Home DACON 생육 환경 최적화 경진대회
Post
Cancel

DACON 생육 환경 최적화 경진대회

[DACON] 생육 환경 최적화 경진대회

참고 | CNN+CatBoost+ANN

  • DL과 ML을 섞어 사용했다는 점이 흥미로웠음

Abstraction

  • CNN: 이미지 밝기 조절 -> 마스킹(HSV) -> 픽셀 비율 추출 -> 이상치 처리 -> 학습
  • CatBoost, ANN : 이미지 픽셀 비율 Feature 변수로 포함 -> 전처리 -> 학습

Library

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
from glob import glob

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import groupby
import random

import seaborn as sns

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import koreanize_matplotlib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
main_path = "./open"
train_imgs = glob(main_path + "/train/*/*/*.png") + glob(main_path + "/train/*/*/*.jpg")
train_imgs = sorted(train_imgs)
test_imgs = glob(main_path + "/test/image/*.png") + glob(main_path + "/test/image/*.jpg")
test_imgs = sorted(test_imgs)
train_data = glob(main_path + "/train/*/meta/*.csv")
train_data = sorted(train_data)
train_label = glob(main_path + "/train/*/*.csv")
train_label = sorted(train_label)
test_data = glob(main_path + "/test/meta/*.csv")
test_data = sorted(test_data)

preprocessing_train_imgs = main_path + "/PREPROCESSING-TRAIN"
preprocessing_test_imgs = main_path + "/PREPROCESSING-TEST"

if not os.path.exists(preprocessing_train_imgs):
    os.mkdir(preprocessing_train_imgs)
if not os.path.exists(preprocessing_test_imgs):
    os.mkdir(preprocessing_test_imgs)

Image Augmentation

Grayscale 히스토그램을 이용해 모든 이미지 밝기 자동 조절

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def automatic_brightness_and_contrast(img, clip_hist_pct=0.025):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Calculate grayscale hist
    hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
    hist_size = len(hist)
    # Calculate cumulative distribution from the hist
    accumulator = []
    accumulator.append(float(hist[0]))
    for idx in range(1, hist_size):
        accumulator.append(accumulator[idx-1]+float(hist[idx]))
    # Locate points to clip
    maximum = accumulator[-1]
    clip_hist_pct *= (maximum/100.0)
    clip_hist_pct /= 2.0
    # Locate Left Cut
    minimum_gray = 0
    while accumulator[minimum_gray]<clip_hist_pct: minimum_gray += 1
    # Locate Right Cut
    maximum_gray = hist_size-1
    while accumulator[maximum_gray]>=(maximum-clip_hist_pct): maximum_gray -= 1
    # Calculate alpha and beta val.
    alpha = 255 / (maximum_gray-minimum_gray)
    beta = -minimum_gray*alpha

    auto_result = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)
    return (auto_result)

Data Processing

  • 이미지 밝기 조절 후 HSV를 이용해 특정 색상 오브젝트 추출
  • 마스킹 된 이미지 픽셀 비율 값 추출 (W / W + B)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def get_image_data(dir_in, dir_out):
    ratio_list = []
    for i in tqdm(dir_in):
        name = i.split("\\")[-1]
        img = cv2.imread(i, cv2.IMREAD_COLOR)
        img = cv2.resize(img, (615, 462))
        brightscale = automatic_brightness_and_contrast(img)
        imgcopy = brightscale.copy()
        hsvimg = cv2.cvtColor(brightscale, cv2.COLOR_BGR2HSV)
        lower = np.array([22, 40, 0])
        upper = np.array([85, 255, 245])
        mask = cv2.inRange(hsvimg, lower, upper)
        number_of_white_pix = np.sum(mask==255)
        number_of_black_pix = np.sum(mask==0)
        ratio = number_of_white_pix / (number_of_white_pix + number_of_black_pix)
        ratio_list.append(ratio)
        result = cv2.bitwise_and(imgcopy, imgcopy, mask=mask)
        cv2.imwrite(os.path.join(dir_out, name), result)
    return ratio_list
1
2
3
4
5
6
7
8
ratio_train = get_image_data(train_imgs, preprocessing_train_imgs)
ratio_test = get_image_data(test_imgs, preprocessing_test_imgs)

processed_train_imgs = glob(main_path+"/PREPROCESSING-TRAIN/*.png") + glob(main_path+"/PREPROCESSING-TRAIN/*.jpg")
processed_train_imgs = sorted(processed_train_imgs)

processed_test_imgs = glob(main_path+"/PREPROCESSING-TEST/*.png") + glob(main_path+"/PREPROCESSING-TEST/*.jpg")
processed_test_imgs = sorted(processed_test_imgs)
1
2
100%|██████████| 1592/1592 [04:44<00:00,  5.59it/s]
100%|██████████| 460/460 [01:24<00:00,  5.44it/s]
  • 정량이 정해져 있는 변수는 bfill, ffill로 결측값 대체 (최근 분무량)
  • 정량이 정해져 있지 않은 변수는 보간법 이용
  • 최근 분무량 데이터는, 일간 누적 분무량으로 측정이 되어 모든 최근 분무량 데이터를 일간 누적 분무량으로 변환
  • 메타데이터에 픽셀 비율 변수로 포함
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
train_df = []
for i in tqdm(train_data):
    name = i.split("\\")[-1].split(".")[0]
    df = pd.read_csv(i)
    df = df.drop('시간', axis = 1)
    case = name.split("_")[0]
    label = pd.read_csv(main_path + f"/train/{case}/label.csv")
    label_name = [i.split(".")[0] for i in label.img_name]
    label.img_name = label_name
    leaf_weight = label[label.img_name == name].leaf_weight.values[0]
    df["무게"] = leaf_weight
    df["최근분무량"] = df["최근분무량"].fillna(method='bfill', limit=1)
    df["최근분무량"] = df["최근분무량"].fillna(method='ffill', limit=1)
    df = df.interpolate()
    water = df['최근분무량'].round(2).tolist()
    if np.mean(water) > 1000:
        nums = [list(v) for k,v in groupby(water, key = lambda x: x != 0) if k != 0]
        if len(nums) == 2:
            cumulative = nums[0][-1] - nums[0][0] + nums[1][-1]
        else:
            cumulative = nums[0][-1] - nums[0][0]
            
    elif 1000 > np.mean(water) > 0:
        nums = [key for key, _ in groupby(water)]
        cumulative = sum(nums[1:])
    else:
        cumulative = 0

    df = df.mean()
    df = df.to_frame().T
    df["이미지"] = name
    df['최근분무량'] = cumulative

    train_df.append(df)

train_df = pd.concat(train_df, ignore_index=True)
train_df['비율'] = ratio_train
train_df.head()
1
100%|██████████| 1592/1592 [00:45<00:00, 35.23it/s]
내부온도관측치외부온도관측치내부습도관측치외부습도관측치CO2관측치EC관측치최근분무량화이트 LED동작강도레드 LED동작강도블루 LED동작강도...냉방부하난방온도난방부하총추정광량백색광추정광량적색광추정광량청색광추정광량무게이미지비율
022.23618143.86854277.7406954.679291487.22638919.5947920.0200.720833201.0000000.139583...179.46035618.85410310.228598145.94482912.39606121.119466NaN49.193CASE01_010.099845
123.01131944.57375077.6459724.537500480.14444420.8555550.0200.861111200.8611110.139583...179.47163118.85396513.709128145.98028312.39146421.099885NaN59.764CASE01_020.120072
222.93111139.53770877.5149314.886111489.06875020.7486110.0200.651042200.3732640.139583...179.52357018.85417113.348331146.01573612.37422721.133608NaN72.209CASE01_030.141682
321.02798658.49750080.0699303.908333481.37847218.1952780.0200.025000200.1638890.139583...179.49584518.8541747.520480145.99747212.37020521.128169NaN85.737CASE01_040.166269
421.87430567.05881981.3497923.908333490.56875019.4004860.0200.861111201.0000000.139583...179.48824118.85414010.943552145.97168812.39491221.121642NaN102.537CASE01_050.191539

5 rows × 21 columns

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
test_df = []
for i in tqdm(test_data):
    name = i.split("\\")[-1].split(".")[0]
    df = pd.read_csv(i)
    df = df.drop("시간", axis=1)
    df["최근분무량"] = df["최근분무량"].fillna(method="bfill", limit=1)
    df["최근분무량"] = df["최근분무량"].fillna(method="ffill", limit=1)
    df = df.interpolate()
    water = df["최근분무량"].round(2).tolist()
    if np.mean(water)>1000:
        nums = [list(v) for k,v in groupby(water, key = lambda x: x!=0) if k != 0]
        if len(nums)==2: cumulative = nums[0][-1] - nums[0][0] + nums[1][-1]
        else: cumulative = nums[0][-1] - nums[0][0]
    elif 1000>np.mean(water)>0:
        nums = [key for key, _group in groupby(water)]
        cumulative = sum(nums[1:])
    else:
        cumulative = 0
    
    df = df.mean()
    df = df.to_frame().T
    df["이미지"] = name
    df["최근분무량"] = cumulative

    test_df.append(df)

test_df = pd.concat(test_df, ignore_index=True)
test_df["비율"] = ratio_test
test_df.head()
1
100%|██████████| 460/460 [00:12<00:00, 37.79it/s]
내부온도관측치외부온도관측치내부습도관측치외부습도관측치CO2관측치EC관측치최근분무량화이트 LED동작강도레드 LED동작강도블루 LED동작강도냉방온도냉방부하난방온도난방부하총추정광량백색광추정광량적색광추정광량청색광추정광량이미지비율
023.63486122.56458369.50048634.499792400.2652780.00000011509.2847.1954867.48472213.48819422.3958296.19826820.3958290.000000179.576697146.04581712.39835921.1325200010.162271
127.17034728.73847263.88180550.837708505.8236110.0000004234.6333.4961819.5065974.72222223.39600723.87534522.3960070.000000126.724400103.60293015.7251977.3962730020.526755
225.99934025.54111179.19781265.936597498.6236111.778872610.7147.1854177.48715313.47673620.29192042.90866417.4168440.606961179.532997146.02647912.39031521.1162020030.011583
322.95694422.36166770.80979245.378646394.6125000.59240912271.8547.1833337.48333313.47951422.3957932.91143420.3957930.000000179.449739145.95020112.39203921.1075000040.149083
423.01475722.53173673.88694433.534167418.5618060.54130313040.8547.1854177.46666713.47812522.3958463.17926020.3958460.000000179.480870145.99854612.36503321.1172900050.168687

Image EDA

  • 산점도 플롯을 이용해 무게, 픽셀 비율 관계 확인
  • 회귀선에서 멀리 떨어져 있는 값은 이상치로 처리 (제거)
  • CASE59와 CASE58은 중복 데이터임
1
_ = sns.scatterplot(data=train_df, x="무게", y="비율")

png

1
2
3
4
5
6
7
8
9
image_outliers = ['CASE05_21','CASE05_22','CASE05_23', 'CASE07_07', 'CASE07_08', 'CASE16_03', 'CASE23_01', 'CASE23_02', 
'CASE23_03', 'CASE23_04', 'CASE23_05', 'CASE23_06', 'CASE23_07', 'CASE23_08', 'CASE23_09', 'CASE45_16', 'CASE45_17',
'CASE72_06',  'CASE73_10', 'CASE59_01','CASE59_02','CASE59_03','CASE59_04','CASE59_05','CASE59_06',
'CASE59_07','CASE59_08','CASE59_09','CASE59_10','CASE59_11','CASE59_12','CASE59_13','CASE59_14','CASE59_15','CASE59_16','CASE59_17','CASE59_18',
'CASE59_19','CASE59_20','CASE59_21','CASE59_22','CASE59_23','CASE59_24','CASE59_25','CASE59_26','CASE59_27','CASE59_28','CASE59_29','CASE59_30',
'CASE59_31','CASE59_32', 'CASE59_33']

train_df_image = train_df[~train_df["이미지"].isin(image_outliers)]
train_imgs_removed = [x for x in processed_train_imgs if x.split(".")[1].split("\\")[1] not in image_outliers]

CNN

1
2
3
4
5
6
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

if torch.cuda.is_available():
    print("GPU")
else:
    print("CPU")
1
GPU

이전에는 토치 GPU 구동 확인을 했었는데 왜 CPU로 잡히는지 모르겠음…

1
2
3
4
5
6
7
CFG = {
    "IMG_SIZE" : 128,
    "EPOCHS" : 80,
    "LEARNIING_RATE" : 1e-3,
    "BATCH_SIZE" : 32,
    "SEED" : 42
}
1
2
3
4
5
6
7
8
9
10
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG["SEED"])
1
2
3
4
5
6
7
8
train_len = int(len(train_imgs_removed)*0.8)
weight = train_df_image['무게'].round(3).tolist()

train_img_path = train_imgs_removed[:train_len]
train_label = weight[:train_len]

val_img_path = train_imgs_removed[train_len:]
val_label = weight[train_len:]
1
len(train_img_path), len(train_label), len(val_img_path), len(val_label)
1
(1232, 1232, 308, 308)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, train_mode=True, transforms=None):
        self.transforms = transforms
        self.train_mode = train_mode
        self.img_path_list = img_path_list
        self.label_list = label_list
    def __getitem__(self, index):
        img_path = self.img_path_list[index]
        image = cv2.imread(img_path)
        if self.transforms is not None:
            image = self.transforms(image)
        if self.train_mode:
            label = self.label_list[index]
            return image, label
        else: return image
    def __len__(self):
        return len(self.img_path_list)
1
2
3
4
5
6
7
8
9
10
11
train_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
                    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
                    ])

test_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
                    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
                    ])
1
2
3
4
5
train_dataset = CustomDataset(train_img_path, train_label, train_mode=True, transforms=train_transform)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_img_path, val_label, train_mode=True, transforms=test_transform)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class CNNRegressor(torch.nn.Module):
    def __init__(self):
        super(CNNRegressor, self).__init__()
        self.layer1 = torch.nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer2 = torch.nn.Sequential(
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer3 = torch.nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer4 = torch.nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.regressor = nn.Linear(3136, 1)
    def forward(self, x):
        # (Batch, 3, 128, 128) -> (Batch, 64, 7, 7)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = torch.flatten(x, start_dim=1)
        out = self.regressor(x)
        return out

def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    # Loss Func.
    criterion = nn.L1Loss().to(device)
    best_mae = 9999
    for epoch in range(1, CFG["EPOCHS"]+1):
        model.train()
        train_loss = []
        for img, label in tqdm(iter(train_loader)):
            img, label = img.float().to(device), label.float().to(device)
            optimizer.zero_grad()
            logit = model(img)
            loss = criterion(logit.squeeze(1), label)
            # backpropagation
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        if scheduler is not None:
            scheduler.step()
        
        # Evaluation
        val_mae = validation(model, val_loader, criterion, device)
        print(f"Epoch [{epoch}] Train MAE: [{np.mean(train_loss):.5f}] Val MAE: [{val_mae:.5f}]\n")
        if best_mae > val_mae:
            best_mae = val_mae
            torch.save(model.state_dict(), "./best_model.pth")
            print("Model Saved.")

def validation(model, val_loader, criterion, device):
    model.eval() # Evaluation
    val_loss = []
    with torch.no_grad():
        for img, label in tqdm(iter(val_loader)):
            img, label = img.float().to(device), label.float().to(device)

            logit = model(img)
            loss = criterion(logit.squeeze(1), label)
            
            val_loss.append(loss.item())

    val_mae_loss = np.mean(val_loss)
    return val_mae_loss

Train and Validation

1
2
3
4
5
6
CNNmodel = CNNRegressor().to(device)

optimizer = torch.optim.SGD(params=CNNmodel.parameters(), lr=CFG["LEARNIING_RATE"])
scheduler = None

train(CNNmodel, optimizer, train_loader, val_loader, scheduler, device)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
100%|██████████| 39/39 [00:20<00:00,  1.90it/s]
100%|██████████| 10/10 [00:04<00:00,  2.39it/s]


Epoch [1] Train MAE: [77.63346] Val MAE: [85.81707]

Model Saved.


100%|██████████| 39/39 [00:09<00:00,  4.21it/s]
100%|██████████| 10/10 [00:02<00:00,  4.28it/s]


Epoch [2] Train MAE: [77.04038] Val MAE: [81.62168]

Model Saved.


100%|██████████| 39/39 [00:09<00:00,  4.24it/s]
100%|██████████| 10/10 [00:02<00:00,  4.85it/s]

1
2
3
4
100%|██████████| 39/39 [00:09<00:00,  4.27it/s]
100%|██████████| 10/10 [00:02<00:00,  4.58it/s]

Epoch [80] Train MAE: [7.41040] Val MAE: [13.52668]

Predict

1
2
3
4
5
6
7
8
9
10
def predict(model, test_loader, device):
    model.eval()
    model_pred = []
    with torch.no_grad():
        for img in tqdm(iter(test_loader)):
            img = img.float().to(device)
            pred_logit = model(img)
            pred_logit = pred_logit.squeeze(1).detach().cpu()
            model_pred.extend(pred_logit.tolist())
    return model_pred
1
2
3
4
5
6
7
8
9
test_dataset = CustomDataset(processed_test_imgs, None, train_mode=False, transforms=test_transform)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

checkpoint = torch.load('./best_model.pth')
CNNmodel = CNNRegressor().to(device)
CNNmodel.load_state_dict(checkpoint)

# Inference
preds = predict(CNNmodel, test_loader, device)
1
100%|██████████| 15/15 [00:05<00:00,  2.74it/s]
1
2
3
submission = pd.read_csv('./open/sample_submission.csv')
submission['leaf_weight'] = preds
submission.to_csv('./CNNsubmit.csv', index=False)

Meatadata EDA

  • 각 환경 변수 시각화
  • 이상치 판단
  • CASE01, CASE02 경우 EC 관측치, 외부 온도 값이 다른 케이스에 비해 다르므로 제외
  • 음수값이 나오는 최근 분무량 제외
  • CO2 관측치가 0인 케이스는 누락 데이터로 판단 -> 제외
1
2
3
4
5
6
7
8
9
10
11
12
13
14
firstfeats = ['내부온도관측치', '외부온도관측치', '내부습도관측치', '외부습도관측치', 'CO2관측치', 'EC관측치','최근분무량']
secondfeats = ['냉방온도', '냉방부하','난방온도', '난방부하', '비율']
thirdfeats = ['화이트 LED동작강도', '레드 LED동작강도', '블루 LED동작강도', '총추정광량', '백색광추정광량', '적색광추정광량', '청색광추정광량']

fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(10, 10))

idx = 0
for row in range(3):
    for col in range(3):
        try:
            sns.scatterplot(x=train_df[firstfeats[idx]], y=train_df[firstfeats[idx]], ax=ax[row][col], hue=train_df[firstfeats[idx]])
            idx += 1
        except:
            pass

png

1
2
3
4
5
6
7
8
meta_outliers = ['CASE01_01','CASE01_02','CASE01_03','CASE01_04','CASE01_05','CASE01_06','CASE01_07',
'CASE01_08','CASE01_09','CASE02_01','CASE02_02','CASE02_03','CASE02_04','CASE02_05','CASE02_06','CASE02_07',
'CASE02_08','CASE02_09','CASE02_10','CASE02_11']

train_df_meta = train_df_image[~train_df_image['이미지'].isin(meta_outliers)]

train_df_meta = train_df_meta[train_df_meta['CO2관측치'] > 0]
train_df_meta = train_df_meta[train_df_meta['최근분무량'] >= 0]
  • 특성간 상관관계가 높은 변수는 제거 (LED 동작 강도)
  • 총추정광량은 백색추정광량, 적생광추정과량, 청색광추정광량 합이므로 총추정광향 변수 제거
1
2
corr = train_df_meta.corr()
corr.style.background_gradient(cmap='coolwarm')
 내부온도관측치외부온도관측치내부습도관측치외부습도관측치CO2관측치EC관측치최근분무량화이트 LED동작강도레드 LED동작강도블루 LED동작강도냉방온도냉방부하난방온도난방부하총추정광량백색광추정광량적색광추정광량청색광추정광량무게비율
내부온도관측치1.0000000.914757-0.592165-0.3488500.0935770.233087-0.260451-0.021698-0.366392-0.4120550.0281920.6083410.300661-0.250316-0.341991-0.089971-0.366396-0.412047-0.096137-0.122599
외부온도관측치0.9147571.000000-0.495556-0.277455-0.0251400.104751-0.218209-0.096884-0.394604-0.520013-0.0176070.5960090.229053-0.270891-0.439107-0.151600-0.394598-0.519996-0.005515-0.029887
내부습도관측치-0.592165-0.4955561.0000000.752177-0.037328-0.5174030.201364-0.0916220.2767600.189064-0.040844-0.185409-0.3443170.1940680.135976-0.0315160.2767770.1890700.2253230.270407
외부습도관측치-0.348850-0.2774550.7521771.000000-0.114504-0.500501-0.023921-0.2193300.181613-0.0626760.002730-0.031856-0.2506060.225217-0.102444-0.1788930.181610-0.0626800.1176380.141438
CO2관측치0.093577-0.025140-0.037328-0.1145041.0000000.112437-0.0999550.141059-0.0196420.077924-0.1632040.201229-0.116181-0.1093700.1401630.153977-0.0196340.0779360.0240760.009779
EC관측치0.2330870.104751-0.517403-0.5005010.1124371.000000-0.1958740.2512060.1314220.3574840.087924-0.0266130.256154-0.0576900.3499210.2258830.1314230.357497-0.153996-0.171411
최근분무량-0.260451-0.2182090.201364-0.023921-0.099955-0.1958741.0000000.274187-0.0382510.245956-0.136779-0.082902-0.118560-0.1309240.2837710.265799-0.0382450.245923-0.284305-0.276562
화이트 LED동작강도-0.021698-0.096884-0.091622-0.2193300.1410590.2512060.2741871.000000-0.3476800.354309-0.3591680.023614-0.003158-0.3831320.7455720.950771-0.3476260.354327-0.343482-0.360687
레드 LED동작강도-0.366392-0.3946040.2767600.181613-0.0196420.131422-0.038251-0.3476801.0000000.4961190.299690-0.131863-0.2630390.5978130.269544-0.3118481.0000000.4961310.2461220.268795
블루 LED동작강도-0.412055-0.5200130.189064-0.0626760.0779240.3574840.2459560.3543090.4961191.000000-0.103579-0.269058-0.053375-0.0051210.8141080.3601400.4961431.0000000.0484660.060023
냉방온도0.028192-0.017607-0.0408440.002730-0.1632040.087924-0.136779-0.3591680.299690-0.1035791.000000-0.4910720.5814710.372114-0.187032-0.3189330.299638-0.1036340.0955930.101461
냉방부하0.6083410.596009-0.185409-0.0318560.201229-0.026613-0.0829020.023614-0.131863-0.269058-0.4910721.000000-0.5350390.096505-0.173651-0.037412-0.131821-0.269003-0.042442-0.064333
난방온도0.3006610.229053-0.344317-0.250606-0.1161810.256154-0.118560-0.003158-0.263039-0.0533750.581471-0.5350391.000000-0.374729-0.0973080.007400-0.263096-0.053433-0.033694-0.034794
난방부하-0.250316-0.2708910.1940680.225217-0.109370-0.057690-0.130924-0.3831320.597813-0.0051210.3721140.096505-0.3747291.000000-0.088278-0.3621160.597797-0.0051240.2006740.206683
총추정광량-0.341991-0.4391070.135976-0.1024440.1401630.3499210.2837710.7455720.2695440.814108-0.187032-0.173651-0.097308-0.0882781.0000000.7961360.2695910.814124-0.134164-0.133084
백색광추정광량-0.089971-0.151600-0.031516-0.1788930.1539770.2258830.2657990.950771-0.3118480.360140-0.318933-0.0374120.007400-0.3621160.7961361.000000-0.3117980.360155-0.304854-0.318758
적색광추정광량-0.366396-0.3945980.2767770.181610-0.0196340.131423-0.038245-0.3476261.0000000.4961430.299638-0.131821-0.2630960.5977970.269591-0.3117981.0000000.4961560.2461350.268809
청색광추정광량-0.412047-0.5199960.189070-0.0626800.0779360.3574970.2459230.3543270.4961311.000000-0.103634-0.269003-0.053433-0.0051240.8141240.3601550.4961561.0000000.0484830.060043
무게-0.096137-0.0055150.2253230.1176380.024076-0.153996-0.284305-0.3434820.2461220.0484660.095593-0.042442-0.0336940.200674-0.134164-0.3048540.2461350.0484831.0000000.992092
비율-0.122599-0.0298870.2704070.1414380.009779-0.171411-0.276562-0.3606870.2687950.0600230.101461-0.064333-0.0347940.206683-0.133084-0.3187580.2688090.0600430.9920921.000000
1
sns.scatterplot(train_df_meta, x=train_df_meta["총추정광량"], y=train_df_meta["백색광추정광량"]+train_df_meta["적색광추정광량"]+train_df_meta["청색광추정광량"]);

png

CatBoost

1
2
3
4
5
6
7
8
9
10
11
features = ['내부온도관측치', '외부온도관측치', '내부습도관측치', '외부습도관측치', 'CO2관측치', 'EC관측치',
         '최근분무량', '냉방온도', '냉방부하',
         '난방온도', '난방부하', '백색광추정광량', '적색광추정광량', '청색광추정광량', '비율']

train_col = train_df_meta[features]

test_col = test_df[features]

train_target = train_df_meta["무게"]

train_x, val_x, train_y, val_y = train_test_split(train_col, train_target, test_size=0.2, random_state=32)
1
2
3
4
5
6
7
8
9
10
11
12
CATmodel = CatBoostRegressor(verbose=50, n_estimators=10000, eval_metric="MAE", early_stopping_rounds=50)
CATmodel.fit(train_x, train_y, eval_set=[(val_x, val_y)], use_best_model=True)

val_pred = CATmodel.predict(val_x)
plt.figure(figsize=(20,10))
plt.plot(np.array(val_pred),label = "pred")
plt.plot(np.array(val_y),label = "true")
plt.legend()
plt.show()

train_score = CATmodel.score(train_x, train_y)
val_score = CATmodel.score(val_x, val_y)
1
2
3
4
5
6
7
8
9
10
11
12
13
Learning rate set to 0.012542
0:	learn: 81.6377352	test: 85.8903420	best: 85.8903420 (0)	total: 147ms	remaining: 24m 30s
50:	learn: 47.6997804	test: 51.2987982	best: 51.2987982 (50)	total: 233ms	remaining: 45.5s
...
6300:	learn: 0.4522590	test: 3.1227911	best: 3.1225241 (6295)	total: 12.2s	remaining: 7.17s
6350:	learn: 0.4476624	test: 3.1219959	best: 3.1219880 (6349)	total: 12.3s	remaining: 7.07s
6400:	learn: 0.4438297	test: 3.1213585	best: 3.1212632 (6383)	total: 12.4s	remaining: 6.96s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 3.121263223
bestIteration = 6383

Shrink model to first 6384 iterations.

png

1
2
3
4
5
CATresult = CATmodel.predict(test_col)

submission = pd.read_csv('./open/sample_submission.csv')
submission['leaf_weight'] = CATresult
submission.to_csv('./CATsubmit.csv', index=False)

ANN

  • sklearn으로 메타 데이터 스케일 조정
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def scale_datasets(x_train, x_test):
    # Z - Score
    standard_scaler = StandardScaler()
    x_train_scaled = pd.DataFrame(
        standard_scaler.fit_transform(x_train),
        columns=x_train.columns
    )
    x_test_scaled = pd.DataFrame(
        standard_scaler.transform(x_test),
        columns = x_test.columns
    )
    return x_train_scaled, x_test_scaled

train_scaled, test_scaled = scale_datasets(train_col, test_col)

train_x_scale, val_x_scale, train_y_scale, val_y_scale = train_test_split(train_scaled, train_target, test_size=0.2, random_state=32)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
tf.random.set_seed(42)

def build_model_using_sequential():
    model = Sequential([
      Dense(100, kernel_initializer='normal', activation='relu'),
      Dense(50, kernel_initializer='normal', activation='relu'),
      Dense(25, kernel_initializer='normal', activation='relu'),
      Dense(1, kernel_initializer='normal', activation='linear')
    ])
    return model

ANNmodel = build_model_using_sequential()

# Loss Func.
mae = MeanAbsoluteError()
ANNmodel.compile(
    loss=mae, 
    optimizer=Adam(learning_rate=0.001), 
    metrics=[mae]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=50,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

# train the model
history = ANNmodel.fit(
    train_x_scale, 
    train_y_scale, 
    epochs=1000, 
    batch_size=32,
    validation_data=(val_x_scale, val_y_scale),
    callbacks=[early_stopping_monitor],
    verbose= 2
)
1
2
3
4
5
6
7
8
9
Epoch 1/1000
32/32 - 1s - loss: 79.6819 - mean_absolute_error: 79.6508 - val_loss: 80.7002 - val_mean_absolute_error: 80.7002 - 1s/epoch - 34ms/step
Epoch 2/1000
32/32 - 0s - loss: 78.0658 - mean_absolute_error: 78.0423 - val_loss: 77.3219 - val_mean_absolute_error: 77.3219 - 129ms/epoch - 4ms/step
...
Epoch 301/1000
Restoring model weights from the end of the best epoch: 251.
32/32 - 0s - loss: 2.7592 - mean_absolute_error: 2.7637 - val_loss: 3.2672 - val_mean_absolute_error: 3.2672 - 171ms/epoch - 5ms/step
Epoch 301: early stopping
1
2
3
4
5
6
val_pred = ANNmodel.predict(val_x_scale)
plt.figure(figsize=(20,10))
plt.plot(np.array(val_pred),label = "pred")
plt.plot(np.array(val_y_scale),label = "true")
plt.legend()
plt.show()
1
8/8 [==============================] - 0s 2ms/step

png

1
2
3
4
5
ANNresult = ANNmodel.predict(test_scaled)

submission = pd.read_csv('./open/sample_submission.csv')
submission['leaf_weight'] = ANNresult
submission.to_csv('./ANNsubmit.csv', index=False)
1
15/15 [==============================] - 0s 2ms/step

Ensemble

1
2
3
4
5
6
7
CNN = pd.read_csv('./CNNsubmit.csv')
CAT = pd.read_csv('./CATsubmit.csv')
ANN = pd.read_csv('./ANNsubmit.csv')

submission_final = pd.read_csv('./open/sample_submission.csv')
submission_final['leaf_weight'] = (CNN['leaf_weight'] * 0.65 + CAT['leaf_weight'] * 0.25 + ANN['leaf_weight'] * 0.1)
submission_final.to_csv('ENSEMBLEsubmit.csv', index=False)

마무리

  • 이미지 데이터와 메타 데이터가 함께 있는 경우 어떻게 전처리를 하고 사용하는지 의아했는데, 해당 필사를 통해 일부 궁금증을 해소할 수 있었음
This post is licensed under CC BY 4.0 by the author.

22년 11월 2주차 주간 회고

22년 11월 3주차 주간 회고

Comments powered by Disqus.