피마 인디언 데이터셋 with PyCaret

  
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import koreanize_matplotlib

Data Load

피마 인디언 당뇨병 데이터 셋

  
df_pima = pd.read_csv("http://bit.ly/data-diabetes-csv")
df_pima.shape

(768, 9)

PyCaret

당뇨병 여부 분류 문제 적용시

  
from pycaret.classification import *

setup

Train data, Test data, Label, Target 등을 설정하는 부분이며, 데이터에 전처리 기법들을 적용 할 수 있음

  
pycaret_models = setup(
    session_id=42, # 랜덤 시드
    data=df_pima, # Input Data
    target="Outcome", # Target
    normalize=True, # 정규화 여부
    normalize_method="minmax", # 정규화 방식
    transformation=True, # 데이터의 분포가 정규 분포에 더 가까워지도록 처리
    fold_strategy="stratifiedkfold",
    use_gpu=True
)

	Description	Value
0	session_id	42
1	Target	Outcome
2	Target Type	Binary
3	Label Encoded	None
4	Original Data	(768, 9)
5	Missing Values	False
6	Numeric Features	7
7	Categorical Features	1
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(537, 24)
12	Transformed Test Set	(231, 24)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	True
19	Log Experiment	False
20	Experiment Name	clf-default-name
21	USI	d7e1
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	True
30	Normalize Method	minmax
31	Transformation	True
32	Transformation Method	yeo-johnson
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Remove Perfect Collinearity	True
45	Clustering	False
46	Clustering Iteration	None
47	Polynomial Features	False
48	Polynomial Degree	None
49	Trignometry Features	False
50	Polynomial Threshold	None
51	Group Features	False
52	Feature Selection	False
53	Feature Selection Method	classic
54	Features Selection Threshold	None
55	Feature Interaction	False
56	Feature Ratio	False
57	Interaction Threshold	None
58	Fix Imbalance	False
59	Fix Imbalance Method	SMOTE

models

  
models_list = models()

models_list

	Name	Reference	Turbo
ID
lr	Logistic Regression	sklearn.linear_model._logistic.LogisticRegression	True
knn	K Neighbors Classifier	sklearn.neighbors._classification.KNeighborsCl...	True
nb	Naive Bayes	sklearn.naive_bayes.GaussianNB	True
dt	Decision Tree Classifier	sklearn.tree._classes.DecisionTreeClassifier	True
svm	SVM - Linear Kernel	sklearn.linear_model._stochastic_gradient.SGDC...	True
rbfsvm	SVM - Radial Kernel	sklearn.svm._classes.SVC	False
gpc	Gaussian Process Classifier	sklearn.gaussian_process._gpc.GaussianProcessC...	False
mlp	MLP Classifier	sklearn.neural_network._multilayer_perceptron....	False
ridge	Ridge Classifier	sklearn.linear_model._ridge.RidgeClassifier	True
rf	Random Forest Classifier	sklearn.ensemble._forest.RandomForestClassifier	True
qda	Quadratic Discriminant Analysis	sklearn.discriminant_analysis.QuadraticDiscrim...	True
ada	Ada Boost Classifier	sklearn.ensemble._weight_boosting.AdaBoostClas...	True
gbc	Gradient Boosting Classifier	sklearn.ensemble._gb.GradientBoostingClassifier	True
lda	Linear Discriminant Analysis	sklearn.discriminant_analysis.LinearDiscrimina...	True
et	Extra Trees Classifier	sklearn.ensemble._forest.ExtraTreesClassifier	True
lightgbm	Light Gradient Boosting Machine	lightgbm.sklearn.LGBMClassifier	True
dummy	Dummy Classifier	sklearn.dummy.DummyClassifier	True

pycaret에서 사용 가능한 모델 목록을 확인 할 수 있음

compare_models

  
pc_clf_models = compare_models(
    n_select=25, # 반환할 모델 개수
    include=models_list.index.tolist()
)

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
gpc	Gaussian Process Classifier	0.7710	0.8098	0.5485	0.7305	0.6223	0.4645	0.4764	0.1060
et	Extra Trees Classifier	0.7691	0.8185	0.5643	0.7206	0.6279	0.4654	0.4755	0.4720
lr	Logistic Regression	0.7653	0.8368	0.5801	0.7055	0.6320	0.4632	0.4704	0.0260
rf	Random Forest Classifier	0.7615	0.8406	0.5693	0.6962	0.6202	0.4511	0.4591	0.4840
ada	Ada Boost Classifier	0.7597	0.8199	0.6061	0.6741	0.6360	0.4580	0.4609	0.0820
lightgbm	Light Gradient Boosting Machine	0.7597	0.8174	0.6333	0.6677	0.6437	0.4643	0.4691	0.9150
lda	Linear Discriminant Analysis	0.7596	0.8319	0.5798	0.6866	0.6223	0.4501	0.4565	0.0140
rbfsvm	SVM - Radial Kernel	0.7577	0.8418	0.5263	0.7080	0.6004	0.4331	0.4445	0.0300
ridge	Ridge Classifier	0.7559	0.0000	0.5693	0.6790	0.6151	0.4405	0.4457	0.0090
gbc	Gradient Boosting Classifier	0.7541	0.8396	0.6225	0.6566	0.6332	0.4502	0.4544	0.1010
knn	K Neighbors Classifier	0.7466	0.7800	0.5424	0.6789	0.6000	0.4183	0.4262	0.3870
mlp	MLP Classifier	0.7411	0.8044	0.5860	0.6483	0.6105	0.4186	0.4230	1.6220
svm	SVM - Linear Kernel	0.7299	0.0000	0.6284	0.6121	0.6119	0.4073	0.4127	0.0090
dt	Decision Tree Classifier	0.7187	0.6909	0.5965	0.6013	0.5919	0.3799	0.3848	0.0100
nb	Naive Bayes	0.6610	0.7666	0.1123	0.4499	0.1719	0.0824	0.1127	0.0090
dummy	Dummy Classifier	0.6499	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0060
qda	Quadratic Discriminant Analysis	0.5529	0.5573	0.5865	0.4949	0.4345	0.1167	0.1724	0.0090

create_model

여러 모델이 아닌 하나의 모델에 대해서 setup 설정으로 학습 및 결과 확인

  
clf_lgbm = create_model("lightgbm")

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8148	0.8932	0.7895	0.7143	0.7500	0.6035	0.6054
1	0.7593	0.8045	0.4737	0.7500	0.5806	0.4236	0.4456
2	0.7222	0.8466	0.6316	0.6000	0.6154	0.3982	0.3985
3	0.6852	0.7278	0.6316	0.5455	0.5854	0.3338	0.3361
4	0.7778	0.8451	0.7368	0.6667	0.7000	0.5242	0.5259
5	0.8519	0.9023	0.7895	0.7895	0.7895	0.6752	0.6752
6	0.7222	0.7158	0.4737	0.6429	0.5455	0.3520	0.3605
7	0.7358	0.8079	0.5556	0.6250	0.5882	0.3948	0.3963
8	0.8113	0.8492	0.7778	0.7000	0.7368	0.5904	0.5924
9	0.7170	0.7786	0.4737	0.6429	0.5455	0.3468	0.3553
Mean	0.7597	0.8171	0.6333	0.6677	0.6437	0.4643	0.4691
Std	0.0503	0.0597	0.1276	0.0688	0.0866	0.1173	0.1152

tune_model

하이퍼파라미터 튜닝을 도와주는 메서드

  
tuned_clf_lgbm = tune_model(clf_lgbm, n_iter=10, optimize="Accuracy")

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8333	0.9308	0.8421	0.7273	0.7805	0.6473	0.6518
1	0.8148	0.8782	0.6316	0.8000	0.7059	0.5735	0.5820
2	0.8148	0.8496	0.7368	0.7368	0.7368	0.5940	0.5940
3	0.6852	0.7353	0.4211	0.5714	0.4848	0.2656	0.2720
4	0.7222	0.8226	0.6316	0.6000	0.6154	0.3982	0.3985
5	0.8333	0.8977	0.6842	0.8125	0.7429	0.6209	0.6259
6	0.7593	0.7805	0.5263	0.7143	0.6061	0.4384	0.4490
7	0.7170	0.8500	0.5556	0.5882	0.5714	0.3604	0.3607
8	0.7547	0.8492	0.5556	0.6667	0.6061	0.4301	0.4339
9	0.7170	0.7724	0.5263	0.6250	0.5714	0.3625	0.3655
Mean	0.7652	0.8366	0.6111	0.6842	0.6421	0.4691	0.4733
Std	0.0522	0.0571	0.1149	0.0826	0.0897	0.1238	0.1241

save_model

학습한 모델을 저장

  
save_model(tuned_clf_lgbm, "./tuned_clf_lgbm")

Transformation Pipeline and Model Successfully Saved





(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Outcome',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                                 colsample_bytree=1.0, device='gpu',
                                 feature_fraction=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=71, min_child_weight=0.001,
                                 min_split_gain=0.6, n_estimators=130, n_jobs=-1,
                                 num_leaves=4, objective=None, random_state=42,
                                 reg_alpha=0.3, reg_lambda=4, silent='warn',
                                 subsample=1.0, subsample_for_bin=200000,
                                 subsample_freq=0)]],
          verbose=False),
 './tuned_clf_lgbm.pkl')

load_model

  
clf_lgbm = load_model("./tuned_clf_lgbm")

Transformation Pipeline and Model Successfully Loaded

  
clf_lgbm["trained_model"]

LGBMClassifier(bagging_fraction=0.6, bagging_freq=5, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, device='gpu',
               feature_fraction=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=71, min_child_weight=0.001,
               min_split_gain=0.6, n_estimators=130, n_jobs=-1, num_leaves=4,
               objective=None, random_state=42, reg_alpha=0.3, reg_lambda=4,
               silent='warn', subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

위와 같이 하이퍼파라미터 튜닝 목록을 확인할 수 있음

PyCaret 맛보기