Home Olist EDA 및 RFM 분석
Post
Cancel

Olist EDA 및 RFM 분석

Kaggle - Brazilian E-Commerce Public Dataset by Olist

참고 노트북 | Customer Segmentation & LTV

사용 라이브러리

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
import warnings
warnings.filterwarnings("ignore")

시각화 함수 정의

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def format_spines(ax, right_border=True):
    ax.spines['bottom'].set_color('#666666')
    ax.spines['left'].set_color('#666666')
    ax.spines['top'].set_visible(False)
    if right_border:
        ax.spines['right'].set_color('#FFFFFF')
    else:
        ax.spines['right'].set_color('#FFFFFF')
    ax.patch.set_facecolor('#FFFFFF')
    

def count_plot(feature, df, colors='Blues_d', hue=False, ax=None, title=''):
    ncount = len(df)
    if hue != False:
        ax = sns.countplot(x=feature, data=df, palette=colors, hue=hue, ax=ax)
    else:
        ax = sns.countplot(x=feature, data=df, palette=colors, ax=ax)
    format_spines(ax)

    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
        ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
                ha='center', va='bottom')
    if not hue:
        ax.set_title(df[feature].describe().name + ' Analysis', size=13, pad=15)
    else:
        ax.set_title(df[feature].describe().name + ' Analysis by ' + hue, size=13, pad=15)  
    if title != '':
        ax.set_title(title)       
    plt.tight_layout()
    
    
def bar_plot(x, y, df, colors='Blues_d', hue=False, ax=None, value=False, title=''):
    try:
        ncount = sum(df[y])
    except:
        ncount = sum(df[x])
    if hue != False:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, hue=hue, ax=ax, ci=None)
    else:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, ax=ax, ci=None)
    format_spines(ax)
    for p in ax.patches:
        xp=p.get_bbox().get_points()[:,0]
        yp=p.get_bbox().get_points()[1,1]
        if value:
            ax.annotate('{:.2f}k'.format(yp/1000), (xp.mean(), yp), 
                    ha='center', va='bottom') 
        else:
            ax.annotate('{:.1f}%'.format(100.*yp/ncount), (xp.mean(), yp), 
                    ha='center', va='bottom') 
    if not hue:
        ax.set_title(df[x].describe().name + ' Analysis', size=12, pad=15)
    else:
        ax.set_title(df[x].describe().name + ' Analysis by ' + hue, size=12, pad=15)
    if title != '':
        ax.set_title(title)  
    plt.tight_layout()

Data Load

1
2
3
4
5
6
7
8
9
10
11
12
13
14
customers_ = pd.read_csv("./data/olist_customers_dataset.csv")
order_items_ = pd.read_csv("./data/olist_order_items_dataset.csv")
order_payments_ = pd.read_csv("./data/olist_order_payments_dataset.csv")
orders_ = pd.read_csv("./data/olist_orders_dataset.csv")

dataset = {
    'Customers': customers_,
    'Order Items': order_items_,
    'Payments': order_payments_,
    'Orders': orders_
}

for x, y in dataset.items():
    print(f'{x}', (list(y.shape)))
1
2
3
4
Customers [99441, 5]
Order Items [112650, 7]
Payments [103886, 5]
Orders [99441, 8]

EDA

Columns Names

1
2
for x, y in dataset.items():
    print(f'{x}', f'{list(y.columns)}\n')
1
2
3
4
5
6
7
Customers ['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']

Order Items ['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']

Payments ['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']

Orders ['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

결측치 확인

1
2
3
# null values
for x, y in dataset.items():
    print(f"{x}: {y.isnull().any().any()}")
1
2
3
4
Customers: False
Order Items: False
Payments: False
Orders: True
1
2
3
4
5
# missing values
for x, y in dataset.items():
    if y.isnull().any().any():
        print(f'{x}', (list(y.shape)),'\n')
        print(f'{y.isnull().sum()}\n')
1
2
3
4
5
6
7
8
9
10
11
Orders [99441, 8] 

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

통합 데이터 프레임 생성

1
2
3
4
df1 = pd.merge(left=order_payments_, right=order_items_, on="order_id")
df2 = pd.merge(left=df1, right=orders_, on="order_id")
df = pd.merge(left=df2, right=customers_, on="customer_id")
print(df.shape)
1
(117601, 22)
1
df.head()
order_idpayment_sequentialpayment_typepayment_installmentspayment_valueorder_item_idproduct_idseller_idshipping_limit_dateprice...order_statusorder_purchase_timestamporder_approved_atorder_delivered_carrier_dateorder_delivered_customer_dateorder_estimated_delivery_datecustomer_unique_idcustomer_zip_code_prefixcustomer_citycustomer_state
0b81ef226f3fe1789b1e8b2acac839d171credit_card899.331af74cc53dcffc8384b29e7abfa41902b213b25e6f54661939f11710a6fddb8712018-05-02 22:15:0979.80...delivered2018-04-25 22:01:492018-04-25 22:15:092018-05-02 15:20:002018-05-09 17:36:512018-05-22 00:00:00708ab75d2a007f0564aedd11139c770839801teofilo otoniMG
1a9810da82917af2d9aefd1278f1dcfa01credit_card124.391a630cc320a8c872f9de830cf121661a3eaf6d55068dea77334e8477d3878d89e2018-07-02 11:18:5817.00...delivered2018-06-26 11:01:382018-06-26 11:18:582018-06-28 14:18:002018-06-29 20:32:092018-07-16 00:00:00a8b9d3a27068454b1c98cc67d4e31e6f2422sao pauloSP
225e8ea4e93396b6fa0d3dd708e76c1bd1credit_card165.7112028bf1b01cafb2d2b1901fca4083222cc419e0650a3c5ba77189a1882b7556a2017-12-26 09:52:3456.99...delivered2017-12-12 11:19:552017-12-14 09:52:342017-12-15 20:13:222017-12-18 17:24:412018-01-04 00:00:006f70c0b2f7552832ba46eb57b1c5651e2652sao pauloSP
3ba78997921bbcdc1373bb41e913ab9531credit_card8107.781548e5bfe28edceab6b51fa707cc9556fda8622b14eb17ae2831f4ac5b9dab84a2017-12-12 12:13:2089.90...delivered2017-12-06 12:04:062017-12-06 12:13:202017-12-07 20:28:282017-12-21 01:35:512018-01-04 00:00:0087695ed086ebd36f20404c82d20fca8736060juiz de foraMG
442fdf880ba16b47b59251dd489d4441a1credit_card2128.451386486367c1f9d4f587a8864ccb6902bcca3071e3e9bb7d12640c9fbe23013062018-05-31 16:14:41113.57...delivered2018-05-21 13:59:172018-05-21 16:14:412018-05-22 11:46:002018-06-01 21:44:532018-06-13 00:00:004291db0da71914754618cd789aebcd5618570conchasSP

5 rows × 22 columns

Converting DateTime

1
2
3
date_columns = ["shipping_limit_date", "order_purchase_timestamp", "order_approved_at", "order_delivered_carrier_date", "order_delivered_customer_date", "order_estimated_delivery_date"]
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format="%Y-%m-%d %H:%M:%S")

Cleaning up name columns

1
2
df["customer_city"] = df["customer_city"].str.title()
df["payment_type"] = df["payment_type"].str.replace("_", " ").str.title()

파생 변수 생성

1
2
3
4
5
6
7
8
9
10
11
12
df['delivery_against_estimated'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date']).dt.days
df['order_purchase_year'] = df["order_purchase_timestamp"].apply(lambda x: x.year)
df['order_purchase_month'] = df["order_purchase_timestamp"].apply(lambda x: x.month)
df['order_purchase_dayofweek'] = df["order_purchase_timestamp"].apply(lambda x: x.dayofweek)
df['order_purchase_hour'] = df["order_purchase_timestamp"].apply(lambda x: x.hour)
df['order_purchase_day'] = df['order_purchase_dayofweek'].map({0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'})
df['order_purchase_mon'] = df["order_purchase_timestamp"].apply(lambda x: x.month).map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})

df['month_year'] = df['order_purchase_month'].astype(str).apply(lambda x: '0' + x if len(x) == 1 else x)
df['month_year'] = df['order_purchase_year'].astype(str) + '-' + df['month_year'].astype(str)

df['month_y'] = df['order_purchase_timestamp'].map(lambda date: 100*date.year + date.month)

Summary

1
df.describe(include="all")
order_idpayment_sequentialpayment_typepayment_installmentspayment_valueorder_item_idproduct_idseller_idshipping_limit_dateprice...customer_statedelivery_against_estimatedorder_purchase_yearorder_purchase_monthorder_purchase_dayofweekorder_purchase_hourorder_purchase_dayorder_purchase_monmonth_yearmonth_y
count117601117601.000000117601117601.000000117601.000000117601.000000117601117601117601117601.000000...117601115034.000000117601.000000117601.000000117601.000000117601.000000117601117601117601117601.000000
unique98665NaN4NaNNaNNaN32951309593317NaN...27NaNNaNNaNNaNNaN71224NaN
top895ab968e7bb0d5659d16cd74cd1650cNaNCredit CardNaNNaNNaNaca2eb7d00ea1a7b8ebd4e68314663af4a3ca9315b744ce9f8e93743614938842017-08-14 20:43:31NaN...SPNaNNaNNaNNaNNaNMonAug2017-11NaN
freq63NaN86769NaNNaNNaN536213363NaN...49566NaNNaNNaNNaNNaN19130126329016NaN
firstNaNNaNNaNNaNNaNNaNNaNNaN2016-09-19 00:15:34NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
lastNaNNaNNaNNaNNaNNaNNaNNaN2020-04-09 22:35:08NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
meanNaN1.093528NaN2.939482172.6867521.195900NaNNaNNaN120.824783...NaN11.0433262017.5381936.0281292.74575014.760002NaNNaNNaN201759.847399
stdNaN0.726692NaN2.774223267.5922900.697706NaNNaNNaN184.479323...NaN10.1623070.5050653.2295791.9612575.325670NaNNaNNaN48.798820
minNaN1.000000NaN0.0000000.0000001.000000NaNNaNNaN0.850000...NaN-189.0000002016.0000001.0000000.0000000.000000NaNNaNNaN201609.000000
25%NaN1.000000NaN1.00000060.8700001.000000NaNNaNNaN39.900000...NaN6.0000002017.0000003.0000001.00000011.000000NaNNaNNaN201709.000000
50%NaN1.000000NaN2.000000108.2100001.000000NaNNaNNaN74.900000...NaN12.0000002018.0000006.0000003.00000015.000000NaNNaNNaN201801.000000
75%NaN1.000000NaN4.000000189.2600001.000000NaNNaNNaN134.900000...NaN16.0000002018.0000008.0000004.00000019.000000NaNNaNNaN201805.000000
maxNaN29.000000NaN24.00000013664.08000021.000000NaNNaNNaN6735.000000...NaN146.0000002018.00000012.0000006.00000023.000000NaNNaNNaN201809.000000

13 rows × 31 columns

결측치 개수와 비율 확인

1
2
3
missing_values = df.isnull().sum().sort_values(ascending = False)
percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([missing_values, percentage], axis=1, keys=['Values', 'Percentage']).transpose()
order_delivered_customer_datedelivery_against_estimatedorder_delivered_carrier_dateorder_approved_atmonth_yearorder_purchase_monorder_purchase_dayorder_purchase_hourorder_purchase_dayofweekorder_purchase_month...freight_valuepriceshipping_limit_dateseller_idproduct_idorder_item_idpayment_valuepayment_installmentspayment_typemonth_y
Values2567.0000002567.0000001245.00000015.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Percentage2.1828052.1828051.0586640.0127550.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0

2 rows × 31 columns

1
2
3
# 결측치 버리기
df.dropna(inplace=True)
df.isnull().values.any()
1
False

Monthly Revenue

1
2
df_revenue = df.groupby("month_year")["payment_value"].sum().reset_index()
df_revenue
month_yearpayment_value
02016-1062591.65
12016-1219.62
22017-01176376.56
32017-02323815.95
42017-03505735.83
52017-04456108.32
62017-05701119.60
72017-06585400.98
82017-07716069.98
92017-08842689.94
102017-09996085.61
112017-10998609.62
122017-111548547.86
132017-121020067.26
142018-011374064.02
152018-021280014.54
162018-031435458.33
172018-041466607.15
182018-051480667.59
192018-061285396.78
202018-071306707.42
212018-081211240.09

Monthly Revenue Growth Rate

1
2
3
df_revenue["MonthlyGrowth"] = df_revenue["payment_value"].pct_change()
plt.figure(figsize=(20, 4))
_ = sns.pointplot(data=df_revenue, x="month_year", y="MonthlyGrowth", ci=None).set_title("월 매출 증감률")

png

Monthly Active Customers

1
2
3
4
5
6
7
df_monthly_active = df.groupby("month_year")["customer_unique_id"].nunique().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette="muted", color_codes=True, style="whitegrid")
bar_plot(x="month_year", y="customer_unique_id", df=df_monthly_active, value=True)
ax.tick_params(axis="x", labelrotation=90)
plt.show()

png

Monthly Order Count

1
2
3
4
5
6
7
df_monthly_sales = df.groupby("month_year")["order_status"].count().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette="muted", color_codes=True, style="whitegrid")
bar_plot(x="month_year", y="order_status", df=df_monthly_sales, value=True)
ax.tick_params(axis="x", labelrotation=90)
plt.show()

png

ARPU (Average Revenue per Customer Purchase)

1
2
3
4
5
6
7
df_monthly_order_avg = df.groupby('month_year')['payment_value'].mean().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='payment_value', df=df_monthly_order_avg, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

png

신규 가입자 비율

1
2
3
4
5
6
7
8
9
10
11
12
df_min_purchase = df.groupby('customer_unique_id')["order_purchase_timestamp"].min().reset_index()
df_min_purchase.columns = ['customer_unique_id','minpurchasedate']
df_min_purchase['minpurchasedate'] = df_min_purchase['minpurchasedate'].map(lambda date: 100*date.year + date.month)

df = pd.merge(df, df_min_purchase, on='customer_unique_id')

df['usertype'] = 'New'
df.loc[df['month_y']>df['minpurchasedate'],'usertype'] = 'Existing'

df_user_type_revenue = df.groupby(['month_y','usertype', 'month_year'])['payment_value'].sum().reset_index()

df_user_type_revenue
month_yusertypemonth_yearpayment_value
0201610New2016-1062591.65
1201612New2016-1219.62
2201701Existing2017-0119.62
3201701New2017-01176356.94
4201702Existing2017-02111.07
5201702New2017-02323704.88
6201703Existing2017-03596.38
7201703New2017-03505139.45
8201704Existing2017-042789.06
9201704New2017-04453319.26
10201705Existing2017-056733.95
11201705New2017-05694385.65
12201706Existing2017-066956.06
13201706New2017-06578444.92
14201707Existing2017-0713632.49
15201707New2017-07702437.49
16201708Existing2017-0815000.05
17201708New2017-08827689.89
18201709Existing2017-0914067.94
19201709New2017-09982017.67
20201710Existing2017-1020695.65
21201710New2017-10977913.97
22201711Existing2017-1125261.14
23201711New2017-111523286.72
24201712Existing2017-1224133.48
25201712New2017-12995933.78
26201801Existing2018-0125079.90
27201801New2018-011348984.12
28201802Existing2018-0226661.62
29201802New2018-021253352.92
30201803Existing2018-0334684.83
31201803New2018-031400773.50
32201804Existing2018-0441982.07
33201804New2018-041424625.08
34201805Existing2018-0536272.26
35201805New2018-051444395.33
36201806Existing2018-0640940.89
37201806New2018-061244455.89
38201807Existing2018-0733086.53
39201807New2018-071273620.89
40201808Existing2018-0827898.55
41201808New2018-081183341.54
1
2
3
4
5
6
7
8
fig, ax = plt.subplots(figsize=(15, 6))
sns.set(palette='muted', color_codes=True)
ax = sns.lineplot(x='month_year', y='payment_value', data=df_user_type_revenue.query("usertype == 'New'"), label='New')
ax = sns.lineplot(x='month_year', y='payment_value', data=df_user_type_revenue.query("usertype == 'Existing'"), label='Existing')
format_spines(ax, right_border=False)
ax.set_title('Existing vs New Customer Comparison')
ax.tick_params(axis='x', labelrotation=90)
plt.show()

png

1
2
3
4
5
6
7
df_user_ratio = df.query("usertype == 'New'").groupby(['month_year'])['customer_unique_id'].nunique()/df.query("usertype == 'Existing'").groupby(['month_year'])['customer_unique_id'].nunique() 
df_user_ratio = df_user_ratio.reset_index()

df_user_ratio = df_user_ratio.dropna()
df_user_ratio.columns = ['month_year','NewCusRatio']

df_user_ratio
month_yearNewCusRatio
22017-01715.000000
32017-02808.000000
42017-03500.600000
52017-04125.333333
62017-05123.214286
72017-0677.871795
82017-0775.040000
92017-0871.175439
102017-0950.670886
112017-1049.193182
122017-1157.390244
132017-1247.660714
142018-0151.833333
152018-0256.151786
162018-0348.385714
172018-0440.629630
182018-0534.791444
192018-0632.103825
202018-0739.377483
212018-0837.012048
1
2
3
4
5
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='NewCusRatio', df=df_user_ratio, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

png

Monthly Retention Rate

1
2
df_user_purchase = df.groupby(['customer_unique_id','month_y'])['payment_value'].sum().reset_index()
df_user_purchase.head()
customer_unique_idmonth_ypayment_value
00000366f3b9a7992bf8c76cfdf3221e2201805141.90
10000b849f77a49e4a4ce2b2a4ca5be3f20180527.19
20000f46a3911fa3c080544448333706420170386.22
30000f6ccb0745a6a4b88665a16c9f07820171043.62
40004aac84e0df4da2b147fca70cf8255201711196.89
1
2
df_user_purchase = df.groupby(['customer_unique_id','month_y'])['payment_value'].count().reset_index()
df_user_purchase.head()
customer_unique_idmonth_ypayment_value
00000366f3b9a7992bf8c76cfdf3221e22018051
10000b849f77a49e4a4ce2b2a4ca5be3f2018051
20000f46a3911fa3c08054444833370642017031
30000f6ccb0745a6a4b88665a16c9f0782017101
40004aac84e0df4da2b147fca70cf82552017111
1
2
df_retention = pd.crosstab(df_user_purchase['customer_unique_id'], df_user_purchase['month_y']).reset_index()
df_retention.head()
month_ycustomer_unique_id201610201612201701201702201703201704201705201706201707...201711201712201801201802201803201804201805201806201807201808
00000366f3b9a7992bf8c76cfdf3221e2000000000...0000001000
10000b849f77a49e4a4ce2b2a4ca5be3f000000000...0000001000
20000f46a3911fa3c0805444483337064000010000...0000000000
30000f6ccb0745a6a4b88665a16c9f078000000000...0000000000
40004aac84e0df4da2b147fca70cf8255000000000...1000000000

5 rows × 23 columns

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
months = df_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
    retention_data = {}
    selected_month = months[i+1]
    prev_month = months[i]
    retention_data['month_y'] = int(selected_month)
    retention_data['TotalUserCount'] = df_retention[selected_month].sum()
    retention_data['RetainedUserCount'] = df_retention[(df_retention[selected_month]>0) & (df_retention[prev_month]>0)][selected_month].sum()
    retention_array.append(retention_data)
    
df_retention = pd.DataFrame(retention_array)
df_retention['RetentionRate'] = df_retention['RetainedUserCount']/df_retention['TotalUserCount']

df_retention
month_yTotalUserCountRetainedUserCountRetentionRate
020170171610.001397
1201702161820.001236
2201703250830.001196
32017042274110.004837
42017053478140.004025
52017063076160.005202
62017073802160.004208
72017084114230.005591
82017094082320.007839
92017104417320.007245
102017117182370.005152
112017125450410.007523
122018016974160.002294
132018026401270.004218
142018036914230.003327
152018046744310.004597
162018056693450.006723
172018066058380.006273
182018076097260.004264
192018086310370.005864
1
2
3
4
5
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_y', y='RetentionRate', df=df_retention, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

png

Cohort Based Retention Rate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
df_retention = pd.crosstab(df_user_purchase['customer_unique_id'], df_user_purchase['month_y']).reset_index()
new_column_names = [ 'm_' + str(column) for column in df_retention.columns]
df_retention.columns = new_column_names

retention_array = []
for i in range(len(months)):
    retention_data = {}
    selected_month = months[i]
    prev_months = months[:i]
    next_months = months[i+1:]
    for prev_month in prev_months:
        retention_data[prev_month] = np.nan
        
    total_user_count =  retention_data['TotalUserCount'] = df_retention['m_' + str(selected_month)].sum()
    retention_data[selected_month] = 1 
    
    query = "{} > 0".format('m_' + str(selected_month))
    

    for next_month in next_months:
        query = query + " and {} > 0".format(str('m_' + str(next_month)))
        retention_data[next_month] = np.round(df_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
    retention_array.append(retention_data)
    
    retention_array = []
for i in range(len(months)):
    retention_data = {}
    selected_month = months[i]
    prev_months = months[:i]
    next_months = months[i+1:]
    for prev_month in prev_months:
        retention_data[prev_month] = np.nan
        
    total_user_count =  retention_data['TotalUserCount'] = df_retention['m_' + str(selected_month)].sum()
    retention_data[selected_month] = 1 
    
    query = "{} > 0".format('m_' + str(selected_month))
    

    for next_month in next_months:
        query = query + " and {} > 0".format(str('m_' + str(next_month)))
        retention_data[next_month] = np.round(df_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
    retention_array.append(retention_data)
    
df_retention = pd.DataFrame(retention_array)
df_retention.index = months

df_retention
TotalUserCount201612201701201702201703201704201705201706201707201708...201711201712201801201802201803201804201805201806201807201808
month_y
20161211.01.00.00.00.00.000.00.000.00...0.000.000.00.00.00.00.000.000.00.00
201701716NaN1.00.00.00.00.000.00.000.00...0.000.000.00.00.00.00.000.000.00.00
2017021618NaNNaN1.00.00.00.000.00.000.00...0.000.000.00.00.00.00.000.000.00.00
2017032508NaNNaNNaN1.00.00.000.00.000.00...0.000.000.00.00.00.00.000.000.00.00
2017042274NaNNaNNaNNaN1.00.010.00.000.00...0.000.000.00.00.00.00.000.000.00.00
2017053478NaNNaNNaNNaNNaN1.000.00.000.00...0.000.000.00.00.00.00.000.000.00.00
2017063076NaNNaNNaNNaNNaNNaN1.00.010.00...0.000.000.00.00.00.00.000.000.00.00
2017073802NaNNaNNaNNaNNaNNaNNaN1.000.01...0.000.000.00.00.00.00.000.000.00.00
2017084114NaNNaNNaNNaNNaNNaNNaNNaN1.00...0.000.000.00.00.00.00.000.000.00.00
2017094082NaNNaNNaNNaNNaNNaNNaNNaNNaN...0.000.000.00.00.00.00.000.000.00.00
2017104417NaNNaNNaNNaNNaNNaNNaNNaNNaN...0.010.000.00.00.00.00.000.000.00.00
2017117182NaNNaNNaNNaNNaNNaNNaNNaNNaN...1.000.010.00.00.00.00.000.000.00.00
2017125450NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaN1.000.00.00.00.00.000.000.00.00
2018016974NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaN1.00.00.00.00.000.000.00.00
2018026401NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaN1.00.00.00.000.000.00.00
2018036914NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaN1.00.00.000.000.00.00
2018046744NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN1.00.010.000.00.00
2018056693NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN1.000.010.00.00
2018066058NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaN1.000.00.00
2018076097NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN1.00.01
2018086310NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN1.00

21 rows × 22 columns

RFM

Recency

1
2
3
4
5
6
7
df_user = pd.DataFrame(df['customer_unique_id'])
df_user.columns = ['customer_unique_id']
df_max_purchase = df.groupby('customer_unique_id')["order_purchase_timestamp"].max().reset_index()
df_max_purchase.columns = ['customer_unique_id', 'MaxPurchaseDate']
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days

df_user = pd.merge(df_user, df_max_purchase[['customer_unique_id','Recency']], on='customer_unique_id')
1
2
3
4
5
sns.set(palette='muted', color_codes=True, style='white')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['Recency'], bins=30)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 군집화
from sklearn.cluster import KMeans

sse={}
df_recency = df_user[['Recency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
    
plt.figure(figsize=(10, 5))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_user[['Recency']])
df_user['RecencyCluster'] = kmeans.predict(df_user[['Recency']])

def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

df_user = order_cluster('RecencyCluster', 'Recency',df_user,False)
1
df_user.groupby('RecencyCluster')['Recency'].describe()
countmeanstdmin25%50%75%max
RecencyCluster
013777.0512.65631148.794950448.0473.0506.0543.0694.0
118773.0381.15825935.208183323.0350.0380.0410.0447.0
227171.0263.26675531.174965210.0234.0267.0285.0322.0
330594.0154.05004231.607872102.0126.0155.0182.0209.0
424703.048.61664628.0301050.024.044.073.0101.0

Frequency

1
2
3
4
df_frequency = df.groupby('customer_unique_id').order_purchase_timestamp.count().reset_index()
df_frequency.columns = ['customer_unique_id','Frequency']

df_user = pd.merge(df_user, df_frequency, on='customer_unique_id')
1
2
3
4
5
sns.set(palette='muted', color_codes=True, style='whitegrid')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['Frequency'], hist=False)
plt.show()

png

1
2
3
4
5
6
7
8
# 군집화
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyCluster'] = kmeans.predict(df_user[['Frequency']])

df_user = order_cluster('FrequencyCluster', 'Frequency',df_user,True)

df_user.groupby('FrequencyCluster')['Frequency'].describe()
countmeanstdmin25%50%75%max
FrequencyCluster
0100150.01.2079680.4058561.01.01.01.02.0
112428.03.9213071.0483233.03.04.05.06.0
21968.09.8506102.5636867.07.09.012.016.0
3397.024.5365246.09908118.020.022.024.038.0
475.075.0000000.00000075.075.075.075.075.0

Revenue

1
2
3
df_revenue = df.groupby('customer_unique_id').payment_value.sum().reset_index()

df_user = pd.merge(df_user, df_revenue, on='customer_unique_id')
1
2
3
4
5
sns.set(palette='muted', color_codes=True, style='white')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['payment_value'], hist=False)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
sse={}
df_revenue = df_user[['payment_value']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_revenue)
    df_revenue["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
    
plt.figure(figsize=(10, 5))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

png

1
2
3
4
5
6
7
kmeans = KMeans(n_clusters=6)
kmeans.fit(df_user[['payment_value']])
df_user['RevenueCluster'] = kmeans.predict(df_user[['payment_value']])

df_user = order_cluster('RevenueCluster', 'payment_value',df_user,True)

df_user.groupby('RevenueCluster')['payment_value'].describe()
countmeanstdmin25%50%75%max
RevenueCluster
0108472.0192.576355191.0573899.5967.53124.24233.5351033.12
15565.01881.126228817.8337401034.061252.811608.362218.0004415.96
2743.07000.2834192137.8604924447.805289.126317.227971.88012490.88
3184.020278.1104355207.41779314196.2816313.6019174.3825051.89030186.00
446.043587.2921742837.17707236489.2444048.0044048.0045256.00045256.00
58.0109312.6400000.000000109312.64109312.64109312.64109312.640109312.64

Overall Score

1
2
3
4
df_user.columns = ['customer_unique_id', 'Recency', 'RecencyCluster', 'Frequency', 'FrequencyCluster', 'Monetary', 'RevenueCluster']

df_user['OverallScore'] = df_user['RecencyCluster'] + df_user['FrequencyCluster'] + df_user['RevenueCluster']
df_user.groupby('OverallScore')['Recency','Frequency','Monetary'].mean()
RecencyFrequencyMonetary
OverallScore
0511.7590011.207100167.861348
1390.7722101.352797195.827734
2276.3513421.468897222.252763
3170.6355851.577970251.513166
470.1355041.674742306.303808
5105.6969625.9482941120.916258
699.0653066.7850343074.301905
7110.81224511.4489805401.498551
8143.40322611.46236622318.521129
1096.45454522.18181835259.341818
1
2
3
df_user['Segment'] = 'Low-Value'
df_user.loc[df_user['OverallScore']>3,'Segment'] = 'Mid-Value' 
df_user.loc[df_user['OverallScore']>6,'Segment'] = 'High-Value' 
1
2
3
4
5
6
7
8
9
10
sns.set(palette='muted', color_codes=True, style='whitegrid')
fig, axs = plt.subplots(1, 3, figsize=(22, 5))
sns.despine(left=True)
sns.scatterplot(x='Recency', y='Frequency', ax=axs[0], hue='Segment', data=df_user, size='Segment', sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
sns.scatterplot(x='Frequency', y='Monetary', ax=axs[1], hue='Segment', data=df_user, size='Segment' , sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
sns.scatterplot(x='Recency', y='Monetary', ax=axs[2], hue='Segment', data=df_user, size='Segment' , sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
axs[0].set_title('Customer Segments by Recency & Frequency')
axs[1].set_title('Customer Segments by Frequency & Monetary')
axs[2].set_title('Customer Segments by Recency & Monetary')
plt.show()

png

마무리

그래프 그리는 함수를 미리 정의해 놓고 사용했다는 점과, 등급을 나누기 위해 군집화를 사용했다는 점이 신선했다.

This post is licensed under CC BY 4.0 by the author.

차원 축소(Dimension Reduction) 정리

군집화(Clustering) 정리

Comments powered by Disqus.