-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflo_rfm.py
213 lines (162 loc) · 9.45 KB
/
flo_rfm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# load libraries
import datetime as dt
import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)
# pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# load data and create a copy
df_ = pd.read_csv("flo_data_20k.csv")
df = df_.copy()
# Examine the first 10 observations, variable names, descriptive statistics, missing values, variable types
def general_info(dataframe):
print("First 10 observations")
print("################################")
print(dataframe.head(10),end="\n\n")
print("Variable names")
print("################################")
print(dataframe.columns, end= "\n\n")
print("Descriptive statistics")
print("################################")
print(dataframe.describe().T, end="\n\n")
print("Missing values")
print("################################")
print(dataframe.isnull().sum(),end="\n\n")
print("Variable types")
print("################################")
print(dataframe.info())
general_info(df)
# Omnichannel means that customers shop both online and offline.
# Create new variables for the total number of purchases and spending for each customer.
df["order_num_total_ever"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
df["customer_value_total_ever"] = df["customer_value_total_ever_online"] + df["customer_value_total_ever_offline"]
# examine the features types
df.info()
df.head(3)
# convert the date variables to date type
df["first_order_date"] = pd.to_datetime(df["first_order_date"])
df["last_order_date"] = pd.to_datetime(df["last_order_date"])
df["last_order_date_online"] = pd.to_datetime(df["last_order_date_online"])
df["last_order_date_offline"] = pd.to_datetime(df["last_order_date_offline"])
# look at the distrubution of the number of customers, total number of products and total spending by shopping channel
df.groupby("order_channel").agg({"master_id":"nunique",
"order_num_total_ever":"sum",
"customer_value_total_ever":"sum"})
# sort the customers by their total spending and examine the first 10 customers
df.master_id.duplicated().sum() # 0
df.sort_values("customer_value_total_ever", ascending=False).head(10)
#df.groupby("master_id").agg({"customer_value_total_ever": "sum"}).sort_values("customer_value_total_ever", ascending=False).head(10)
# sort the customers by their total number of purchases and examine the first 10 customers
df.sort_values("order_num_total_ever", ascending=False).head(10)
# create a function for data preparation
path = "flo_data_20k.csv"
def preprocessing(path):
# Step 1
df_ = pd.read_csv(path)
df = df_.copy()
# Step 2
print("Overview")
print("################################################################")
print("First 10 observations")
print("################################################################")
print(df.head(10), end="\n\n")
print("Variable names")
print("################################################################")
print(df.columns, end="\n\n")
print("Descriptive statistics")
print("################################################################")
print(df.describe().T, end="\n\n")
print("Missing values")
print("################################################################")
print(df.isnull().sum(), end="\n\n")
print("Variable types")
print("################################################################")
print(df.info(),end="\n\n")
# Step 3
df["order_num_total_ever"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
df["customer_value_total_ever"] = df["customer_value_total_ever_online"] + df["customer_value_total_ever_offline"]
# Step 4
for i in df.columns:
if 'date' in i:
df[i] = pd.to_datetime(df[i])
# Step 5
print("Distribution of the number of customers, total number of products and total spending by shopping channel")
print("################################################################")
print(df.groupby("order_channel").agg({"master_id":"nunique",
"order_num_total_ever":"sum",
"customer_value_total_ever":"sum"}),end="\n\n")
# Step 6
print("Top 10 customers by total spending")
print("################################")
print(df.sort_values("customer_value_total_ever", ascending=False).head(10),end="\n\n")
# Step 7
print("Top 10 customers by total number of purchases")
print("################################################################")
print(df.sort_values("order_num_total_ever", ascending=False).head(10))
return df
df = preprocessing(path)
#########################################################################################################################
#########################################################################################################################
# recency : time since last purchase
# frequency : total number of purchases
# monetary : total spending
# select the date of analysis as the day after the maximum date
df.last_order_date.max() # Timestamp('2021-05-30 00:00:00')
analysis_date = df.last_order_date.max() + pd.DateOffset(days=2)
# Recency = analysis_date - last_order_date
# frequency = order_num_total_ever
# monetary = customer_value_total_ever
# calculate recency, frequency and monetary for each customer
df.groupby("master_id").agg({"last_order_date": lambda date: (analysis_date - date.max()).days,
"order_num_total_ever": lambda order: order.count(),
"customer_value_total_ever": lambda price: price.sum()})
#Assign your calculated metrics to a variable named rfm and change the names of the columns to recency, frequency and monetary.
rfm = df.groupby("master_id").agg({"last_order_date": lambda date: (analysis_date - date.max()).days,
"order_num_total_ever": lambda order: order.count(),
"customer_value_total_ever": lambda price: price.sum()})
rfm.columns = ["recency", "frequency", "monetary"]
#########################################################################################################################
#########################################################################################################################
# convert recency, frequency and monetary to scores between 1-5 using qcut function
# record these scores as recency_score, frequency_score and monetary_score
rfm["recent_score"] = pd.qcut(rfm["recency"], 5, labels=[5, 4, 3, 2, 1])
rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
rfm["monetary_score"] = pd.qcut(rfm["monetary"], 5, labels=[1, 2, 3, 4, 5])
# record the recency_score and frequency_score as a single variable RF_SCORE
rfm["RF_SCORE"] = (rfm["recent_score"].astype(str) + rfm["frequency_score"].astype(str))
#########################################################################################################################
#########################################################################################################################
# description of the segments for RFM analysis
seg_map = {
r'[1-2][1-2]': 'hibernating',
r'[1-2][3-4]': 'at_Risk',
r'[1-2]5': 'cant_loose',
r'3[1-2]': 'about_to_sleep',
r'33': 'need_attention',
r'[3-4][4-5]': 'loyal_customers',
r'41': 'promising',
r'51': 'new_customers',
r'[4-5][2-3]': 'potential_loyalists',
r'5[4-5]': 'champions'
}
# convert the RF_SCORE to segments using the seg_map
rfm["segment"] = rfm["RF_SCORE"].replace(seg_map, regex=True)
#########################################################################################################################
#########################################################################################################################
# examine the average recency, frequency and monetary values for each segment
rfm[["segment", "recency", "frequency", "monetary"]].groupby("segment").agg(["mean"])
# FLO includes a new women's shoe brand. The price of the products included in the brand is above the general customer preferences.
# For this reason, it is desired to communicate with the customers who will be interested in the promotion and sales of the product.
# Special communication will be made with customers who are loyal customers (champions, loyal_customers)
type1 = rfm[(rfm["segment"] == "champions") | (rfm["segment"] == "loyal_customers")]
type2 = df[(df["interested_in_categories_12"]).str.contains("KADIN")]
id1 = pd.merge(type1, type2, on=["master_id"]).master_id
# save to csv file
id1.to_csv("case1_id.csv", index=False)
# nearly 40% discount is planned for men and children products.
# It is planned to target customers who have not purchased for a long time, the customers who are about to sleep and new customers.
customer1 = rfm[(rfm["segment"] == "cant_loose") | (rfm["segment"] == "about_to_sleep") | (rfm["segment"] == "new_customers")]
customer2 = df[(df["interested_in_categories_12"]).str.contains("ERKEK | COCUK")]
id2 = pd.merge(customer1, customer2, on=["master_id"]).master_id
# save to csv file
id2.to_csv("case2_id.csv", index=False)