Data Analytics Workflow.py

# -*- coding: utf-8 -*-
"""W6W7W8_Lukas_Wisesa_Intermediate.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/12sgjIszFR7O6n5mjYYqfLaaAmlntbSB6
"""

import pandas as pd
import numpy as np
import matplotlib as plt

"""## Data Input from Google Drive to Google Colab"""

sheet_url_1 = 'https://drive.google.com/file/d/1R9g5f310AA1Zmrnmd8P8jclnyToN54Bx/view'
sheet_url_1 ='https://drive.google.com/uc?id=' + sheet_url_1.split('/')[-2]
Orders_dataset = pd.read_csv(sheet_url_1)
Orders_dataset.head()

sheet_url_2 = 'https://drive.google.com/file/d/1S_BzA_P4v26oIvErtB4Ta64fQB53Hw1_/view?usp=sharing'
sheet_url_2 ='https://drive.google.com/uc?id=' + sheet_url_2.split('/')[-2]
Customers_dataset = pd.read_csv(sheet_url_2)
Customers_dataset.head()

sheet_url_3 = 'https://drive.google.com/file/d/1JBzMoiRBy6YsxHoS3OhysRcZZyRh5DQD/view?usp=sharing'
sheet_url_3 ='https://drive.google.com/uc?id=' + sheet_url_3.split('/')[-2]
Order_payments_dataset = pd.read_csv(sheet_url_3)
Order_payments_dataset.head()

"""## Combining Data"""

df_combined1 = pd.merge(Orders_dataset, Customers_dataset, how='outer', left_on = 'customer_id', right_on = 'customer_id')
df_combined2 = pd.merge(df_combined1, Order_payments_dataset, how='outer', left_on = 'order_id', right_on = 'order_id')
df_combined2.head(10)

"""## Data Cleaning (NaN Value)"""

df = df_combined2
df_null = df[df.isna().any(axis=1)]

print(df_null.count())
print("")
print("_____________________________________________________")
print("Number of incomplete data :", len(df_null))
print("Percentage of incomplete data:",len(df_null)*100/len(Orders_dataset),"%")
print("_____________________________________________________")

# Remove all null
df = df.dropna()
df_null2 = df[df.isna().any(axis=1)]

print(df_null2.count())
print("")
print("_____________________________________________________")
print("Number of incomplete data :", len(df_null2))
print("Percentage of incomplete data:",len(df_null2)*100/len(Orders_dataset),"%")
print("_____________________________________________________")

df_ori = df.drop_duplicates()
df_ori = df_ori.reset_index(drop=True)
df_ori.tail(10)

#Timedate type
df_ori['order_purchase_timestamp'] = pd.to_datetime(df_ori['order_purchase_timestamp'])
df_ori['order_approved_at'] = pd.to_datetime(df_ori['order_approved_at'])
df_ori['order_delivered_carrier_date'] = pd.to_datetime(df_ori['order_delivered_carrier_date'])
df_ori['order_delivered_customer_date'] = pd.to_datetime(df_ori['order_delivered_customer_date'])
df_ori['order_estimated_delivery_date'] = pd.to_datetime(df_ori['order_estimated_delivery_date'])

"""## Removing Outliers --Payment Value"""

df_ori = df_ori.loc[df_ori['order_status'] != 'cancelled']
df_ori.boxplot(column = 'payment_value')
Q3 = df_ori.payment_value.quantile(0.75)
Q1 = df_ori.payment_value.quantile(0.25)
IQR = Q3-Q1

Max_value = Q3 + IQR
Min_value = Q1 - IQR
print("Quantile 3 :",Q3)
print("Quantile 1 :",Q1)
print("Batas atas", Max_value)
print("Batas bawah", Min_value)

a = 0
for i in range (0,len(df)):
  if df_ori.payment_value[i] > Max_value or df_ori.payment_value[i] < Min_value :
    a = a+1

print("Number of outlier is ",a)
print("Outlier Percentage is",a*100/len(df),"%")

df_ori.payment_value.plot(kind='hist',subplots=True,sharex=True,sharey=True,title='Payment_value',bins = 200)

df_clean = df_ori.loc[(df_ori['payment_value']<Max_value) &
                  (df_ori['payment_value']>Min_value)]

df_ori = df_clean.reset_index(drop=True)
df_clean = df_clean.reset_index(drop=True)

a = 0
for i in range (0,len(df_clean)):
  if df_clean.payment_value[i] > Max_value or df_clean.payment_value[i] < Min_value :
    a = a+1

print("Number of outlier is ",a)
print("Outlier Percentage is",a*100/len(df_clean),"%")

df_clean.payment_value.plot(kind='hist',subplots=True,sharex=True,sharey=True,title='Payment_value no outlier',bins = 200)
df_clean.head(5)

"""## Exploratory Data Analysis --Descriptive Analysis"""

print("Count every distinct value in each column")
print("______________________________________")
print(df_ori.nunique())
print("______________________________________")

print("Count every min in each column")
print(df_ori.min())
print("______________________________________")

print("Count every max in each column")
print(df_ori.max())
print("______________________________________")

print("Count every median in each column")
print(df_ori.median())
print("______________________________________")

print("Count every mode in each column")
print(df_ori.mode().T)
print("______________________________________")

print("Count every mean in each column")
print(df_ori.iloc[12:14].mean())
print("______________________________________")

print("Count every Q1 in each column")
print(df_ori.quantile(q=0.25))
print("______________________________________")

print("Count every Q3 in each column")
print(df_ori.quantile(q=0.75))
print("______________________________________")

print("Count every range in each column")
print(df_ori.quantile(q=0.75)-df_ori.quantile(q=0.25))
print("______________________________________")

print("Count every variance in each column")
print(df_ori.var())
print("______________________________________")

print("Count every standard deviation in each column")
print(df_ori.std())
print("______________________________________")

print("Count every coef of var in each column")
print(df_ori.iloc[12:14].std()/df_ori.iloc[12:14].mean())
print("______________________________________")

print("Count every kurtosis in each column")
print(df_ori.kurtosis())
print("______________________________________")

print("Count every skewness in each column")
print(df_ori.skew())
print("______________________________________")

print("average of payment_sequential:", df_ori.payment_sequential.mean())
print("average of payment_installments:", df_ori.payment_installments.mean())
print("average of payment_value:", df_ori.payment_value.mean())

"""## Exploratory Data Analysis --Visualization

### Number of Orders/Month
"""

# Number of orders/mont
df_ori.sort_values(by=['order_approved_at'],ascending=False).head(5)
df_month = df_ori['order_approved_at'].dt.strftime('%Y/%m')
df_month = pd.DataFrame(df_month)
df_month['order_id']=df_ori['order_id']


df_month = df_month[['order_approved_at','order_id']].groupby('order_approved_at', as_index=False).count()
df_month = df_month.rename(columns={'order_id':'total order'})
df_month = df_month.sort_values(by=['order_approved_at'],ascending=True)
df_month = df_month.set_index('order_approved_at')

import matplotlib.pyplot as plt
import matplotlib
 
plt.style.use("dark_background")
plt.figure(figsize=(12, 10))
 
plt.xlabel("order_approved_at")
plt.ylabel("total_order")
plt.title("Orders/month")

plt.rcParams.update({'font.size': 8})
plt.plot(df_month["total order"])

# 2018 was the busiest year, especially january, march, and may. 
# In 2017 there are massive spike showing a great improvement.
# 2018 until latest time shows a decreasing trend.

"""### Busiest Day"""

#Busiest Day
df_day = df_ori['order_approved_at'].dt.dayofweek
df_day = pd.DataFrame(df_day)
df_day['order_id']=df_ori['order_id']

df_day = df_day[['order_approved_at','order_id']].groupby('order_approved_at', as_index=False).count()
df_day = df_day.rename(columns={'order_id':'total order'})
df_day = df_day.sort_values(by=['order_approved_at'],ascending=True)

encode_values = { 
    "Days" : {0: "Sunday", 1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday", 5: "Friday", 6:"Saturday"},
    }
df_day.order_approved_at = df_day.order_approved_at.replace(encode_values['Days'])
df_day = df_day.set_index('order_approved_at')

df_day.sort_values(by=['total order'],ascending=False)

import matplotlib.pyplot as plt
import matplotlib

plt.style.use("dark_background")
plt.figure(figsize=(12, 10))
 
plt.xlabel("order_approved_at")
plt.ylabel("total_order")
plt.title("Orders/month")

plt.rcParams.update({'font.size': 8})
plt.plot(df_day["total order"])

#Busiest Day is Monday, then Tuesday, then wednesday

"""## Payment Type Percentage"""

df_payment_type = df_ori['payment_type']
df_payment_type = pd.DataFrame(df_payment_type)
df_payment_type['order_id']=df_ori['order_id']

df_payment_type = df_payment_type[['payment_type','order_id']].groupby('payment_type', as_index=False).count()
df_payment_type = df_payment_type.rename(columns={'order_id':'total_order'})

List_percentage = []
B = df_payment_type.total_order.sum()
for i in range(0,len(df_payment_type)):
  List_percentage.append(df_payment_type.total_order[i]*100/B)
  
df_payment_type['payment type percentage (%)'] = List_percentage
df_payment_type = df_payment_type.sort_values('payment type percentage (%)')
df_payment_type
## Most of the users prefer the use of credit card and boleto

import matplotlib.pyplot as plt
import numpy as np

plt.style.use("fivethirtyeight")

y = np.array(df_payment_type['payment type percentage (%)'])
mylabels = df_payment_type.payment_type
myexplode = [0.0, 0.0, 0.2, 0.]

def func(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return "{:.1f}%\n({:d})".format(pct, absolute)

plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(8, 8))
plt.pie(y, labels = mylabels, explode = myexplode, autopct=lambda pct: func(pct, y),textprops=dict(color="k"))
plt.show()

"""## Business Question"""

# 1.  Is there any difference between cities in term of their average payment value and how many users in there ? Creating a ranking system is better 
# --- This insight could be use for predicting their buying power and create city priority campaign.

# 2. Can we estimate the online shop revenue per month ?
# --- This could be used to forecast our revenue and analyze whether a campaign is successful or not.

# 3. Can we estimate delivery time based on the customer location ?
# --- This could be used to create a better user experience.

"""## Q1 : City Campaign --Data Processing"""

Answer_one_p1 = df_clean[['customer_city','payment_value']].groupby('customer_city', as_index=False).mean()
Answer_one_p1
Answer_one_p2 = df_clean[['customer_city','order_id']].groupby('customer_city', as_index=False).count()
Answer_one_p2
Answer_one_final = pd.merge(Answer_one_p1, Answer_one_p2, how='outer', left_on = 'customer_city', right_on = 'customer_city')
Answer_one_final = Answer_one_final.set_axis(['customer_city','payment_value','total_order'], axis=1)

Answer_one_final = pd.merge(Answer_one_p1, Answer_one_p2, how='outer', left_on = 'customer_city', right_on = 'customer_city')
Answer_one_final = Answer_one_final.set_axis(['customer_city','payment_value','total_order'], axis=1)

List_percentage = []
B = Answer_one_final.total_order.sum()
for i in range(0,len(Answer_one_final)):
  List_percentage.append(Answer_one_final.total_order[i]*100/B)

Answer_one_final['Percentage from all of the orders (%)'] = List_percentage
Answer_one_final.sort_values(by=['total_order'],ascending=False).head(5)

"""## Q1 : City Campaign --Ranking/Priority System"""

## Ranking system can be used to create priority between cities. This is useful when the e-commerce want to create an advertisement through a billboard, or sponsor a local competition 
## the e-commerce platform could create an advertisement based on how much do the user spends in ther market, therefore creating more transactions.

#Normalization
Ranks = Answer_one_final
Ranks = Ranks.rename(columns={"payment_value":"avg_payment_value"})
Ranks['payment_normalized'] = (Ranks['avg_payment_value']- min(Ranks['avg_payment_value']))/(max(Ranks['avg_payment_value'])- min(Ranks['avg_payment_value']))
Ranks['total_order_normalized'] = (Ranks['total_order']- min(Ranks['total_order']))/(max(Ranks['total_order'])- min(Ranks['total_order']))
Ranks.sort_values(by=['total_order'],ascending=False).head(5)

#Proportion of scoring weight
a = 0.1 #Payment average
b = 0.9 #Total orders from one region
Ranks['score'] = Ranks['payment_normalized']*a + Ranks['total_order_normalized']*b
Ranks['rank'] = Ranks['score'].rank(ascending=False)

Ranks.sort_values(by=['score'],ascending=False).head(5)

"""## Q2 : DailyForecast --Data Process (unfinished)"""

df_ori.sort_values(by=['order_approved_at'],ascending=False).head(5)
df_month = df_ori['order_approved_at'].dt.strftime('%Y/%m/%d')
df_month = pd.DataFrame(df_month)
df_month['order_id']=df_ori['order_id']
df_month['payment_value'] = df_ori['payment_value']


df_month = df_month.groupby('order_approved_at', as_index=False).agg(Payment_valuem=('payment_value', 'sum'), Total_order=('order_id', 'count'))
df_forecast = df_month[2::].reset_index()
df_forecast.head(5)
##TK

"""## Cluster Analysis"""

import seaborn as sns
from sklearn import cluster
from sklearn.metrics import silhouette_samples, silhouette_score

#Take some variables
# Business Question, clustering to know the level of spending (high orders and high payments) and classifying it to some classess
df_ori.head(5)

df_cluster1 = df_ori[['customer_id','payment_value']].groupby('customer_id', as_index=False).sum()
df_cluster2 = df_clean[['customer_id','order_id']].groupby('customer_id', as_index=False).count()

df_cluster = pd.merge(df_cluster1, df_cluster2, how='outer', left_on = 'customer_id', right_on = 'customer_id')
df_cluster = df_cluster.set_axis(['customer_city','total_payment_value','total_order'], axis=1)
df_cluster.head()

"""## K-NN Cluster"""

X = df_cluster.iloc[:, [1, 2]].values

plt.style.use("dark_background")

from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.style.use("dark_background")
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.plot(range(1, 11), wcss)
plt.show()

kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('total_payment')
plt.ylabel('total_order')
plt.legend()
plt.show()

"""## Cluster Interpretation"""

# The greens show that the customer is the highest spender in the platform and loyal
# The blues show that the customer is the second cluster of the most highest spender in the platform and loyal
# The light-blues show that the customer is the third cluster of the most highest spender in the platform and loyal
# The red show that the customer is the lowest spender in the platform

"""## Recommendation based on the cluster result"""

# Segmented ads could be given
# 1. For the reds and light blues, ads should feature promo such as cheaper price or high discount.
# 2. For the greens and blues, ads featuring lifestyle that have high price could still be effective and giving much more revenue.