From d4dc7da111173266bf60b989fa7ba37538ea9faf Mon Sep 17 00:00:00 2001
From: Nishil <63183230+Nishil07@users.noreply.github.com>
Date: Fri, 28 Aug 2020 20:59:29 +0530
Subject: [PATCH 1/2] Update Notebook.py

---
 Notebook.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Notebook.py b/Notebook.py
index acc9018..20e9cb8 100644
--- a/Notebook.py
+++ b/Notebook.py
@@ -15,7 +15,7 @@
        'social':[1,1,0,0,0,1,1,1,1,0],
        'web':[0,1,1,1,1,1,1,0,1,1]}
 
-oh = pd.DataFrame(ohe,columns = ['email','mobile','social','web']) 
+ohx = pd.DataFrame(ohe,columns = ['email','mobile','social','web']) 
 
 cleaned_portfolio = portfolio
-cleaned_portfolio = pd.concat([portfolio,oh],axis=1)
\ No newline at end of file
+cleaned_portfolio = pd.concat([portfolio,ohx],axis=1)

From 053d3e8feedb05a20682bde054487be1d993d271 Mon Sep 17 00:00:00 2001
From: Nishil <63183230+Nishil07@users.noreply.github.com>
Date: Fri, 28 Aug 2020 23:04:16 +0530
Subject: [PATCH 2/2] Update Notebook.py

---
 Notebook.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/Notebook.py b/Notebook.py
index 20e9cb8..da0ee2d 100644
--- a/Notebook.py
+++ b/Notebook.py
@@ -19,3 +19,43 @@
 
 cleaned_portfolio = portfolio
 cleaned_portfolio = pd.concat([portfolio,ohx],axis=1)
+
+# Data Cleaning of profile dataset
+
+# To check the number of NULL values in each column
+# profile.isnull().sum()
+'''
+gender              2175
+age                    0
+id                     0
+became_member_on       0
+income              2175
+'''
+# Also on checking the age column against all the pts having gender and income 
+# as Null we find that the corresponding age value is 118 which is quite
+# unusual. So in order to cleanse the data we drop all such points.
+
+# Dropping NULL values
+cleaned_profile = profile
+cleaned_profile = cleaned_profile.dropna()
+
+# OneHotEncoding the gender column
+ohe = pd.get_dummies(cleaned_profile['gender'])
+cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1)
+cleaned_profile = cleaned_profile.drop(['gender'],axis=1)
+
+# To convert the became_member_on to date-time stamp because the machine will not
+# understand data corresponding to date in integer form.
+cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date
+
+# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks
+cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d')
+cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date
+cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0)
+
+# Taking a ratio of the subtracted dates to convert it into no.of.days
+cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D')
+cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year
+
+# Then we drop the reference column because it is not useful to us further analysis
+cleaned_profile = cleaned_profile.drop(['today_date'],axis=1)