From d4dc7da111173266bf60b989fa7ba37538ea9faf Mon Sep 17 00:00:00 2001 From: Nishil <63183230+Nishil07@users.noreply.github.com> Date: Fri, 28 Aug 2020 20:59:29 +0530 Subject: [PATCH 1/2] Update Notebook.py --- Notebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Notebook.py b/Notebook.py index acc9018..20e9cb8 100644 --- a/Notebook.py +++ b/Notebook.py @@ -15,7 +15,7 @@ 'social':[1,1,0,0,0,1,1,1,1,0], 'web':[0,1,1,1,1,1,1,0,1,1]} -oh = pd.DataFrame(ohe,columns = ['email','mobile','social','web']) +ohx = pd.DataFrame(ohe,columns = ['email','mobile','social','web']) cleaned_portfolio = portfolio -cleaned_portfolio = pd.concat([portfolio,oh],axis=1) \ No newline at end of file +cleaned_portfolio = pd.concat([portfolio,ohx],axis=1) From 053d3e8feedb05a20682bde054487be1d993d271 Mon Sep 17 00:00:00 2001 From: Nishil <63183230+Nishil07@users.noreply.github.com> Date: Fri, 28 Aug 2020 23:04:16 +0530 Subject: [PATCH 2/2] Update Notebook.py --- Notebook.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Notebook.py b/Notebook.py index 20e9cb8..da0ee2d 100644 --- a/Notebook.py +++ b/Notebook.py @@ -19,3 +19,43 @@ cleaned_portfolio = portfolio cleaned_portfolio = pd.concat([portfolio,ohx],axis=1) + +# Data Cleaning of profile dataset + +# To check the number of NULL values in each column +# profile.isnull().sum() +''' +gender 2175 +age 0 +id 0 +became_member_on 0 +income 2175 +''' +# Also on checking the age column against all the pts having gender and income +# as Null we find that the corresponding age value is 118 which is quite +# unusual. So in order to cleanse the data we drop all such points. + +# Dropping NULL values +cleaned_profile = profile +cleaned_profile = cleaned_profile.dropna() + +# OneHotEncoding the gender column +ohe = pd.get_dummies(cleaned_profile['gender']) +cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1) +cleaned_profile = cleaned_profile.drop(['gender'],axis=1) + +# To convert the became_member_on to date-time stamp because the machine will not +# understand data corresponding to date in integer form. +cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date + +# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks +cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d') +cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date +cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0) + +# Taking a ratio of the subtracted dates to convert it into no.of.days +cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D') +cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year + +# Then we drop the reference column because it is not useful to us further analysis +cleaned_profile = cleaned_profile.drop(['today_date'],axis=1)