Skip to content

Commit

Permalink
Merge pull request #15 from Nishil07/master
Browse files Browse the repository at this point in the history
Update Notebook.py
  • Loading branch information
Nishil07 authored Aug 28, 2020
2 parents 857af81 + 053d3e8 commit 5d67472
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions Notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,43 @@

cleaned_portfolio = portfolio
cleaned_portfolio = pd.concat([portfolio,ohx],axis=1)

# Data Cleaning of profile dataset

# To check the number of NULL values in each column
# profile.isnull().sum()
'''
gender 2175
age 0
id 0
became_member_on 0
income 2175
'''
# Also on checking the age column against all the pts having gender and income
# as Null we find that the corresponding age value is 118 which is quite
# unusual. So in order to cleanse the data we drop all such points.

# Dropping NULL values
cleaned_profile = profile
cleaned_profile = cleaned_profile.dropna()

# OneHotEncoding the gender column
ohe = pd.get_dummies(cleaned_profile['gender'])
cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1)
cleaned_profile = cleaned_profile.drop(['gender'],axis=1)

# To convert the became_member_on to date-time stamp because the machine will not
# understand data corresponding to date in integer form.
cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date

# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks
cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d')
cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date
cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0)

# Taking a ratio of the subtracted dates to convert it into no.of.days
cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D')
cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year

# Then we drop the reference column because it is not useful to us further analysis
cleaned_profile = cleaned_profile.drop(['today_date'],axis=1)

0 comments on commit 5d67472

Please sign in to comment.