-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrangle.py
51 lines (45 loc) · 2 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
import pandas as pd
import re
def acquire_data():
'''
This functions acquire both datasets needed for this project via .csv files.
'''
df = pd.read_csv('bachelorette-contestants.csv')
join = pd.read_csv('bachelorette_edit-to-join.csv')
return df, join
def join_dfs(df, join):
'''
This functions joins two Bachelorette datasets together,
first by making the contestant names compatiable to join on.
'''
# Get last initials
df['last_initial'] = df['Name'].str.extract(r'( \w{1})')
df.last_initial = df.last_initial.fillna('X')
# Truncate last names / get first names
df['first_name'] = df.Name.replace(to_replace = '( )\w+.', value = '', regex = True)
# concat newly formatted first and last names with a '_' in between
df['CONTESTANT'] = df['first_name'].str.upper() + '_' + df['last_initial'].str.upper()
# take out white spaces
df.CONTESTANT = df.CONTESTANT.str.replace(" ", "")
# add season number to front of each contestant name
df['CONTESTANT'] = df['Season'].map(str) + '_' + df['CONTESTANT']
# I need to take the '0's out of join.CONTESTANT to make both dataframes match before I join them.
# replace the matching strings
join.CONTESTANT = join.CONTESTANT.replace(to_replace ='^0', value = '', regex = True)
# now join the two dataframes together
new_df = df.join(join.set_index('CONTESTANT'), on='CONTESTANT', how='inner')
# drop unneeded features
new_df = new_df.drop(columns=['last_initial', 'first_name', 'SEASON'])
# set the index
new_df = new_df.set_index('CONTESTANT')
return new_df
def drop_extra_cols(df):
'''
This function drops unneeded columns/features in
a Bachelorette data df, and returns a df.
'''
df = df.drop(columns=['ELIMINATION-1', 'ELIMINATION-2', 'ELIMINATION-3', 'ELIMINATION-4',
'ELIMINATION-5', 'ELIMINATION-6', 'ELIMINATION-7', 'ELIMINATION-8',
'ELIMINATION-9', 'ELIMINATION-10', 'Occupation', 'Name', 'DATES-1'])
return df