-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaner.py
executable file
·158 lines (110 loc) · 4.06 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 29 14:18:55 2017¨
filename: cleaner.py
description: This file cleans the data set and makes it useable for analysis
@author: Timo
"""
import query
import variables
import credentials
import numpy as np
import pandas as pd
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
from genderize import Genderize
"""
Global Cleaning
"""
def clean_dataset(df):
df["Last Name"] = df["Last Name"].apply(lambda x: hash(x))
df["First Name"] = df["First Name"].apply(lambda x: hash(x))
def rev_dict(dictionary):
reversed_dict = {}
rev_proc = dict([reversed(i) for i in dictionary.items()])
reversed_dict.update(rev_proc)
return reversed_dict
def extract_dicts():
dict_list = []
for key in variables.var_dict.keys():
if variables.var_dict[key] is None:
pass
else:
dict_list.append(variables.var_dict[key])
return dict_list
def extract_colnames(dates = False):
feature_names = []
for key in variables.var_dict.keys():
feature_names.append(key)
if dates == True:
return [s for s in feature_names if "Date" in s]
else:
return [s for s in feature_names if "Date" not in s]
def feature_cleaner(df, names, dicts):
for elem1, elem2 in zip(names, dicts):
if sum(df[elem1].isnull()) != len(df[elem1].isnull()):
df[elem1].replace(elem2, inplace = True)
else:
pass
return df
def date_cleaner(df, names):
for element in names:
df[str(element)] = pd.to_datetime(df[str(element)])
return df
df.columns = df.columns[:4].append(df.columns[4:].str.split(".").str[1])
df.replace('-', np.nan, inplace = True)
df.rename(columns = rev_dict(query.query_dict), inplace = True)
df = feature_cleaner(df, extract_colnames(), extract_dicts())
df = date_cleaner(df, extract_colnames(dates = True))
return df
def run_local_cleaner(df, col, gender = False):
def age_calculator(df, column):
age_list = []
sys_date = time.strftime("%Y-%m-%d")
sys_date = datetime.strptime(sys_date, '%Y-%m-%d')
for element in df[column]:
if type(element) is pd._libs.tslib.Timestamp:
age_list.append(relativedelta(sys_date, element).years)
else:
age_list.append(element)
return age_list
def date_cleaner(df, names):
for element in names:
df[str(element)] = pd.to_datetime(df[str(element)])
def get_gender(df, column):
name_list = []
genderdict_list = []
gender_list = []
for name in df[column]:
name_list.append(name)
name_list = [name_list[i:i + 10] for i in range(0, len(name_list), 10)]
for element in name_list:
genderdict_list.append(Genderize().get(element))
for gender in genderdict_list:
for key in gender:
gender_list.append(key['gender'])
return gender_list
for name in col:
if gender == True and name == "First Name":
df["Gender"] = pd.Series(get_gender(df, name), index = df.index, name = "Gender")
elif name == "Study ID":
df = df[df['Study ID'].str.contains("[L][Z][R][0-9]+")]
elif name == "Date of Birth":
df.loc[name] = pd.Series(age_calculator(df, name), index = df.index)
df = df.rename(columns = {"Date of Birth" : "Age"})
elif name == "Radiologist":
df = df[df[name] != "Dr. Nemo"] # Index at 214
elif name == "Intake":
date_cleaner(df, name)
elif name == "CT Evaluation":
date_cleaner(df, name)
else:
pass
return df
#
df = pd.read_csv(credentials.csv)
df = clean_dataset(df)
collist = ["First Name", "Study ID", "Date of Birth", "Radiologist"]
df = run_local_cleaner(df, collist, gender = False)