-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_data.py
50 lines (32 loc) · 911 Bytes
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/python
import pandas as pd
import os
import numpy as np
data_dir = "Data"
data_paths = [
os.path.join(data_dir, f)
for f in os.listdir(data_dir)
if f.endswith(".xlsx")
]
def fix_age(data):
old_colname = "Age.Bracket"
new_colname = "~Age"
data = data.rename(columns={old_colname: new_colname})
def fix_age_cell(cell):
V = [int(b) for b in cell.split("-")]
return np.ceil(np.mean(V))
data[new_colname] = data[new_colname].apply(fix_age_cell)
return data
def fix_sex(data):
sex_map = {
"male": 1,
"female": 0
}
data["Sex"] = data["Sex"].apply(lambda v: sex_map[v])
return data
for file_path in data_paths:
data = pd.read_excel(file_path, index=False)
output_path = os.path.splitext(file_path)[0] + ".csv"
data = fix_age(data)
data = fix_sex(data)
data.to_csv(output_path, index=False)