-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_processing.py
100 lines (79 loc) · 2.65 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import argparse
import numpy as np
from sklearn.model_selection import train_test_split
from utils import read_excel_files
def handle_missing_values(df):
""" Takes data from provided excel files and removes unnecessary columns
'area', "other', 'woz per m2' as well as rows with missing values.
Parameters
----------
df : pandas DataFrame
Contains all data from the provided files.
Returns
-------
updated pandas DataFrame
"""
df_new = df.drop(["area", "other"], axis=1)
df_new.dropna(inplace=True)
return df_new
def split_x_y(df_woz):
""" Splits the data from the excel files into input and labels.
Parameters
----------
df_woz : pandas DataFrame
Provided data, after missing values are handled.
Returns
-------
tuple of numpy arrays
The features of data points and their corresponding labels.
"""
x = df_woz.drop("average woz", axis=1).to_numpy()
y = df_woz["average woz"].to_numpy()
return x, y
def normalize(x):
""" Takes an array containing all features for each data points. First
normalizes them by transforming counts of family compositions to percentages
of total population.
Parameters
----------
x : numpy ndarray of shape (data points, features)
Returns
-------
normalized numpy ndarray
"""
average_m2 = x[:, 0]
composition_counts = x[:, 1 : x.shape[1] - 1]
total_counts = x[:, x.shape[1] - 1]
def normalize_column(column):
return np.multiply(np.divide(column, total_counts), 100)
normalized_compositions = np.apply_along_axis(
normalize_column, 0, composition_counts
)
normalized_compositions = np.hstack(
(average_m2.reshape(-1, 1), normalized_compositions, total_counts.reshape(-1, 1))
)
return normalized_compositions
def main():
df_woz = read_excel_files()
df_woz = handle_missing_values(df_woz)
df_woz.insert(2, "average m2", df_woz["average woz"].divide(df_woz["woz per m2"]))
df_woz.drop("woz per m2", axis=1, inplace=True)
x, y = split_x_y(df_woz)
x = normalize(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=args.t)
np.save("data/x_all.npy", x)
np.save("data/y_all.npy", y)
np.save("data/x_train.npy", x_train)
np.save("data/x_test.npy", x_test)
np.save("data/y_train.npy", y_train)
np.save("data/y_test.npy", y_test)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-t",
type=float,
default=0.3,
help="percentage of data used for test, default = 0.3",
)
args = parser.parse_args()
main()