-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample-imputation.py
45 lines (30 loc) · 1.43 KB
/
example-imputation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from open_end_to_end_ml.tasks import ArticlesToInvestigateTask
from open_end_to_end_ml.baselines import NoDataAugmentation, \
ArticlesToInvestigateTextualFeatureAdder, ArticlesToInvestigateLogisticRegressionTrainer
from open_end_to_end_ml.steps import DataCleaning
class ModeImputation(DataCleaning):
def __init__(self, columns):
self.columns = columns
self.most_frequent_values_per_column = {}
def __find_most_frequent_values(self, data):
for column in self.columns:
column_without_empty_strings = data[data[column] != ''][column]
self.most_frequent_values_per_column[column] = column_without_empty_strings.mode()[0]
def clean(self, data, random_state, is_train):
if is_train:
self.__find_most_frequent_values(data)
imputed_data = data.copy(deep=True)
for column in self.columns:
value_to_impute = self.most_frequent_values_per_column[column]
imputed_data[column].replace('', value_to_impute, inplace=True)
return imputed_data
imputer = ModeImputation(['ActionGeo_Fullname', 'Actor1Name', 'Actor2Name', 'site_name'])
experiment = ArticlesToInvestigateTask(
random_state=42,
run_name='mode_imputation',
augmentation=NoDataAugmentation(),
cleaning=imputer,
feature_adder=ArticlesToInvestigateTextualFeatureAdder(),
model_trainer=ArticlesToInvestigateLogisticRegressionTrainer()
)
experiment.run()