-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathconfig_anonymize.yml
130 lines (120 loc) · 3.82 KB
/
config_anonymize.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
spark:
app:
name: "mm"
sql:
files:
maxPartitionBytes: 67108864
driver:
memory: "8g"
maxResultSize: "2g"
loglevel: "ERROR"
path:
input_data:
# TODO: replace w absolute paths, here and under working
directory_path: "synthetic_data"
file_paths:
antennas: 'antennas.csv'
cdr: 'cdr.csv'
fairness: 'fairness.csv'
home_ground_truth: 'home_locations.csv'
labels: 'labels.csv'
mobiledata: 'mobiledata.csv'
mobilemoney: 'mobilemoney.csv'
population: 'population_tgo_2019-07-01.tif'
poverty_scores: null
recharges: 'recharges.csv'
rwi: 'TGO_relative_wealth_index.csv'
shapefiles:
regions: 'regions.geojson'
cantons: 'cantons.geojson'
prefectures: 'prefectures.geojson'
survey: 'survey.csv'
targeting: 'targeting.csv'
user_consent: null
features: 'features.csv'
# this file can contain any string - it will function as a hash salt for anonymization
anonymization_salt: 'salt.txt'
working:
directory_path: "walkthrough_outputs/"
col_names:
cdr:
txn_type: "txn_type"
caller_id: "caller_id"
recipient_id: "recipient_id"
timestamp: "timestamp"
duration: "duration"
caller_antenna: "caller_antenna"
recipient_antenna: "recipient_antenna"
international: "international"
antennas:
antenna_id: "antenna_id"
tower_id: "tower_id"
latitude: "latitude"
longitude: "longitude"
recharges:
caller_id: "caller_id"
amount: "amount"
timestamp: "timestamp"
mobiledata:
caller_id: "caller_id"
volume: "volume"
timestamp: "timestamp"
mobilemoney:
txn_type: "txn_type"
caller_id: "caller_id"
recipient_id: "recipient_id"
timestamp: "timestamp"
amount: "amount"
sender_balance_before: "sender_balance_before"
sender_balance_after: "sender_balance_after"
recipient_balance_before: "recipient_balance_before"
recipient_balance_after: "recipient_balance_after"
geo: 'cantons'
col_types:
survey:
continuous: [ "con0", "con1", "con2", "con3", "con4", "con5", "con6", "con7", "con8", "con9" ]
categorical: [ "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9" ]
binary: [ "bin0", "bin1", "bin2", "bin3", "bin4", "bin5", "bin6", "bin7", "bin8", "bin9" ]
params:
cdr:
weekend: [1, 7]
start_of_day: 7
end_of_day: 19
home_location:
filter_hours: null
automl:
autogluon:
time_limit: 300
eval_metric: 'r2'
label: 'label'
sample_weight: 'weight'
opt_in_default: false
hyperparams:
'linear':
'dropmissing__threshold': [0.9, 1]
'droplowvariance__threshold': [ 0, 0.01 ]
'winsorizer__limits': [!!python/tuple [0., 1.], !!python/tuple [0.005, .995]]
'lasso':
'dropmissing__threshold': [ 0.9, 1 ]
'droplowvariance__threshold': [ 0, 0.01 ]
'winsorizer__limits': [!!python/tuple [0., 1.], !!python/tuple [0.005, .995]]
'model__alpha': [ .001, .01, .05, .03, .1 ]
'ridge':
'dropmissing__threshold': [ 0.9, 1 ]
'droplowvariance__threshold': [ 0, 0.01 ]
'winsorizer__limits': [!!python/tuple [0., 1.], !!python/tuple [0.005, .995]]
'model__alpha': [ .001, .01, .05, .03, .1 ]
'randomforest':
'dropmissing__threshold': [ 0.9, 1 ]
'droplowvariance__threshold': [ 0, 0.01 ]
'winsorizer__limits': [!!python/tuple [0., 1.], !!python/tuple [0.005, .995]]
'model__max_depth': [ 2, 4, 6, 8, 10 ]
'model__n_estimators': [ 50, 100, 200 ]
'gradientboosting':
'dropmissing__threshold': [ 0.99 ]
'droplowvariance__threshold': [ 0.01 ]
'winsorizer__limits': [!!python/tuple [0., 1.], !!python/tuple [0.005, .995]]
'model__min_data_in_leaf': [ 10, 20, 50 ]
'model__num_leaves': [ 5, 10, 20 ]
'model__learning_rate': [ 0.05, 0.075, 0.1 ]
'model__n_estimators': [ 50, 100, 200 ]