configs/config_emily.yml

spark:
  app_name: "mm"
  files:
    max_partition_bytes: 67108864
  driver:
    memory: "50g"
    max_result_size: "2g"
  loglevel: "ERROR"


path:
  wd: "/home/em/cider/"
  survey:
    data: "../synthetic_data"
    outputs: "../tests/outputs/survey"
    file_names:
      survey: "/survey.csv" 
  featurizer:
    data: "../synthetic_data"
    outputs: "../tests/outputs/featurizer"
    file_names:
      cdr: '/cdr.csv'
      antennas: '/antennas.csv'
      recharges: '/recharges.csv'
      mobiledata: '/mobiledata.csv'
      mobilemoney: '/mobilemoney.csv'
      shapefiles:
        'regions': '/regions.geojson'
        'prefectures': '/prefectures.geojson'
  ml: 
    outputs: "/home/em/cideroutputs/machinelearning"
    features: '/data/togo_anon/feats/survey_combos/survey2020_cdr2020_cider.csv'
    labels: "/data/togo_anon/surveys/survey2018/survey2020_labels_cider.csv"
  home_location:
    data: "../synthetic_data"
    outputs: "../tests/outputs/homelocation"
    file_names:
      cdr: '/cdr.csv'
      antennas: '/antennas.csv'
      shapefiles:
        'prefectures': '/prefectures.geojson'
      poverty_scores: null
      groundtruth: '/home_locations.csv'
  targeting:
    data: "../../synthetic_data"
    outputs: "../../tests/outputs/targeting"
    file_names:
      data: '/targeting.csv'
  fairness:
    data: "../../synthetic_data"
    outputs: "../../tests/outputs/fairness"
    file_names:
      data: '/fairness.csv'


col_names:
  cdr:
    txn_type: "txn_type"
    caller_id: "caller_id"
    recipient_id: "recipient_id"
    timestamp: "timestamp"
    duration: "duration"
    caller_antenna: "caller_antenna"
    recipient_antenna: "recipient_antenna"
    international: "international"
  antennas:
    antenna_id: "antenna_id"
    tower_id: "tower_id"
    latitude: "latitude"
    longitude: "longitude"
  recharges:
    caller_id: "caller_id"
    amount: "amount"
    timestamp: "timestamp"
  mobiledata:
    caller_id: "caller_id"
    volume: "volume"
    timestamp: "timestamp"
  mobilemoney:
    txn_type: "txn_type"
    caller_id: "caller_id"
    recipient_id: "recipient_id"
    timestamp: "timestamp"
    amount: "amount"
    sender_balance_before: "sender_balance_before"
    sender_balance_after: "sender_balance_after"
    recipient_balance_before: "recipient_balance_before"
    recipient_balance_after: "recipient_balance_after"

  geo: 'tower_id'

col_types:
  survey:
    continuous: [ "con0", "con1", "con2", "con3", "con4", "con5", "con6", "con7", "con8", "con9" ]
    categorical: [ "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9" ]
    binary: [ "bin0", "bin1", "bin2", "bin3", "bin4", "bin5", "bin6", "bin7", "bin8", "bin9" ]

params:
  home_location:
    filter_hours: null

  automl:
    autosklearn:
      time_left: 3600
      n_jobs: 
      memory_limit: 3072
    autogluon:
      time_limit: 3600
      eval_metric: 'r2'
      label: 'label'
      sample_weight: 'weight'

hyperparams:
  'linear':
    'dropmissing__threshold': [0.9, 1]
    'droplowvariance__threshold': [ 0, 0.01 ]
    'winsorizer__limits': [[0., 1.], [0.005, .995]]
  'lasso':
    'dropmissing__threshold': [ 0.9, 1 ]
    'droplowvariance__threshold': [ 0, 0.01 ]
    'winsorizer__limits': [[0., 1.], [0.005, .995]]
    'model__alpha': [ .001, .01, .05, .03, .1 ]
  'ridge':
    'dropmissing__threshold': [ 0.9, 1 ]
    'droplowvariance__threshold': [ 0, 0.01 ]
    'winsorizer__limits': [[0., 1.],  [0.005, .995]]
    'model__alpha': [ .001, .01, .05, .03, .1 ]
  'randomforest':
    'dropmissing__threshold': [ 0.9, 1 ]
    'droplowvariance__threshold': [ 0, 0.01 ]
    'winsorizer__limits': [[0., 1.], [0.005, .995]]
    'model__max_depth': [ 2, 4, 6, 8, 10 ]
    'model__n_estimators': [ 50, 100, 200 ]
  'gradientboosting':
    'dropmissing__threshold': [ 0.99 ]
    'droplowvariance__threshold': [ 0.01 ]
    'winsorizer__limits': [[0., 1.], [0.005, .995]]
    'model__min_data_in_leaf': [ 10, 20, 50 ]
    'model__num_leaves': [ 5, 10, 20 ]
    'model__learning_rate': [ 0.05, 0.075, 0.1 ]
    'model__n_estimators': [ 50, 100, 200 ]