guild.yml

- config: meta-model
  resources:
    op-prepare:
      - operation: prepare-data
        select: .*.pkl
        name: prepared-data

- operations:
    prepare-data:
      main: trademl.modeling.prepare
      sourcecode: 
        - include: '*.py'
      flags-import: all
      needed: no
      flags:
        contract:
          type: string
          default: 'SPY_IB'
        input_path:
          description: Path to read data from. 
          arg_name: input_path
          type: string
          default: D:/market_data/usa/ohlcv_features
        structural_break_regime:
          description: Shoud we use structural breaks and if yes, which one
          arg_name: structural_break_regime
          type: string
          default: 'all'
          choices: ['all','chow']
        stationarity:
          description: Use stationary columns or not. If yes which type of stationarity
          arg_name: stationarity
          type: string
          default: 'fracdiff'
          choices: ['orig','fracdiff']
        label_tuning:
          description: Should we tune labeling parameters or import existing labels
          arg_name: label_tuning
          default: 1
          hoices: [0,1]
        label:
          description: If label_tuning parameter is True, choose label.
          arg_name: label
          type: string
          default: 'day_5'
          choices: ['day_1','day_2','day_5','day_10','day_20','day_30','day_60']
        labeling_technique:
          description: Labeling technique to use for classification
          arg_name: labeling_technique
          type: string
          default: 'tb'
          choices: ['tb','ts']
        ts_look_forward_window:
          description: How much time units into the future to look when looking for trend
          arg_name: ts_look_forward_window
          min: 1
          max: 1000000
        ts_min_sample_length:
          description: Minimum sample length used to fit regression
          arg_name: ts_min_sample_length
          default: 30
        ts_step:
          description: Optimal t-value index is searched every ‘step’ indices
          arg_name: ts_step
          default: 5
        tb_triplebar_num_days:
          description: Number of days for defining vertical bars
          arg_name: tb_triplebar_num_days
          min: 0
          max: 1000
        tb_triplebar_min_ret:
          description: minimal return for barrier to be used in the sample
          arg_name: tb_triplebar_min_ret
          min: 0.001
          max: 0.1
        tb_min_pct:
          description: A fraction used to decide if the observation occurs less than that fraction.
          arg_name: tb_min_pct
          default: 0.05
        tb_volatility_lookback:
          description: Number of days in the past for calculating daily volatility
          arg_name: tb_volatility_lookback
          default: 100
          min: 1
          max: 1000000
        tb_volatility_scaler:
          arg_name: tb_volatility_scaler
          description: Scalar which multiplies daily_vol to make sampling more/less frequent
          default: 1
          min: 0.1
          max: 3.0
        correlation_threshold:
          description: Threshold for removing highly correlated columns
          arg_name: correlation_threshold
          min: 0.5
          max: 0.99
          default: 0.98
        dim_reduction:
          description: Choose dim reduction technique
          arg_name: dim_reduction
          type: string
          default: 'none'
          choices: ['none','pca','gplearn']
        scaling:
          description: Use stationary columns or not. If yes which type of stationarity
          arg_name: structural_break_regime
          type: string
          default: 'none'
          choices: ['none','expanding']
        num_threads:
          arg_name: num_threads
          description: Number of threads to use in mlfinlab multhithread function
          min: 1
          max: 32
    pipeline-rf:
      sourcecode: 
        - include: '*.py'
      flags:
        data-label_tuning:
          default: 1
        data-label:
          default: 'day_5'
        data-dim_reduction:
          default: 'none'
        data-lookforward:
          default: 240
        data-tb_volatility_lookback:
          default: 100
        data-tb_volatility_scaler:
          default: 1
        data-tb_triplebar_num_days:
          default: 8
        data-correlation_threshold:
          default: 0.98
        data-scaling:
          default: 'none'
        random-forest-depth:
          default: 4
        random-forest-maxf:
          default: 10
        random-n_estimators:
          default: 350
        random-min_weight_fraction_leaf:
          default: 0.1        
        random-class_weight:
          default: 'balanced_subsample'
      steps:
        - run: > 
            prepare-data label_tuning=${data-label_tuning} label=${data-label} tb_volatility_lookback=${data-tb_volatility_lookback}
            tb_volatility_scaler=${data-tb_volatility_scaler} correlation_threshold=${data-correlation_threshold} 
            dim_reduction=${data-dim_reduction} scaling=${data-scaling}
          isolate-runs: no
          needed: no
        - run: >
            random-forest:train max_depth=${random-forest-depth} max_features=${random-forest-maxf} n_estimators=${random-n_estimators} 
            n_estimators=${random-n_estimators} min_weight_fraction_leaf=${random-min_weight_fraction_leaf}
          isolate-runs: no
          needed: no
    pipeline-rf-opt:
      sourcecode: 
        - include: '*.py'
      # flags:
        # data-label_tuning:
        #   default: 1
        # data-label:
        #   default: 'day_5'
        # data-stationarity:
        #   default: 'fracdiff'
        # data-dim_reduction:
        #   default: 'none'
        # data-lookforward:
        #   default: 240
        # data-tb_volatility_lookback:
        #   default: 100
        # data-tb_volatility_scaler:
        #   default: 1
        # data-tb_triplebar_num_days:
        #   default: 8
        # data-correlation_threshold:
        #   default: 0.98
        # data-scaling:
        #   default: 'none'
        # random-class_weight:
        #   default: 'balanced_subsample'
      steps:
        - run: prepare-data
          flags:
            scaling: [none,expanding]
        # - run: > 
        #     prepare-data label_tuning=${data-label_tuning} 
        #     label=${data-label} 
        #     tb_volatility_lookback=${data-tb_volatility_lookback} 
        #     tb_volatility_scaler=${data-tb_volatility_scaler} 
        #     correlation_threshold=${data-correlation_threshold} 
        #     dim_reduction=${data-dim_reduction} 
        #     scaling=${data-scaling}
          isolate-runs: no
          needed: no
        - run: random-forest:train
          isolate-runs: no
          needed: no
          max-trials: 32
          flags:
            class_weight: ['balanced','balanced_subsample']
            max_depth: [2,3,4,5,6,7,10] 
            # max_features: [1,5,10,20,50]
            n_estimators: [250,500,750,1000]
            # min_weight_fraction_leaf: range[0.0:0.1:0.02]
            # min_samples_leaf: range[0.0:0.4:0.05]
            # min_impurity_decrease: range[0.0:0.2:0.02]
    pipeline-lightgbm-opt:
      sourcecode: 
        - include: '*.py'
      flags:
        data-label_tuning:
          default: 1
        data-label:
          default: 'day_5'
        data-stationarity:
          default: 'fracdiff'
        data-dim_reduction:
          default: 'none'
        data-lookforward:
          default: 240
        data-tb_volatility_lookback:
          default: 100
        data-tb_volatility_scaler:
          default: 1
        data-tb_triplebar_num_days:
          default: 8
        data-correlation_threshold:
          default: 0.98
        data-scaling:
          default: 'none'
        data-num_threads:
          default: 8
      steps:
        - run: > 
            prepare-data label_tuning=${data-label_tuning}  
            stationarity=${data-stationarity} 
            tb_volatility_lookback=${data-tb_volatility_lookback} 
            tb_volatility_scaler=${data-tb_volatility_scaler} 
            correlation_threshold=${data-correlation_threshold} 
            dim_reduction=${data-dim_reduction} 
            scaling=${data-scaling}
            num_threads=${data-num_threads}
          isolate-runs: no
          needed: no
        - run: lightgbm:train
          isolate-runs: no
          needed: no
          max-trials: 32
          # optimizer: gp
          # maximize: mean_score
          flags:
            boosting_type: [gbdt,dart]
            num_leaves: range[10:120]
            max_depth: range[2:10]
            # min_child_samples: range[5:50]
            subsample: range[0.5:1.0:0.05]
            # lambda_l1: linspace[0:5.0:10]
            # bagging_freq: uniform[1:7]
            # min_child_samples: uniform[5:100]

- model: random-forest
  extends: meta-model
  description: Random forest model
  operations:
    train:
      description: Trainer for random forest  
      main: trademl.modeling.train_rf  # Python module when running the operation
      requires: op-prepare
      sourcecode: 
        - include: '*.py'
      needed: no
      flags‑import: all
      flags:
        num_threads:
          arg_name: num_threads
          description: Number of threads to use in mlfinlab multhithread function
          min: 1
          max: 32
        sample_weights_type:
          description: Sample weights to use in training
          arg_name: sample_weights_type
          type: string
          default: 'returns'
          choices: [returns,time_decay,none]
        cv_type:
          description: type of cv
          arg_name: cv_type
          type: string
          default: 'purged_kfold'
          choices: ['purged_kfold']
        cv_number:
          description: Number of CV folds to use in CV
          arg_name: cv_number
          min: 1
          max: 20
        max_depth:
          description: Maximum depth for the tree in random forest algorithm
          arg_name: max_depth
          min: 1
          max: 10
        max_features:
          description: maximum number of featurs in random forest
          arg_name: max_features
          min: 1
          max: 250
        n_estimators:
          description: Number of estimators (decision trees) in random forest
          arg_name: n_estimators
          min: 1
          max: 10000
        min_weight_fraction_leaf:
          description: TODO
          arg_name: min_weight_fraction_leaf
          min: 0
          max: 1
        class_weight:
          description: sklearn class_weight argument
          arg_name: class_weight
          type: string
          default: 'balanced_subsample'
          choices: ['balanced','balanced_subsample']
        min_samples_leaf:
          default: 0
        min_impurity_decrease:
          default: 0

- model: xgboost
  extends: meta-model
  description: Xgboost classifier
  operations:
    train:
      description: Trainer for xgboost  
      main: trademl.modeling.train_xgb  # Python module when running the operation
      requires: op-prepare
      sourcecode: 
        - include: '*.py'
      flags‑import: all
      flags:
        input_data_path:
          description: Path where prepared data and saved output exists
          arg_name: input_data_path
          type: string
          default: 'C:/Users/Mislav/Documents/GitHub/trademl/trademl/modeling'
          required: True
        cv_type:
          description: type of cv
          arg_name: cv_type
          type: string
          default: 'purged_kfold'
          choices: ['purged_kfold']
        cv_number:
          description: Number of CV folds to use in CV
          arg_name: cv_number
          min: 1
          max: 20
        max_depth:
          description: Maximum depth for the tree in random forest algorithm
          arg_name: max_depth
          min: 1
          max: 10
        learning_rate:
          description: Learning rate
          arg_name: learning_rate
          default: 10e-2
        booster:
          description: Xgbosst booster to use
          arg_name: booster
          type: string
          default: 'gbtree'
          choices: ['gbtree','gblinear','dart']
        colsample_bytree:
          description: look at xgbost docs
          arg_name: colsample_bytree
          default: 0.9
        subsample:
          description: >
            When constructing the additional tree, don't use the full training data set.
            Instead use a subset of randomly selected training data.
          arg_name: subsample
          default: 0.9
          max: 1
        min_child_weight:
          description: >
            When constructing the additional tree, don't use the full training data set.
            Instead use a subset of randomly selected training data.
          arg_name: min_child_weight
          default: 1

- model: lightgbm
  extends: meta-model
  description: Lightgbm classifier
  operations:
    train:
      description: Trainer for lightgbm
      main: trademl.modeling.train_lgbm
      requires: op-prepare
      sourcecode: 
        - include: '*.py'
      flags‑import: all
      flags:
        input_data_path:
          description: Path where prepared data and saved output exists
          arg_name: input_data_path
          type: string
          default: 'C:/Users/Mislav/Documents/GitHub/trademl/trademl/modeling'
          required: True
        cv_type:
          description: type of cv
          arg_name: cv_type
          type: string
          default: 'purged_kfold'
          choices: ['purged_kfold']
        cv_number:
          description: Number of CV folds to use in CV
          arg_name: cv_number
          min: 1
          max: 20
        num_leaves:
          description: Max number of leaves in one tree
          arg_name: num_leaves
          default: 50
          min: 1
          max: 131072
        n_estimators:
          description: Number of boosting iterations
          arg_name: n_estimators
          default: 400
          min: 10
          max: 100000
        max_depth:
          description: Maximum depth for the tree in random forest algorithm
          arg_name: max_depth
          min: 1
          max: 10
        learning_rate:
          description: Learning rate
          arg_name: learning_rate
          default: 10e-2
        boosting_type:
          description: Lightgbm booster to use
          arg_name: boosting_type
          type: string
          default: 'gbdt'
          choices: ['gbdt','rf','dart','goss']
        colsample_bytree:
          description: look at xgbost docs
          arg_name: colsample_bytree
          default: 0.9
        subsample:
          description: >
            When constructing the additional tree, don't use the full training data set.
            Instead use a subset of randomly selected training data.
          arg_name: subsample
          default: 0.9
          max: 1
        min_child_samples:
          description: >
            Minimal number of data in one leaf. Can be used to deal with over-fitting
          arg_name: min_child_weight
          default: 20
        lambda_l1:
          arg_name: lambda_l1
          default: 0.1
        

- model: lstm
  operations:
    train:
      description: Trainer a LSTM nerual network
      main: trademl.modeling.train_lstm  # Python module when running the operation
      sourcecode: 
        - include: '*.py'
      requires: prepare-data
      flags‑import: all
      flags:
        input_path:
          description: Path where prepared data and saved output exists
          arg_name: input_path
          type: string
          default: 'C:/Users/Mislav/Documents/GitHub/trademl/trademl/modeling'
          required: True
        batch_size:
          description: Batch size
          arg_name: batch_size
          default: 128
        n_lstm_layers:
          description: Number of layer in the model
          arg_name: n_lstm_layers
          default: 3
          min: 1
          max: 4
        n_units:
          description: Number of units in the layers
          arg_name: n_units
          default: 64
        decrease_units:
          description: decrease number of units with every layer
          arg_name: decrease_units
          default: 0
          min: 0
          max: 1
        dropout:
          description: Dropout rate
          arg_name: dropout
          default: 0.2
          min: 0.0
          max: 0.95
        lr:
          description: Learning rate
          arg_name: lr
          default: 10e-2
        epochs:
          description: Number of epochs
          arg_name: epochs
          default: 100

- model: lstm-kerastune
  operations:
    train:
      description: Trainer a LSTM nerual network with keras tuner
      main: trademl.modeling.train_lstm_tune  # Python module when running the operation
      sourcecode: 
        - include: '*.py'
      # requires: prepared-data
      flags‑import: all
      flags:
        input_path:
          description: Path where prepared data and saved output exists
          arg_name: input_path
          type: string
          default: 'C:/Users/Mislav/Documents/GitHub/trademl/trademl/modeling'
          required: True
        train_val_index_split:
          description: How many observations to train/test part
          arg_name: train_val_index_split
          default: 0.75
          min: 0.0
          max: 0.99
        time_step_length:
          description: Time ste length when constructing 3D data from time series
          arg_name: time_step_length
          default: 120
        batch_size:
          description: Batch size
          arg_name: batch_size
          default: 128
        n_lstm_layers:
          description: Number of layer in the model
          arg_name: n_lstm_layers
          default: 3
          min: 1
          max: 4
        n_units:
          description: Number of units in the layers
          arg_name: n_units
          default: 64
        dropout:
          description: Dropout rate
          arg_name: dropout
          default: 0.2
          min: 0.0
          max: 0.95
        lr:
          description: Learning rate
          arg_name: lr
          default: 10e-2
        epochs:
          description: Number of epochs
          arg_name: epochs
          default: 15
        optimizer:
          description: Kerastuner optimizer to use
          arg_name: optimizer
          default: 'random'
          choices: ['random','hyperband']
        max_trials:
          description: Max trials parameter for random search optimizer
          arg_name: max_trials
          default: 5
        executions_per_trial:
          description: Execution per trial parameter for random search optimizer
          arg_name: executions_per_trial
          default: 2


# - model: random_forest_sklearnopt
#   description: Random forest model with skelarn optimization
#   operations:
#     train:
#       description: Trainer for random forest with GridSearchCV 
#       main: trademl.modeling.train_rf_sklearnopt  # Python module when running the operation
#       # flags-dest: globals  #alternative is to use argparse module, see https://www-pre.guild.ai/reference/defaults/
#       flags‑import: all
#       flags:
#         # DATA_PATH:
#         #   arg_name: DATA_PATH
#         #   description: Path to data
#         #   type: string
#         #   requires: yes
#         num_threads:
#           arg_name: num_threads
#           description: Number of threads to use in mlfinlab multhithread function
#           min: 1
#           max: 32
#         structural_break_regime:
#           description: Shoud we use structural breaks and if yes, which one
#           arg_name: structural_break_regime
#           type: string
#           default: 'all'
#           choices: ['all','chow']
#         std_outlier:
#           arg_name: std_outlier
#           description: Standard devaition threshold. Remove observations from X above/below std_oulier
#           min: 2
#           max: 20
#         tb_volatility_scaler:
#           arg_name: tb_volatility_scaler
#           description: Scalar which multiplies daily_vol to make sampling more/less frequent
#           min: 0.1
#           max: 3.0
#         max_depth:
#           description: Maximum depth for the tree in random forest algorithm
#           arg_name: max_depth
#           min: 1
#           max: 10
#         labeling_technique:
#           description: Labeling technique to use for classification
#           arg_name: labeling_technique
#           type: string
#           default: 'triple_barrier'
#           choices: ['triple_barrier','trend_scanning']
#         tb_volatility_lookback:
#           description: Number of days in the past for calculating daily volatility
#           arg_name: tb_volatility_lookback
#           min: 1
#           max: 1000000
#         tb_triplebar_num_days:
#           description: Number of days for defining vertical bars
#           arg_name: tb_triplebar_num_days
#           min: 1
#           max: 1000
#         tb_triplebar_min_ret:
#           description: minimal return for barrier to be used in the sample
#           arg_name: tb_triplebar_min_ret
#           min: 0.001
#           max: 0.1
#         sample_weights_type:
#           description: Sample weights to use in training
#           arg_name: sample_weights_type
#           type: string
#           default: 'returns'
#           choices: ['returns','time_decay','trend_scanning']
#         max_features:
#           description: maximum number of featurs in random forest
#           arg_name: max_features
#           min: 1
#           max: 250
#         n_estimators:
#           description: Number of estimators (decision trees) in random forest
#           arg_name: n_estimators
#           min: 1
#           max: 10000
#         ts_look_forward_window:
#           description: How much time units into the future to look when looking for trend
#           arg_name: ts_look_forward_window
#           min: 1
#           max: 1000000
#         cv_number:
#           description: Number of CV folds to use in CV
#           arg_name: cv_number
#           min: 1
#           max: 20