From 32ec264bbf48d2a24f2f942c759647caf23ed85c Mon Sep 17 00:00:00 2001 From: Gabriel Moreira Date: Mon, 10 Apr 2023 20:18:34 -0300 Subject: [PATCH] Preprocessing: Added support to control features (with no transformation), support to TSV as input, options to fill null with a median or a value --- examples/quick_start/scripts/preproc/args_parsing.py | 4 +++- examples/quick_start/scripts/preproc/preprocessing.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/quick_start/scripts/preproc/args_parsing.py b/examples/quick_start/scripts/preproc/args_parsing.py index 505205a342..5eeb31e81c 100644 --- a/examples/quick_start/scripts/preproc/args_parsing.py +++ b/examples/quick_start/scripts/preproc/args_parsing.py @@ -25,6 +25,7 @@ def build_arg_parser(): help="", ) + parser.add_argument("--control_features", default="", help="") parser.add_argument("--categorical_features", default="", help="") parser.add_argument("--continuous_features", default="", help="") @@ -94,6 +95,7 @@ def parse_arguments(): args = parser.parse_args() # Parsing list args + args.control_features = parse_list_arg(args.control_features) args.categorical_features = parse_list_arg(args.categorical_features) args.continuous_features = parse_list_arg(args.continuous_features) @@ -110,7 +112,7 @@ def parse_arguments(): if args.filter_query: args.filter_query = args.filter_query.replace('"', "") - if args.csv_sep.lower() == "": + if args.csv_sep.lower() == "[tab]": args.csv_sep = "\t" return args diff --git a/examples/quick_start/scripts/preproc/preprocessing.py b/examples/quick_start/scripts/preproc/preprocessing.py index 51d6c5d544..afcb11ab04 100644 --- a/examples/quick_start/scripts/preproc/preprocessing.py +++ b/examples/quick_start/scripts/preproc/preprocessing.py @@ -164,12 +164,14 @@ def generate_nvt_workflow_features(self): args = self.args feats = dict() + for col in args.control_features: + feats[col] = [col] for col in args.categorical_features: feats[col] = [col] >> nvt_ops.Categorify() for col in args.continuous_features: feats[col] = [col] if args.continuous_features_fillna is not None: - if args.continuous_features_fillna.lower() == "": + if args.continuous_features_fillna.lower() == "[median]": feats[col] = feats[col] >> nvt_ops.FillMedian() else: feats[col] = feats[col] >> nvt_ops.FillMissing(args.continuous_features_fillna)