From b9b131302c27f73a91c24b471939d737026fe2f1 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Tue, 15 Oct 2024 23:01:12 +0200 Subject: [PATCH] Update hyperparameter_importance.py updates changes made locally --- .../analysis/hyperparameter_importance.py | 74 ++++++++++++------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/publications/2023-neurips/experiments/surf/snellius/analysis/hyperparameter_importance.py b/publications/2023-neurips/experiments/surf/snellius/analysis/hyperparameter_importance.py index 01b2952..72c878a 100644 --- a/publications/2023-neurips/experiments/surf/snellius/analysis/hyperparameter_importance.py +++ b/publications/2023-neurips/experiments/surf/snellius/analysis/hyperparameter_importance.py @@ -16,21 +16,22 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--n_trees', type=int, default=16) - parser.add_argument('--openml_ids', type=int, nargs='+', default=None) + parser.add_argument('--openml_ids', type=int, nargs='+', default=[3]) parser.add_argument('--workflow_name', type=str, default="lcdb.workflow.sklearn.LibLinearWorkflow") parser.add_argument('--openml_taskid_name', type=str, default="m:openmlid") parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~/experiments/lcdb')) parser.add_argument('--output_filetype', type=str, choices=['pdf', 'png'], default='png') parser.add_argument('--max_load', type=int, default=None) + parser.add_argument('--anchor_value', type=int, default=2048) return parser.parse_args() def numeric_encode(df, config_space): # https://automl.github.io/ConfigSpace/latest/api/ConfigSpace/configuration_space/ - result = np.zeros((len(df), len(config_space.get_hyperparameters())), dtype=float) + result = np.zeros((len(df), len(config_space.values())), dtype=float) for hyperparameter_name, hyperparameter in config_space.items(): - index = config_space.get_idx_by_hyperparameter_name(hyperparameter_name) + index = config_space.index_of[hyperparameter_name] if isinstance(hyperparameter, ConfigSpace.hyperparameters.NumericalHyperparameter): result[:, index] = df[hyperparameter_name].to_numpy() elif isinstance(hyperparameter, ConfigSpace.hyperparameters.Constant): @@ -43,49 +44,50 @@ def numeric_encode(df, config_space): return result -def fanova_on_task(task_results, performance_column_name, curve_data_column, config_space, n_trees): +def fanova_on_task(task_results, performance_column_name, config_space, n_trees): fanova_results = [] - # query_confusion_matrix_values = lcdb.analysis.json.QueryMetricValuesFromAnchors("confusion_matrix", split_name="val") - # out = task_results[performance_column_name].apply(query_confusion_matrix_values) - # print(out) - # balanced_error_rate_values_for_config = np.array( - # out.apply(lambda x: list(map(lambda x: 1 - lcdb.analysis.score.balanced_accuracy_from_confusion_matrix(x), x))).to_list()) - # print(balanced_error_rate_values_for_config.mean(axis=0)) - # print(out) evaluator = fanova.fanova.fANOVA( X=numeric_encode(task_results, config_space), Y=task_results[performance_column_name].to_numpy(), config_space=config_space, n_trees=n_trees, ) - for idx, pname in enumerate(config_space.get_hyperparameter_names()): + for idx, pname in enumerate(config_space.keys()): logging.info('-- hyperparameter %d %s' % (idx, pname)) unique_values = task_results[pname].unique() logging.info('-- UNIQUE VALUES: %d (%s)' % (len(unique_values), unique_values)) importance = evaluator.quantify_importance([idx]) - fanova_results.append( - { - "hyperparameter": pname, - "fanova": importance[(idx,)]["individual importance"], - } - ) - + fanova_results.append({ + "hyperparameter": pname, + "fanova": importance[(idx,)]["individual importance"], + }) return fanova_results def run(args): fanova_all_results = [] performance_column = "objective" - curve_data_column = "m:json" + anchor_size_column = "anchor_sizes" + learning_curve_column = "learning_curve_data" WorkflowClass = lcdb.builder.utils.import_attr_from_module(args.workflow_name) config_space = WorkflowClass.config_space() - workflow_hyperparameter_mapping = {"p:" + name: name for name in config_space.get_hyperparameter_names()} + workflow_hyperparameter_mapping = {"p:" + name: name for name in config_space.keys()} id_results = dict() - all_results_all_workflows = lcdb.db.LCDB().query(workflows=[args.workflow_name], openmlids=args.openml_ids) + all_results_all_workflows = lcdb.db.LCDB().query( + workflows=[args.workflow_name], + openmlids=args.openml_ids, + processors={ + anchor_size_column: lcdb.analysis.json.QueryAnchorValues(), + learning_curve_column: lambda x: list(map( + lambda x: 1 - lcdb.analysis.score.balanced_accuracy_from_confusion_matrix(x), + lcdb.analysis.json.QueryMetricValuesFromAnchors("confusion_matrix", split_name="val")(x) + )) + } + ) load_count = 0 for frame_workflow_job_task in all_results_all_workflows: workflow_ids = frame_workflow_job_task['m:workflow'].unique() @@ -95,6 +97,25 @@ def run(args): raise ValueError('Should not happen. %s %s' % (str(workflow_ids), str(openml_task_ids))) if (workflow_ids[0], openml_task_ids[0]) not in id_results: id_results[(workflow_ids[0], openml_task_ids[0])] = list() + + performance_values_new = list() + for index, row in frame_workflow_job_task.iterrows(): + anchor_sizes = row[anchor_size_column] + performance_value_at_anchor = np.nan + if args.anchor_value is not None: + if args.anchor_value not in anchor_sizes: + logging.warning('Anchor %d not available in task %d workflow %s' + % (args.anchor_value, openml_task_ids[0], workflow_ids[0]) + ) + else: + anchor_index = anchor_sizes.index(args.anchor_value) + performance_value_at_anchor = row[learning_curve_column][anchor_index] + else: + performance_value_at_anchor = row[learning_curve_column][-1] + performance_values_new.append(performance_value_at_anchor) + performance_values_new = np.array(performance_values_new, dtype=float) + frame_workflow_job_task[performance_column] = pd.Series(performance_values_new) + id_results[(workflow_ids[0], openml_task_ids[0])].append(frame_workflow_job_task) load_count += 1 @@ -106,11 +127,11 @@ def run(args): task_ids.add(task_id) task_results = pd.concat(id_results[(workflow_name, task_id)]) task_results = task_results.rename(workflow_hyperparameter_mapping, axis=1) - relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column, curve_data_column] + relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column] task_results = task_results[relevant_columns] logging.info("Starting with task %d (%d/%d)" % (task_id, idx + 1, len(id_results))) - fanova_task_results = fanova_on_task(task_results, performance_column, curve_data_column, config_space, args.n_trees) + fanova_task_results = fanova_on_task(task_results, performance_column, config_space, args.n_trees) fanova_all_results.extend(fanova_task_results) fanova_all_results = pd.DataFrame(fanova_all_results) @@ -125,7 +146,10 @@ def run(args): plt.tight_layout() # save plot to file - output_file = args.output_directory + '/fanova_%s.%s' % (args.workflow_name, args.output_filetype) + filename_suffix = "" + if args.anchor_value is not None: + filename_suffix = "_anchor_%d" % args.anchor_value + output_file = args.output_directory + '/fanova_%s%s.%s' % (args.workflow_name, filename_suffix, args.output_filetype) os.makedirs(args.output_directory, exist_ok=True) plt.savefig(output_file) logging.info('saved to %s' % output_file)