Skip to content

Commit

Permalink
several changes
Browse files Browse the repository at this point in the history
- fixed some bugs in the debug function
- fixed an import error in the dense workflow
- enabled reporting of reading errors when fetching results from pcloud
- changed numerical imputer to median
- added use case notebooks for debugging and generating latex tables for
hyperparameter spaces
  • Loading branch information
felix committed Nov 26, 2024
1 parent 9f2ea2f commit 2df8cfd
Show file tree
Hide file tree
Showing 8 changed files with 1,346 additions and 47 deletions.
50 changes: 26 additions & 24 deletions publications/2023-neurips/lcdb/db/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,31 +242,33 @@ def generator():
tracebacks, configs, errors = [], [], []

for df in tqdm(gen, disable=not show_progress):
# check if "traceback" column exists
if "m:traceback" in df.columns:
traceback_rows = df[df["m:traceback"].notna()]

# extract corresponding configuration parameters
if not traceback_rows.empty:
traceback_indices = traceback_rows.index.tolist()
config_cols = [c for c in df.columns if c.startswith("p:")]
# corresponding_configs = df.loc[traceback_rows.index]
# configs.append(corresponding_configs)
corresponding_configs_reset = df.loc[traceback_indices, config_cols].drop_duplicates().reset_index(drop=True)
configs.append(corresponding_configs_reset)

tracebacks.append(traceback_rows["m:traceback"])

# extract errors from traceback messages str format first
traceback_str = str(traceback_rows["m:traceback"].iloc[0])
try:
error_message = re.search(r'(\w+Error): (.*)', traceback_str).group(0)
except:
error_message = traceback_str
errors.append(error_message)

else:
print("Error: no traceback column in dataframe")
if df is not None:
# check if "traceback" column exists
if "m:traceback" in df.columns:
traceback_rows = df[df["m:traceback"].notna()]

# extract corresponding configuration parameters
if not traceback_rows.empty:
traceback_indices = traceback_rows.index.tolist()
config_cols = [c for c in df.columns if c.startswith("p:")]
# corresponding_configs = df.loc[traceback_rows.index]
# configs.append(corresponding_configs)
corresponding_configs_reset = df.loc[traceback_indices, config_cols].drop_duplicates().reset_index(drop=True)
configs.append(corresponding_configs_reset)

tracebacks.append(traceback_rows["m:traceback"])

# extract errors from traceback messages str format first
traceback_str = str(traceback_rows["m:traceback"].iloc[0])
try:
error_message = re.search(r'(\w+Error): (.*)', traceback_str).group(0)
except:
error_message = traceback_str
errors.append(error_message)

else:
print("Error: no traceback column in dataframe")

return {
"configs": pd.concat(configs, ignore_index=True) if configs else None,
Expand Down
66 changes: 47 additions & 19 deletions publications/2023-neurips/lcdb/db/_pcloud_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import requests
import jmespath
from json import JSONDecodeError


class PCloudRepository(Repository):
Expand Down Expand Up @@ -233,8 +234,8 @@ def get_result_files_of_workflow_and_dataset_in_campaign(
print(f"Invalid filename {filename}")
continue

result_files.append(file_data["fileid"])
return result_files
result_files.append([workflow, campaign, openmlid, _workflow_seed, _test_seed, _val_seed, file_data["fileid"]])
return pd.DataFrame(result_files, columns=["workflow", "campaign", "openmlid", "seed_workflow", "seed_test", "seed_val", "fileid"])

def get_result_files_of_workflow_in_campaign(
self,
Expand All @@ -249,17 +250,18 @@ def get_result_files_of_workflow_in_campaign(
if openmlids is None:
openmlids = self.get_datasets(workflow=workflow, campaign=campaign)

filenames = []
result_files = None
for openmlid in openmlids:
filenames.extend(self.get_result_files_of_workflow_and_dataset_in_campaign(
result_files_new = self.get_result_files_of_workflow_and_dataset_in_campaign(
workflow=workflow,
campaign=campaign,
openmlid=openmlid,
workflow_seeds=workflow_seeds,
test_seeds=test_seeds,
validation_seeds=validation_seeds
))
return filenames
)
result_files = result_files_new if result_files is None else pd.concat([result_files, result_files_new])
return result_files

def get_result_files_of_workflow(
self,
Expand All @@ -270,19 +272,20 @@ def get_result_files_of_workflow(
test_seeds=None,
validation_seeds=None
):
filenames = []
result_files = None
if campaigns is None:
campaigns = self.get_campaigns(workflow)
for campaign in campaigns:
filenames.extend(self.get_result_files_of_workflow_in_campaign(
result_files_new = self.get_result_files_of_workflow_in_campaign(
workflow=workflow,
campaign=campaign,
openmlids=openmlids,
workflow_seeds=workflow_seeds,
test_seeds=test_seeds,
validation_seeds=validation_seeds
))
return filenames
)
result_files = result_files_new if result_files is None else pd.concat([result_files, result_files_new])
return result_files

def get_result_files(
self,
Expand All @@ -296,16 +299,17 @@ def get_result_files(
if workflows is None:
workflows = self.get_workflows()

result_files = []
result_files = None
for workflow in workflows:
result_files.extend(self.get_result_files_of_workflow(
result_files_new = self.get_result_files_of_workflow(
workflow=workflow,
campaigns=campaigns,
openmlids=openmlids,
workflow_seeds=workflow_seeds,
test_seeds=test_seeds,
validation_seeds=validation_seeds
))
)
result_files = result_files_new if result_files is None else pd.concat([result_files, result_files_new])
return result_files

def query_results_as_stream(
Expand All @@ -316,7 +320,9 @@ def query_results_as_stream(
workflow_seeds=None,
test_seeds=None,
validation_seeds=None,
processors=None
processors=None,
raise_errors=False,
report_errors=True
):
"""
Expand Down Expand Up @@ -344,20 +350,42 @@ def query_results_as_stream(

# read in all result files

def gen_fun():
def gen_fun(raise_errors=False):
total_entries = 0

for file in result_files:
for i, file_desc in result_files.iterrows():
if total_entries > 10 ** 6:
raise ValueError(f"Cannot read in more than 10**6 results.")
df = self.read_result_file(file)
df_deserialized = deserialize_dataframe(df)
df = self.read_result_file(file_desc["fileid"])
try:
df_deserialized = deserialize_dataframe(df)
except Exception as e:
is_parsing_error = isinstance(e, JSONDecodeError)
if is_parsing_error:
error_msg = f"Parsing error {repr(e)} for result file:"
else:
error_msg = f"{type(e)} with message '{repr(e)}' in result file:"

error_msg += ""\
f"\n\tworkflow {file_desc['workflow']}"\
f"\n\tcampaign {file_desc['campaign']}"\
f"\n\topenmlid {file_desc['openmlid']}"\
f"\n\tseed_wf {file_desc['seed_workflow']}"\
f"\n\tseed_test {file_desc['seed_test']}"\
f"\n\tseed_valid {file_desc['seed_val']}"\
f"\n\tpCloud file id {file_desc['fileid']}"
if raise_errors:
raise Exception(error_msg)
elif report_errors:
print(error_msg)
df_deserialized = None

if processors is not None:
for name, fun in processors.items():
df[name] = df.apply(fun, axis=1) # apply the function to all rows in the dataframe
df.drop(columns="m:json", inplace=True)

total_entries += len(df_deserialized)
total_entries += len(df_deserialized) if df_deserialized is not None else 0
yield df_deserialized

return CountAwareGenerator(len(result_files), gen=gen_fun())
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def get_pp_steps(self, X, y, metadata, **kwargs):
# step 1: imputation
if np.any(pd.isnull(X)):
cat_steps.append(("cat_imputer", SimpleImputer(strategy="most_frequent")))
num_steps.append(("num_imputer", SimpleImputer(strategy="most_frequent")))
num_steps.append(("num_imputer", SimpleImputer(strategy="median")))

# step 2: encoding of categorical attributes
if has_cat:
Expand Down
6 changes: 3 additions & 3 deletions publications/2023-neurips/lcdb/workflow/keras/_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from ConfigSpace import Categorical, ConfigurationSpace, Float, Integer
from ...scorer import ClassificationScorer
from ...timer import Timer
from ...utils import get_schedule, filter_keys_with_prefix
from lcdb.builder.scorer import ClassificationScorer
from lcdb.builder.timer import Timer
from lcdb.builder.utils import get_schedule, filter_keys_with_prefix
from .._base_workflow import BaseWorkflow
from .._preprocessing_workflow import PreprocessedWorkflow
from ._augmentation import MixUpAugmentation, CutMixAugmentation, CutOutAugmentation
Expand Down
252 changes: 252 additions & 0 deletions publications/2023-neurips/use cases/1b - successive halving.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 2df8cfd

Please sign in to comment.