Skip to content

Commit

Permalink
Merge branch 'main' into deephyper
Browse files Browse the repository at this point in the history
  • Loading branch information
fmohr authored Nov 18, 2024
2 parents 20cb057 + e105db0 commit 9c018e8
Show file tree
Hide file tree
Showing 15 changed files with 189 additions and 18 deletions.
14 changes: 14 additions & 0 deletions publications/2023-neurips/cluster/slurm/apptainer/batchjoblarge.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh
#SBATCH --partition=general --qos=long
#SBATCH --time=168:00:00
#SBATCH --mincpus=2
#SBATCH --mem=36000
#SBATCH --job-name=lcdbL
#SBATCH --output=lcdbL%a.txt
#SBATCH --error=lcdbL%a.txt
#SBATCH --array=1-83
ulimit -n 8000
cd /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/lcdbpyexp/code/publications/2023-neurips/
rsync openml_cache /tmp/tjviering/ -r -v --ignore-existing
cd /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/
srun apptainer exec -c --bind /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/lcdbpyexp/code/publications/2023-neurips:/mnt,/tmp:/tmp test6_re2.sif /bin/bash -c "mkdir -p ~/.config/ && mkdir -p ~/.config/openml/ && echo 'cachedir=/tmp/tjviering/openml_cache/' > ~/.config/openml/config && source activate /opt/conda/envs/lcdb && pip install py_experimenter==1.2 pynisher && mkdir -p /tmp/tjviering/ && mkdir -p /tmp/tjviering/${SLURM_ARRAY_TASK_ID} && rm -rf /tmp/tjviering/${SLURM_ARRAY_TASK_ID}/lcdb && cd /tmp/tjviering/${SLURM_ARRAY_TASK_ID} && git clone https://github.com/fmohr/lcdb.git && source activate /opt/conda/envs/lcdb && cd lcdb/publications/2023-neurips && pip install . && cd /mnt && ~/.local/bin/lcdb run --config config/knn_large.cfg --executor-name B{$SLURM_ARRAY_TASK_ID}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh
#SBATCH --partition=general --qos=long
#SBATCH --time=168:00:00
#SBATCH --mincpus=2
#SBATCH --mem=12000
#SBATCH --job-name=lcdbM
#SBATCH --output=lcdbM%a.txt
#SBATCH --error=lcdbM%a.txt
#SBATCH --array=1-146
ulimit -n 8000
cd /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/lcdbpyexp/code/publications/2023-neurips/
rsync openml_cache /tmp/tjviering/ -r -v --ignore-existing
cd /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/
srun apptainer exec -c --bind /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/lcdbpyexp/code/publications/2023-neurips:/mnt,/tmp:/tmp test6_re2.sif /bin/bash -c "mkdir -p ~/.config/ && mkdir -p ~/.config/openml/ && echo 'cachedir=/tmp/tjviering/openml_cache/' > ~/.config/openml/config && source activate /opt/conda/envs/lcdb && pip install py_experimenter==1.2 pynisher && mkdir -p /tmp/tjviering/ && mkdir -p /tmp/tjviering/${SLURM_ARRAY_TASK_ID} && rm -rf /tmp/tjviering/${SLURM_ARRAY_TASK_ID}/lcdb && cd /tmp/tjviering/${SLURM_ARRAY_TASK_ID} && git clone https://github.com/fmohr/lcdb.git && source activate /opt/conda/envs/lcdb && cd lcdb/publications/2023-neurips && pip install . && cd /mnt && ~/.local/bin/lcdb run --config config/knn_medium.cfg --executor-name B{$SLURM_ARRAY_TASK_ID}"
14 changes: 14 additions & 0 deletions publications/2023-neurips/cluster/slurm/apptainer/batchjobsmall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh
#SBATCH --partition=general --qos=long
#SBATCH --time=168:00:00
#SBATCH --mincpus=2
#SBATCH --mem=6000
#SBATCH --job-name=lcdbS
#SBATCH --output=lcdbS%a.txt
#SBATCH --error=lcdbS%a.txt
#SBATCH --array=1-115
ulimit -n 8000
cd /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/lcdbpyexp/code/publications/2023-neurips/
rsync openml_cache /tmp/tjviering/ -r -v --ignore-existing
cd /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/
srun apptainer exec -c --bind /tudelft.net/staff-bulk/ewi/insy/PRLab/Staff/tjviering/lcdbpyexp/code/publications/2023-neurips:/mnt,/tmp:/tmp test6_re2.sif /bin/bash -c "mkdir -p ~/.config/ && mkdir -p ~/.config/openml/ && echo 'cachedir=/tmp/tjviering/openml_cache/' > ~/.config/openml/config && source activate /opt/conda/envs/lcdb && pip install py_experimenter==1.2 pynisher && mkdir -p /tmp/tjviering/ && mkdir -p /tmp/tjviering/${SLURM_ARRAY_TASK_ID} && rm -rf /tmp/tjviering/${SLURM_ARRAY_TASK_ID}/lcdb && cd /tmp/tjviering/${SLURM_ARRAY_TASK_ID} && git clone https://github.com/fmohr/lcdb.git && source activate /opt/conda/envs/lcdb && cd lcdb/publications/2023-neurips && pip install . && cd /mnt && ~/.local/bin/lcdb run --config config/knn_small.cfg --executor-name B{$SLURM_ARRAY_TASK_ID}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import json
import pymysql
import pandas as pd
import time

pw2 = 'database_password'

def postprocess_table(table_name):

cnx = pymysql.connect(host='lcdb_experiments.ewi.tudelft.nl', user='lcdb', passwd=pw2, db='db_lcdb')
query = '''select * from %s where postprocess=1;''' % table_name
to_process = pd.read_sql_query(query, cnx)

print('found %d rows for processing...' % len(to_process))

query_list = []

for i in range(0, len(to_process)):
print('working on row %d' % i)

row = to_process.iloc[i]

query = '''select * from %s where workflow='%s' and openmlid=%d and hyperparameters='%s' and status='created';''' % (
table_name, row.workflow, row.openmlid, row.hyperparameters)

datas = pd.read_sql_query(query, cnx)
if len(datas) < 1:
print('this row doesnt have any jobs remaining... too bad!')
else:
trainsize_small = json.loads(row.train_sizes)[0]

trainsizes_todo = []
for train_size in datas['train_sizes'].unique():
train_size_ = json.loads(train_size)
if train_size_[0] > trainsize_small:
trainsizes_todo.append(train_size)

for trainsize in trainsizes_todo:
query_list.append(
'''update %s set status='skipped' where workflow='%s' and openmlid=%d and hyperparameters='%s' and status='created' and train_sizes='%s';''' % (
table_name, row.workflow, row.openmlid, row.hyperparameters, trainsize))

query_list.append('''update %s set postprocess=0 where id=%d''' % (table_name, row.ID))

print('I have to execute %d queries... Lets get to work!' % len(query_list))

affected_rows = []
if len(query_list) > 0:
cursor = cnx.cursor()
for query in query_list:
print('performing query: %s' % query)
tmp = (cursor.execute(query))
print('rows affected: %d' % tmp)
affected_rows.append(tmp)
cursor.close()
cnx.commit()
cnx.close()


while True:
try:
print('trying small...')
postprocess_table('jobs_small')
print('trying medium...')
postprocess_table('jobs_medium')
print('trying large...')
postprocess_table('jobs_large')
except Exception as e:
print('failed with error %s' % str(e))
print('going to sleep for 5 min...')
time.sleep(60*5)
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/knn_large.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_large

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.KNNWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/knn_medium.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_medium

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.KNNWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/knn_small.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_small

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.KNNWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/liblinear_large.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_large

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.LibLinearWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/liblinear_medium.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_medium

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.LibLinearWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/liblinear_small.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_small

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.LibLinearWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/libsvm_large.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_large

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.LibSVMWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/libsvm_medium.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_medium

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.LibSVMWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
4 changes: 3 additions & 1 deletion publications/2023-neurips/config/libsvm_small.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ provider = mysql
database = db_lcdb
table = jobs_small

n_jobs = 2

# train_size and hyperparameters are omitted since they are computed automatically
keyfields = workflow:text, openmlid:int, valid_prop: float, test_prop: float, seed_outer:int, seed_inner:int, train_sizes:text, hyperparameters:text, monotonic:boolean, maxruntime:int, measure_memory:boolean,
workflow = lcdb.workflow.sklearn.LibSVMWorkflow
Expand All @@ -19,5 +21,5 @@ monotonic = 1
maxruntime = 1800
measure_memory = 0

resultfields = result:LONGTEXT
resultfields = result:LONGTEXT, postprocess:boolean
resultfields.timestamps = false
1 change: 0 additions & 1 deletion publications/2023-neurips/lcdb/cli/_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,5 @@ def main(
pd.DataFrame(configs, columns=skopt_space.dimension_names).to_csv(
output_file, index=False
)

if verbose:
print(f"Experiments written to {output_file}")
57 changes: 49 additions & 8 deletions publications/2023-neurips/lcdb/workflow/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ def get_all_experiments(
seed: int,
max_num_anchors_per_row: int,
LHS: bool,
random_hps_per_dataset: bool,
) -> List[Dict]:
"""Create a sample of experimental configurations for a given workflow.
Expand All @@ -264,14 +265,54 @@ def get_all_experiments(
max_num_anchors_per_row=max_num_anchors_per_row,
)

# import the workflow class
workflow_path = config.get("PY_EXPERIMENTER", "workflow")
workflow_class = import_attr_from_module(workflow_path)

config_space = workflow_class.get_config_space()
default_config = get_default_config(config_space)

config_space.seed(seed)
df_experiments_grouped = df_experiments.groupby("openmlid")

experiments = []

for name, group in df_experiments_grouped:
print('working on dataset %d...' % name)
# import the workflow class
workflow_path = config.get("PY_EXPERIMENTER", "workflow")
workflow_class = import_attr_from_module(workflow_path)

config_space = workflow_class.get_config_space()
default_config = get_default_config(config_space)

seed_post_processed = seed
if random_hps_per_dataset:
seed_post_processed = seed_post_processed + int(name)
config_space.seed(seed_post_processed)

if LHS:
print('using LHS with seed %d...' % seed_post_processed)
lhs_generator = LHSGenerator(config_space, n=num_configs, seed=seed)
hp_samples = lhs_generator.generate()
else:
print('using random sampling with seed %d...' % seed_post_processed)
hp_samples = config_space.sample_configuration(num_configs)
if num_configs == 1:
hp_samples = [hp_samples]
hp_samples.insert(0, default_config)

# create all rows for the experiments
experiments = experiments + [
{
"workflow": workflow_path,
"openmlid": openmlid,
"valid_prop": v_p,
"test_prop": t_p,
"seed_outer": s_o,
"seed_inner": s_i,
"train_sizes": train_sizes,
"maxruntime": maxruntime,
"hyperparameters": dict(hp),
"monotonic": mon,
"measure_memory": measure_memory,
}
for (openmlid, v_p, t_p, s_o, s_i, train_sizes, mon, maxruntime, measure_memory), hp in it.product(
group.values, hp_samples
)
]

if LHS:
print("using LHS...")
Expand Down

0 comments on commit 9c018e8

Please sign in to comment.