From 4a22a065c22b8da492a7e8ee7dbfbb2ceb532cfc Mon Sep 17 00:00:00 2001 From: Henry Addison Date: Mon, 18 Mar 2024 15:35:46 +0000 Subject: [PATCH] clean up bin dir and move blue pebble helpers into separate dir --- bin/add-ensemble-member-dim-to-predictions | 119 --------------------- bin/{ => bp}/bp-tb | 0 bin/{ => bp}/queue-sampling | 0 bin/{ => bp}/queue-training | 0 bin/{ => bp}/train-sample | 0 bin/deterministic/{ => bp}/queue-sampling | 0 bin/deterministic/{ => bp}/queue-training | 0 bin/deterministic/{ => bp}/train-sample | 0 bin/deterministic/mv-xfm-to-keyed-dir | 55 ---------- bin/deterministic/netcdf-to-numpy.py | 62 ----------- bin/mv-xfm-to-keyed-dir | 60 ----------- bin/split-samples-by-time-period | 56 ---------- 12 files changed, 352 deletions(-) delete mode 100755 bin/add-ensemble-member-dim-to-predictions rename bin/{ => bp}/bp-tb (100%) rename bin/{ => bp}/queue-sampling (100%) rename bin/{ => bp}/queue-training (100%) rename bin/{ => bp}/train-sample (100%) rename bin/deterministic/{ => bp}/queue-sampling (100%) rename bin/deterministic/{ => bp}/queue-training (100%) rename bin/deterministic/{ => bp}/train-sample (100%) delete mode 100755 bin/deterministic/mv-xfm-to-keyed-dir delete mode 100644 bin/deterministic/netcdf-to-numpy.py delete mode 100755 bin/mv-xfm-to-keyed-dir delete mode 100755 bin/split-samples-by-time-period diff --git a/bin/add-ensemble-member-dim-to-predictions b/bin/add-ensemble-member-dim-to-predictions deleted file mode 100755 index 4626d9469..000000000 --- a/bin/add-ensemble-member-dim-to-predictions +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python -# make sure all datasets have ensemble member and add default to any that don't - -import glob -import logging -import os -import shutil -import xarray as xr - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO, format="%(levelname)s %(asctime)s: %(message)s") - -DEFAULT_ENSEMBLE_MEMBER = "01" - - -def fix_file(nc_filepath): - logger.info(f"Working on {nc_filepath}") - ds = xr.open_dataset(nc_filepath) - - if "ensemble_member" in ds.dims: - logger.info(f"Already has ensemble member: {nc_filepath}") - ds.close() - return - - logger.info(f"Fixing {nc_filepath}") - ds = ds.load() - ds.close() - ds = ds.expand_dims(dict(ensemble_member=[DEFAULT_ENSEMBLE_MEMBER])) - - dirpath, filename = os.path.split(nc_filepath) - new_filepath = os.path.join(dirpath, "01", filename) - os.makedirs(os.path.dirname(new_filepath), exist_ok=True) - - if "sample_id" in ds.dims: - ds = ds.squeeze(dim="sample_id") - - ds.to_netcdf(new_filepath) - - ds = xr.open_dataset(new_filepath) - assert list(ds["pred_pr"].dims) == [ - "ensemble_member", - "time", - "grid_latitude", - "grid_longitude", - ], list(ds["pred_pr"].dims) - assert ds["pred_pr"].shape[0] == 1 - assert ds["pred_pr"].shape[1] > 0 - assert ds["pred_pr"].shape[2] == 64 - assert ds["pred_pr"].shape[3] == 64 - assert ds["pred_pr"].isnull().sum().values.item() == 0 - logger.info(f"Removing original prediction file: {nc_filepath}") - os.remove(nc_filepath) - - -def main(): - diff_models_glob = os.path.join( - os.getenv("DERIVED_DATA"), - "workdirs", - "score-sde", - "*", # sde - "xarray_cncsnpp_continuous", - "*", # model name - ) - - unet_models_glob = os.path.join( - os.getenv("DERIVED_DATA"), - "workdirs", - "u-net", - "*", # model name - ) - - id_linpr_models_glob = os.path.join( - os.getenv("DERIVED_DATA"), - "workdirs", - "id-linpr", # model name - ) - - model_dirs = ( - glob.glob(diff_models_glob) - + glob.glob(unet_models_glob) - + glob.glob(id_linpr_models_glob) - ) - for model_dir in model_dirs: - if os.path.basename(model_dir) == "archive": - continue - - samples_glob = os.path.join( - model_dir, - "samples", - "*", # checkpoint - "*", # dataset - "*", # input_xfm - "*", # split - "predictions-*.nc", - ) - sample_filepaths = glob.glob(samples_glob) - for sample_filepath in sample_filepaths: - fix_file(sample_filepath) - - sample_config_files_glob = os.path.join( - model_dir, - "samples", - "*", # checkpoint - "*", # dataset - "*", # input_xfm - "*", # split - "config.yml", - ) - - for sample_config_filepath in glob.glob(sample_config_files_glob): - dirpath, filename = os.path.split(sample_config_filepath) - new_filepath = os.path.join(dirpath, "01", filename) - os.makedirs(os.path.dirname(new_filepath), exist_ok=True) - logger.info(f"moving config {sample_config_filepath} to {new_filepath}") - shutil.move(sample_config_filepath, new_filepath) - - -if __name__ == "__main__": - main() diff --git a/bin/bp-tb b/bin/bp/bp-tb similarity index 100% rename from bin/bp-tb rename to bin/bp/bp-tb diff --git a/bin/queue-sampling b/bin/bp/queue-sampling similarity index 100% rename from bin/queue-sampling rename to bin/bp/queue-sampling diff --git a/bin/queue-training b/bin/bp/queue-training similarity index 100% rename from bin/queue-training rename to bin/bp/queue-training diff --git a/bin/train-sample b/bin/bp/train-sample similarity index 100% rename from bin/train-sample rename to bin/bp/train-sample diff --git a/bin/deterministic/queue-sampling b/bin/deterministic/bp/queue-sampling similarity index 100% rename from bin/deterministic/queue-sampling rename to bin/deterministic/bp/queue-sampling diff --git a/bin/deterministic/queue-training b/bin/deterministic/bp/queue-training similarity index 100% rename from bin/deterministic/queue-training rename to bin/deterministic/bp/queue-training diff --git a/bin/deterministic/train-sample b/bin/deterministic/bp/train-sample similarity index 100% rename from bin/deterministic/train-sample rename to bin/deterministic/bp/train-sample diff --git a/bin/deterministic/mv-xfm-to-keyed-dir b/bin/deterministic/mv-xfm-to-keyed-dir deleted file mode 100755 index 4b8eb249c..000000000 --- a/bin/deterministic/mv-xfm-to-keyed-dir +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python -# move cached input transforms and samples to folder that includes the transform key - -import glob -import os -from ml_collections import config_dict -import shutil -import yaml - - -def main(): - model_run_wildcard_path = os.path.join( - os.getenv("DERIVED_DATA"), "workdirs", "u-net", "*" - ) - print(model_run_wildcard_path) - model_run_glob = glob.glob(model_run_wildcard_path) - for model_run_path in model_run_glob: - # skip the archive directory - if model_run_path.endswith("archive"): - continue - print(f"Working on {model_run_path}", flush=True) - # extract input transform key from config - model_run_config_path = os.path.join(model_run_path, "config.yml") - with open(model_run_config_path) as f: - config = config_dict.ConfigDict(yaml.unsafe_load(f)) - try: - input_xfm_key = config["input_transform_key"] - except AttributeError: - input_xfm_key = "v1" - - # move the transform - input_xfm_glob = glob.glob( - os.path.join(model_run_path, "transforms", "*", "input.pickle") - ) - for input_xfm_path in input_xfm_glob: - new_xfm_path = input_xfm_path.replace( - "input.pickle", f"{input_xfm_key}/input.pickle" - ) - os.makedirs(os.path.dirname(new_xfm_path), exist_ok=True) - print(f"Moving {input_xfm_path} to {new_xfm_path}") - shutil.move(input_xfm_path, new_xfm_path) - - # move the samples - samples_glob = glob.glob(os.path.join(model_run_path, "samples", "*", "*")) - for sample_set_path in samples_glob: - new_sample_path = os.path.join(sample_set_path, input_xfm_key) - existing_sample_splits_glob = glob.glob(os.path.join(sample_set_path, "*")) - os.makedirs(new_sample_path, exist_ok=True) - for sample_split_path in existing_sample_splits_glob: - print(f"Moving {sample_split_path} to {new_sample_path}") - shutil.move(sample_split_path, new_sample_path) - - -if __name__ == "__main__": - main() diff --git a/bin/deterministic/netcdf-to-numpy.py b/bin/deterministic/netcdf-to-numpy.py deleted file mode 100644 index cac535973..000000000 --- a/bin/deterministic/netcdf-to-numpy.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse -import logging -import os -from pathlib import Path - -import numpy as np -import xarray as xr - - -def get_args(): - parser = argparse.ArgumentParser( - description="Save a netcdf dataset to raw numpy on disk", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--input", - dest="input_dir", - type=Path, - required=True, - help="Base path to input storage", - ) - parser.add_argument( - "--output", - dest="output_dir", - type=Path, - required=True, - help="Base path to storage output", - ) - parser.add_argument( - "--variable", - dest="variable", - type=str, - required=True, - help="Name of variable to extract and save from netCDF dataset", - ) - - return parser.parse_args() - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, format="%(levelname)s %(asctime)s: %(message)s" - ) - - args = get_args() - - output_path = args.output_dir / f"{args.variable}.npy" - - logging.info( - f"Saving {args.variable} from dataset in {args.input_dir} to {output_path}" - ) - - os.makedirs(args.output_dir, exist_ok=True) - - ds = xr.open_mfdataset(str(args.input_dir / "*.nc")) - - # don't need the ensemble member dimension for training - np_array = ds.isel(ensemble_member=0)[args.variable].values - - np.save(output_path, np_array) - - logging.info("All done") diff --git a/bin/mv-xfm-to-keyed-dir b/bin/mv-xfm-to-keyed-dir deleted file mode 100755 index 3f90399ae..000000000 --- a/bin/mv-xfm-to-keyed-dir +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# move cached input transforms and samples to folder that includes the transform key - -import glob -import os -from ml_collections import config_dict -import shutil -import yaml - - -def main(): - model_run_wildcard_path = os.path.join( - os.getenv("DERIVED_DATA"), - "workdirs", - "score-sde", - "*", - "xarray_cncsnpp_continuous", - "*", - ) - print(model_run_wildcard_path) - model_run_glob = glob.glob(model_run_wildcard_path) - for model_run_path in model_run_glob: - # skip the archive directory - if model_run_path.endswith("archive"): - continue - print(f"Working on {model_run_path}", flush=True) - # extract input transform key from config - model_run_config_path = os.path.join(model_run_path, "config.yml") - with open(model_run_config_path) as f: - config = config_dict.ConfigDict(yaml.unsafe_load(f)) - try: - input_xfm_key = config.data.input_transform_key - except AttributeError: - input_xfm_key = "v1" - - # move the transform - input_xfm_glob = glob.glob( - os.path.join(model_run_path, "transforms", "*", "input.pickle") - ) - for input_xfm_path in input_xfm_glob: - new_xfm_path = input_xfm_path.replace( - "input.pickle", f"{input_xfm_key}/input.pickle" - ) - os.makedirs(os.path.dirname(new_xfm_path), exist_ok=True) - print(f"Moving {input_xfm_path} to {new_xfm_path}") - shutil.move(input_xfm_path, new_xfm_path) - - # move the samples - samples_glob = glob.glob(os.path.join(model_run_path, "samples", "*", "*")) - for sample_set_path in samples_glob: - new_sample_path = os.path.join(sample_set_path, input_xfm_key) - existing_sample_splits_glob = glob.glob(os.path.join(sample_set_path, "*")) - os.makedirs(new_sample_path, exist_ok=True) - for sample_split_path in existing_sample_splits_glob: - print(f"Moving {sample_split_path} to {new_sample_path}") - shutil.move(sample_split_path, new_sample_path) - - -if __name__ == "__main__": - main() diff --git a/bin/split-samples-by-time-period b/bin/split-samples-by-time-period deleted file mode 100755 index 83c656471..000000000 --- a/bin/split-samples-by-time-period +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# Concatenate samples from a bunch of subsets of training set - -import cftime -import glob -import os -import typer -import xarray as xr - -app = typer.Typer() - -TIME_PERIODS = { - "historic": slice( - cftime.Datetime360Day(1980, 12, 1, 12, 0, 0, 0, has_year_zero=True), - cftime.Datetime360Day(2000, 11, 30, 12, 0, 0, 0, has_year_zero=True), - ), - "present": slice( - cftime.Datetime360Day(2020, 12, 1, 12, 0, 0, 0, has_year_zero=True), - cftime.Datetime360Day(2040, 11, 30, 12, 0, 0, 0, has_year_zero=True), - ), - "future": slice( - cftime.Datetime360Day(2060, 12, 1, 12, 0, 0, 0, has_year_zero=True), - cftime.Datetime360Day(2080, 11, 30, 12, 0, 0, 0, has_year_zero=True), - ), -} - - -def samples_dir(workdir, checkpoint, dataset, input_xfm, split): - return f"{os.getenv('DERIVED_DATA')}/workdirs/{workdir}/samples/{checkpoint}/{dataset}/{input_xfm}/{split}" - - -@app.command() -def main( - workdir: str, dataset: str, checkpoint="epoch-100", input_xfm="stan", split="val" -): - pred_glob = os.path.join( - samples_dir(workdir, checkpoint, dataset, input_xfm, split), "predictions-*.nc" - ) - typer.echo(pred_glob) - pred_paths = glob.glob(pred_glob) - - for pred_path in pred_paths: - ds = xr.open_dataset(pred_path) - for tp in ["historic", "present", "future"]: - tp_output_path = os.path.join( - samples_dir(workdir, checkpoint, f"{dataset}-{tp}", input_xfm, split), - os.path.basename(pred_path), - ) - - typer.echo(f"save to {tp_output_path}") - os.makedirs(os.path.dirname(tp_output_path), exist_ok=True) - ds.sel(time=TIME_PERIODS[tp]).to_netcdf(tp_output_path) - - -if __name__ == "__main__": - app()