From 4a22a065c22b8da492a7e8ee7dbfbb2ceb532cfc Mon Sep 17 00:00:00 2001
From: Henry Addison <henryaddison@users.noreply.github.com>
Date: Mon, 18 Mar 2024 15:35:46 +0000
Subject: [PATCH] clean up bin dir

and move blue pebble helpers into separate dir
---
 bin/add-ensemble-member-dim-to-predictions | 119 ---------------------
 bin/{ => bp}/bp-tb                         |   0
 bin/{ => bp}/queue-sampling                |   0
 bin/{ => bp}/queue-training                |   0
 bin/{ => bp}/train-sample                  |   0
 bin/deterministic/{ => bp}/queue-sampling  |   0
 bin/deterministic/{ => bp}/queue-training  |   0
 bin/deterministic/{ => bp}/train-sample    |   0
 bin/deterministic/mv-xfm-to-keyed-dir      |  55 ----------
 bin/deterministic/netcdf-to-numpy.py       |  62 -----------
 bin/mv-xfm-to-keyed-dir                    |  60 -----------
 bin/split-samples-by-time-period           |  56 ----------
 12 files changed, 352 deletions(-)
 delete mode 100755 bin/add-ensemble-member-dim-to-predictions
 rename bin/{ => bp}/bp-tb (100%)
 rename bin/{ => bp}/queue-sampling (100%)
 rename bin/{ => bp}/queue-training (100%)
 rename bin/{ => bp}/train-sample (100%)
 rename bin/deterministic/{ => bp}/queue-sampling (100%)
 rename bin/deterministic/{ => bp}/queue-training (100%)
 rename bin/deterministic/{ => bp}/train-sample (100%)
 delete mode 100755 bin/deterministic/mv-xfm-to-keyed-dir
 delete mode 100644 bin/deterministic/netcdf-to-numpy.py
 delete mode 100755 bin/mv-xfm-to-keyed-dir
 delete mode 100755 bin/split-samples-by-time-period

diff --git a/bin/add-ensemble-member-dim-to-predictions b/bin/add-ensemble-member-dim-to-predictions
deleted file mode 100755
index 4626d9469..000000000
--- a/bin/add-ensemble-member-dim-to-predictions
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python
-# make sure all datasets have ensemble member and add default to any that don't
-
-import glob
-import logging
-import os
-import shutil
-import xarray as xr
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO, format="%(levelname)s %(asctime)s: %(message)s")
-
-DEFAULT_ENSEMBLE_MEMBER = "01"
-
-
-def fix_file(nc_filepath):
-    logger.info(f"Working on {nc_filepath}")
-    ds = xr.open_dataset(nc_filepath)
-
-    if "ensemble_member" in ds.dims:
-        logger.info(f"Already has ensemble member: {nc_filepath}")
-        ds.close()
-        return
-
-    logger.info(f"Fixing {nc_filepath}")
-    ds = ds.load()
-    ds.close()
-    ds = ds.expand_dims(dict(ensemble_member=[DEFAULT_ENSEMBLE_MEMBER]))
-
-    dirpath, filename = os.path.split(nc_filepath)
-    new_filepath = os.path.join(dirpath, "01", filename)
-    os.makedirs(os.path.dirname(new_filepath), exist_ok=True)
-
-    if "sample_id" in ds.dims:
-        ds = ds.squeeze(dim="sample_id")
-
-    ds.to_netcdf(new_filepath)
-
-    ds = xr.open_dataset(new_filepath)
-    assert list(ds["pred_pr"].dims) == [
-        "ensemble_member",
-        "time",
-        "grid_latitude",
-        "grid_longitude",
-    ], list(ds["pred_pr"].dims)
-    assert ds["pred_pr"].shape[0] == 1
-    assert ds["pred_pr"].shape[1] > 0
-    assert ds["pred_pr"].shape[2] == 64
-    assert ds["pred_pr"].shape[3] == 64
-    assert ds["pred_pr"].isnull().sum().values.item() == 0
-    logger.info(f"Removing original prediction file: {nc_filepath}")
-    os.remove(nc_filepath)
-
-
-def main():
-    diff_models_glob = os.path.join(
-        os.getenv("DERIVED_DATA"),
-        "workdirs",
-        "score-sde",
-        "*",  # sde
-        "xarray_cncsnpp_continuous",
-        "*",  # model name
-    )
-
-    unet_models_glob = os.path.join(
-        os.getenv("DERIVED_DATA"),
-        "workdirs",
-        "u-net",
-        "*",  # model name
-    )
-
-    id_linpr_models_glob = os.path.join(
-        os.getenv("DERIVED_DATA"),
-        "workdirs",
-        "id-linpr",  # model name
-    )
-
-    model_dirs = (
-        glob.glob(diff_models_glob)
-        + glob.glob(unet_models_glob)
-        + glob.glob(id_linpr_models_glob)
-    )
-    for model_dir in model_dirs:
-        if os.path.basename(model_dir) == "archive":
-            continue
-
-        samples_glob = os.path.join(
-            model_dir,
-            "samples",
-            "*",  # checkpoint
-            "*",  # dataset
-            "*",  # input_xfm
-            "*",  # split
-            "predictions-*.nc",
-        )
-        sample_filepaths = glob.glob(samples_glob)
-        for sample_filepath in sample_filepaths:
-            fix_file(sample_filepath)
-
-        sample_config_files_glob = os.path.join(
-            model_dir,
-            "samples",
-            "*",  # checkpoint
-            "*",  # dataset
-            "*",  # input_xfm
-            "*",  # split
-            "config.yml",
-        )
-
-        for sample_config_filepath in glob.glob(sample_config_files_glob):
-            dirpath, filename = os.path.split(sample_config_filepath)
-            new_filepath = os.path.join(dirpath, "01", filename)
-            os.makedirs(os.path.dirname(new_filepath), exist_ok=True)
-            logger.info(f"moving config {sample_config_filepath} to {new_filepath}")
-            shutil.move(sample_config_filepath, new_filepath)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bin/bp-tb b/bin/bp/bp-tb
similarity index 100%
rename from bin/bp-tb
rename to bin/bp/bp-tb
diff --git a/bin/queue-sampling b/bin/bp/queue-sampling
similarity index 100%
rename from bin/queue-sampling
rename to bin/bp/queue-sampling
diff --git a/bin/queue-training b/bin/bp/queue-training
similarity index 100%
rename from bin/queue-training
rename to bin/bp/queue-training
diff --git a/bin/train-sample b/bin/bp/train-sample
similarity index 100%
rename from bin/train-sample
rename to bin/bp/train-sample
diff --git a/bin/deterministic/queue-sampling b/bin/deterministic/bp/queue-sampling
similarity index 100%
rename from bin/deterministic/queue-sampling
rename to bin/deterministic/bp/queue-sampling
diff --git a/bin/deterministic/queue-training b/bin/deterministic/bp/queue-training
similarity index 100%
rename from bin/deterministic/queue-training
rename to bin/deterministic/bp/queue-training
diff --git a/bin/deterministic/train-sample b/bin/deterministic/bp/train-sample
similarity index 100%
rename from bin/deterministic/train-sample
rename to bin/deterministic/bp/train-sample
diff --git a/bin/deterministic/mv-xfm-to-keyed-dir b/bin/deterministic/mv-xfm-to-keyed-dir
deleted file mode 100755
index 4b8eb249c..000000000
--- a/bin/deterministic/mv-xfm-to-keyed-dir
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-# move cached input transforms and samples to folder that includes the transform key
-
-import glob
-import os
-from ml_collections import config_dict
-import shutil
-import yaml
-
-
-def main():
-    model_run_wildcard_path = os.path.join(
-        os.getenv("DERIVED_DATA"), "workdirs", "u-net", "*"
-    )
-    print(model_run_wildcard_path)
-    model_run_glob = glob.glob(model_run_wildcard_path)
-    for model_run_path in model_run_glob:
-        # skip the archive directory
-        if model_run_path.endswith("archive"):
-            continue
-        print(f"Working on {model_run_path}", flush=True)
-        # extract input transform key from config
-        model_run_config_path = os.path.join(model_run_path, "config.yml")
-        with open(model_run_config_path) as f:
-            config = config_dict.ConfigDict(yaml.unsafe_load(f))
-        try:
-            input_xfm_key = config["input_transform_key"]
-        except AttributeError:
-            input_xfm_key = "v1"
-
-        # move the transform
-        input_xfm_glob = glob.glob(
-            os.path.join(model_run_path, "transforms", "*", "input.pickle")
-        )
-        for input_xfm_path in input_xfm_glob:
-            new_xfm_path = input_xfm_path.replace(
-                "input.pickle", f"{input_xfm_key}/input.pickle"
-            )
-            os.makedirs(os.path.dirname(new_xfm_path), exist_ok=True)
-            print(f"Moving {input_xfm_path} to {new_xfm_path}")
-            shutil.move(input_xfm_path, new_xfm_path)
-
-        # move the samples
-        samples_glob = glob.glob(os.path.join(model_run_path, "samples", "*", "*"))
-        for sample_set_path in samples_glob:
-            new_sample_path = os.path.join(sample_set_path, input_xfm_key)
-            existing_sample_splits_glob = glob.glob(os.path.join(sample_set_path, "*"))
-            os.makedirs(new_sample_path, exist_ok=True)
-            for sample_split_path in existing_sample_splits_glob:
-                print(f"Moving {sample_split_path} to {new_sample_path}")
-                shutil.move(sample_split_path, new_sample_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bin/deterministic/netcdf-to-numpy.py b/bin/deterministic/netcdf-to-numpy.py
deleted file mode 100644
index cac535973..000000000
--- a/bin/deterministic/netcdf-to-numpy.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import argparse
-import logging
-import os
-from pathlib import Path
-
-import numpy as np
-import xarray as xr
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        description="Save a netcdf dataset to raw numpy on disk",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--input",
-        dest="input_dir",
-        type=Path,
-        required=True,
-        help="Base path to input storage",
-    )
-    parser.add_argument(
-        "--output",
-        dest="output_dir",
-        type=Path,
-        required=True,
-        help="Base path to storage output",
-    )
-    parser.add_argument(
-        "--variable",
-        dest="variable",
-        type=str,
-        required=True,
-        help="Name of variable to extract and save from netCDF dataset",
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO, format="%(levelname)s %(asctime)s: %(message)s"
-    )
-
-    args = get_args()
-
-    output_path = args.output_dir / f"{args.variable}.npy"
-
-    logging.info(
-        f"Saving {args.variable} from dataset in {args.input_dir} to {output_path}"
-    )
-
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    ds = xr.open_mfdataset(str(args.input_dir / "*.nc"))
-
-    # don't need the ensemble member dimension for training
-    np_array = ds.isel(ensemble_member=0)[args.variable].values
-
-    np.save(output_path, np_array)
-
-    logging.info("All done")
diff --git a/bin/mv-xfm-to-keyed-dir b/bin/mv-xfm-to-keyed-dir
deleted file mode 100755
index 3f90399ae..000000000
--- a/bin/mv-xfm-to-keyed-dir
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-# move cached input transforms and samples to folder that includes the transform key
-
-import glob
-import os
-from ml_collections import config_dict
-import shutil
-import yaml
-
-
-def main():
-    model_run_wildcard_path = os.path.join(
-        os.getenv("DERIVED_DATA"),
-        "workdirs",
-        "score-sde",
-        "*",
-        "xarray_cncsnpp_continuous",
-        "*",
-    )
-    print(model_run_wildcard_path)
-    model_run_glob = glob.glob(model_run_wildcard_path)
-    for model_run_path in model_run_glob:
-        # skip the archive directory
-        if model_run_path.endswith("archive"):
-            continue
-        print(f"Working on {model_run_path}", flush=True)
-        # extract input transform key from config
-        model_run_config_path = os.path.join(model_run_path, "config.yml")
-        with open(model_run_config_path) as f:
-            config = config_dict.ConfigDict(yaml.unsafe_load(f))
-        try:
-            input_xfm_key = config.data.input_transform_key
-        except AttributeError:
-            input_xfm_key = "v1"
-
-        # move the transform
-        input_xfm_glob = glob.glob(
-            os.path.join(model_run_path, "transforms", "*", "input.pickle")
-        )
-        for input_xfm_path in input_xfm_glob:
-            new_xfm_path = input_xfm_path.replace(
-                "input.pickle", f"{input_xfm_key}/input.pickle"
-            )
-            os.makedirs(os.path.dirname(new_xfm_path), exist_ok=True)
-            print(f"Moving {input_xfm_path} to {new_xfm_path}")
-            shutil.move(input_xfm_path, new_xfm_path)
-
-        # move the samples
-        samples_glob = glob.glob(os.path.join(model_run_path, "samples", "*", "*"))
-        for sample_set_path in samples_glob:
-            new_sample_path = os.path.join(sample_set_path, input_xfm_key)
-            existing_sample_splits_glob = glob.glob(os.path.join(sample_set_path, "*"))
-            os.makedirs(new_sample_path, exist_ok=True)
-            for sample_split_path in existing_sample_splits_glob:
-                print(f"Moving {sample_split_path} to {new_sample_path}")
-                shutil.move(sample_split_path, new_sample_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bin/split-samples-by-time-period b/bin/split-samples-by-time-period
deleted file mode 100755
index 83c656471..000000000
--- a/bin/split-samples-by-time-period
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env python
-# Concatenate samples from a bunch of subsets of training set
-
-import cftime
-import glob
-import os
-import typer
-import xarray as xr
-
-app = typer.Typer()
-
-TIME_PERIODS = {
-    "historic": slice(
-        cftime.Datetime360Day(1980, 12, 1, 12, 0, 0, 0, has_year_zero=True),
-        cftime.Datetime360Day(2000, 11, 30, 12, 0, 0, 0, has_year_zero=True),
-    ),
-    "present": slice(
-        cftime.Datetime360Day(2020, 12, 1, 12, 0, 0, 0, has_year_zero=True),
-        cftime.Datetime360Day(2040, 11, 30, 12, 0, 0, 0, has_year_zero=True),
-    ),
-    "future": slice(
-        cftime.Datetime360Day(2060, 12, 1, 12, 0, 0, 0, has_year_zero=True),
-        cftime.Datetime360Day(2080, 11, 30, 12, 0, 0, 0, has_year_zero=True),
-    ),
-}
-
-
-def samples_dir(workdir, checkpoint, dataset, input_xfm, split):
-    return f"{os.getenv('DERIVED_DATA')}/workdirs/{workdir}/samples/{checkpoint}/{dataset}/{input_xfm}/{split}"
-
-
-@app.command()
-def main(
-    workdir: str, dataset: str, checkpoint="epoch-100", input_xfm="stan", split="val"
-):
-    pred_glob = os.path.join(
-        samples_dir(workdir, checkpoint, dataset, input_xfm, split), "predictions-*.nc"
-    )
-    typer.echo(pred_glob)
-    pred_paths = glob.glob(pred_glob)
-
-    for pred_path in pred_paths:
-        ds = xr.open_dataset(pred_path)
-        for tp in ["historic", "present", "future"]:
-            tp_output_path = os.path.join(
-                samples_dir(workdir, checkpoint, f"{dataset}-{tp}", input_xfm, split),
-                os.path.basename(pred_path),
-            )
-
-            typer.echo(f"save to {tp_output_path}")
-            os.makedirs(os.path.dirname(tp_output_path), exist_ok=True)
-            ds.sel(time=TIME_PERIODS[tp]).to_netcdf(tp_output_path)
-
-
-if __name__ == "__main__":
-    app()