From 584e302c13f5714ec80ee4c4837c34c4c02c18a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=B6hler?= <27728103+Ceyron@users.noreply.github.com> Date: Mon, 24 Jun 2024 10:55:44 +0200 Subject: [PATCH] Dataset scraper (#1) * Simple scraping utility * Scraper can save to disk * Add curated list of scenarios in apebench and remove issues * Also produce and write test data * Release memory after creation * Only export 10 test trajectories in 3D * Add simple docstring --- apebench/__init__.py | 2 + apebench/_scraper.py | 250 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 apebench/_scraper.py diff --git a/apebench/__init__.py b/apebench/__init__.py index 58136ae..05d8fa9 100644 --- a/apebench/__init__.py +++ b/apebench/__init__.py @@ -1,3 +1,4 @@ +from . import _scraper as scraper from . import scenarios from ._extensions import arch_extensions from ._run import ( @@ -34,4 +35,5 @@ "melt_sample_rollouts", "read_in_kwargs", "arch_extensions", + "scraper", ] diff --git a/apebench/_scraper.py b/apebench/_scraper.py new file mode 100644 index 0000000..c85aa0c --- /dev/null +++ b/apebench/_scraper.py @@ -0,0 +1,250 @@ +""" +Utilities to scrape APEBench datasets into numpy arrays and save them to disk. +""" + +import json +import logging +from dataclasses import asdict + +import jax.numpy as jnp + +from .scenarios import scenario_dict + + +def scrape_data_and_metadata( + folder: str = None, + *, + scenario: str, + name: str = "auto", + **scenario_kwargs, +): + """ + Produce train data, test data, and metadata for a given scenario. Optionally + write them to disk. + + **Arguments:** + + - `folder` (str, optional): Folder to save the data and metadata to. If + None, returns the data and metadata as jax arrays and a dictionary, + respectively. + - `scenario` (str): Name of the scenario to produce data for. Must be one of + `apebench.scenarios.scenario_dict`. + - `name` (str, optional): Name of the scenario. If "auto", the name is + automatically generated based on the scenario and its additional + arguments. + - `**scenario_kwargs`: Additional arguments to pass to the scenario. All + attributes of a scenario can be modified by passing them as keyword + arguments. + """ + scenario = scenario_dict[scenario](**scenario_kwargs) + if name == "auto": + name = scenario.get_scenario_name() + + additional_infos = [] + for key, value in scenario_kwargs.items(): + additional_infos.append(f"{key}={value}") + if len(additional_infos) > 0: + additional_infos = ", ".join(additional_infos) + additional_infos = "__" + additional_infos + else: + additional_infos = "" + + name += additional_infos + + logging.info(f"Producing train data for {name}") + train_data = scenario.get_train_data() + train_num_nans = jnp.sum(jnp.isnan(train_data)) + if train_num_nans > 0: + logging.warning(f"Train data contains {train_num_nans} NaNs") + + logging.info(f"Producing test data for {name}") + test_data = scenario.get_test_data() + test_num_nans = jnp.sum(jnp.isnan(test_data)) + if test_num_nans > 0: + logging.warning(f"Test data contains {test_num_nans} NaNs") + + info = asdict(scenario) + + metadata = { + "name": name, + "info": info, + } + + if folder is not None: + with open(f"{folder}/{name}.json", "w") as f: + json.dump(metadata, f) + jnp.save(f"{folder}/{name}_train.npy", train_data) + jnp.save(f"{folder}/{name}_test.npy", test_data) + + del train_data, test_data + else: + return train_data, metadata + + +CURATION_APEBENCH_V1 = [ + # 1D - Linear + { + "scenario": "diff_adv", + }, + { + "scenario": "diff_diff", + }, + { + "scenario": "diff_adv_diff", + }, + { + "scenario": "diff_disp", + }, + { + "scenario": "diff_hyp_diff", + }, + # 1D - Nonlinear + {"scenario": "diff_burgers"}, + {"scenario": "diff_kdv"}, + {"scenario": "diff_ks"}, + {"scenario": "diff_ks_cons"}, + # 1D - Reaction-Diffusion + {"scenario": "diff_fisher"}, + # 2D - Linear + {"scenario": "diff_adv", "num_spatial_dims": 2}, + {"scenario": "diff_diff", "num_spatial_dims": 2}, + {"scenario": "diff_adv_diff", "num_spatial_dims": 2}, + {"scenario": "diff_disp", "num_spatial_dims": 2}, + {"scenario": "diff_hyp_diff", "num_spatial_dims": 2}, + # 2D - Linear Special + { + "scenario": "phy_unbal_adv", + "num_spatial_dims": 2, + "advection_coef_vector": (0.01, -0.04), + }, + {"scenario": "phy_diag_diff", "num_spatial_dims": 2}, + {"scenario": "phy_aniso_diff", "num_spatial_dims": 2}, + {"scenario": "phy_mix_disp", "num_spatial_dims": 2}, + {"scenario": "phy_mix_hyp", "num_spatial_dims": 2}, + # 2D - Nonlinear + {"scenario": "diff_burgers", "num_spatial_dims": 2}, + {"scenario": "diff_burgers_sc", "num_spatial_dims": 2}, + {"scenario": "diff_kdv", "num_spatial_dims": 2}, + {"scenario": "diff_ks", "num_spatial_dims": 2}, + {"scenario": "phy_decay_turb", "num_spatial_dims": 2}, + {"scenario": "phy_kolm_flow", "num_spatial_dims": 2}, + # 2D - Reaction-Diffusion + {"scenario": "diff_fisher", "num_spatial_dims": 2}, + {"scenario": "phy_gs_type", "num_spatial_dims": 2}, + {"scenario": "phy_sh", "num_spatial_dims": 2}, + # 3D - Linear + { + "scenario": "diff_adv", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_diff", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_adv_diff", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_disp", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_hyp_diff", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + # 3D - Linear Special + { + "scenario": "phy_unbal_adv", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "phy_diag_diff", + "num_spatial_dims": 3, + "num_points": 32, + "diffusion_coef_vector": (0.001, 0.002, 0.0004), + "num_test_samples": 10, + }, + { + "scenario": "phy_aniso_diff", + "num_spatial_dims": 3, + "num_points": 32, + "diffusion_coef_matrix": ( + (0.001, 0.0005, 0.0003), + (0.0005, 0.002, 0.0002), + (0.0003, 0.0002, 0.0004), + ), + "num_test_samples": 10, + }, + { + "scenario": "phy_mix_disp", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "phy_mix_hyp", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + # 3D - Nonlinear + { + "scenario": "diff_burgers", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_burgers_sc", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_kdv", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "diff_ks", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + # 3D - Reaction-Diffusion + { + "scenario": "diff_fisher", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "phy_gs_type", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, + { + "scenario": "phy_sh", + "num_spatial_dims": 3, + "num_points": 32, + "num_test_samples": 10, + }, +] +""" +Collection of default scenarios as used in the original APEBench paper +"""