Skip to content

Commit

Permalink
Dataset scraper (#1)
Browse files Browse the repository at this point in the history
* Simple scraping utility

* Scraper can save to disk

* Add curated list of scenarios in apebench and remove issues

* Also produce and write test data

* Release memory after creation

* Only export 10 test trajectories in 3D

* Add simple docstring
  • Loading branch information
Ceyron authored Jun 24, 2024
1 parent 35073c4 commit 584e302
Show file tree
Hide file tree
Showing 2 changed files with 252 additions and 0 deletions.
2 changes: 2 additions & 0 deletions apebench/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from . import _scraper as scraper
from . import scenarios
from ._extensions import arch_extensions
from ._run import (
Expand Down Expand Up @@ -34,4 +35,5 @@
"melt_sample_rollouts",
"read_in_kwargs",
"arch_extensions",
"scraper",
]
250 changes: 250 additions & 0 deletions apebench/_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
"""
Utilities to scrape APEBench datasets into numpy arrays and save them to disk.
"""

import json
import logging
from dataclasses import asdict

import jax.numpy as jnp

from .scenarios import scenario_dict


def scrape_data_and_metadata(
folder: str = None,
*,
scenario: str,
name: str = "auto",
**scenario_kwargs,
):
"""
Produce train data, test data, and metadata for a given scenario. Optionally
write them to disk.
**Arguments:**
- `folder` (str, optional): Folder to save the data and metadata to. If
None, returns the data and metadata as jax arrays and a dictionary,
respectively.
- `scenario` (str): Name of the scenario to produce data for. Must be one of
`apebench.scenarios.scenario_dict`.
- `name` (str, optional): Name of the scenario. If "auto", the name is
automatically generated based on the scenario and its additional
arguments.
- `**scenario_kwargs`: Additional arguments to pass to the scenario. All
attributes of a scenario can be modified by passing them as keyword
arguments.
"""
scenario = scenario_dict[scenario](**scenario_kwargs)
if name == "auto":
name = scenario.get_scenario_name()

additional_infos = []
for key, value in scenario_kwargs.items():
additional_infos.append(f"{key}={value}")
if len(additional_infos) > 0:
additional_infos = ", ".join(additional_infos)
additional_infos = "__" + additional_infos
else:
additional_infos = ""

name += additional_infos

logging.info(f"Producing train data for {name}")
train_data = scenario.get_train_data()
train_num_nans = jnp.sum(jnp.isnan(train_data))
if train_num_nans > 0:
logging.warning(f"Train data contains {train_num_nans} NaNs")

logging.info(f"Producing test data for {name}")
test_data = scenario.get_test_data()
test_num_nans = jnp.sum(jnp.isnan(test_data))
if test_num_nans > 0:
logging.warning(f"Test data contains {test_num_nans} NaNs")

info = asdict(scenario)

metadata = {
"name": name,
"info": info,
}

if folder is not None:
with open(f"{folder}/{name}.json", "w") as f:
json.dump(metadata, f)
jnp.save(f"{folder}/{name}_train.npy", train_data)
jnp.save(f"{folder}/{name}_test.npy", test_data)

del train_data, test_data
else:
return train_data, metadata


CURATION_APEBENCH_V1 = [
# 1D - Linear
{
"scenario": "diff_adv",
},
{
"scenario": "diff_diff",
},
{
"scenario": "diff_adv_diff",
},
{
"scenario": "diff_disp",
},
{
"scenario": "diff_hyp_diff",
},
# 1D - Nonlinear
{"scenario": "diff_burgers"},
{"scenario": "diff_kdv"},
{"scenario": "diff_ks"},
{"scenario": "diff_ks_cons"},
# 1D - Reaction-Diffusion
{"scenario": "diff_fisher"},
# 2D - Linear
{"scenario": "diff_adv", "num_spatial_dims": 2},
{"scenario": "diff_diff", "num_spatial_dims": 2},
{"scenario": "diff_adv_diff", "num_spatial_dims": 2},
{"scenario": "diff_disp", "num_spatial_dims": 2},
{"scenario": "diff_hyp_diff", "num_spatial_dims": 2},
# 2D - Linear Special
{
"scenario": "phy_unbal_adv",
"num_spatial_dims": 2,
"advection_coef_vector": (0.01, -0.04),
},
{"scenario": "phy_diag_diff", "num_spatial_dims": 2},
{"scenario": "phy_aniso_diff", "num_spatial_dims": 2},
{"scenario": "phy_mix_disp", "num_spatial_dims": 2},
{"scenario": "phy_mix_hyp", "num_spatial_dims": 2},
# 2D - Nonlinear
{"scenario": "diff_burgers", "num_spatial_dims": 2},
{"scenario": "diff_burgers_sc", "num_spatial_dims": 2},
{"scenario": "diff_kdv", "num_spatial_dims": 2},
{"scenario": "diff_ks", "num_spatial_dims": 2},
{"scenario": "phy_decay_turb", "num_spatial_dims": 2},
{"scenario": "phy_kolm_flow", "num_spatial_dims": 2},
# 2D - Reaction-Diffusion
{"scenario": "diff_fisher", "num_spatial_dims": 2},
{"scenario": "phy_gs_type", "num_spatial_dims": 2},
{"scenario": "phy_sh", "num_spatial_dims": 2},
# 3D - Linear
{
"scenario": "diff_adv",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_diff",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_adv_diff",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_disp",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_hyp_diff",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
# 3D - Linear Special
{
"scenario": "phy_unbal_adv",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "phy_diag_diff",
"num_spatial_dims": 3,
"num_points": 32,
"diffusion_coef_vector": (0.001, 0.002, 0.0004),
"num_test_samples": 10,
},
{
"scenario": "phy_aniso_diff",
"num_spatial_dims": 3,
"num_points": 32,
"diffusion_coef_matrix": (
(0.001, 0.0005, 0.0003),
(0.0005, 0.002, 0.0002),
(0.0003, 0.0002, 0.0004),
),
"num_test_samples": 10,
},
{
"scenario": "phy_mix_disp",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "phy_mix_hyp",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
# 3D - Nonlinear
{
"scenario": "diff_burgers",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_burgers_sc",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_kdv",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "diff_ks",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
# 3D - Reaction-Diffusion
{
"scenario": "diff_fisher",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "phy_gs_type",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
{
"scenario": "phy_sh",
"num_spatial_dims": 3,
"num_points": 32,
"num_test_samples": 10,
},
]
"""
Collection of default scenarios as used in the original APEBench paper
"""

0 comments on commit 584e302

Please sign in to comment.