give up on openml.org (#16)

* add dataset directly to git repo * get data from the local copy, not openml.org
hsf-training · Jan 24, 2025 · 7c653ec · 7c653ec
1 parent 9453667
commit 7c653ec
Show file tree

Hide file tree

Showing 7 changed files with 11 additions and 23 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -48,11 +48,6 @@ jobs:
           cache-environment: true
           post-cleanup: "all"
 
-      # Preload the main project data
-      - name: Preload main project data
-        run: |
-          python -c 'import sklearn.datasets; d = sklearn.datasets.fetch_openml("hls4ml_lhc_jets_hlf"); d["data"], d["target"]'
-
       # Build the book
       - name: Build the book
         run: |

diff --git a/deep-learning-intro-for-hep/20-main-project.md b/deep-learning-intro-for-hep/20-main-project.md
@@ -34,7 +34,6 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 
-import sklearn.datasets
 import torch
 from torch import nn, optim
 from torch.utils.data import TensorDataset, DataLoader, random_split
@@ -48,12 +47,10 @@ The data comes from an online catalog: [hls4ml_lhc_jets_hlf](https://openml.org/
 
 The full description is online, with references to the paper in which it was published.
 
-Scikit-Learn has a tool for downloading it, which takes a minute or two.
-
 ```{code-cell} ipython3
-hls4ml_lhc_jets_hlf = sklearn.datasets.fetch_openml("hls4ml_lhc_jets_hlf")
-
-features, targets = hls4ml_lhc_jets_hlf["data"], hls4ml_lhc_jets_hlf["target"]
+hls4ml_lhc_jets_hlf = pd.read_parquet("data/hls4ml_lhc_jets_hlf.parquet")
+features = hls4ml_lhc_jets_hlf.drop("jet_type", axis=1)
+targets = hls4ml_lhc_jets_hlf["jet_type"]
 ```
 
 View the features (16 numerical properties of jets) as a Pandas DataFrame:

diff --git a/deep-learning-intro-for-hep/21-main-project-solutions.md b/deep-learning-intro-for-hep/21-main-project-solutions.md
@@ -26,7 +26,6 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 
-import sklearn.datasets
 import torch
 from torch import nn, optim
 from torch.utils.data import TensorDataset, DataLoader, random_split
@@ -90,9 +89,9 @@ expected_ROC = np.array([
 ## Step 1: download and understand the data
 
 ```{code-cell} ipython3
-hls4ml_lhc_jets_hlf = sklearn.datasets.fetch_openml("hls4ml_lhc_jets_hlf")
-
-features, targets = hls4ml_lhc_jets_hlf["data"], hls4ml_lhc_jets_hlf["target"]
+hls4ml_lhc_jets_hlf = pd.read_parquet("data/hls4ml_lhc_jets_hlf.parquet")
+features = hls4ml_lhc_jets_hlf.drop("jet_type", axis=1)
+targets = hls4ml_lhc_jets_hlf["jet_type"]
 ```
 
 ## Step 2: split the data into training, validation, and test samples

diff --git a/deep-learning-intro-for-hep/23-autoencoders.md b/deep-learning-intro-for-hep/23-autoencoders.md
@@ -45,7 +45,6 @@ import pandas as pd
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 
-import sklearn.datasets
 import torch
 from torch import nn, optim
 ```
@@ -57,12 +56,10 @@ from torch import nn, optim
 Let's use the jet data from the main project.
 
 ```{code-cell} ipython3
-hls4ml_lhc_jets_hlf = sklearn.datasets.fetch_openml("hls4ml_lhc_jets_hlf")
-
+hls4ml_lhc_jets_hlf = pd.read_parquet("data/hls4ml_lhc_jets_hlf.parquet")
 features_unnormalized = torch.tensor(
-    hls4ml_lhc_jets_hlf["data"].values, dtype=torch.float32,
+    hls4ml_lhc_jets_hlf.drop("jet_type", axis=1).values, dtype=torch.float32
 )
-
 features = (features_unnormalized - features_unnormalized.mean(axis=0)) / features_unnormalized.std(axis=0)
 ```
 
@@ -189,7 +186,7 @@ The exact distribution isn't meaningful (and it would change if we used a differ
 How well do these clumps correspond to the known jet sources?
 
 ```{code-cell} ipython3
-hidden_truth = hls4ml_lhc_jets_hlf["target"].values
+hidden_truth = hls4ml_lhc_jets_hlf["jet_type"].values
 ```
 
 ```{code-cell} ipython3

diff --git a/deep-learning-intro-for-hep/24-convolutional.md b/deep-learning-intro-for-hep/24-convolutional.md
@@ -30,7 +30,6 @@ import matplotlib as mpl
 import matplotlib.pyplot as plt
 
 import h5py
-import sklearn.datasets
 import torch
 from torch import nn, optim
 ```
@@ -42,7 +41,7 @@ from torch import nn, optim
 The jet dataset that you used for your [main project](20-main-project.md) is based on 16 hand-crafted features:
 
 ```{code-cell} ipython3
-list(sklearn.datasets.fetch_openml("hls4ml_lhc_jets_hlf")["data"].columns)
+list(pd.read_parquet("data/hls4ml_lhc_jets_hlf.parquet").columns[:-1])
 ```
 
 Suppose we didn't know that these are a useful way to characterize jet substructure, or suppose that there are better ways not listed here (very plausible!). A model trained on these 16 features wouldn't have as much discriminating power as it could.

diff --git a/deep-learning-intro-for-hep/data/hls4ml_lhc_jets_hlf.parquet b/deep-learning-intro-for-hep/data/hls4ml_lhc_jets_hlf.parquet
diff --git a/environment.yml b/environment.yml
@@ -16,6 +16,7 @@ dependencies:
   - pandas
   - iminuit
   - scikit-learn
+  - fastparquet
   - pytorch-cpu  # this is `torch` in pip
 
   # used in very few sections (optional)