diff --git a/soccer_xg/features.py b/soccer_xg/attributes.py
similarity index 87%
rename from soccer_xg/features.py
rename to soccer_xg/attributes.py
index c7c066d..e53260d 100644
--- a/soccer_xg/features.py
+++ b/soccer_xg/attributes.py
@@ -1,9 +1,9 @@
-"""A collection of feature generators.
+"""A collection of attribute (i.e., feature and label) generators.
 
-There are three types of feature generators:
+There are three types of generators:
 
 gamestates
-   Feature generators which calculate a set of features based on the shot and
+   Generators which calculate a set of attributes based on the shot and
    the N previous actions (i.e., shot context). The input is a list of
    gamestates. Internally each game state is represented as a list of SPADL
    action dataframes :math:`[a_0, a_1, ...]` where each row in the :math:`a_i`
@@ -11,22 +11,22 @@
    :math:`a_{i-1}` dataframe. :math:`a_0` is the shot action.
 
 actions
-   Feature generators which calculate a set of features based on the shot and
+   Generators which calculate a set of attributes based on the shot and
    all preceding actions. The input is a :class:`pandas.DataFrame` of actions
-   in SPADL format and a boolean mask to select the shots for which features
+   in SPADL format and a boolean mask to select the shots for which attributes
    should be computed.
 
 events
-   Feature generators which calculate a set of features based on the original
-   event data. These feature generators are provider-specific. The input is
-   a :class:`pandas.DataFrame` of events and a boolean mask to select the
-   shots for which features should be computed.
+   Generators which calculate a set of attributes based on the original
+   event data. These generators are provider-specific. The input is
+   a :class:`pandas.DataFrame` of events and a series with event IDs to select
+   the shots for which attributes should be computed.
 
 The types are specified using the ``ftype`` decorator. Only functions, which
-have a parameter called "ftype" are seen by soccer-xg as a feature generator.
-Others will not be calculated.
+have a parameter called "ftype" are seen by soccer-xg as a generator. Others
+will not be calculated.
 
-As the "gamestates" and "actions" feature generators compute features from
+As the "gamestates" and "actions" generators compute attributes from
 SPADL actions, they work for all data providers that are supported by the
 SoccerAction library.
 """
@@ -36,10 +36,10 @@
 import numpy as np
 import pandas as pd
 
+from socceraction import spadl
 import socceraction.spadl.config as spadlcfg
 import socceraction.vaep.features as fs
 from socceraction.vaep.features import simple
-from . import utils
 
 _spadl_cfg = {
     "length": 105,
@@ -70,7 +70,7 @@ def decorate_func(func):
 
 
 # ############################################################################
-# SoccerAction-style gamestate features
+# SoccerAction-style gamestate attributes
 # ############################################################################
 
 actiontype = ftype("gamestates")(fs.actiontype)
@@ -107,7 +107,7 @@ def speed(gamestates):
         between each <nb_prev_actions> action ai and action a0.
     """
     a0 = gamestates[0]
-    spaced = pd.DataFrame()
+    spaced = pd.DataFrame(index=a0.index)
     for i, a in enumerate(gamestates[1:]):
         dt = a0.time_seconds - a.time_seconds
         dt[dt < 1] = 1
@@ -120,10 +120,36 @@ def speed(gamestates):
 
 
 # ############################################################################
-# Features on SPADL shots
+# Attributes on SPADL shots
 # ############################################################################
 
 
+@ftype("actions")
+def goal_from_shot(actions, shot_mask):
+    """Determine whether a goal was scored from the current action.
+
+    This label can be used to train an xG model.
+
+    Parameters
+    ----------
+    actions : pd.DataFrame
+        The actions of a game in SPADL format.
+    shot_mask : pd.Series
+        A boolean mask to select the shots for which attributes should be
+        computed.
+
+    Returns
+    -------
+    pd.DataFrame
+        A dataframe with a column 'goal' and a row for each shot set to
+        True if a goal was scored from the current shot; otherwise False.
+    """
+    shots = actions.loc[shot_mask]
+    goaldf = pd.DataFrame(index=shots.index)
+    goaldf["goal"] = shots["result_name"] == "success"
+    return goaldf
+
+
 @ftype("actions")
 def shot_dist(actions, shot_mask):
     """Compute the distance to the middle of the goal.
@@ -133,7 +159,7 @@ def shot_dist(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -143,7 +169,7 @@ def shot_dist(actions, shot_mask):
         ('dist_shot').
     """
     shots = actions.loc[shot_mask]
-    distdf = pd.DataFrame()
+    distdf = pd.DataFrame(index=shots.index)
     dx = (_spadl_cfg["length"] - shots["start_x"]).values
     dy = (_spadl_cfg["width"] / 2 - shots["start_y"]).values
     distdf["dist_shot"] = np.sqrt(dx**2 + dy**2)
@@ -162,7 +188,7 @@ def shot_location(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -172,7 +198,7 @@ def shot_location(actions, shot_mask):
         and a column for the distance to the goal line ('dx_shot').
     """
     shots = actions.loc[shot_mask]
-    locationdf = pd.DataFrame()
+    locationdf = pd.DataFrame(index=shots.index)
     locationdf["dx_shot"] = _spadl_cfg["length"] - shots["start_x"]
     locationdf["dy_shot"] = (_spadl_cfg["width"] / 2 - shots["start_y"]).abs()
     return locationdf
@@ -190,7 +216,7 @@ def shot_angle(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -200,7 +226,7 @@ def shot_angle(actions, shot_mask):
         ('angle_shot').
     """
     shots = actions.loc[shot_mask]
-    polardf = pd.DataFrame()
+    polardf = pd.DataFrame(index=shots.index)
     dx = (_spadl_cfg["length"] - shots["start_x"]).abs().values
     dy = (_spadl_cfg["width"] / 2 - shots["start_y"]).abs().values
     with np.errstate(divide="ignore", invalid="ignore"):
@@ -217,7 +243,7 @@ def shot_visible_angle(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -234,7 +260,7 @@ def shot_visible_angle(actions, shot_mask):
     shots = actions.loc[shot_mask]
     dx = _spadl_cfg["length"] - shots["start_x"]
     dy = _spadl_cfg["width"] / 2 - shots["start_y"]
-    angledf = pd.DataFrame()
+    angledf = pd.DataFrame(index=shots.index)
     angledf["visible_angle_shot"] = np.arctan(
         _spadl_cfg["goal_width"] * dx / (dx**2 + dy**2 - (_spadl_cfg["goal_width"] / 2) ** 2)
     )
@@ -268,7 +294,7 @@ def shot_relative_angle(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -309,7 +335,7 @@ def shot_bodypart(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -319,9 +345,11 @@ def shot_bodypart(actions, shot_mask):
         ('bodypart_name_shot').
     """
     shots = actions.loc[shot_mask]
-    bodypartdf = pd.DataFrame()
+    bodypartdf = pd.DataFrame(index=shots.index)
     bodypartdf["bodypart_name_shot"] = pd.Categorical(
-        shots["bodypart_name"].replace(["foot_left", "foot_right"], "foot"), categories=["foot", "head", "other"], ordered=False
+        shots["bodypart_name"].replace(["foot_left", "foot_right"], "foot"),
+        categories=["foot", "head", "other"],
+        ordered=False,
     )
     return bodypartdf
 
@@ -336,7 +364,7 @@ def shot_bodypart_detailed(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -346,7 +374,7 @@ def shot_bodypart_detailed(actions, shot_mask):
         ('bodypart_name_shot').
     """
     shots = actions.loc[shot_mask]
-    bodypartdf = pd.DataFrame()
+    bodypartdf = pd.DataFrame(index=shots.index)
     bodypartdf["bodypart_name_shot"] = pd.Categorical(
         shots["bodypart_name"], categories=spadlcfg.bodyparts, ordered=False
     )
@@ -362,7 +390,7 @@ def shot_bodypart_onehot(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -372,11 +400,13 @@ def shot_bodypart_onehot(actions, shot_mask):
         to take a shot.
     """
     shots = actions.loc[shot_mask]
-    X = pd.DataFrame()
+    X = pd.DataFrame(index=shots.index)
     for bodypart_name in spadlcfg.bodyparts:
         col = "bodypart_" + bodypart_name + "_shot"
         if bodypart_name == "head/other":
             X[col] = shots["bodypart_name"].isin(["head", "other", "head/other"])
+        elif bodypart_name == "foot":
+            X[col] = shots["bodypart_name"].isin(["foot", "foot_left", "foot_right"])
         else:
             X[col] = shots["bodypart_name"] == bodypart_name
     return X
@@ -397,7 +427,7 @@ def post_dribble(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -434,7 +464,7 @@ def assist_type(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -491,7 +521,7 @@ def fastbreak(actions, shot_mask):
     actions : pd.DataFrame
         The actions of a game in SPADL format.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -738,7 +768,7 @@ def fn(actions):
 caley_grid = ftype("gamestates")(custom_grid("caley_zone", _caley_shot_matrix(), _point_in_rect))
 
 # ############################################################################
-# StatsBomb-specific features
+# StatsBomb-specific attributes
 # ############################################################################
 
 
@@ -818,7 +848,7 @@ def statsbomb_open_goal(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -846,7 +876,7 @@ def statsbomb_first_touch(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -877,7 +907,7 @@ def statsbomb_free_projection(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -950,7 +980,7 @@ def statsbomb_goalkeeper_position(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -1031,7 +1061,7 @@ def statsbomb_defenders_position(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -1105,7 +1135,7 @@ def statsbomb_assist(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -1198,7 +1228,7 @@ def statsbomb_counterattack(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -1228,7 +1258,7 @@ def statsbomb_shot_impact_height(events, shot_mask):
     events : pd.DataFrame
         The StatsBomb events of a game.
     shot_mask : pd.Series
-        A boolean mask to select the shots for which features should be
+        A boolean mask to select the shots for which attributes should be
         computed.
 
     Returns
@@ -1265,7 +1295,7 @@ def statsbomb_shot_impact_height(events, shot_mask):
     return output
 
 
-all_features = [
+default_features = [
     actiontype,
     bodypart,
     result,
@@ -1290,23 +1320,19 @@ def statsbomb_shot_impact_height(events, shot_mask):
     ),
 ]
 
-simple_features = []
-statsbomb_features = []
+default_labels = [goal_from_shot]
 
 
-def extract_features_on_game(
-    game, actions, events=None, xfns=all_features, shotfilter=None, nb_prev_actions=3
+def compute_attributes(
+    game,
+    actions,
+    events=None,
+    xfns=default_features,
+    yfns=default_labels,
+    shotfilter=None,
+    nb_prev_actions=3,
 ):
-    """
-    Extract features from
-
-    * a :class:`pandas.DataFrame` containing SPADL actions
-
-    and / or
-
-    * a :class:`pandas.DataFrame` containing provider-specific event data
-
-    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.
+    """Extract xG features for a given game.
 
     Parameters
     ----------
@@ -1315,10 +1341,12 @@ def extract_features_on_game(
     actions : pd.DataFrame
         A DataFrame containing SPADL actions.
     events: pd.DataFrame
-        A DataFrame containing the raw events. Can be used to calculate
-        provider-specific features.
+        A DataFrame containing the raw provider-specific events corresponding
+        to ``actions``. Can be used to calculate provider-specific features.
     xfns : list(callable)
-        List of feature generators to apply.
+        List of feature generators to apply. Defaults to ``default_features``.
+    yfns : list(callable)
+        List of label generators to apply. Defaults to ``default_labels``.
     shotfilter: callable(pd.Series) -> bool
         A function that takes a shot (in SPADL format) and returns True if the
         shot should be used for feature extraction. If None, all shots will be
@@ -1332,46 +1360,60 @@ def extract_features_on_game(
     pd.DataFrame
         A DataFrame with the calculated features.
     """
-    actions = utils.enhance_actions(actions)
-    # get shot index
+    # add names for result, bodypart and type
+    actions = spadl.utils.add_names(actions)
+
+    # select shots
     if shotfilter is None:
         # filter shots and ignore own goals
-        shot_idx = actions.type_name.isin(
+        shot_mask = actions.type_name.isin(
             ["shot", "shot_penalty", "shot_freekick"]
         ) & actions.result_name.isin(["fail", "success"])
     else:
-        shot_idx = actions.apply(lambda a: shotfilter(a), axis=1)
-    shot_actions_idx = actions.index.values[shot_idx]
-    shot_events_idx = actions.loc[shot_idx, "original_event_id"]
-    # handle inputs with no shots or no features
-    if shot_idx.sum() < 1:
+        shot_mask = actions.apply(lambda a: shotfilter(a), axis=1)
+    shot_actions_idx = actions.index[shot_mask]
+    shot_events_idx = actions.loc[shot_mask, "original_event_id"]
+
+    # handle inputs with no shots or no attributes
+    if shot_mask.sum() < 1:
         # TODO: create the expected columns
         return pd.DataFrame()
-    if len(xfns) < 1:
+    if len(xfns + yfns) < 1:
         return pd.DataFrame(index=shot_actions_idx)
-    # convert actions to ltr gamestates
+
+    # convert actions to ltr orientation
+    actions_ltr = spadl.utils.play_left_to_right(actions, game.home_team_id)
+    # convert actions to ltr shot gamestates
     gamestates = fs.gamestates(actions, nb_prev_actions)
-    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
-    # remove post-shot attributes
-    gamestates[0].loc[shot_idx, "end_x"] = float("NaN")
-    gamestates[0].loc[shot_idx, "end_y"] = float("NaN")
-    gamestates[0].loc[shot_idx, "result_id"] = float("NaN")
-    # get gamestates corresponding to shots
-    shot_gamestates = [states.loc[shot_idx] for states in gamestates]
-    # compute features
-    X = []
-    for fn in xfns:
-        if getattr(fn, "ftype", None) == "gamestates":
-            X.append(fn(shot_gamestates).set_index(shot_events_idx.values))
-        elif getattr(fn, "ftype", None) == "actions":
-            X.append(fn(gamestates[0], shot_idx).set_index(shot_events_idx.values))
-        elif getattr(fn, "ftype", None) == "events":
-            X.append(fn(events, shot_events_idx))
-        else:
-            warnings.warn("Unknown feature type for {}.".format(fn.__name__))
-    X = pd.concat(X, axis=1).loc[shot_events_idx].set_index(shot_actions_idx)
-    missing_bool = X.select_dtypes(include=['boolean']).columns
-    X[missing_bool] = X[missing_bool].fillna(False).astype(bool)
-    # replace 'a0' by 'shot' in each feature name
-    X.rename(columns=lambda s: s.replace("a0", "shot"), inplace=True)
-    return X
+    gamestates_ltr = fs.play_left_to_right(gamestates, game.home_team_id)
+    shot_gamestates_ltr = [states.loc[shot_mask].copy() for states in gamestates_ltr]
+    # remove post-shot attributes to avoid target leakage
+    shot_gamestates_ltr[0]["end_x"] = float("NaN")
+    shot_gamestates_ltr[0]["end_y"] = float("NaN")
+    shot_gamestates_ltr[0]["result_id"] = float("NaN")
+
+    # compute features and labels
+    def _apply_fns(fns):
+        attrs = []
+        for fn in fns:
+            if getattr(fn, "ftype", None) == "gamestates":
+                attrs.append(fn(shot_gamestates_ltr).set_index(shot_events_idx))
+            elif getattr(fn, "ftype", None) == "actions":
+                attrs.append(fn(actions_ltr, shot_mask).set_index(shot_events_idx))
+            elif getattr(fn, "ftype", None) == "events":
+                attrs.append(fn(events, shot_events_idx))
+            else:
+                warnings.warn("Unknown attribute type for {}.".format(fn.__name__))
+        attrs = pd.concat(attrs, axis=1).loc[shot_events_idx].set_index(shot_actions_idx)
+        attrs.index.name = "action_id"
+        # fill missing values
+        missing_bool = attrs.select_dtypes(include=['boolean']).columns
+        attrs[missing_bool] = attrs[missing_bool].fillna(False).astype(bool)
+        # replace 'a0' by 'shot' in each feature name
+        attrs.rename(columns=lambda s: s.replace("a0", "shot"), inplace=True)
+        return attrs
+
+    X = _apply_fns(xfns)
+    y = _apply_fns(yfns)
+
+    return X, y
diff --git a/soccer_xg/xg.py b/soccer_xg/xg.py
index aef6626..5de3d77 100644
--- a/soccer_xg/xg.py
+++ b/soccer_xg/xg.py
@@ -1,18 +1,21 @@
 """Tools for creating and analyzing xG models."""
-import os
+from __future__ import annotations
+
+import warnings
+from typing import Tuple, List, Optional
+from pathlib import Path
 
 import joblib
 import pandas as pd
-import socceraction.spadl.config as spadlcfg
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import brier_score_loss, roc_auc_score
 from sklearn.pipeline import make_pipeline
 from sklearn.utils.validation import NotFittedError
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
-from soccer_xg import features as fs
-from soccer_xg import metrics, utils
-from soccer_xg.api import DataApi
+from soccer_xg import attributes as fs
+from soccer_xg import metrics
+from soccer_xg.data.base import Dataset
 from soccer_xg.ml.preprocessing import simple_proc_for_linear_algoritms
 
 
@@ -22,41 +25,45 @@ class XGModel:
     Parameters
     ----------
     copy_data : boolean (default=``True``)
-        Whether or not to copy data when fitting and applying the model. Running the model
-        in-place (``copy_data=False``) will be faster and have a smaller memory footprint,
-        but if not done carefully can lead to data integrity issues.
+        Whether or not to copy data when fitting and applying the model.
+        Running the model in-place (``copy_data=False``) will be faster and
+        have a smaller memory footprint, but if not done carefully can lead to
+        data integrity issues.
 
     Attributes
     ----------
     model : A Scikit-learn pipeline (or equivalent)
-        The actual model used to compute xG. Upon initialization it will be set to
-        a default model, but can be overridden by the user.
+        The actual model used to compute xG. Upon initialization it will be
+        set to a default model, but can be overridden by the user.
     column_descriptions : dictionary
-        A dictionary whose keys are the names of the columns used in the model, and the values are
-        string descriptions of what the columns mean. Set at initialization to be the default model,
-        if you create your own model you'll need to update this attribute manually.
+        A dictionary whose keys are the names of the columns used in the
+        model, and the values are string descriptions of what the columns
+        mean. Set at initialization to be the default model, if you create
+        your own model you'll need to update this attribute manually.
     training_seasons : A list of tuples, or ``None`` (default=``None``)
-        If the model was trained using data from the DataApi, a list of (competition_id, season_id) tuples
-        used to train the model. If the DataApi was **not** used, an empty list. If no model
-        has been trained yet, ``None``.
-    validation_seasons : same as ``training_seasons``, but for validation data.
+        If the model was trained using data from a Dataset, a list of
+        (competition_id, season_id) tuples used to train the model. If no
+        Dataset was used, an empty list. If no model has been trained yet,
+        ``None``.
+    validation_seasons :  A list of tuples, or ``None`` (default=``None``)
+        Same as ``training_seasons``, but for validation data.
     sample_probabilities : A numpy array of floats or ``None`` (default=``None``)
-        After the model has been validated, contains the sampled predicted probabilities used to
-        compute the validation statistic.
+        After the model has been validated, contains the sampled predicted
+        probabilities used to compute the validation statistic.
     predicted_goal_percents : A numpy array of floats or ``None`` (default=``None``)
-        After the model has been validated, contains the actual probabilities in the test
-        set at each probability in ``sample_probabilities``.
+        After the model has been validated, contains the actual probabilities
+        in the test set at each probability in ``sample_probabilities``.
     num_shots_used : A numpy array of floats or ``None`` (default=``None``)
-        After the model has been validated, contains the number of shots used to compute each
-        element of ``predicted_goal_percents``.
+        After the model has been validated, contains the number of shots used
+        to compute each element of ``predicted_goal_percents``.
     model_directory : string
         The directory where all models will be saved to or loaded from.
     """
 
-    model_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
+    model_directory = Path(__file__).resolve().parent / "models"
     _default_model_filename = "default_model.xg"
 
-    def __init__(self, copy_data=True):
+    def __init__(self, copy_data: bool = True):
         self.copy_data = copy_data
         self.column_descriptions = None
 
@@ -70,7 +77,7 @@ def __init__(self, copy_data=True):
         self._num_shots_used = None
 
     @property
-    def training_seasons(self):
+    def training_seasons(self) -> Optional[List[Tuple[str, str]]]:
         return self._training_seasons
 
     @property
@@ -89,11 +96,18 @@ def predicted_goal_percents(self):
     def num_shots_used(self):
         return self._num_shots_used
 
+    @classmethod
+    def filter_shots(cls, df_actions):
+        shot_mask = df_actions.type_name.isin(
+            ["shot", "shot_penalty", "shot_freekick"]
+        ) & df_actions.result_name.isin(["fail", "success"])
+        return shot_mask
+
     def train(
         self,
-        source_data,
-        training_seasons=(("ENG", "1617"), ("ENG", "1718")),
-        target_colname="goal",
+        source_data: Dataset | pd.DataFrame,
+        training_seasons: List[Tuple[str, str]] = (("ENG", "1617"), ("ENG", "1718")),
+        target_colname: str = "goal",
     ):
         """Train the model.
 
@@ -103,7 +117,7 @@ def train(
         This method implements a simple wrapper around the core Scikit-learn functionality
         which does this.
 
-        The default is to use data from a DataApi object, however that can be changed
+        The default is to use data from a Dataset object, however that can be changed
         to a simple Pandas DataFrame with precomputed features and labels if desired.
 
         There is no particular output from this function, rather the parameters governing
@@ -113,19 +127,19 @@ def train(
 
         Parameters
         ----------
-        source_data : ``DataApi`` or a Pandas DataFrame
+        source_data : ``Dataset`` or a Pandas DataFrame
             The data to be used to train the model. If an instance of
-            ``DataApi`` is given, will query the api database for the training data.
+            ``Dataset`` is given, will query the api database for the training data.
         training_seasons : list of tuples (default=``[('ENG', '1617'), ('ENG', '1718')]``)
-            What seasons to use to train the model if getting data from a DataApi instance.
-            If ``source_data`` is not a ``DataApi``, this argument will be ignored.
+            What seasons to use to train the model if getting data from a Dataset instance.
+            If ``source_data`` is not a ``Dataset``, this argument will be ignored.
             **NOTE:** it is critical not to use all possible data in order to train the
             model - some will need to be reserved for a final validation (see the
             ``validate_model`` method). A good dataset to reserve
             for validation is the most recent one or two seasons.
         target_colname : string or integer (default=``"goal"``)
             The name of the target variable column. This is only relevant if
-            ``source_data`` is not a ``DataApi``.
+            ``source_data`` is not a ``Dataset``.
 
         Returns
         -------
@@ -136,15 +150,14 @@ def train(
                 model.train(source_data, training_seasons, target_colname)
         else:
             self._training_seasons = []
-            if isinstance(source_data, DataApi):
-                game_ids = source_data.games[
-                    source_data.games.season_id.astype(str).isin([s[1] for s in training_seasons])
-                    & source_data.games.competition_id.astype(str).isin(
-                        [s[0] for s in training_seasons]
-                    )
-                ].index
-                feature_cols = get_features(source_data, game_ids)
-                target_col = get_labels(source_data, game_ids)
+            if isinstance(source_data, Dataset):
+                game_ids = pd.concat(
+                    source_data.games(competition_id=s[0], season_id=s[1])
+                    for s in training_seasons
+                ).index
+                feature_cols, target_col = prepare(
+                    source_data, game_ids, shotfilter=self.filter_shots
+                )
                 self._training_seasons = training_seasons
             else:
                 target_col = source_data[target_colname]
@@ -161,30 +174,34 @@ def validate(
     ):
         """Validate the model.
 
-        Once a modeling pipeline is trained, a different dataset must be fed into the trained model
-        to validate the quality of the fit.
-        This method implements a simple wrapper around the core Scikit-learn functionality
+        Once a modeling pipeline is trained, a different dataset must be fed
+        into the trained model to validate the quality of the fit. This method
+        implements a simple wrapper around the core Scikit-learn functionality
         which does this.
 
-        The default is to use data from a DataApi object, however that can be changed
-        to a simple Pandas DataFrame with precomputed features and labels if desired.
+        The default is to use data from a Dataset object, however that can be
+        changed to a simple Pandas DataFrame with precomputed features and
+        labels if desired.
 
-        The output of this method is a dictionary with relevant error metrics (see ``soccer_xg.metrics``).
+        The output of this method is a dictionary with relevant error metrics
+        (see ``soccer_xg.metrics``).
 
         Parameters
         ----------
-        source_data : ``DataApi`` or a Pandas DataFrame
+        source_data : ``Dataset`` or a Pandas DataFrame
             The data to be used to validate the model. If an instance of
-            ``DataApi`` is given, will query the api database for the training data.
+            ``Dataset`` is given, will query the api database for the training
+            data.
         validation_seasons : list of tuples (default=``[('ENG', '1819')]``)
-            What seasons to use to validated the model if getting data from a DataApi instance.
-            If ``source_data`` is not a ``DataApi``, this argument will be ignored.
-            **NOTE:** it is critical not to use the same data to validate the model as was used
-            in the fit. Generally a good data set to use for validation is one from a time
-            period more recent than was used to train the model.
+            What seasons to use to validated the model if getting data from
+            a Dataset instance. If ``source_data`` is not a ``Dataset``, this
+            argument will be ignored. **NOTE:** it is critical not to use the
+            same data to validate the model as was used in the fit. Generally
+            a good data set to use for validation is one from a time period
+            more recent than was used to train the model.
         target_colname : string or integer (default=``"goal"``)
             The name of the target variable column. This is only relevant if
-            ``source_data`` is not a ``DataApi``.
+            ``source_data`` is not a ``Dataset``.
         plot: bool (default=true)
             Whether to plot the AUROC and probability calibration curves.
 
@@ -202,14 +219,11 @@ def validate(
         if not self._fitted:
             raise NotFittedError("Must fit model before validating.")
 
-        if isinstance(source_data, DataApi):
-            game_ids = source_data.games[
-                source_data.games.season_id.astype(str).isin([s[1] for s in validation_seasons])
-                & source_data.games.competition_id.astype(str).isin(
-                    [s[0] for s in validation_seasons]
-                )
-            ].index
-            target_col = get_labels(source_data, game_ids)
+        if isinstance(source_data, Dataset):
+            game_ids = pd.concat(
+                source_data.games(competition_id=s[0], season_id=s[1]) for s in validation_seasons
+            ).index
+            _, target_col = prepare(source_data, game_ids)
             self._validation_seasons = validation_seasons
         else:
             game_ids = None
@@ -273,18 +287,18 @@ def validate(
     def estimate(self, source_data, game_ids=None):
         """Estimate the xG values for all shots in a set of games.
 
-        The default is to use data from a DataApi object, however that can be changed
+        The default is to use data from a Dataset object, however that can be changed
         to a simple Pandas DataFrame with precomputed features and labels if desired.
 
         Parameters
         ----------
-        source_data : ``DataApi`` or a Pandas DataFrame
+        source_data : ``Dataset`` or a Pandas DataFrame
             The data to be used to validate the model. If an instance of
-            ``DataApi`` is given, will query the api database for the training data.
+            ``Dataset`` is given, will query the api database for the training data.
         game_ids : list of ints (default=None)
             Only xG values for the games in this list are returned. By default,
             xG values are computed for all games in the source data.
-            If ``source_data`` is not a ``DataApi``, this argument will be ignored.
+            If ``source_data`` is not a ``Dataset``, this argument will be ignored.
 
         Returns
         -------
@@ -307,10 +321,9 @@ def estimate(self, source_data, game_ids=None):
                 xg.append(model.estimate(source_data, game_ids))
             return pd.concat(xg).sort_index()
         else:
-            if isinstance(source_data, DataApi):
-                if game_ids is None:
-                    game_ids = source_data.games.index if game_ids is None else game_ids
-                source_data = get_features(source_data, game_ids)
+            if isinstance(source_data, Dataset):
+                game_ids = source_data.games().index if game_ids is None else game_ids
+                source_data, _ = prepare(source_data, game_ids)
 
             xg = pd.DataFrame(index=source_data.index)
             xg["xG"] = self.model.predict_proba(source_data)[:, 1]
@@ -350,7 +363,7 @@ def save_model(self, filename=None):
         """
         if filename is None:
             filename = self._default_model_filename
-        joblib.dump(self, os.path.join(self.model_directory, filename))
+        joblib.dump(self, self.model_directory / filename)
 
     @classmethod
     def load_model(cls, filename=None):
@@ -370,48 +383,12 @@ def load_model(cls, filename=None):
         if filename is None:
             filename = cls._default_model_filename
 
-        return joblib.load(os.path.join(cls.model_directory, filename))
+        return joblib.load(cls.model_directory / filename)
 
 
 class OpenplayXGModel(XGModel):
     _default_model_filename = "default_openplay_model.xg"
 
-    def train(
-        self,
-        source_data,
-        training_seasons=(("ENG", "1617"), ("ENG", "1718")),
-        target_colname="goal",
-    ):
-        self._training_seasons = []
-        if isinstance(source_data, DataApi):
-            game_ids = source_data.games[
-                source_data.games.season_id.astype(str).isin([s[1] for s in training_seasons])
-                & source_data.games.competition_id.astype(str).isin(
-                    [s[0] for s in training_seasons]
-                )
-            ].index
-            feature_cols = get_features(
-                source_data, game_ids, shotfilter=OpenplayXGModel.filter_shots
-            )
-            target_col = get_labels(source_data, game_ids, shotfilter=OpenplayXGModel.filter_shots)
-            self._training_seasons = training_seasons
-        else:
-            target_col = source_data[target_colname]
-            feature_cols = source_data.drop(target_colname, axis=1)
-        self.model.fit(feature_cols, target_col)
-        self._fitted = True
-
-    def estimate(self, source_data, game_ids=None):
-        if isinstance(source_data, DataApi):
-            game_ids = source_data.games.index if game_ids is None else game_ids
-            source_data = get_features(
-                source_data, game_ids, shotfilter=OpenplayXGModel.filter_shots
-            )
-
-        xg = pd.DataFrame(index=source_data.index)
-        xg["xG"] = self.model.predict_proba(source_data)[:, 1]
-        return xg
-
     def create_default_pipeline(self):
         bodypart_colname = "bodypart_id_a0"
         dist_to_goal_colname = "start_dist_to_goal_a0"
@@ -430,8 +407,8 @@ def create_default_pipeline(self):
         pipe = make_pipeline(preprocess_pipeline, base_model)
         return pipe
 
-    @staticmethod
-    def filter_shots(df_actions):
+    @classmethod
+    def filter_shots(cls, df_actions):
         shot_idx = (df_actions.type_name == "shot") & df_actions.result_name.isin(
             ["fail", "success"]
         )
@@ -454,9 +431,9 @@ def train(
         pass
 
     def estimate(self, source_data, game_ids=None):
-        if isinstance(source_data, DataApi):
+        if isinstance(source_data, Dataset):
             game_ids = source_data.games.index if game_ids is None else game_ids
-            source_data = get_features(
+            source_data, _ = prepare(
                 source_data,
                 game_ids,
                 xfns=[],
@@ -471,8 +448,8 @@ def estimate(self, source_data, game_ids=None):
     def create_default_pipeline(self):
         return None
 
-    @staticmethod
-    def filter_shots(df_actions):
+    @classmethod
+    def filter_shots(cls, df_actions):
         shot_idx = df_actions.type_name == "shot_penalty"
         return shot_idx
 
@@ -480,42 +457,6 @@ def filter_shots(df_actions):
 class FreekickXGModel(XGModel):
     _default_model_filename = "default_freekick_model.xg"
 
-    def train(
-        self,
-        source_data,
-        training_seasons=(("ENG", "1617"), ("ENG", "1718")),
-        target_colname="goal",
-    ):
-        self._training_seasons = []
-        if isinstance(source_data, DataApi):
-            game_ids = source_data.games[
-                source_data.games.season_id.astype(str).isin([s[1] for s in training_seasons])
-                & source_data.games.competition_id.astype(str).isin(
-                    [s[0] for s in training_seasons]
-                )
-            ].index
-            feature_cols = get_features(
-                source_data, game_ids, shotfilter=FreekickXGModel.filter_shots
-            )
-            target_col = get_labels(source_data, game_ids, shotfilter=FreekickXGModel.filter_shots)
-            self._training_seasons = training_seasons
-        else:
-            target_col = source_data[target_colname]
-            feature_cols = source_data.drop(target_colname, axis=1)
-        self.model.fit(feature_cols, target_col)
-        self._fitted = True
-
-    def estimate(self, source_data, game_ids=None):
-        if isinstance(source_data, DataApi):
-            game_ids = source_data.games.index if game_ids is None else game_ids
-            source_data = get_features(
-                source_data, game_ids, shotfilter=FreekickXGModel.filter_shots
-            )
-
-        xg = pd.DataFrame(index=source_data.index)
-        xg["xG"] = self.model.predict_proba(source_data)[:, 1]
-        return xg
-
     def create_default_pipeline(self):
         dist_to_goal_colname = "start_dist_to_goal_a0"
         angle_to_goal_colname = "start_angle_to_goal_a0"
@@ -532,109 +473,75 @@ def create_default_pipeline(self):
         pipe = make_pipeline(preprocess_pipeline, base_model)
         return pipe
 
-    @staticmethod
-    def filter_shots(df_actions):
+    @classmethod
+    def filter_shots(cls, df_actions):
         shot_idx = df_actions.type_name == "shot_freekick"
         return shot_idx
 
 
-def get_features(
-    api,
+def prepare(
+    dataset: Dataset,
     game_ids=None,
-    xfns=fs.all_features,
+    xfns=fs.default_features,
+    yfns=fs.default_labels,
     shotfilter=None,
     nb_prev_actions=3,
+    on_fail="raise",
 ):
-    game_ids = api.games.index if game_ids is None else game_ids
-    X = {}
-    for game_id in tqdm(game_ids, desc=f"Generating features"):
-        # try:
-        game = api.games.loc[game_id]
-        game_actions = utils.enhance_actions(api.get_actions(game_id))
-        game_events = api.get_events(game_id)
-        X[game_id] = _compute_features_game(game, game_actions, xfns, shotfilter, nb_prev_actions)
-        X[game_id].index.name = "action_id"
-        X[game_id]["game_id"] = game_id
-        # except Exception as e:
-        # print(f"Failed for game with id={game_id}: {e}")
-    X = pd.concat(X.values()).reset_index().set_index(["game_id", "action_id"])
-    # remove post-shot features (these will all have a single unique value)
-    f = X.columns[X.nunique() > 1]
-    return X[f]
-
+    """Prepare a dataset for training and validation.
 
-def _compute_features_game(
-    game, actions, xfns=fs.all_features, shotfilter=None, nb_prev_actions=3
-):
-    if shotfilter is None:
-        # filter shots and ignore own goals
-        shot_idx = actions.type_name.isin(
-            ["shot", "shot_penalty", "shot_freekick"]
-        ) & actions.result_name.isin(["fail", "success"])
-    else:
-        shot_idx = shotfilter(actions)
-    if shot_idx.sum() < 1:
-        return pd.DataFrame()
-    if len(xfns) < 1:
-        return pd.DataFrame(index=actions.index.values[shot_idx])
-    # convert actions to gamestates
-    gamestates = [
-        states.loc[shot_idx].copy() for states in fs.gamestates(actions, nb_prev_actions)
-    ]
-    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
-    # remove post-shot attributes
-    gamestates[0].loc[shot_idx, "end_x"] = float("NaN")
-    gamestates[0].loc[shot_idx, "end_y"] = float("NaN")
-    gamestates[0].loc[shot_idx, "result_id"] = float("NaN")
-    # compute features
-    X = pd.concat([fn(gamestates).reset_index(drop=True) for fn in xfns], axis=1).set_index(
-        actions.loc[shot_idx].index
-    )
-    # fix data types
-    for c in [c for c in X.columns.values if c.startswith("type_id")]:
-        X[c] = pd.Categorical(
-            X[c].replace(spadlcfg.actiontypes_df().type_name.to_dict()),
-            categories=spadlcfg.actiontypes,
-            ordered=False,
-        )
-    for c in [c for c in X.columns.values if c.startswith("result_id")]:
-        X[c] = pd.Categorical(
-            X[c].replace(spadlcfg.results_df().result_name.to_dict()),
-            categories=spadlcfg.results,
-            ordered=False,
-        )
-    for c in [c for c in X.columns.values if c.startswith("bodypart_id")]:
-        X[c] = pd.Categorical(
-            X[c].replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
-            categories=spadlcfg.bodyparts,
-            ordered=False,
-        )
-    return X
-
-
-def get_labels(api, game_ids=None, shotfilter=None):
-    game_ids = api.games.index if game_ids is None else game_ids
-    y = {}
-    for game_id in tqdm(game_ids, desc=f"Generating labels"):
+    Parameters
+    ----------
+    dataset : Dataset
+        The dataset to use.
+    game_ids : list of ints (default=None)
+        Only use data from the games in this list. By default, all games
+        in the dataset are used.
+    xfns : list(callable)
+        List of feature generators to apply. Defaults to ``default_features``.
+    yfns : list(callable)
+        List of label generators to apply. Defaults to ``default_labels``.
+    shotfilter: callable(pd.Series) -> bool
+        A function that takes a shot (in SPADL format) and returns True if the
+        shot should be used for feature extraction. If None, all shots will be
+        used (excluding own-goals).
+    nb_prev_actions: int
+        The number of previous actions to consider when calculating labels
+    on_fail: 'raise' or 'warn'
+        What to do if a feature or label function fails on a specific game.
+
+    Returns
+    -------
+    X : pd.DataFrame
+        A dataframe containing the features.
+    y : pd.DataFrame
+        A dataframe containing the labels.
+    """
+    game_ids = dataset.games().index if game_ids is None else game_ids
+    X, y = {}, {}
+    for game_id in tqdm(game_ids, desc="Preparing dataset"):
         try:
-            game = api.games.loc[game_id]
-            game_actions = utils.enhance_actions(api.get_actions(game_id))
-            y[game_id] = _compute_labels_game(game, game_actions, shotfilter)
-            y[game_id].index.name = "action_id"
+            game = dataset.games().loc[game_id]
+            game_actions = dataset.actions(game_id)
+            game_events = dataset.events(game_id)
+            X[game_id], y[game_id] = fs.compute_attributes(
+                game,
+                game_actions,
+                events=game_events,
+                xfns=xfns,
+                yfns=yfns,
+                shotfilter=shotfilter,
+                nb_prev_actions=nb_prev_actions,
+            )
+            X[game_id]["game_id"] = game_id
             y[game_id]["game_id"] = game_id
         except Exception as e:
-            print(e)
-    return pd.concat(y.values()).reset_index().set_index(["game_id", "action_id"])["goal"]
-
-
-def _compute_labels_game(game, actions, shotfilter=None):
-    # compute labels
-    y = actions["result_name"] == "success"
-    if shotfilter is None:
-        # filter shots and ignore own goals
-        shot_idx = actions.type_name.isin(
-            ["shot", "shot_penalty", "shot_freekick"]
-        ) & actions.result_name.isin(["fail", "success"])
-    else:
-        shot_idx = shotfilter(actions)
-    return y.loc[shot_idx].to_frame("goal")
+            if on_fail == "warn":
+                warnings.warn(f"Failed for game with id={game_id}: {e}")
+            else:
+                raise RuntimeError(f"Failed for game with id={game_id}.") from e
+    X = pd.concat(X.values()).reset_index().set_index(["game_id", "action_id"])
+    # remove post-shot features (these will all have a single unique value)
+    f = X.columns[X.nunique() > 1]
+    y = pd.concat(y.values()).reset_index().set_index(["game_id", "action_id"])
+    return X[f], y