diff --git a/soccer_xg/features.py b/soccer_xg/attributes.py similarity index 87% rename from soccer_xg/features.py rename to soccer_xg/attributes.py index c7c066d..e53260d 100644 --- a/soccer_xg/features.py +++ b/soccer_xg/attributes.py @@ -1,9 +1,9 @@ -"""A collection of feature generators. +"""A collection of attribute (i.e., feature and label) generators. -There are three types of feature generators: +There are three types of generators: gamestates - Feature generators which calculate a set of features based on the shot and + Generators which calculate a set of attributes based on the shot and the N previous actions (i.e., shot context). The input is a list of gamestates. Internally each game state is represented as a list of SPADL action dataframes :math:`[a_0, a_1, ...]` where each row in the :math:`a_i` @@ -11,22 +11,22 @@ :math:`a_{i-1}` dataframe. :math:`a_0` is the shot action. actions - Feature generators which calculate a set of features based on the shot and + Generators which calculate a set of attributes based on the shot and all preceding actions. The input is a :class:`pandas.DataFrame` of actions - in SPADL format and a boolean mask to select the shots for which features + in SPADL format and a boolean mask to select the shots for which attributes should be computed. events - Feature generators which calculate a set of features based on the original - event data. These feature generators are provider-specific. The input is - a :class:`pandas.DataFrame` of events and a boolean mask to select the - shots for which features should be computed. + Generators which calculate a set of attributes based on the original + event data. These generators are provider-specific. The input is + a :class:`pandas.DataFrame` of events and a series with event IDs to select + the shots for which attributes should be computed. The types are specified using the ``ftype`` decorator. Only functions, which -have a parameter called "ftype" are seen by soccer-xg as a feature generator. -Others will not be calculated. +have a parameter called "ftype" are seen by soccer-xg as a generator. Others +will not be calculated. -As the "gamestates" and "actions" feature generators compute features from +As the "gamestates" and "actions" generators compute attributes from SPADL actions, they work for all data providers that are supported by the SoccerAction library. """ @@ -36,10 +36,10 @@ import numpy as np import pandas as pd +from socceraction import spadl import socceraction.spadl.config as spadlcfg import socceraction.vaep.features as fs from socceraction.vaep.features import simple -from . import utils _spadl_cfg = { "length": 105, @@ -70,7 +70,7 @@ def decorate_func(func): # ############################################################################ -# SoccerAction-style gamestate features +# SoccerAction-style gamestate attributes # ############################################################################ actiontype = ftype("gamestates")(fs.actiontype) @@ -107,7 +107,7 @@ def speed(gamestates): between each action ai and action a0. """ a0 = gamestates[0] - spaced = pd.DataFrame() + spaced = pd.DataFrame(index=a0.index) for i, a in enumerate(gamestates[1:]): dt = a0.time_seconds - a.time_seconds dt[dt < 1] = 1 @@ -120,10 +120,36 @@ def speed(gamestates): # ############################################################################ -# Features on SPADL shots +# Attributes on SPADL shots # ############################################################################ +@ftype("actions") +def goal_from_shot(actions, shot_mask): + """Determine whether a goal was scored from the current action. + + This label can be used to train an xG model. + + Parameters + ---------- + actions : pd.DataFrame + The actions of a game in SPADL format. + shot_mask : pd.Series + A boolean mask to select the shots for which attributes should be + computed. + + Returns + ------- + pd.DataFrame + A dataframe with a column 'goal' and a row for each shot set to + True if a goal was scored from the current shot; otherwise False. + """ + shots = actions.loc[shot_mask] + goaldf = pd.DataFrame(index=shots.index) + goaldf["goal"] = shots["result_name"] == "success" + return goaldf + + @ftype("actions") def shot_dist(actions, shot_mask): """Compute the distance to the middle of the goal. @@ -133,7 +159,7 @@ def shot_dist(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -143,7 +169,7 @@ def shot_dist(actions, shot_mask): ('dist_shot'). """ shots = actions.loc[shot_mask] - distdf = pd.DataFrame() + distdf = pd.DataFrame(index=shots.index) dx = (_spadl_cfg["length"] - shots["start_x"]).values dy = (_spadl_cfg["width"] / 2 - shots["start_y"]).values distdf["dist_shot"] = np.sqrt(dx**2 + dy**2) @@ -162,7 +188,7 @@ def shot_location(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -172,7 +198,7 @@ def shot_location(actions, shot_mask): and a column for the distance to the goal line ('dx_shot'). """ shots = actions.loc[shot_mask] - locationdf = pd.DataFrame() + locationdf = pd.DataFrame(index=shots.index) locationdf["dx_shot"] = _spadl_cfg["length"] - shots["start_x"] locationdf["dy_shot"] = (_spadl_cfg["width"] / 2 - shots["start_y"]).abs() return locationdf @@ -190,7 +216,7 @@ def shot_angle(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -200,7 +226,7 @@ def shot_angle(actions, shot_mask): ('angle_shot'). """ shots = actions.loc[shot_mask] - polardf = pd.DataFrame() + polardf = pd.DataFrame(index=shots.index) dx = (_spadl_cfg["length"] - shots["start_x"]).abs().values dy = (_spadl_cfg["width"] / 2 - shots["start_y"]).abs().values with np.errstate(divide="ignore", invalid="ignore"): @@ -217,7 +243,7 @@ def shot_visible_angle(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -234,7 +260,7 @@ def shot_visible_angle(actions, shot_mask): shots = actions.loc[shot_mask] dx = _spadl_cfg["length"] - shots["start_x"] dy = _spadl_cfg["width"] / 2 - shots["start_y"] - angledf = pd.DataFrame() + angledf = pd.DataFrame(index=shots.index) angledf["visible_angle_shot"] = np.arctan( _spadl_cfg["goal_width"] * dx / (dx**2 + dy**2 - (_spadl_cfg["goal_width"] / 2) ** 2) ) @@ -268,7 +294,7 @@ def shot_relative_angle(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -309,7 +335,7 @@ def shot_bodypart(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -319,9 +345,11 @@ def shot_bodypart(actions, shot_mask): ('bodypart_name_shot'). """ shots = actions.loc[shot_mask] - bodypartdf = pd.DataFrame() + bodypartdf = pd.DataFrame(index=shots.index) bodypartdf["bodypart_name_shot"] = pd.Categorical( - shots["bodypart_name"].replace(["foot_left", "foot_right"], "foot"), categories=["foot", "head", "other"], ordered=False + shots["bodypart_name"].replace(["foot_left", "foot_right"], "foot"), + categories=["foot", "head", "other"], + ordered=False, ) return bodypartdf @@ -336,7 +364,7 @@ def shot_bodypart_detailed(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -346,7 +374,7 @@ def shot_bodypart_detailed(actions, shot_mask): ('bodypart_name_shot'). """ shots = actions.loc[shot_mask] - bodypartdf = pd.DataFrame() + bodypartdf = pd.DataFrame(index=shots.index) bodypartdf["bodypart_name_shot"] = pd.Categorical( shots["bodypart_name"], categories=spadlcfg.bodyparts, ordered=False ) @@ -362,7 +390,7 @@ def shot_bodypart_onehot(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -372,11 +400,13 @@ def shot_bodypart_onehot(actions, shot_mask): to take a shot. """ shots = actions.loc[shot_mask] - X = pd.DataFrame() + X = pd.DataFrame(index=shots.index) for bodypart_name in spadlcfg.bodyparts: col = "bodypart_" + bodypart_name + "_shot" if bodypart_name == "head/other": X[col] = shots["bodypart_name"].isin(["head", "other", "head/other"]) + elif bodypart_name == "foot": + X[col] = shots["bodypart_name"].isin(["foot", "foot_left", "foot_right"]) else: X[col] = shots["bodypart_name"] == bodypart_name return X @@ -397,7 +427,7 @@ def post_dribble(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -434,7 +464,7 @@ def assist_type(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -491,7 +521,7 @@ def fastbreak(actions, shot_mask): actions : pd.DataFrame The actions of a game in SPADL format. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -738,7 +768,7 @@ def fn(actions): caley_grid = ftype("gamestates")(custom_grid("caley_zone", _caley_shot_matrix(), _point_in_rect)) # ############################################################################ -# StatsBomb-specific features +# StatsBomb-specific attributes # ############################################################################ @@ -818,7 +848,7 @@ def statsbomb_open_goal(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -846,7 +876,7 @@ def statsbomb_first_touch(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -877,7 +907,7 @@ def statsbomb_free_projection(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -950,7 +980,7 @@ def statsbomb_goalkeeper_position(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -1031,7 +1061,7 @@ def statsbomb_defenders_position(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -1105,7 +1135,7 @@ def statsbomb_assist(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -1198,7 +1228,7 @@ def statsbomb_counterattack(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -1228,7 +1258,7 @@ def statsbomb_shot_impact_height(events, shot_mask): events : pd.DataFrame The StatsBomb events of a game. shot_mask : pd.Series - A boolean mask to select the shots for which features should be + A boolean mask to select the shots for which attributes should be computed. Returns @@ -1265,7 +1295,7 @@ def statsbomb_shot_impact_height(events, shot_mask): return output -all_features = [ +default_features = [ actiontype, bodypart, result, @@ -1290,23 +1320,19 @@ def statsbomb_shot_impact_height(events, shot_mask): ), ] -simple_features = [] -statsbomb_features = [] +default_labels = [goal_from_shot] -def extract_features_on_game( - game, actions, events=None, xfns=all_features, shotfilter=None, nb_prev_actions=3 +def compute_attributes( + game, + actions, + events=None, + xfns=default_features, + yfns=default_labels, + shotfilter=None, + nb_prev_actions=3, ): - """ - Extract features from - - * a :class:`pandas.DataFrame` containing SPADL actions - - and / or - - * a :class:`pandas.DataFrame` containing provider-specific event data - - In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. + """Extract xG features for a given game. Parameters ---------- @@ -1315,10 +1341,12 @@ def extract_features_on_game( actions : pd.DataFrame A DataFrame containing SPADL actions. events: pd.DataFrame - A DataFrame containing the raw events. Can be used to calculate - provider-specific features. + A DataFrame containing the raw provider-specific events corresponding + to ``actions``. Can be used to calculate provider-specific features. xfns : list(callable) - List of feature generators to apply. + List of feature generators to apply. Defaults to ``default_features``. + yfns : list(callable) + List of label generators to apply. Defaults to ``default_labels``. shotfilter: callable(pd.Series) -> bool A function that takes a shot (in SPADL format) and returns True if the shot should be used for feature extraction. If None, all shots will be @@ -1332,46 +1360,60 @@ def extract_features_on_game( pd.DataFrame A DataFrame with the calculated features. """ - actions = utils.enhance_actions(actions) - # get shot index + # add names for result, bodypart and type + actions = spadl.utils.add_names(actions) + + # select shots if shotfilter is None: # filter shots and ignore own goals - shot_idx = actions.type_name.isin( + shot_mask = actions.type_name.isin( ["shot", "shot_penalty", "shot_freekick"] ) & actions.result_name.isin(["fail", "success"]) else: - shot_idx = actions.apply(lambda a: shotfilter(a), axis=1) - shot_actions_idx = actions.index.values[shot_idx] - shot_events_idx = actions.loc[shot_idx, "original_event_id"] - # handle inputs with no shots or no features - if shot_idx.sum() < 1: + shot_mask = actions.apply(lambda a: shotfilter(a), axis=1) + shot_actions_idx = actions.index[shot_mask] + shot_events_idx = actions.loc[shot_mask, "original_event_id"] + + # handle inputs with no shots or no attributes + if shot_mask.sum() < 1: # TODO: create the expected columns return pd.DataFrame() - if len(xfns) < 1: + if len(xfns + yfns) < 1: return pd.DataFrame(index=shot_actions_idx) - # convert actions to ltr gamestates + + # convert actions to ltr orientation + actions_ltr = spadl.utils.play_left_to_right(actions, game.home_team_id) + # convert actions to ltr shot gamestates gamestates = fs.gamestates(actions, nb_prev_actions) - gamestates = fs.play_left_to_right(gamestates, game.home_team_id) - # remove post-shot attributes - gamestates[0].loc[shot_idx, "end_x"] = float("NaN") - gamestates[0].loc[shot_idx, "end_y"] = float("NaN") - gamestates[0].loc[shot_idx, "result_id"] = float("NaN") - # get gamestates corresponding to shots - shot_gamestates = [states.loc[shot_idx] for states in gamestates] - # compute features - X = [] - for fn in xfns: - if getattr(fn, "ftype", None) == "gamestates": - X.append(fn(shot_gamestates).set_index(shot_events_idx.values)) - elif getattr(fn, "ftype", None) == "actions": - X.append(fn(gamestates[0], shot_idx).set_index(shot_events_idx.values)) - elif getattr(fn, "ftype", None) == "events": - X.append(fn(events, shot_events_idx)) - else: - warnings.warn("Unknown feature type for {}.".format(fn.__name__)) - X = pd.concat(X, axis=1).loc[shot_events_idx].set_index(shot_actions_idx) - missing_bool = X.select_dtypes(include=['boolean']).columns - X[missing_bool] = X[missing_bool].fillna(False).astype(bool) - # replace 'a0' by 'shot' in each feature name - X.rename(columns=lambda s: s.replace("a0", "shot"), inplace=True) - return X + gamestates_ltr = fs.play_left_to_right(gamestates, game.home_team_id) + shot_gamestates_ltr = [states.loc[shot_mask].copy() for states in gamestates_ltr] + # remove post-shot attributes to avoid target leakage + shot_gamestates_ltr[0]["end_x"] = float("NaN") + shot_gamestates_ltr[0]["end_y"] = float("NaN") + shot_gamestates_ltr[0]["result_id"] = float("NaN") + + # compute features and labels + def _apply_fns(fns): + attrs = [] + for fn in fns: + if getattr(fn, "ftype", None) == "gamestates": + attrs.append(fn(shot_gamestates_ltr).set_index(shot_events_idx)) + elif getattr(fn, "ftype", None) == "actions": + attrs.append(fn(actions_ltr, shot_mask).set_index(shot_events_idx)) + elif getattr(fn, "ftype", None) == "events": + attrs.append(fn(events, shot_events_idx)) + else: + warnings.warn("Unknown attribute type for {}.".format(fn.__name__)) + attrs = pd.concat(attrs, axis=1).loc[shot_events_idx].set_index(shot_actions_idx) + attrs.index.name = "action_id" + # fill missing values + missing_bool = attrs.select_dtypes(include=['boolean']).columns + attrs[missing_bool] = attrs[missing_bool].fillna(False).astype(bool) + # replace 'a0' by 'shot' in each feature name + attrs.rename(columns=lambda s: s.replace("a0", "shot"), inplace=True) + return attrs + + X = _apply_fns(xfns) + y = _apply_fns(yfns) + + return X, y diff --git a/soccer_xg/xg.py b/soccer_xg/xg.py index aef6626..5de3d77 100644 --- a/soccer_xg/xg.py +++ b/soccer_xg/xg.py @@ -1,18 +1,21 @@ """Tools for creating and analyzing xG models.""" -import os +from __future__ import annotations + +import warnings +from typing import Tuple, List, Optional +from pathlib import Path import joblib import pandas as pd -import socceraction.spadl.config as spadlcfg from sklearn.linear_model import LogisticRegression from sklearn.metrics import brier_score_loss, roc_auc_score from sklearn.pipeline import make_pipeline from sklearn.utils.validation import NotFittedError -from tqdm import tqdm +from tqdm.auto import tqdm -from soccer_xg import features as fs -from soccer_xg import metrics, utils -from soccer_xg.api import DataApi +from soccer_xg import attributes as fs +from soccer_xg import metrics +from soccer_xg.data.base import Dataset from soccer_xg.ml.preprocessing import simple_proc_for_linear_algoritms @@ -22,41 +25,45 @@ class XGModel: Parameters ---------- copy_data : boolean (default=``True``) - Whether or not to copy data when fitting and applying the model. Running the model - in-place (``copy_data=False``) will be faster and have a smaller memory footprint, - but if not done carefully can lead to data integrity issues. + Whether or not to copy data when fitting and applying the model. + Running the model in-place (``copy_data=False``) will be faster and + have a smaller memory footprint, but if not done carefully can lead to + data integrity issues. Attributes ---------- model : A Scikit-learn pipeline (or equivalent) - The actual model used to compute xG. Upon initialization it will be set to - a default model, but can be overridden by the user. + The actual model used to compute xG. Upon initialization it will be + set to a default model, but can be overridden by the user. column_descriptions : dictionary - A dictionary whose keys are the names of the columns used in the model, and the values are - string descriptions of what the columns mean. Set at initialization to be the default model, - if you create your own model you'll need to update this attribute manually. + A dictionary whose keys are the names of the columns used in the + model, and the values are string descriptions of what the columns + mean. Set at initialization to be the default model, if you create + your own model you'll need to update this attribute manually. training_seasons : A list of tuples, or ``None`` (default=``None``) - If the model was trained using data from the DataApi, a list of (competition_id, season_id) tuples - used to train the model. If the DataApi was **not** used, an empty list. If no model - has been trained yet, ``None``. - validation_seasons : same as ``training_seasons``, but for validation data. + If the model was trained using data from a Dataset, a list of + (competition_id, season_id) tuples used to train the model. If no + Dataset was used, an empty list. If no model has been trained yet, + ``None``. + validation_seasons : A list of tuples, or ``None`` (default=``None``) + Same as ``training_seasons``, but for validation data. sample_probabilities : A numpy array of floats or ``None`` (default=``None``) - After the model has been validated, contains the sampled predicted probabilities used to - compute the validation statistic. + After the model has been validated, contains the sampled predicted + probabilities used to compute the validation statistic. predicted_goal_percents : A numpy array of floats or ``None`` (default=``None``) - After the model has been validated, contains the actual probabilities in the test - set at each probability in ``sample_probabilities``. + After the model has been validated, contains the actual probabilities + in the test set at each probability in ``sample_probabilities``. num_shots_used : A numpy array of floats or ``None`` (default=``None``) - After the model has been validated, contains the number of shots used to compute each - element of ``predicted_goal_percents``. + After the model has been validated, contains the number of shots used + to compute each element of ``predicted_goal_percents``. model_directory : string The directory where all models will be saved to or loaded from. """ - model_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") + model_directory = Path(__file__).resolve().parent / "models" _default_model_filename = "default_model.xg" - def __init__(self, copy_data=True): + def __init__(self, copy_data: bool = True): self.copy_data = copy_data self.column_descriptions = None @@ -70,7 +77,7 @@ def __init__(self, copy_data=True): self._num_shots_used = None @property - def training_seasons(self): + def training_seasons(self) -> Optional[List[Tuple[str, str]]]: return self._training_seasons @property @@ -89,11 +96,18 @@ def predicted_goal_percents(self): def num_shots_used(self): return self._num_shots_used + @classmethod + def filter_shots(cls, df_actions): + shot_mask = df_actions.type_name.isin( + ["shot", "shot_penalty", "shot_freekick"] + ) & df_actions.result_name.isin(["fail", "success"]) + return shot_mask + def train( self, - source_data, - training_seasons=(("ENG", "1617"), ("ENG", "1718")), - target_colname="goal", + source_data: Dataset | pd.DataFrame, + training_seasons: List[Tuple[str, str]] = (("ENG", "1617"), ("ENG", "1718")), + target_colname: str = "goal", ): """Train the model. @@ -103,7 +117,7 @@ def train( This method implements a simple wrapper around the core Scikit-learn functionality which does this. - The default is to use data from a DataApi object, however that can be changed + The default is to use data from a Dataset object, however that can be changed to a simple Pandas DataFrame with precomputed features and labels if desired. There is no particular output from this function, rather the parameters governing @@ -113,19 +127,19 @@ def train( Parameters ---------- - source_data : ``DataApi`` or a Pandas DataFrame + source_data : ``Dataset`` or a Pandas DataFrame The data to be used to train the model. If an instance of - ``DataApi`` is given, will query the api database for the training data. + ``Dataset`` is given, will query the api database for the training data. training_seasons : list of tuples (default=``[('ENG', '1617'), ('ENG', '1718')]``) - What seasons to use to train the model if getting data from a DataApi instance. - If ``source_data`` is not a ``DataApi``, this argument will be ignored. + What seasons to use to train the model if getting data from a Dataset instance. + If ``source_data`` is not a ``Dataset``, this argument will be ignored. **NOTE:** it is critical not to use all possible data in order to train the model - some will need to be reserved for a final validation (see the ``validate_model`` method). A good dataset to reserve for validation is the most recent one or two seasons. target_colname : string or integer (default=``"goal"``) The name of the target variable column. This is only relevant if - ``source_data`` is not a ``DataApi``. + ``source_data`` is not a ``Dataset``. Returns ------- @@ -136,15 +150,14 @@ def train( model.train(source_data, training_seasons, target_colname) else: self._training_seasons = [] - if isinstance(source_data, DataApi): - game_ids = source_data.games[ - source_data.games.season_id.astype(str).isin([s[1] for s in training_seasons]) - & source_data.games.competition_id.astype(str).isin( - [s[0] for s in training_seasons] - ) - ].index - feature_cols = get_features(source_data, game_ids) - target_col = get_labels(source_data, game_ids) + if isinstance(source_data, Dataset): + game_ids = pd.concat( + source_data.games(competition_id=s[0], season_id=s[1]) + for s in training_seasons + ).index + feature_cols, target_col = prepare( + source_data, game_ids, shotfilter=self.filter_shots + ) self._training_seasons = training_seasons else: target_col = source_data[target_colname] @@ -161,30 +174,34 @@ def validate( ): """Validate the model. - Once a modeling pipeline is trained, a different dataset must be fed into the trained model - to validate the quality of the fit. - This method implements a simple wrapper around the core Scikit-learn functionality + Once a modeling pipeline is trained, a different dataset must be fed + into the trained model to validate the quality of the fit. This method + implements a simple wrapper around the core Scikit-learn functionality which does this. - The default is to use data from a DataApi object, however that can be changed - to a simple Pandas DataFrame with precomputed features and labels if desired. + The default is to use data from a Dataset object, however that can be + changed to a simple Pandas DataFrame with precomputed features and + labels if desired. - The output of this method is a dictionary with relevant error metrics (see ``soccer_xg.metrics``). + The output of this method is a dictionary with relevant error metrics + (see ``soccer_xg.metrics``). Parameters ---------- - source_data : ``DataApi`` or a Pandas DataFrame + source_data : ``Dataset`` or a Pandas DataFrame The data to be used to validate the model. If an instance of - ``DataApi`` is given, will query the api database for the training data. + ``Dataset`` is given, will query the api database for the training + data. validation_seasons : list of tuples (default=``[('ENG', '1819')]``) - What seasons to use to validated the model if getting data from a DataApi instance. - If ``source_data`` is not a ``DataApi``, this argument will be ignored. - **NOTE:** it is critical not to use the same data to validate the model as was used - in the fit. Generally a good data set to use for validation is one from a time - period more recent than was used to train the model. + What seasons to use to validated the model if getting data from + a Dataset instance. If ``source_data`` is not a ``Dataset``, this + argument will be ignored. **NOTE:** it is critical not to use the + same data to validate the model as was used in the fit. Generally + a good data set to use for validation is one from a time period + more recent than was used to train the model. target_colname : string or integer (default=``"goal"``) The name of the target variable column. This is only relevant if - ``source_data`` is not a ``DataApi``. + ``source_data`` is not a ``Dataset``. plot: bool (default=true) Whether to plot the AUROC and probability calibration curves. @@ -202,14 +219,11 @@ def validate( if not self._fitted: raise NotFittedError("Must fit model before validating.") - if isinstance(source_data, DataApi): - game_ids = source_data.games[ - source_data.games.season_id.astype(str).isin([s[1] for s in validation_seasons]) - & source_data.games.competition_id.astype(str).isin( - [s[0] for s in validation_seasons] - ) - ].index - target_col = get_labels(source_data, game_ids) + if isinstance(source_data, Dataset): + game_ids = pd.concat( + source_data.games(competition_id=s[0], season_id=s[1]) for s in validation_seasons + ).index + _, target_col = prepare(source_data, game_ids) self._validation_seasons = validation_seasons else: game_ids = None @@ -273,18 +287,18 @@ def validate( def estimate(self, source_data, game_ids=None): """Estimate the xG values for all shots in a set of games. - The default is to use data from a DataApi object, however that can be changed + The default is to use data from a Dataset object, however that can be changed to a simple Pandas DataFrame with precomputed features and labels if desired. Parameters ---------- - source_data : ``DataApi`` or a Pandas DataFrame + source_data : ``Dataset`` or a Pandas DataFrame The data to be used to validate the model. If an instance of - ``DataApi`` is given, will query the api database for the training data. + ``Dataset`` is given, will query the api database for the training data. game_ids : list of ints (default=None) Only xG values for the games in this list are returned. By default, xG values are computed for all games in the source data. - If ``source_data`` is not a ``DataApi``, this argument will be ignored. + If ``source_data`` is not a ``Dataset``, this argument will be ignored. Returns ------- @@ -307,10 +321,9 @@ def estimate(self, source_data, game_ids=None): xg.append(model.estimate(source_data, game_ids)) return pd.concat(xg).sort_index() else: - if isinstance(source_data, DataApi): - if game_ids is None: - game_ids = source_data.games.index if game_ids is None else game_ids - source_data = get_features(source_data, game_ids) + if isinstance(source_data, Dataset): + game_ids = source_data.games().index if game_ids is None else game_ids + source_data, _ = prepare(source_data, game_ids) xg = pd.DataFrame(index=source_data.index) xg["xG"] = self.model.predict_proba(source_data)[:, 1] @@ -350,7 +363,7 @@ def save_model(self, filename=None): """ if filename is None: filename = self._default_model_filename - joblib.dump(self, os.path.join(self.model_directory, filename)) + joblib.dump(self, self.model_directory / filename) @classmethod def load_model(cls, filename=None): @@ -370,48 +383,12 @@ def load_model(cls, filename=None): if filename is None: filename = cls._default_model_filename - return joblib.load(os.path.join(cls.model_directory, filename)) + return joblib.load(cls.model_directory / filename) class OpenplayXGModel(XGModel): _default_model_filename = "default_openplay_model.xg" - def train( - self, - source_data, - training_seasons=(("ENG", "1617"), ("ENG", "1718")), - target_colname="goal", - ): - self._training_seasons = [] - if isinstance(source_data, DataApi): - game_ids = source_data.games[ - source_data.games.season_id.astype(str).isin([s[1] for s in training_seasons]) - & source_data.games.competition_id.astype(str).isin( - [s[0] for s in training_seasons] - ) - ].index - feature_cols = get_features( - source_data, game_ids, shotfilter=OpenplayXGModel.filter_shots - ) - target_col = get_labels(source_data, game_ids, shotfilter=OpenplayXGModel.filter_shots) - self._training_seasons = training_seasons - else: - target_col = source_data[target_colname] - feature_cols = source_data.drop(target_colname, axis=1) - self.model.fit(feature_cols, target_col) - self._fitted = True - - def estimate(self, source_data, game_ids=None): - if isinstance(source_data, DataApi): - game_ids = source_data.games.index if game_ids is None else game_ids - source_data = get_features( - source_data, game_ids, shotfilter=OpenplayXGModel.filter_shots - ) - - xg = pd.DataFrame(index=source_data.index) - xg["xG"] = self.model.predict_proba(source_data)[:, 1] - return xg - def create_default_pipeline(self): bodypart_colname = "bodypart_id_a0" dist_to_goal_colname = "start_dist_to_goal_a0" @@ -430,8 +407,8 @@ def create_default_pipeline(self): pipe = make_pipeline(preprocess_pipeline, base_model) return pipe - @staticmethod - def filter_shots(df_actions): + @classmethod + def filter_shots(cls, df_actions): shot_idx = (df_actions.type_name == "shot") & df_actions.result_name.isin( ["fail", "success"] ) @@ -454,9 +431,9 @@ def train( pass def estimate(self, source_data, game_ids=None): - if isinstance(source_data, DataApi): + if isinstance(source_data, Dataset): game_ids = source_data.games.index if game_ids is None else game_ids - source_data = get_features( + source_data, _ = prepare( source_data, game_ids, xfns=[], @@ -471,8 +448,8 @@ def estimate(self, source_data, game_ids=None): def create_default_pipeline(self): return None - @staticmethod - def filter_shots(df_actions): + @classmethod + def filter_shots(cls, df_actions): shot_idx = df_actions.type_name == "shot_penalty" return shot_idx @@ -480,42 +457,6 @@ def filter_shots(df_actions): class FreekickXGModel(XGModel): _default_model_filename = "default_freekick_model.xg" - def train( - self, - source_data, - training_seasons=(("ENG", "1617"), ("ENG", "1718")), - target_colname="goal", - ): - self._training_seasons = [] - if isinstance(source_data, DataApi): - game_ids = source_data.games[ - source_data.games.season_id.astype(str).isin([s[1] for s in training_seasons]) - & source_data.games.competition_id.astype(str).isin( - [s[0] for s in training_seasons] - ) - ].index - feature_cols = get_features( - source_data, game_ids, shotfilter=FreekickXGModel.filter_shots - ) - target_col = get_labels(source_data, game_ids, shotfilter=FreekickXGModel.filter_shots) - self._training_seasons = training_seasons - else: - target_col = source_data[target_colname] - feature_cols = source_data.drop(target_colname, axis=1) - self.model.fit(feature_cols, target_col) - self._fitted = True - - def estimate(self, source_data, game_ids=None): - if isinstance(source_data, DataApi): - game_ids = source_data.games.index if game_ids is None else game_ids - source_data = get_features( - source_data, game_ids, shotfilter=FreekickXGModel.filter_shots - ) - - xg = pd.DataFrame(index=source_data.index) - xg["xG"] = self.model.predict_proba(source_data)[:, 1] - return xg - def create_default_pipeline(self): dist_to_goal_colname = "start_dist_to_goal_a0" angle_to_goal_colname = "start_angle_to_goal_a0" @@ -532,109 +473,75 @@ def create_default_pipeline(self): pipe = make_pipeline(preprocess_pipeline, base_model) return pipe - @staticmethod - def filter_shots(df_actions): + @classmethod + def filter_shots(cls, df_actions): shot_idx = df_actions.type_name == "shot_freekick" return shot_idx -def get_features( - api, +def prepare( + dataset: Dataset, game_ids=None, - xfns=fs.all_features, + xfns=fs.default_features, + yfns=fs.default_labels, shotfilter=None, nb_prev_actions=3, + on_fail="raise", ): - game_ids = api.games.index if game_ids is None else game_ids - X = {} - for game_id in tqdm(game_ids, desc=f"Generating features"): - # try: - game = api.games.loc[game_id] - game_actions = utils.enhance_actions(api.get_actions(game_id)) - game_events = api.get_events(game_id) - X[game_id] = _compute_features_game(game, game_actions, xfns, shotfilter, nb_prev_actions) - X[game_id].index.name = "action_id" - X[game_id]["game_id"] = game_id - # except Exception as e: - # print(f"Failed for game with id={game_id}: {e}") - X = pd.concat(X.values()).reset_index().set_index(["game_id", "action_id"]) - # remove post-shot features (these will all have a single unique value) - f = X.columns[X.nunique() > 1] - return X[f] - + """Prepare a dataset for training and validation. -def _compute_features_game( - game, actions, xfns=fs.all_features, shotfilter=None, nb_prev_actions=3 -): - if shotfilter is None: - # filter shots and ignore own goals - shot_idx = actions.type_name.isin( - ["shot", "shot_penalty", "shot_freekick"] - ) & actions.result_name.isin(["fail", "success"]) - else: - shot_idx = shotfilter(actions) - if shot_idx.sum() < 1: - return pd.DataFrame() - if len(xfns) < 1: - return pd.DataFrame(index=actions.index.values[shot_idx]) - # convert actions to gamestates - gamestates = [ - states.loc[shot_idx].copy() for states in fs.gamestates(actions, nb_prev_actions) - ] - gamestates = fs.play_left_to_right(gamestates, game.home_team_id) - # remove post-shot attributes - gamestates[0].loc[shot_idx, "end_x"] = float("NaN") - gamestates[0].loc[shot_idx, "end_y"] = float("NaN") - gamestates[0].loc[shot_idx, "result_id"] = float("NaN") - # compute features - X = pd.concat([fn(gamestates).reset_index(drop=True) for fn in xfns], axis=1).set_index( - actions.loc[shot_idx].index - ) - # fix data types - for c in [c for c in X.columns.values if c.startswith("type_id")]: - X[c] = pd.Categorical( - X[c].replace(spadlcfg.actiontypes_df().type_name.to_dict()), - categories=spadlcfg.actiontypes, - ordered=False, - ) - for c in [c for c in X.columns.values if c.startswith("result_id")]: - X[c] = pd.Categorical( - X[c].replace(spadlcfg.results_df().result_name.to_dict()), - categories=spadlcfg.results, - ordered=False, - ) - for c in [c for c in X.columns.values if c.startswith("bodypart_id")]: - X[c] = pd.Categorical( - X[c].replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()), - categories=spadlcfg.bodyparts, - ordered=False, - ) - return X - - -def get_labels(api, game_ids=None, shotfilter=None): - game_ids = api.games.index if game_ids is None else game_ids - y = {} - for game_id in tqdm(game_ids, desc=f"Generating labels"): + Parameters + ---------- + dataset : Dataset + The dataset to use. + game_ids : list of ints (default=None) + Only use data from the games in this list. By default, all games + in the dataset are used. + xfns : list(callable) + List of feature generators to apply. Defaults to ``default_features``. + yfns : list(callable) + List of label generators to apply. Defaults to ``default_labels``. + shotfilter: callable(pd.Series) -> bool + A function that takes a shot (in SPADL format) and returns True if the + shot should be used for feature extraction. If None, all shots will be + used (excluding own-goals). + nb_prev_actions: int + The number of previous actions to consider when calculating labels + on_fail: 'raise' or 'warn' + What to do if a feature or label function fails on a specific game. + + Returns + ------- + X : pd.DataFrame + A dataframe containing the features. + y : pd.DataFrame + A dataframe containing the labels. + """ + game_ids = dataset.games().index if game_ids is None else game_ids + X, y = {}, {} + for game_id in tqdm(game_ids, desc="Preparing dataset"): try: - game = api.games.loc[game_id] - game_actions = utils.enhance_actions(api.get_actions(game_id)) - y[game_id] = _compute_labels_game(game, game_actions, shotfilter) - y[game_id].index.name = "action_id" + game = dataset.games().loc[game_id] + game_actions = dataset.actions(game_id) + game_events = dataset.events(game_id) + X[game_id], y[game_id] = fs.compute_attributes( + game, + game_actions, + events=game_events, + xfns=xfns, + yfns=yfns, + shotfilter=shotfilter, + nb_prev_actions=nb_prev_actions, + ) + X[game_id]["game_id"] = game_id y[game_id]["game_id"] = game_id except Exception as e: - print(e) - return pd.concat(y.values()).reset_index().set_index(["game_id", "action_id"])["goal"] - - -def _compute_labels_game(game, actions, shotfilter=None): - # compute labels - y = actions["result_name"] == "success" - if shotfilter is None: - # filter shots and ignore own goals - shot_idx = actions.type_name.isin( - ["shot", "shot_penalty", "shot_freekick"] - ) & actions.result_name.isin(["fail", "success"]) - else: - shot_idx = shotfilter(actions) - return y.loc[shot_idx].to_frame("goal") + if on_fail == "warn": + warnings.warn(f"Failed for game with id={game_id}: {e}") + else: + raise RuntimeError(f"Failed for game with id={game_id}.") from e + X = pd.concat(X.values()).reset_index().set_index(["game_id", "action_id"]) + # remove post-shot features (these will all have a single unique value) + f = X.columns[X.nunique() > 1] + y = pd.concat(y.values()).reset_index().set_index(["game_id", "action_id"]) + return X[f], y