From 97fc6f813c331462035d221e1959a1e09acc561a Mon Sep 17 00:00:00 2001 From: Pieter Robberechts Date: Tue, 4 Jun 2024 15:32:20 +0200 Subject: [PATCH] dev --- soccer_xg/__init__.py | 24 +- soccer_xg/attributes.py | 408 ++++++++++++++-------- soccer_xg/calibration.py | 110 +----- soccer_xg/data/hdf.py | 2 +- soccer_xg/metrics.py | 179 ++++++++-- soccer_xg/ml/logreg.py | 82 ++--- soccer_xg/ml/mlp.py | 57 +--- soccer_xg/ml/pipeline.py | 21 ++ soccer_xg/ml/preprocessing.py | 3 +- soccer_xg/ml/xgboost.py | 60 +--- soccer_xg/utils.py | 29 +- soccer_xg/visualisation.py | 512 ++++++++++++++++++++++------ soccer_xg/xg.py | 613 ++++++++++++++++++++++------------ 13 files changed, 1309 insertions(+), 791 deletions(-) diff --git a/soccer_xg/__init__.py b/soccer_xg/__init__.py index e17d960..569ab07 100644 --- a/soccer_xg/__init__.py +++ b/soccer_xg/__init__.py @@ -4,7 +4,27 @@ :copyright: (c) 2023 by DTAI KU Leuven. :license: Apache v2, see LICENSE for more details. """ -from soccer_xg.xg import XGModel +from soccer_xg.data import Dataset, HDFDataset, SQLDataset +from soccer_xg.xg import ( + DatasetTransformer, + XGModel, + PenaltyXGModel, + FreekickXGModel, + BasicOpenplayXGModel, + AdvancedOpenplayXGModel, + StatsBombOpenplayXGModel, +) __version__ = '0.0.1' -__all__ = ['XGModel'] +__all__ = [ + 'Dataset', + 'HDFDataset', + 'SQLDataset', + 'DatasetTransformer', + 'XGModel', + 'PenaltyXGModel', + 'FreekickXGModel', + 'BasicOpenplayXGModel', + 'AdvancedOpenplayXGModel', + 'StatsBombOpenplayXGModel', +] diff --git a/soccer_xg/attributes.py b/soccer_xg/attributes.py index 06e8d16..9dfe207 100644 --- a/soccer_xg/attributes.py +++ b/soccer_xg/attributes.py @@ -22,7 +22,7 @@ a :class:`pandas.DataFrame` of events and a series with event IDs to select the shots for which attributes should be computed. -The types are specified using the ``ftype`` decorator. Only functions, which +The types are specified using the ``feature`` decorator. Only functions, which have a parameter called "ftype" are seen by soccer-xg as a generator. Others will not be calculated. @@ -32,13 +32,13 @@ """ import math import warnings +from typing import Callable import numpy as np import pandas as pd - -from socceraction import spadl import socceraction.spadl.config as spadlcfg import socceraction.vaep.features as fs +from socceraction import spadl from socceraction.vaep.features import simple _spadl_cfg = { @@ -56,14 +56,43 @@ "circle_radius": 9.15, } +# Typing +Events = pd.DataFrame +Actions = pd.DataFrame +GameStates = list[Actions] +GameStateAttributeGenerator = Callable[[GameStates], pd.DataFrame] +ActionsAttributeGenerator = Callable[[Actions, pd.Series], pd.DataFrame] +EventsAttributeGenerator = Callable[[Events, pd.Series], pd.DataFrame] +AttributeGenerator = ( + EventsAttributeGenerator | ActionsAttributeGenerator | GameStateAttributeGenerator +) -def ftype(value): - """ - This method returns a decorator that sets the property key of the function to value - """ + +# Decorators + +_FEATURE_REGISTRY = {} +_LABEL_REGISTRY = {} + +def feature(ftype, features=None): + """A decorator that sets the property 'ftype' of the function to value.""" def decorate_func(func): - setattr(func, "ftype", value) + setattr(func, "ftype", ftype) + if features is not None: + for feature in features: + _FEATURE_REGISTRY[feature] = func + return func + + return decorate_func + +def label(ftype, labels=None): + """A decorator that sets the property 'ftype' of the function to value.""" + + def decorate_func(func): + setattr(func, "ftype", ftype) + if labels is not None: + for label in labels: + _LABEL_REGISTRY[label] = func return func return decorate_func @@ -73,50 +102,22 @@ def decorate_func(func): # SoccerAction-style gamestate attributes # ############################################################################ -actiontype = ftype("gamestates")(fs.actiontype) -actiontype_onehot = ftype("gamestates")(fs.actiontype_onehot) -result = ftype("gamestates")(fs.result) -result_onehot = ftype("gamestates")(fs.result_onehot) -actiontype_result_onehot = ftype("gamestates")(fs.actiontype_result_onehot) -bodypart = ftype("gamestates")(fs.bodypart) -bodypart_onehot = ftype("gamestates")(fs.bodypart_onehot) -startlocation = ftype("gamestates")(fs.startlocation) -endlocation = ftype("gamestates")(fs.endlocation) -startpolar = ftype("gamestates")(fs.startpolar) -endpolar = ftype("gamestates")(fs.endpolar) -team = ftype("gamestates")(fs.team) -movement = ftype("gamestates")(fs.movement) -time_delta = ftype("gamestates")(fs.time_delta) -space_delta = ftype("gamestates")(fs.space_delta) - - -@ftype("gamestates") -def speed(gamestates): - """Get the movement speed of the ball between the last and previous actions. - - Parameters - ---------- - gamestates : list(pd.DataFrame) - The game states of a game. - - Returns - ------- - pd.DataFrame - A dataframe with a column for the horizontal ('speedx_a0i'), vertical - ('speedy_a0i') and total ('speed_a0i') movement speed of the ball - between each action ai and action a0. - """ - a0 = gamestates[0] - spaced = pd.DataFrame(index=a0.index) - for i, a in enumerate(gamestates[1:]): - dt = a0.time_seconds - a.time_seconds - dt[dt < 1] = 1 - dx = a.end_x - a0.start_x - spaced["speedx_a0" + (str(i + 1))] = dx.abs() / dt - dy = a.end_y - a0.start_y - spaced["speedy_a0" + (str(i + 1))] = dy.abs() / dt - spaced["speed_a0" + (str(i + 1))] = np.sqrt(dx**2 + dy**2) / dt - return spaced +actiontype = feature("gamestates")(fs.actiontype) +actiontype_onehot = feature("gamestates")(fs.actiontype_onehot) +result = feature("gamestates")(fs.result) +result_onehot = feature("gamestates")(fs.result_onehot) +actiontype_result_onehot = feature("gamestates")(fs.actiontype_result_onehot) +bodypart = feature("gamestates", ["bodypart"])(fs.bodypart) +bodypart_onehot = feature("gamestates")(fs.bodypart_onehot) +startlocation = feature("gamestates")(fs.startlocation) +endlocation = feature("gamestates")(fs.endlocation) +startpolar = feature("gamestates")(fs.startpolar) +endpolar = feature("gamestates")(fs.endpolar) +team = feature("gamestates")(fs.team) +movement = feature("gamestates")(fs.movement) +time_delta = feature("gamestates")(fs.time_delta) +space_delta = feature("gamestates")(fs.space_delta) +speed = feature("gamestates")(fs.speed) # ############################################################################ @@ -124,7 +125,7 @@ def speed(gamestates): # ############################################################################ -@ftype("actions") +@label("actions", ["goal"]) def goal_from_shot(actions, shot_mask): """Determine whether a goal was scored from the current action. @@ -150,7 +151,34 @@ def goal_from_shot(actions, shot_mask): return goaldf -@ftype("actions") +@feature("actions", ["type_name_shot"]) +def shot_type(actions, shot_mask): + """Compute the shot's action type. + + Parameters + ---------- + actions : pd.DataFrame + The actions of a game in SPADL format. + shot_mask : pd.Series + A boolean mask to select the shots for which attributes should be + computed. + + Returns + ------- + pd.DataFrame + A dataframe with a column for the shot's action type ('type_name_shot'). + """ + shots = actions.loc[shot_mask] + shottypedf = pd.DataFrame(index=shots.index) + shottypedf["type_name_shot"] = pd.Categorical( + shots["type_name"], + categories=["shot", "shot_penalty", "shot_freekick"], + ordered=False, + ) + return shottypedf + + +@feature("actions", ["dist_shot"]) def shot_dist(actions, shot_mask): """Compute the distance to the middle of the goal. @@ -176,7 +204,7 @@ def shot_dist(actions, shot_mask): return distdf -@ftype("actions") +@feature("actions", ["dx_shot", "dy_shot"]) def shot_location(actions, shot_mask): """Compute the distance to the mid line and goal line. @@ -204,7 +232,7 @@ def shot_location(actions, shot_mask): return locationdf -@ftype("actions") +@feature("actions", ["angle_shot"]) def shot_angle(actions, shot_mask): """Compute the angle to the middle of the goal. @@ -234,7 +262,7 @@ def shot_angle(actions, shot_mask): return polardf -@ftype("actions") +@feature("actions", ["visible_angle_shot"]) def shot_visible_angle(actions, shot_mask): """Compute the angle formed between the shot location and the two goal posts. @@ -280,7 +308,7 @@ def shot_visible_angle(actions, shot_mask): return angledf -@ftype("actions") +@feature("actions", ["relative_angle_shot"]) def shot_relative_angle(actions, shot_mask): """Compute the relative angle to goal. @@ -326,7 +354,7 @@ def shot_relative_angle(actions, shot_mask): return angledf[["relative_angle_shot"]] -@ftype("actions") +@feature("actions", ["bodypart_name_shot"]) def shot_bodypart(actions, shot_mask): """Return the body part used to take the shot. @@ -354,7 +382,7 @@ def shot_bodypart(actions, shot_mask): return bodypartdf -@ftype("actions") +@feature("actions", ["detailed_bodypart_name_shot"]) def shot_bodypart_detailed(actions, shot_mask): """Return the body part used to take the shot, distinguishing between the left and right foot. @@ -375,13 +403,13 @@ def shot_bodypart_detailed(actions, shot_mask): """ shots = actions.loc[shot_mask] bodypartdf = pd.DataFrame(index=shots.index) - bodypartdf["bodypart_name_shot"] = pd.Categorical( + bodypartdf["detailed_bodypart_name_shot"] = pd.Categorical( shots["bodypart_name"], categories=spadlcfg.bodyparts, ordered=False ) return bodypartdf -@ftype("actions") +@feature("actions", [f"bodypart_{type_name}_shot" for type_name in spadlcfg.bodyparts]) def shot_bodypart_onehot(actions, shot_mask): """Return the one-hot encoded body part used to take the shot. @@ -412,7 +440,7 @@ def shot_bodypart_onehot(actions, shot_mask): return X -@ftype("actions") +@feature("actions", ["post_dribble", "carry_length"]) def post_dribble(actions, shot_mask): """Compute features describing the dribble before the shot. @@ -442,15 +470,16 @@ def post_dribble(actions, shot_mask): for idx in actions.loc[shot_mask].index: carry_length = 0 maybe_carry = actions.loc[:idx].iloc[-1] - if maybe_carry.type_name == "dribble": + post_dribble = maybe_carry.type_name == "dribble" + if post_dribble: dx = maybe_carry.end_x - maybe_carry.start_x dy = maybe_carry.end_y - maybe_carry.start_y carry_length = math.sqrt(dx**2 + dy**2) - df[idx] = {"carry_length": carry_length} + df[idx] = {"carry_length": carry_length, "post_dribble": post_dribble} return pd.DataFrame.from_dict(df, orient="index") -@ftype("actions") +@feature("actions", ["type_name_assist"]) def assist_type(actions, shot_mask): """Return the assist type. @@ -504,11 +533,11 @@ def assist_type(actions, shot_mask): assist_type = assist.type_name if assist else "direct" # TODO (assist_technique): The technique for crosses one of straight, # inswinging, or out swinging and whether the pass was a through ball - df[idx] = {"type_assist": assist_type} + df[idx] = {"type_name_assist": assist_type} return pd.DataFrame.from_dict(df, orient="index") -@ftype("actions") +@feature("actions", ["fastbreak"]) def fastbreak(actions, shot_mask): """Get whether the shot was part of a counter attack. @@ -559,7 +588,7 @@ def fastbreak(actions, shot_mask): return pd.DataFrame.from_dict(df, orient="index") -@ftype("actions") +@feature("actions", ["rebound", "time_prev_shot"]) def rebound(actions, shot_mask): """Get whether the shot was a rebound. @@ -718,6 +747,33 @@ def fn(point): def triangular_grid(name, angle_bins, dist_bins, symmetrical=False): + """Get the location of a shot as a cell index from a triangular grid. + + Paramters + --------- + name : str + Name of the feature. + angle_bins : list + A monotonically increasing array of bin edges for the angle of the + shot wrt the center of the goal, including the rightmost edge. In degrees. + dist_bins : list + A monotonically increasing array of bin edges for the distance of the + shot to the center of the goal, including the rightmost edge. In meters. + symmetrical : bool + Whether to use a symmetrical grid. Default: False. + + Examples + -------- + >>> shot_zone = feature("gamestates", ["shot_zone"])( + >>> triangular_grid( + >>> "shot_zone", + >>> [-50, -20, 20, 50], + >>> [2, 4, 8, 11, 16, 24, 34, 50], + >>> symmetrical=True, + >>> ) + >>> ) + """ + @simple def fn(actions): zonedf = startpolar(actions) @@ -741,12 +797,39 @@ def fn(actions): return fn -def rectangular_grid(name, x_bins, y_bins, symmetrical=False, cfg=_spadl_cfg): +def rectangular_grid(name, x_bins, y_bins, symmetrical=False): + """Get the location of a shot as a cell index from a rectangular grid. + + Paramters + --------- + name : str + Name of the feature. + x_bins : list + A monotonically increasing array of bin edges for the length of the + pitch, including the rightmost edge. In degrees. + y_bins : list + A monotonically increasing array of bin edges for the width of the + pitch, including the rightmost edge. In meters. + symmetrical : bool + Whether to use a symmetrical grid. Default: False. + + Examples + -------- + >>> shot_zone = feature("gamestates", ["shot_zone"])( + >>> rectangular_grid( + >>> "shot_zone", + >>> np.arange(0, 105, 2), + >>> np.arange(0, 68, 2), + >>> symmetrical=False, + >>> ) + >>> ) + """ + @simple def fn(actions): zonedf = actions[["start_x", "start_y"]].copy() if symmetrical: - m = (cfg["origin_y"] + cfg["width"]) / 2 + m = (_spadl_cfg["origin_y"] + _spadl_cfg["width"]) / 2 zonedf.loc[zonedf.start_y > m, "start_y"] -= m x_bin = np.digitize(zonedf.start_x, x_bins) y_bin = np.digitize(zonedf.start_y, y_bins) @@ -762,6 +845,26 @@ def fn(actions): def custom_grid(name, zones, is_in_zone): + """Get the location of a shot as a cell index from a custom grid. + + Paramters + --------- + name : str + Name of the feature. + zones : list + is_in_zone : callable + + Examples + -------- + >>> shot_zone = feature("gamestates", ["shot_zone"])( + >>> custom_grid( + >>> "shot_zone", + >>> _caley_shot_matrix(), + >>> _point_in_rect, + >>> ) + >>> ) + """ + @simple def fn(actions): zonedf = actions[["start_x", "start_y"]].copy() @@ -786,7 +889,7 @@ def fn(actions): return fn -caley_grid = ftype("gamestates")(custom_grid("caley_zone", _caley_shot_matrix(), _point_in_rect)) +caley_grid = feature("gamestates", ["caley_zone"])(custom_grid("caley_zone", _caley_shot_matrix(), _point_in_rect)) # ############################################################################ # StatsBomb-specific attributes @@ -857,7 +960,7 @@ def _is_inside_triangle(point, tri_points): return False -@ftype("events") +@feature("events", ["sb_open_goal"]) def statsbomb_open_goal(events, shot_mask): """Get whether the shot was taken into an open goal. @@ -879,13 +982,13 @@ def statsbomb_open_goal(events, shot_mask): output = {} for idx, shot in events.loc[shot_mask].iterrows(): if "shot" in shot.extra: - output[idx] = {"open_goal": "open_goal" in shot.extra['shot']} + output[idx] = {"sb_open_goal": "open_goal" in shot.extra['shot']} output = pd.DataFrame.from_dict(output, orient="index") return output -@ftype("events") +@feature("events", ["sb_first_touch"]) def statsbomb_first_touch(events, shot_mask): """Get whether the shot was a first-touch shot. @@ -907,13 +1010,13 @@ def statsbomb_first_touch(events, shot_mask): output = {} for idx, shot in events.loc[shot_mask].iterrows(): if "shot" in shot.extra: - output[idx] = {"first_touch": "first_time" in shot.extra['shot']} + output[idx] = {"sb_first_touch": "first_time" in shot.extra['shot']} output = pd.DataFrame.from_dict(output, orient="index") return output -@ftype("events") +@feature("events", ["sb_free_projection_gaps", "sb_free_projection_pct"]) def statsbomb_free_projection(events, shot_mask): """Get the free projection area. @@ -985,14 +1088,14 @@ def statsbomb_free_projection(events, shot_mask): new_free_projection.append(projection) free_projection = [p for p in new_free_projection if p[1] - p[0] > 0] output[idx] = { - "free_projection_gaps": len(free_projection), - "free_projection_pct": np.sum(np.diff(free_projection)) / np.diff(goal)[0], + "sb_free_projection_gaps": len(free_projection), + "sb_free_projection_pct": np.sum(np.diff(free_projection)) / np.diff(goal)[0], } output = pd.DataFrame.from_dict(output, orient="index") return output -@ftype("events") +@feature("events", ["sb_goalkeeper_x", "sb_goalkeeper_y", "sb_goalkeeper_dist_to_ball", "sb_goalkeeper_dist_to_goal", "sb_goalkeeper_angle_to_goal"]) def statsbomb_goalkeeper_position(events, shot_mask): """Get the goalkeeper's position. @@ -1049,17 +1152,17 @@ def statsbomb_goalkeeper_position(events, shot_mask): goalkeeper_dist_to_ball = math.sqrt(dx_kb**2 + dy_kb**2) output[idx] = { - "goalkeeper_x": goalkeeper_x, - "goalkeeper_y": goalkeeper_y, - "goalkeeper_dist_to_ball": goalkeeper_dist_to_ball, - "goalkeeper_dist_to_goal": goalkeeper_dist_to_goal, - "goalkeeper_angle_to_goal": goalkeeper_angle_to_goal, + "sb_goalkeeper_x": goalkeeper_x, + "sb_goalkeeper_y": goalkeeper_y, + "sb_goalkeeper_dist_to_ball": goalkeeper_dist_to_ball, + "sb_goalkeeper_dist_to_goal": goalkeeper_dist_to_goal, + "sb_goalkeeper_angle_to_goal": goalkeeper_angle_to_goal, } output = pd.DataFrame.from_dict(output, orient="index") return output -@ftype("events") +@feature("events", ["sb_dist_to_defender", "sb_under_pressure", "sb_nb_defenders_in_shot_line", "sb_nb_defenders_behind_ball", "sb_one_on_one"]) def statsbomb_defenders_position(events, shot_mask): """Get features describing the position of the defending players. @@ -1120,23 +1223,23 @@ def statsbomb_defenders_position(events, shot_mask): ) behind_ball.append(defender_x > ball_x) output[idx] = { - "dist_to_defender": min(distances, default=float("inf")), - "under_pressure": shot.under_pressure, - "nb_defenders_in_shot_line": sum(in_shot_line), - "nb_defenders_behind_ball": sum(behind_ball), - "one_on_one": ( + "sb_dist_to_defender": min(distances, default=float("inf")), + "sb_under_pressure": shot.under_pressure, + "sb_nb_defenders_in_shot_line": sum(in_shot_line), + "sb_nb_defenders_behind_ball": sum(behind_ball), + "sb_one_on_one": ( sum(behind_ball) == 0 and sum(in_shot_line) == 0 and shot.extra["shot"]["body_part"]["name"] in ["Left Foot", "Right Foot"] ), } output = pd.DataFrame.from_dict(output, orient="index") - output["one_on_one"] = output["one_on_one"].astype('boolean') - output["under_pressure"] = output["under_pressure"].astype('boolean') + output["sb_one_on_one"] = output["sb_one_on_one"].astype('boolean') + output["sb_under_pressure"] = output["sb_under_pressure"].astype('boolean') return output -@ftype("events") +@feature("events", ["sb_end_x_assist", "sb_end_y_assist", "sb_carry_dist", "sb_type_name_assist", "sb_height_assist"]) def statsbomb_assist(events, shot_mask): # noqa: C901 """Get features describing the assist. @@ -1145,7 +1248,7 @@ def statsbomb_assist(events, shot_mask): # noqa: C901 - end_y_assist: The assisting pass' y-coordinate - carry_dist: The distance between the end location of the assisting pass and the location of the shot. - - type_assist: The assist type, which is one of 'standard_pass', + - type_name_assist: The assist type, which is one of 'standard_pass', 'free_kick', 'corner', 'throw_in', 'cross', 'cut_back' or 'through_ball'. - height_assist: The peak height of the assisting pass, which is one of 'ground', 'low' (under shoulder level) or 'high' (above shoulder @@ -1210,16 +1313,16 @@ def statsbomb_assist(events, shot_mask): # noqa: C901 assist_height = m[assist.extra["pass"]["height"]["name"]] output[event_id] = { - "end_x_assist": assist_x, - "end_y_assist": assist_y, - "carry_dist": math.sqrt((shot_x - assist_x) ** 2 + (shot_y - assist_y) ** 2), - "type_assist": assist_type, - "height_assist": assist_height, + "sb_end_x_assist": assist_x, + "sb_end_y_assist": assist_y, + "sb_carry_dist": math.sqrt((shot_x - assist_x) ** 2 + (shot_y - assist_y) ** 2), + "sb_type_name_assist": assist_type, + "sb_height_assist": assist_height, } output = pd.DataFrame.from_dict(output, orient="index") - output["type_assist"] = pd.Categorical( - output["type_assist"], + output["sb_type_assist"] = pd.Categorical( + output["sb_type_name_assist"], categories=[ "standard_pass", "free_kick", @@ -1231,13 +1334,13 @@ def statsbomb_assist(events, shot_mask): # noqa: C901 ], ordered=False, ) - output["height_assist"] = pd.Categorical( - output["height_assist"], categories=["ground", "low", "high"], ordered=True + output["sb_height_assist"] = pd.Categorical( + output["sb_height_assist"], categories=["ground", "low", "high"], ordered=True ) return output -@ftype("events") +@feature("events", ["sb_from_counterattack"]) def statsbomb_counterattack(events, shot_mask): """Get whether a shot was from a counterattack. @@ -1259,14 +1362,14 @@ def statsbomb_counterattack(events, shot_mask): output = {} for idx, shot in events.loc[shot_mask].iterrows(): output[idx] = { - "from_counterattack": shot.play_pattern_name == "From Counter", + "sb_from_counterattack": shot.play_pattern_name == "From Counter", } output = pd.DataFrame.from_dict(output, orient="index") return output -@ftype("events") +@feature("events", ["sb_impact_height"]) def statsbomb_shot_impact_height(events, shot_mask): """Get the height of the ball when the shot was taken. @@ -1307,41 +1410,23 @@ def statsbomb_shot_impact_height(events, shot_mask): height = "low" elif shot.extra["shot"]["technique"]["name"] == "Overhead Kick": height = "high" - output[idx] = {"impact_height": height} + output[idx] = {"sb_impact_height": height} output = pd.DataFrame.from_dict(output, orient="index") - output["impact_height"] = pd.Categorical( - output["impact_height"], categories=["ground", "low", "high"], ordered=True + output["sb_impact_height"] = pd.Categorical( + output["sb_impact_height"], categories=["ground", "low", "high"], ordered=True ) return output -default_features = [ - actiontype, - bodypart, - result, - startlocation, - endlocation, - movement, - space_delta, - startpolar, - endpolar, - team, - time_delta, - speed, - shot_angle, - caley_grid, - ftype("gamestates")( - triangular_grid( - "angle_zone", - [-50, -20, 20, 50], - [2, 4, 8, 11, 16, 24, 34, 50], - symmetrical=True, - ) - ), +default_features : list[AttributeGenerator | str] = [ + shot_type, + shot_location, + shot_dist, + shot_visible_angle, ] -default_labels = [goal_from_shot] +default_labels : list[AttributeGenerator | str] = [goal_from_shot] def compute_attributes( @@ -1353,7 +1438,7 @@ def compute_attributes( shotfilter=None, nb_prev_actions=3, ): - """Extract xG features for a given game. + """Extract xG features and labels for a given game. Parameters ---------- @@ -1364,9 +1449,9 @@ def compute_attributes( events: pd.DataFrame A DataFrame containing the raw provider-specific events corresponding to ``actions``. Can be used to calculate provider-specific features. - xfns : list(callable) + xfns : list(callable or str) List of feature generators to apply. Defaults to ``default_features``. - yfns : list(callable) + yfns : list(callable or str) List of label generators to apply. Defaults to ``default_labels``. shotfilter: callable(pd.Series) -> bool A function that takes a shot (in SPADL format) and returns True if the @@ -1396,11 +1481,11 @@ def compute_attributes( shot_events_idx = actions.loc[shot_mask, "original_event_id"] # handle inputs with no shots or no attributes - if shot_mask.sum() < 1: + if shot_mask.sum() == 0: # TODO: create the expected columns - return pd.DataFrame() - if len(xfns + yfns) < 1: - return pd.DataFrame(index=shot_actions_idx) + return pd.DataFrame(index=pd.Index([], name="action_id")), pd.DataFrame(index=pd.Index([], name="action_id")) + if len(xfns + yfns) == 0: + return pd.DataFrame(index=shot_actions_idx), pd.DataFrame(index=shot_actions_idx) # convert actions to ltr orientation actions_ltr = spadl.utils.play_left_to_right(actions, game.home_team_id) @@ -1414,17 +1499,37 @@ def compute_attributes( shot_gamestates_ltr[0]["result_id"] = float("NaN") # compute features and labels - def _apply_fns(fns): + def _apply_fns(fns, registry): attrs = [] + _fns = {} for fn in fns: + if isinstance(fn, str): + if fn not in registry: + raise ValueError( + f"Unkown feature '{fn}'. Valid feature names are [{', '.join(registry)}]" + ) + _fn = registry[fn] + if _fn not in _fns: + _fns[_fn] = [fn] + else: + _fns[_fn].append(fn) + else: + _fns[fn] = None + for fn, cols in _fns.items(): + new_attrs = None if getattr(fn, "ftype", None) == "gamestates": - attrs.append(fn(shot_gamestates_ltr).set_index(shot_events_idx)) + new_attrs = fn(shot_gamestates_ltr).set_index(shot_events_idx) elif getattr(fn, "ftype", None) == "actions": - attrs.append(fn(actions_ltr, shot_mask).set_index(shot_events_idx)) + new_attrs = fn(actions_ltr, shot_mask).set_index(shot_events_idx) elif getattr(fn, "ftype", None) == "events": - attrs.append(fn(events, shot_events_idx)) + new_attrs = fn(events, shot_events_idx) else: - warnings.warn("Unknown attribute type for {}.".format(fn.__name__), stacklevel=2) + warnings.warn(f"Unknown attribute type for {fn.__name__}.", stacklevel=2) + if new_attrs is not None: + if cols is not None: + attrs.append(new_attrs[cols]) + else: + attrs.append(new_attrs) attrs = pd.concat(attrs, axis=1).loc[shot_events_idx].set_index(shot_actions_idx) attrs.index.name = "action_id" # fill missing values @@ -1434,7 +1539,14 @@ def _apply_fns(fns): attrs.rename(columns=lambda s: s.replace("a0", "shot"), inplace=True) return attrs - X = _apply_fns(xfns) - y = _apply_fns(yfns) + X = _apply_fns(xfns, _FEATURE_REGISTRY) if len(xfns) > 0 else None + y = _apply_fns(yfns, _LABEL_REGISTRY) if len(yfns) > 0 else None + + if X is None: + assert y is not None + X = pd.DataFrame(index=y.index) + if y is None: + assert X is not None + y = pd.DataFrame(index=X.index) return X, y diff --git a/soccer_xg/calibration.py b/soccer_xg/calibration.py index 7612baf..941aaf6 100644 --- a/soccer_xg/calibration.py +++ b/soccer_xg/calibration.py @@ -7,9 +7,9 @@ from sklearn.isotonic import IsotonicRegression from sklearn.linear_model import LogisticRegression from sklearn.model_selection import check_cv -from sklearn.preprocessing import LabelBinarizer, label_binarize +from sklearn.preprocessing import LabelBinarizer from sklearn.svm import LinearSVC -from sklearn.utils import check_X_y, column_or_1d, indexable, check_consistent_length +from sklearn.utils import column_or_1d, indexable from sklearn.utils.validation import check_is_fitted @@ -93,7 +93,7 @@ def __init__(self, base_estimator=None, method=None, cv=3, score_type=None): self.score_type = score_type def fit(self, X, y, sample_weight=None): - """Fit the calibrated model + """Fit the calibrated model. Parameters ---------- @@ -187,7 +187,7 @@ def fit(self, X, y, sample_weight=None): return self def predict_proba(self, X): - """Posterior probabilities of classification + """Posterior probabilities of classification. This function returns posterior probabilities of classification according to each class on an array of test vectors X. @@ -217,7 +217,7 @@ def predict_proba(self, X): return mean_proba def calibrate_scores(self, df): - """Posterior probabilities of classification + """Posterior probabilities of classification. This function returns posterior probabilities of classification according to each class on an array of test vectors X. @@ -246,8 +246,9 @@ def calibrate_scores(self, df): return mean_proba def predict(self, X): - """Predict the target of new samples. Can be different from the - prediction of the uncalibrated classifier. + """Predict the target of new samples. + + Can be different from the prediction of the uncalibrated classifier. Parameters ---------- @@ -336,7 +337,7 @@ def _preproc(self, X): return df, idx_pos_class def fit(self, X, y, sample_weight=None): - """Calibrate the fitted model + """Calibrate the fitted model. Parameters ---------- @@ -425,7 +426,7 @@ def predict_proba(self, X): return proba def calibrate_scores(self, df): - """Posterior probabilities of classification + """Posterior probabilities of classification. This function returns posterior probabilities of classification according to each class on an array of test vectors X. @@ -522,15 +523,15 @@ def predict(self, T): class _DummyCalibration(BaseEstimator, RegressorMixin): - """Dummy regression model. The purpose of this class is to give + """Dummy regression model. + + The purpose of this class is to give the CalibratedClassifierCV class the option to just return the probabilities of the base classifier. - - """ def fit(self, X, y, sample_weight=None): - """Does nothing. + """Do nothing. Parameters ---------- @@ -564,86 +565,3 @@ def predict(self, T): The predicted data. """ return T - - -def calibration_curve(y_true, y_prob, normalize=False, n_bins=5): - """Compute true and predicted probabilities for a calibration curve. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array, shape (n_samples,) - True targets. - - y_prob : array, shape (n_samples,) - Probabilities of the positive class. - - normalize : bool, optional, default=False - Whether y_prob needs to be normalized into the bin [0, 1], i.e. is not - a proper probability. If True, the smallest value in y_prob is mapped - onto 0 and the largest one onto 1. - - n_bins : int - Number of bins. A bigger number requires more data. - - Returns - ------- - prob_true : array, shape (n_bins,) - The true probability in each bin (fraction of positives). - - prob_pred : array, shape (n_bins,) - The mean predicted probability in each bin. - - References - ---------- - Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good - Probabilities With Supervised Learning, in Proceedings of the 22nd - International Conference on Machine Learning (ICML). - See section 4 (Qualitative Analysis of Predictions). - """ - y_true = column_or_1d(y_true) - y_prob = column_or_1d(y_prob) - - if normalize: # Normalize predicted values into interval [0, 1] - y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min()) - elif y_prob.min() < 0 or y_prob.max() > 1: - raise ValueError('y_prob has values outside [0, 1] and normalize is ' 'set to False.') - - y_true = _check_binary_probabilistic_predictions(y_true, y_prob) - - bins = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1) - binids = np.digitize(y_prob, bins) - 1 - - bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins)) - bin_true = np.bincount(binids, weights=y_true, minlength=len(bins)) - bin_total = np.bincount(binids, minlength=len(bins)) - - zero = bin_total == 0 - bin_total[zero] = 2 - # nonzero = bin_total != 0 - - prob_true = bin_true / bin_total - prob_pred = bin_sums / bin_total - - return prob_true, prob_pred - - -def _check_binary_probabilistic_predictions(y_true, y_prob): - """Check that y_true is binary and y_prob contains valid probabilities""" - check_consistent_length(y_true, y_prob) - - labels = np.unique(y_true) - - if len(labels) != 2: - raise ValueError( - 'Only binary classification is supported. ' 'Provided labels %s.' % labels - ) - - if y_prob.max() > 1: - raise ValueError('y_prob contains values greater than 1.') - - if y_prob.min() < 0: - raise ValueError('y_prob contains values less than 0.') - - return label_binarize(y_true, classes=labels)[:, 0] diff --git a/soccer_xg/data/hdf.py b/soccer_xg/data/hdf.py index a6fb859..1ffb22f 100644 --- a/soccer_xg/data/hdf.py +++ b/soccer_xg/data/hdf.py @@ -133,7 +133,7 @@ def players(self, game_id: Optional[int] = None) -> pd.DataFrame: raise IndexError(f"No game found with ID={game_id}") else: players = self["player_games"] - cols = ["team_id", "player_id", "player_name", "nickname"] + cols = ["team_id", "player_id", "player_name"] return players[cols].drop_duplicates().set_index(["player_id"]) def events(self, game_id: int) -> pd.DataFrame: diff --git a/soccer_xg/metrics.py b/soccer_xg/metrics.py index 12dc49e..e4d56bb 100644 --- a/soccer_xg/metrics.py +++ b/soccer_xg/metrics.py @@ -1,7 +1,38 @@ """A collection of metrics for evaluation xG models.""" +import warnings + import numpy as np from scipy import integrate from sklearn.neighbors import KernelDensity +from sklearn.preprocessing import label_binarize +from sklearn.utils import check_consistent_length, column_or_1d + + +def _check_binary_probabilistic_predictions(y_true, y_prob): + """Check that y_true is binary and y_prob contains valid probabilities""" + # convert to 1D numpy array + y_true = column_or_1d(y_true) + y_prob = column_or_1d(y_prob) + + # check equal length + check_consistent_length(y_true, y_prob) + + if y_prob.min() < 0 or y_prob.max() > 1: + warnings.warn( + "y_prob has values outside [0, 1] and normalize is set to False. " + "Probalities outside [0, 1] will be clipped." + ) + y_prob = np.clip(y_prob, a_min=0, a_max=1) + + # check if binary classification + labels = np.unique(y_true) + if len(labels) != 2: + raise ValueError( + 'Only binary classification is supported. Provided labels %s.' % labels + ) + + return label_binarize(y_true, classes=labels)[:, 0], y_prob + def expected_calibration_error(y_true, y_prob, n_bins=5, strategy='uniform'): @@ -41,31 +72,21 @@ def expected_calibration_error(y_true, y_prob, n_bins=5, strategy='uniform'): References ---------- - [1]: Chuan Guo, Geoff Pleiss, Yu Sun, Kilian Q. Weinberger, - On Calibration of Modern Neural Networks. - Proceedings of the 34th International Conference on Machine Learning - (ICML 2017). - arXiv:1706.04599 - https://arxiv.org/pdf/1706.04599.pdf - [2]: Nixon, Jeremy, et al., - Measuring calibration in deep learning. - arXiv:1904.01685 - https://arxiv.org/abs/1904.01685 - + .. [1] Chuan Guo, Geoff Pleiss, Yu Sun, Kilian Q. Weinberger, On + Calibration of Modern Neural Networks. Proceedings of the 34th + International Conference on Machine Learning (ICML 2017). + arXiv:1706.04599 https://arxiv.org/pdf/1706.04599.pdf + .. [2] Nixon, Jeremy, et al., Measuring calibration in deep learning. + arXiv:1904.01685 https://arxiv.org/abs/1904.01685 """ - if y_prob.shape != y_true.shape: - raise ValueError('Shapes must match') - if y_prob.min() < 0 or y_prob.max() > 1: - raise ValueError('y_prob has values outside [0, 1].') - labels = np.unique(y_true) - if len(labels) > 2: - raise ValueError('Only binary classification is supported.') + y_true, y_prob = _check_binary_probabilistic_predictions(y_true, y_prob) if strategy == 'quantile': # Determine bin edges by distribution of data quantiles = np.linspace(0, 1, n_bins + 1) bins = np.percentile(y_prob, quantiles * 100) - bins[-1] = bins[-1] + 1e-8 + bins[0] = 0 - 1e-8 + bins[-1] = 1 + 1e-8 elif strategy == 'uniform': bins = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1) else: @@ -99,7 +120,93 @@ def _reliability(y_true, y_prob, bins): return accs, confs, counts -def bayesian_calibration_curve(y_true, y_pred, n_bins=100): +def calibration_curve(y_true, y_prob, bins=10, bin_strategy="uniform"): + """Compute true and predicted probabilities for a calibration curve. + + Parameters + ---------- + y_true : array (n_samples, ) + Labels indicating the true class. + y_prob : array (n_samples, ) + Output probability scores. + bins : int or list of floats + Number of bins to create in the scores' space, or list of bin + boundaries. More bins require more data. + bin_strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + + uniform + The bins have identical widths. + quantile + The bins have the same number of samples and depend on `y_prob`. + + Returns + ------- + avg_true : array, shape (n_bins,) + The true probability in each bin (fraction of positives). + + avg_pred : array, shape (n_bins,) + The mean predicted probability in each bin. + + bin_true : array, shape (n_bins,) + Number of true samples in each bin. + + bin_total : array, shape (n_bins,) + Number of samples in each bin. + + References + ---------- + .. [1] Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good + Probabilities With Supervised Learning, in Proceedings of the 22nd + International Conference on Machine Learning (ICML). + See section 4 (Qualitative Analysis of Predictions). + """ + y_true, y_prob = _check_binary_probabilistic_predictions(y_true, y_prob) + + if isinstance(bins, int): + n_bins = bins + if bin_strategy == "quantile": # Determine bin edges by distribution of data + quantiles = np.linspace(0, 1, n_bins + 1) + bins = np.percentile(y_prob, quantiles * 100) + bins[0] = 0 - 1e-8 + bins[-1] = 1 + 1e-8 + elif bin_strategy == "uniform": + bins = np.linspace(0, 1 + 1e-8, n_bins + 1) + else: + raise ValueError( + "Invalid entry to 'strategy' input. Strategy " + "must be either 'quantile' or 'uniform'." + ) + elif isinstance(bins, list) or isinstance(bins, np.ndarray): + n_bins = len(bins) - 1 + bins = np.array(bins) + if bins[0] == 0.0: + bins[0] = 0 - 1e-8 + if bins[-1] == 1.0: + bins[-1] = 1 + 1e-8 + else: + raise ValueError( + "Invalid entry to 'bins' input. The must be either " + "a list of bin boundaries or the number of bins." + ) + + bin_idx = np.digitize(y_prob, bins) - 1 + + bin_true = np.bincount(bin_idx, weights=y_true, minlength=n_bins) + bin_pred = np.bincount(bin_idx, weights=y_prob, minlength=n_bins) + bin_total = np.bincount(bin_idx, minlength=n_bins) + + zero_idx = bin_total == 0 + avg_true = np.empty(bin_total.shape[0]) + avg_true.fill(np.nan) + avg_true[~zero_idx] = np.divide(bin_true[~zero_idx], bin_total[~zero_idx]) + avg_pred = np.empty(bin_total.shape[0]) + avg_pred.fill(np.nan) + avg_pred[~zero_idx] = np.divide(bin_pred[~zero_idx], bin_total[~zero_idx]) + return avg_true, avg_pred, bin_true, bin_total + + +def bayesian_calibration_curve(y_true, y_prob, n_bins=100): """Compute true and predicted probabilities for a calibration curve using kernel density estimation instead of bins with a fixed width. @@ -115,21 +222,24 @@ def bayesian_calibration_curve(y_true, y_pred, n_bins=100): Returns ------- - prob_true : ndarray of shape (n_bins,) - The proportion of samples whose class is the positive class, in each - bin (fraction of positives). - prob_pred : ndarray of shape (n_bins,) + avg_true : array, shape (n_bins,) + The true probability in each bin (fraction of positives). + + avg_pred : array, shape (n_bins,) The mean predicted probability in each bin. - number_total : ndarray of shape (n_bins,) - The number of examples in each bin. + + bin_true : array, shape (n_bins,) + Number of true samples in each bin. + + bin_total : array, shape (n_bins,) + Number of samples in each bin. """ - y_pred = np.array(y_pred) - y_true = np.array(y_true, dtype=bool) + y_true, y_prob = _check_binary_probabilistic_predictions(y_true, y_prob) + y_true = y_true.astype(bool) + bandwidth = 1 / n_bins - kde_pos = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit( - (y_pred[y_true])[:, np.newaxis] - ) - kde_total = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(y_pred[:, np.newaxis]) + kde_pos = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit((y_prob[y_true])[:, np.newaxis]) + kde_total = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(y_prob[:, np.newaxis]) sample_probabilities = np.linspace(0.01, 0.99, 99) number_density_offense_won = np.exp( kde_pos.score_samples(sample_probabilities[:, np.newaxis]) @@ -142,8 +252,9 @@ def bayesian_calibration_curve(y_true, y_pred, n_bins=100): predicted_pos_percents = np.nan_to_num(number_pos / number_total, 1) return ( - 100.0 * sample_probabilities, - 100.0 * predicted_pos_percents, + predicted_pos_percents, + sample_probabilities, + number_pos, number_total, ) diff --git a/soccer_xg/ml/logreg.py b/soccer_xg/ml/logreg.py index 5c72518..d0a4b98 100644 --- a/soccer_xg/ml/logreg.py +++ b/soccer_xg/ml/logreg.py @@ -6,70 +6,32 @@ def logreg_gridsearch_classifier( - numeric_features, - categoric_features, - learning_rate=0.08, - use_dask=False, - n_iter=100, - scoring='roc_auc', + numeric_features: list[str], + categoric_features: list[str], + learning_rate: float | str | None = 0.08, + use_dask: bool = False, + n_iter: int = 100, + scoring: str = 'roc_auc', ): - """ - Simple classification pipeline using hyperband to optimize logreg hyper-parameters - Parameters - ---------- - `numeric_features` : The list of numeric features - `categoric_features` : The list of categoric features - `learning_rate` : The learning rate - """ + """Simple classification pipeline using random search to optimize logreg hyper-parameters. - return _logreg_gridsearch_model( - 'classification', - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, - ) - - -def logreg_gridsearch_regressor( - numeric_features, - categoric_features, - learning_rate=0.08, - use_dask=False, - n_iter=100, - scoring='roc_auc', -): - """ - Simple regression pipeline using hyperband to optimize logreg hyper-parameters Parameters ---------- - `numeric_features` : The list of numeric features - `categoric_features` : The list of categoric features - `learning_rate` : The learning rate - """ - - return _logreg_gridsearch_model( - 'regression', - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, - ) + numeric_features : list(str) + The list of numeric features + categoric_features : list(str) + The list of categoric features + learning_rate : float + The learning rate. If None, a sklearn.linear_model.LogisticRegression is used. + Otherwise, a sklearn.linear_model.SGDClassifier is used. + use_dask : bool + If True, use dask for parallelizing a grid search over the hyper-parameters. - -def _logreg_gridsearch_model( - task, - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, -): + Returns + ------- + sklearn.model_selection.RandomizedSearchCV + The classifier with the best hyper-parameters found by the random search. + """ if learning_rate is None: param_space = { 'clf__C': np.logspace(-5, 5, 100), @@ -87,7 +49,7 @@ def _logreg_gridsearch_model( model = SGDClassifier( learning_rate=learning_rate_schedule, eta0=eta0, - loss='log', + loss='log_loss', max_iter=10000, fit_intercept=False, ) diff --git a/soccer_xg/ml/mlp.py b/soccer_xg/ml/mlp.py index 1a1ce35..03e4a3f 100644 --- a/soccer_xg/ml/mlp.py +++ b/soccer_xg/ml/mlp.py @@ -1,5 +1,5 @@ from scipy.stats.distributions import randint, uniform -from sklearn.neural_network import MLPClassifier, MLPRegressor +from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from .preprocessing import simple_proc_for_linear_algoritms @@ -21,55 +21,6 @@ def mlp_gridsearch_classifier( `categoric_features` : The list of categoric features `learning_rate` : The learning rate """ - - return _mlp_gridsearch_model( - 'classification', - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, - ) - - -def mlp_gridsearch_regressor( - numeric_features, - categoric_features, - learning_rate=0.08, - use_dask=False, - n_iter=100, - scoring='roc_auc', -): - """ - Simple regression pipeline using hyperband to optimize mlp hyper-parameters - Parameters - ---------- - `numeric_features` : The list of numeric features - `categoric_features` : The list of categoric features - `learning_rate` : The learning rate - """ - - return _mlp_gridsearch_model( - 'regression', - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, - ) - - -def _mlp_gridsearch_model( - task, - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, -): param_space = { 'clf__hidden_layer_sizes': [ (24,), @@ -84,11 +35,7 @@ def _mlp_gridsearch_model( 'clf__learning_rate': ['constant', 'adaptive'], } - model = ( - MLPClassifier(learning_rate_init=learning_rate) - if task == 'classification' - else MLPRegressor(learning_rate_init=learning_rate) - ) + model = MLPClassifier(learning_rate_init=learning_rate) pipe = Pipeline( [ diff --git a/soccer_xg/ml/pipeline.py b/soccer_xg/ml/pipeline.py index 4672869..dd44eab 100644 --- a/soccer_xg/ml/pipeline.py +++ b/soccer_xg/ml/pipeline.py @@ -306,3 +306,24 @@ def fit(self, X, y=None): def transform(self, X): row = self.rowle.transform(X[self.rowname]) return self.embedding[row, :] + + +class InteractionFeature(BaseEstimator, TransformerMixin): + def __init__(self, columns_to_multiply, new_column_name=None): + self.columns_to_multiply = columns_to_multiply + if new_column_name is None: + self.new_column_name = "_x_".join(columns_to_multiply) + else: + self.new_column_name = new_column_name + + def fit(self, X, y=None): + return self + + def transform(self, X): + X_copy = X.copy() + X_copy[self.new_column_name] = X_copy[self.columns_to_multiply[0]] + + for col in self.columns_to_multiply[1:]: + X_copy[self.new_column_name] *= X_copy[col] + + return X_copy diff --git a/soccer_xg/ml/preprocessing.py b/soccer_xg/ml/preprocessing.py index 319caf9..245fec5 100644 --- a/soccer_xg/ml/preprocessing.py +++ b/soccer_xg/ml/preprocessing.py @@ -19,7 +19,6 @@ def simple_proc_for_tree_algoritms(numeric_features, categoric_features): numpipe = make_pipeline( ColumnsSelector(numeric_features), SimpleImputer(strategy='mean'), - StandardScaler(), ) if numeric_features and categoric_features: return make_union(catpipe, numpipe) @@ -45,7 +44,7 @@ def simple_proc_for_linear_algoritms(numeric_features, categoric_features): numpipe = make_pipeline( ColumnsSelector(numeric_features), SimpleImputer(strategy='mean'), - StandardScaler(), + # StandardScaler(), ) if numeric_features and categoric_features: return make_union(catpipe, numpipe) diff --git a/soccer_xg/ml/xgboost.py b/soccer_xg/ml/xgboost.py index 4b4fb5a..ea330f1 100644 --- a/soccer_xg/ml/xgboost.py +++ b/soccer_xg/ml/xgboost.py @@ -13,63 +13,17 @@ def xgboost_gridsearch_classifier( n_iter=100, scoring='roc_auc', ): - """ - Simple classification pipeline using hyperband to optimize xgboost hyper-parameters - Parameters - ---------- - `numeric_features` : The list of numeric features - `categoric_features` : The list of categoric features - `learning_rate` : The learning rate - """ + """Simple classification pipeline using random search to optimize xgboost hyper-parameters. - return _xgboost_gridsearch_model( - 'classification', - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, - ) - - -def xgboost_gridsearch_regressor( - numeric_features, - categoric_features, - learning_rate=0.08, - use_dask=False, - n_iter=100, - scoring='roc_auc', -): - """ - Simple regression pipeline using hyperband to optimize xgboost hyper-parameters Parameters ---------- `numeric_features` : The list of numeric features `categoric_features` : The list of categoric features `learning_rate` : The learning rate + `use_dask` : Whether to use dask or not + `n_iter` : The number of iterations for the random search + `scoring` : The scoring function to use """ - - return _xgboost_gridsearch_model( - 'regression', - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, - ) - - -def _xgboost_gridsearch_model( - task, - numeric_features, - categoric_features, - learning_rate, - use_dask, - n_iter, - scoring, -): param_space = { 'clf__max_depth': randint(2, 11), 'clf__min_child_weight': randint(1, 11), @@ -83,11 +37,7 @@ def _xgboost_gridsearch_model( 'clf__scale_pos_weight': uniform(0.1, 9.9), } - model = ( - xgbsk.XGBClassifier(learning_rate=learning_rate) - if task == 'classification' - else xgbsk.XGBRegressor(learning_rate=learning_rate) - ) + model = xgbsk.XGBClassifier(learning_rate=learning_rate) pipe = Pipeline( [ diff --git a/soccer_xg/utils.py b/soccer_xg/utils.py index 69190cf..0b7c3b2 100644 --- a/soccer_xg/utils.py +++ b/soccer_xg/utils.py @@ -1,8 +1,11 @@ +"""Utility functions.""" import math import pandas as pd from fuzzywuzzy import fuzz +from .data import Dataset + def match_name(name, list_names, min_score=0): # -1 score incase we don't get any matches @@ -31,9 +34,7 @@ def map_names( ): # List for dicts for easy dataframe creation dict_list = [] - for _, (id, name) in df1[ - [df1_output_colname, df1_match_colname] - ].iterrows(): + for _, (id, name) in df1[[df1_output_colname, df1_match_colname]].iterrows(): # Use our method to find best match, we can set a threshold here match = match_name(name, df2[df2_match_colname], threshold) # New dict for storing data @@ -43,11 +44,7 @@ def map_names( if match[1] > threshold: dict_.update({'df2_name': match[0]}) dict_.update( - { - 'df2_id': df2.loc[ - df2[df2_match_colname] == match[0], df2_output_colname - ].iloc[0] - } + {'df2_id': df2.loc[df2[df2_match_colname] == match[0], df2_output_colname].iloc[0]} ) else: dict_.update({'df2_name': 'unknown'}) @@ -133,25 +130,21 @@ def get_matching_shot( # Get shots that happened around the same time ts = shot.time_seconds best_match = other_shots_by_player_in_period.iloc[ - (other_shots_by_player_in_period['time_seconds'] - ts) - .abs() - .argsort()[:1] + (other_shots_by_player_in_period['time_seconds'] - ts).abs().argsort()[:1] ].iloc[0] if abs(ts - best_match.time_seconds) < 3: return best_match return None -def sample_temporal(dataset, size_val=0.0, size_test=0.2): +def sample_temporal( + dataset: Dataset, size_val: float = 0.0, size_test: float = 0.2 +) -> tuple[list[int], list[int], list[int]]: game_ids = dataset.games().sort_values(by='game_date').index.values nb_games = len(game_ids) - games_train = game_ids[ - 0 : math.floor((1 - size_val - size_test) * nb_games) - ] + games_train = game_ids[0 : math.floor((1 - size_val - size_test) * nb_games)] games_val = game_ids[ - math.ceil((1 - size_val - size_test) * nb_games) : math.floor( - (1 - size_test) * nb_games - ) + math.ceil((1 - size_val - size_test) * nb_games) : math.floor((1 - size_test) * nb_games) ] games_test = game_ids[math.ceil((1 - size_test) * nb_games) + 1 : -1] return games_train, games_val, games_test diff --git a/soccer_xg/visualisation.py b/soccer_xg/visualisation.py index c048992..90094a5 100644 --- a/soccer_xg/visualisation.py +++ b/soccer_xg/visualisation.py @@ -1,138 +1,444 @@ import matplotlib.pyplot as plt -import matplotsoccer as mps import numpy as np import numpy.ma as ma from matplotlib.ticker import MultipleLocator from sklearn.metrics import auc, roc_curve +from sklearn.preprocessing import label_binarize +from statsmodels.stats.proportion import proportion_confint +import matplotlib.ticker as mticker +from matplotlib.ticker import MaxNLocator +from matplotlib import gridspec +from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable + from soccer_xg import metrics -def plot_calibration_curve( - y_true, - y_pred, - name='Calibration curve', +def plot_reliability_diagram( + labels, + scores, + legend=None, + show_histogram=True, + bins=10, + bin_strategy="uniform", + bayesian=False, min_samples=None, - axis=None, - **kwargs, + fig=None, + show_counts=False, + ci=None, + shaded_ci=False, + interval_method='beta', + fmt='s-', + show_correction=False, + show_gaps=False, + sample_proportion=0, + color_list=None, + show_bars=False, + invert_histogram=False, + overlay_histogram=False, + color_gaps='lightcoral', + ax=None, ): - """Plot the validation data. + """Plot the reliability diagram of the given scores and true labels. Parameters ---------- - axis : matplotlib.pyplot.axis object or ``None`` (default=``None``) - If provided, the validation line will be overlaid on ``axis``. - Otherwise, a new figure and axis will be generated and plotted on. - **kwargs - Arguments to ``axis.plot``. + labels : array (n_samples, ) + Labels indicating the true class. + scores : array (n_samples,) or list of matrices + Output probability scores for one or several methods. + legend : list of strings or None + Text to use for the legend. + show_histogram : boolean + If True, it generates an additional figure showing the number of + samples in each bin. + bins : int or list of floats + Number of bins to create in the scores' space, or list of bin + boundaries. + bin_strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + + uniform + The bins have identical widths. + quantile + The bins have the same number of samples and depend on `y_prob`. + bayesian : bool, default=False + Compute true and predicted probabilities for a calibration curve using + kernel density estimation instead of bins with a fixed width. + min_samples : int or None + Hide bins with less than 'min_samples'. + fig : matplotlib.pyplot.Figure or None + Figure to use for the plots, if None a new figure is created. + show_counts : boolean + If True shows the number of samples of each bin in its corresponding + line marker. + ci : float or None + If a float between 0 and 1 is passed, it shows an errorbar + corresponding to a confidence interval containing the specified + percentile of the data. + shaded_ci : boolean + If True, the confidence interval is shown as a shaded area instead of + error bars. + interval_method : string (default: 'beta') + Method to estimate the confidence interval which uses the function + proportion_confint from statsmodels.stats.proportion + fmt : string (default: 's-') + Format of the lines following the matplotlib.pyplot.plot standard. + show_correction : boolean + If True shows an arrow for each bin indicating the necessary correction + to the average scores in order to be perfectly calibrated. + show_gaps : boolean + If True shows the gap between the average predictions and the true + proportion of positive samples. + sample_proportion : float in the interval [0, 1] (default 0) + If bigger than 0, it shows the labels of the specified proportion of + samples. + color_list : list of strings or None + List of string colors indicating the color of each method. + show_bars : boolean + If True shows bars instead of lines. + invert_histogram : boolean + If True shows the histogram with the zero on top and highest number of + bin samples at the bottom. + overlay_histogram : boolean + If True, shows the histogram on the same plot as the reliability diagram. + color_gaps : string + Color of the gaps (if shown). Returns ------- - matplotlib.pylot.axis - The axis the plot was made on. - - Raises - ------ - NotFittedError - If the model hasn't been fit **and** validated. + fig : matplotlib.pyplot.figure + Figure with the reliability diagram """ + if isinstance(scores, list): + scores_list = scores + else: + scores_list = [ + scores, + ] + n_scores = len(scores_list) + if color_list is None: + color_list = plt.rcParams['axes.prop_cycle'].by_key()['color'] - if axis is None: - axis = plt.figure(figsize=(5, 5)).add_subplot(111) - - axis.set_title(name) - axis.plot([0, 100], [0, 100], ls='--', lw=1, color='grey') - axis.set_xlabel('Predicted probability') - axis.set_ylabel('True probability in each bin') - axis.set_xlim((0, 100)) - axis.xaxis.set_major_locator(MultipleLocator(20)) - axis.xaxis.set_minor_locator(MultipleLocator(10)) - axis.set_ylim((0, 100)) - axis.yaxis.set_major_locator(MultipleLocator(20)) - axis.yaxis.set_minor_locator(MultipleLocator(10)) - # axis.set_aspect(1) - axis.grid(which='both') - - ( - sample_probabilities, - predicted_pos_percents, - num_plays_used, - ) = metrics.bayesian_calibration_curve(y_true, y_pred) - - if min_samples is not None: - axis.plot( - sample_probabilities, - predicted_pos_percents, - c='c', - alpha=0.3, - **kwargs, + classes = np.unique(labels) + n_classes = len(classes) + if n_classes != 2: + raise ValueError( + 'Only binary classification is supported. Provided labels %s.' % labels ) - sample_probabilities = ma.array(sample_probabilities) - sample_probabilities[num_plays_used < min_samples] = ma.masked - predicted_pos_percents = ma.array(predicted_pos_percents) - predicted_pos_percents[num_plays_used < min_samples] = ma.masked - - max_deviation = metrics.max_deviation(sample_probabilities, predicted_pos_percents) - residual_area = metrics.residual_area(sample_probabilities, predicted_pos_percents) - - axis.plot( - sample_probabilities, - predicted_pos_percents, - c='c', - label='Calibration curve\n(area = %0.2f, max dev = %0.2f)' - % (residual_area, max_deviation), - **kwargs, - ) - - axis.legend(loc='lower right') - - ax2 = axis.twinx() - ax2.hist( - y_pred * 100, - bins=np.arange(0, 101, 1), - density=True, - alpha=0.4, - facecolor='grey', - ) - ax2.set_ylim([0, 0.2]) - ax2.set_yticks([0, 0.1, 0.2]) - - plt.tight_layout() - return axis + labels = label_binarize(labels, classes=classes)[:, 0] + labels_list = [] -def plot_roc_curve(y_true, y_prob, name='Calibration curve', axis=None): - fpr, tpr, _ = roc_curve(y_true, y_prob) - roc_auc = auc(fpr, tpr) + if fig is None: + fig = plt.figure(figsize=(4, 4)) + + if show_histogram: + spec = gridspec.GridSpec( + ncols=1, nrows=2, height_ratios=[5, 1], wspace=0.02, hspace=0.04, left=0.15 + ) + else: + spec = gridspec.GridSpec(ncols=1, nrows=1, hspace=0.04, left=0.15) + + if isinstance(bins, int): + n_bins = bins + if bin_strategy == "quantile": # Determine bin edges by distribution of data + quantiles = np.linspace(0, 1, n_bins + 1) + bins = np.percentile(scores_list[0], quantiles * 100) + bins[0] = 0 - 1e-8 + bins[-1] = 1 + 1e-8 + elif bin_strategy == "uniform": + bins = np.linspace(0, 1 + 1e-8, n_bins + 1) + else: + raise ValueError( + "Invalid entry to 'strategy' input. Strategy " + "must be either 'quantile' or 'uniform'." + ) + elif isinstance(bins, list) or isinstance(bins, np.ndarray): + n_bins = len(bins) - 1 + bins = np.array(bins) + if bins[0] == 0.0: + bins[0] = 0 - 1e-8 + if bins[-1] == 1.0: + bins[-1] = 1 + 1e-8 + else: + raise ValueError( + "Invalid entry to 'bins' input. The must be either " + "a list of bin boundaries or the number of bins." + ) + + if ax is not None: + ax1 = ax + else: + ax1 = fig.add_subplot(spec[0]) + # Perfect calibration + ax1.plot([0, 1], [0, 1], "--", color='lightgrey', zorder=10) + for j, score in enumerate(scores_list): + if labels_list: + labels = labels_list[j] + + if bayesian: + avg_true, avg_pred, bin_true, bin_total = metrics.bayesian_calibration_curve(labels, score) + bins = np.linspace(0.01, 0.99, 99) + else: + avg_true, avg_pred, bin_true, bin_total = metrics.calibration_curve(labels, score, bins=bins) + + zero_idx = bin_total == 0 + + if min_samples is not None: + avg_true = ma.array(avg_true) + avg_true[bin_total < min_samples] = ma.masked + avg_pred = ma.array(avg_pred) + avg_pred[bin_total < min_samples] = ma.masked + + name = legend[j] if legend else None + if show_bars: + ax1.bar( + x=bins[:-1][~zero_idx], + height=avg_true[~zero_idx], + align='edge', + width=(bins[1:] - bins[:-1])[~zero_idx], + edgecolor='black', + color=color_list[j], + ) + else: + if ci is None: + ax1.plot(avg_pred, avg_true, fmt, label=name, color=color_list[j]) + else: + nozero_intervals = proportion_confint( + count=bin_true[~zero_idx], + nobs=bin_total[~zero_idx], + alpha=1 - ci, + method=interval_method, + ) + nozero_intervals = np.array(nozero_intervals) + + intervals = np.empty((2, bin_total.shape[0])) + intervals.fill(np.nan) + intervals[:, ~zero_idx] = nozero_intervals + + yerr = np.abs(intervals - avg_true) + if shaded_ci: + ax1.fill_between(avg_pred, avg_true-yerr[0], avg_true+yerr[1], color=color_list[j], alpha=0.2) + ax1.plot(avg_pred, avg_true, fmt, label=name, color=color_list[j]) + else: + ax1.errorbar( + avg_pred, avg_true, yerr=yerr, label=name, fmt=fmt, color=color_list[j] + ) # markersize=5) + + if show_counts: + for ap, at, count in zip(avg_pred, avg_true, bin_total): + if np.isfinite(ap) and np.isfinite(at): + ax1.text( + ap, + at, + str(count), + fontsize=6, + ha='center', + va='center', + zorder=11, + bbox=dict(boxstyle='square,pad=0.3', fc='white', ec=color_list[j]), + ) + + if show_correction: + for ap, at in zip(avg_pred, avg_true): + ax1.arrow( + ap, + at, + at - ap, + 0, + color=color_gaps, + head_width=0.02, + length_includes_head=True, + width=0.01, + ) + + if show_gaps: + for ap, at in zip(avg_pred, avg_true): + error = avg_pred - avg_true + negative_values = error < 0 + ygaps = np.zeros(shape=(2, avg_true.shape[0])) + ygaps[0, negative_values] = -error[negative_values] + ygaps[1, ~negative_values] = error[~negative_values] + ax1.errorbar( + avg_pred, + avg_true, + yerr=ygaps, + fmt=" ", + color=color_gaps, + lw=4, + capsize=5, + capthick=1, + zorder=10, + ) + + if sample_proportion > 0: + idx = np.random.choice(labels.shape[0], int(sample_proportion * labels.shape[0])) + ax1.scatter( + score[idx], + labels[idx], + marker='|', + s=100, + alpha=0.2, + color=color_list[j], + ) + + ax1.set_xlim((0, 1)) + ax1.xaxis.set_major_locator(MultipleLocator(.20)) + ax1.xaxis.set_minor_locator(MultipleLocator(.10)) + ax1.set_ylim((0, 1)) + ax1.yaxis.set_major_locator(MultipleLocator(.20)) + ax1.yaxis.set_minor_locator(MultipleLocator(.10)) + if not show_histogram or overlay_histogram: + ax1.set_xlabel('Average score') + elif show_histogram: + ax1.set_xticklabels([]) + ax1.set_ylabel('Fraction of positives') + ax1.grid(which='both') + # ax1.set_aspect(1) + ax1.set_axisbelow(True) + + if show_histogram: + + if overlay_histogram: + ax2 = ax1.twinx() + else: + divider = make_axes_locatable(ax1) + ax2 = divider.append_axes("bottom", size="20%", pad=0.1, sharex=ax1) - if axis is None: - axis = plt.figure(figsize=(5, 5)).add_subplot(111) - axis.plot(fpr, tpr, linewidth=1, label='ROC curve (area = %0.2f)' % roc_auc) + # ax2 = fig.add_subplot(spec[1], label='{}'.format(i)) + for j, score in enumerate(scores_list): + # lines = ax1.get_lines() + # ax2.set_xticklabels([]) + + name = legend[j] if legend else None + if n_scores > 1: + kwargs = {'histtype': 'step', 'edgecolor': color_list[j]} + else: + kwargs = {'histtype': 'bar', 'edgecolor': 'black', 'color': color_list[j]} + if overlay_histogram: + kwargs = {**kwargs, 'alpha': 0.4 } + + ax2.hist(score, range=(0, 1), bins=bins, label=name, lw=1, **kwargs) + ax2.set_xlim((0, 1)) + ax2.set_xlabel('Average score') + ax2.yaxis.set_major_locator(MaxNLocator(integer=True, prune='upper', nbins=3)) + ax2.set_ylabel('Count') + if not overlay_histogram: + ytickloc = ax2.get_yticks() + ax2.yaxis.set_major_locator(mticker.FixedLocator(ytickloc)) + yticklabels = ['{:0.0f}'.format(value) for value in ytickloc] + ax2.set_yticklabels(labels=yticklabels, fontdict=dict(verticalalignment='top')) + ax2.grid(True, which='both') + ax2.set_axisbelow(True) + if invert_histogram: + ylim = ax2.get_ylim() + ax2.set_ylim(reversed(ylim)) + + if legend is not None: + lines, labels = fig.axes[0].get_legend_handles_labels() + fig.legend( + lines, + labels, + loc='upper center', + bbox_to_anchor=(0, 0, 1, 1), + bbox_transform=fig.transFigure, + ncol=6, + ) + + fig.align_labels() + return fig + + +def plot_roc_curve( + labels, + scores, + legend=None, + color_list=None, + fmt='-', + fig=None, + ax=None +): + """Plot the ROC curve of the given scores and true labels. + + Parameters + ---------- + labels : array (n_samples, ) + Labels indicating the true class. + scores : array (n_samples,) or list of matrices + Output probability scores for one or several methods. + legend : list of strings or None + Text to use for the legend. + color_list : list of strings or None + List of string colors indicating the color of each method. + fmt : string (default: 's-') + Format of the lines following the matplotlib.pyplot.plot standard. + fig : matplotlib.pyplot.Figure or None + Figure to use for the plots, if None a new figure is created. + + Returns + ------- + fig : matplotlib.pyplot.figure + Figure with the ROC curve + """ + if isinstance(scores, list): + scores_list = scores + else: + scores_list = [ + scores, + ] + + if color_list is None: + color_list = plt.rcParams['axes.prop_cycle'].by_key()['color'] + + if fig is None: + fig = plt.figure(figsize=(4, 4)) + spec = gridspec.GridSpec(ncols=1, nrows=1, hspace=0.04, left=0.15) + + if ax is None: + ax = fig.add_subplot(spec[0]) + + for j, score in enumerate(scores_list): + fpr, tpr, _ = roc_curve(labels, score) + roc_auc = auc(fpr, tpr) + name = f"{legend[j]} (AUC = {roc_auc:.2f})" if legend else None + + ax.plot(fpr, tpr, fmt, linewidth=1, label=name, color=color_list[j]) # reference line, legends, and axis labels - axis.plot([0, 1], [0, 1], linestyle='--', color='gray') - axis.set_title('ROC curve') - axis.set_xlabel('False Positive Rate') - axis.set_ylabel('True Positive Rate') - axis.set_xlim(0, 1) - axis.xaxis.set_major_locator(MultipleLocator(0.20)) - axis.xaxis.set_minor_locator(MultipleLocator(0.10)) - axis.set_ylim(0, 1) - axis.yaxis.set_major_locator(MultipleLocator(0.20)) - axis.yaxis.set_minor_locator(MultipleLocator(0.10)) - axis.grid(which='both') - - # sns.despine() + ax.plot([0, 1], [0, 1], linestyle='--', color='gray') + ax.set_xlabel('False Positive Rate') + ax.set_ylabel('True Positive Rate') + ax.set_xlim(0, 1) + ax.xaxis.set_major_locator(MultipleLocator(0.20)) + ax.xaxis.set_minor_locator(MultipleLocator(0.10)) + ax.set_ylim(0, 1) + ax.yaxis.set_major_locator(MultipleLocator(0.20)) + ax.yaxis.set_minor_locator(MultipleLocator(0.10)) + ax.grid(which='both') + # plt.gca().xaxis.set_ticks_position('none') # plt.gca().yaxis.set_ticks_position('none') - plt.gca().legend() - axis.legend(loc='lower right') - plt.tight_layout() + if legend is not None: + lines, labels = fig.axes[0].get_legend_handles_labels() + fig.legend( + lines, + labels, + loc='lower right', + bbox_to_anchor=(0, 0, 1, 1), + bbox_transform=fig.transFigure, + ncol=6, + ) + + fig.align_labels() + return fig def plot_heatmap(model, data, axis=None): + import matplotsoccer as mps + if axis is None: axis = plt.figure(figsize=(8, 10)).add_subplot(111) diff --git a/soccer_xg/xg.py b/soccer_xg/xg.py index 5de3d77..4f6d26e 100644 --- a/soccer_xg/xg.py +++ b/soccer_xg/xg.py @@ -2,14 +2,14 @@ from __future__ import annotations import warnings -from typing import Tuple, List, Optional from pathlib import Path +from typing import Callable, Literal import joblib import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import brier_score_loss, roc_auc_score -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline, make_pipeline from sklearn.utils.validation import NotFittedError from tqdm.auto import tqdm @@ -17,6 +17,7 @@ from soccer_xg import metrics from soccer_xg.data.base import Dataset from soccer_xg.ml.preprocessing import simple_proc_for_linear_algoritms +from soccer_xg.ml.pipeline import InteractionFeature class XGModel: @@ -24,22 +25,17 @@ class XGModel: Parameters ---------- - copy_data : boolean (default=``True``) - Whether or not to copy data when fitting and applying the model. - Running the model in-place (``copy_data=False``) will be faster and - have a smaller memory footprint, but if not done carefully can lead to - data integrity issues. - - Attributes - ---------- - model : A Scikit-learn pipeline (or equivalent) - The actual model used to compute xG. Upon initialization it will be - set to a default model, but can be overridden by the user. + dataset_transformer : DatasetTransformer + A dataset transformer to convert a dataset to features. + pipeline : Pipeline + Scikit-Learn pipeline to use for the model. column_descriptions : dictionary A dictionary whose keys are the names of the columns used in the model, and the values are string descriptions of what the columns - mean. Set at initialization to be the default model, if you create - your own model you'll need to update this attribute manually. + mean. + + Attributes + ---------- training_seasons : A list of tuples, or ``None`` (default=``None``) If the model was trained using data from a Dataset, a list of (competition_id, season_id) tuples used to train the model. If no @@ -63,11 +59,14 @@ class XGModel: model_directory = Path(__file__).resolve().parent / "models" _default_model_filename = "default_model.xg" - def __init__(self, copy_data: bool = True): - self.copy_data = copy_data - self.column_descriptions = None - - self.model = self.create_default_pipeline() + def __init__( + self, + dataset_transformer: DatasetTransformer, + pipeline: Pipeline, + column_descriptions: dict[str, str] | None = None, + ): + self.dataset_transformer = dataset_transformer + self.pipeline = pipeline self._fitted = False self._training_seasons = None self._validation_seasons = None @@ -77,27 +76,43 @@ def __init__(self, copy_data: bool = True): self._num_shots_used = None @property - def training_seasons(self) -> Optional[List[Tuple[str, str]]]: + def training_seasons(self) -> list[tuple[str, str]] | None: return self._training_seasons @property - def validation_seasons(self): + def validation_seasons(self) -> list[tuple[str, str]] | None: return self._validation_seasons @property - def sample_probabilities(self): + def sample_probabilities(self) -> list[float] | None: return self._sample_probabilities @property - def predicted_goal_percents(self): + def predicted_goal_percents(self) -> list[float] | None: return self._predicted_goal_percents @property - def num_shots_used(self): + def num_shots_used(self) -> int | None: return self._num_shots_used @classmethod - def filter_shots(cls, df_actions): + def filter_shots(cls, df_actions) -> pd.Series: + """Return a boolean mask indicating which shots to handle. + + This method is used to filter out shots that should not be used + for training, validation, or prediction. By default, it filters + out own-goals only. + + Parameters + ---------- + df_actions : pd.DataFrame + A dataframe containing the SPADL actions. + + Returns + ------- + pd.Series + A boolean mask indicating which shots to handle. + """ shot_mask = df_actions.type_name.isin( ["shot", "shot_penalty", "shot_freekick"] ) & df_actions.result_name.isin(["fail", "success"]) @@ -106,30 +121,34 @@ def filter_shots(cls, df_actions): def train( self, source_data: Dataset | pd.DataFrame, - training_seasons: List[Tuple[str, str]] = (("ENG", "1617"), ("ENG", "1718")), target_colname: str = "goal", + training_seasons: list[tuple[str, str]] | None = None, ): """Train the model. Once a modeling pipeline is set up (either the default or something custom-generated), historical data needs to be fed into it in order to "fit" the model so that it can then be used to predict future results. - This method implements a simple wrapper around the core Scikit-learn functionality - which does this. + This method implements a simple wrapper around the core Scikit-learn + functionality which does this. - The default is to use data from a Dataset object, however that can be changed - to a simple Pandas DataFrame with precomputed features and labels if desired. + The default is to use data from a Dataset object, however that can be + changed to a simple Pandas DataFrame with precomputed features and + labels if desired. - There is no particular output from this function, rather the parameters governing - the fit of the model are saved inside the model object itself. If you want to get an - estimate of the quality of the fit, use the ``validate_model`` method after running - this method. + There is no particular output from this function, rather the + parameters governing the fit of the model are saved inside the model + object itself. If you want to get an estimate of the quality of the + fit, use the ``validate_model`` method after running this method. Parameters ---------- source_data : ``Dataset`` or a Pandas DataFrame The data to be used to train the model. If an instance of - ``Dataset`` is given, will query the api database for the training data. + ``Dataset`` is given, will query the database for the training data. + target_colname : string or integer (default=``"goal"``) + The name of the target variable column. This is only relevant if + ``source_data`` is not a ``Dataset``. training_seasons : list of tuples (default=``[('ENG', '1617'), ('ENG', '1718')]``) What seasons to use to train the model if getting data from a Dataset instance. If ``source_data`` is not a ``Dataset``, this argument will be ignored. @@ -137,40 +156,35 @@ def train( model - some will need to be reserved for a final validation (see the ``validate_model`` method). A good dataset to reserve for validation is the most recent one or two seasons. - target_colname : string or integer (default=``"goal"``) - The name of the target variable column. This is only relevant if - ``source_data`` is not a ``Dataset``. Returns ------- ``None`` """ - if isinstance(self.model, list): - for model in self.model: - model.train(source_data, training_seasons, target_colname) + self._training_seasons = [] + if isinstance(source_data, Dataset): + game_ids = pd.concat( + source_data.games(competition_id=s[0], season_id=s[1]) for s in training_seasons + ).index.tolist() + feature_cols, target_col = self.dataset_transformer.transform( + dataset=source_data, + game_ids=game_ids, + ) + self._training_seasons = training_seasons else: - self._training_seasons = [] - if isinstance(source_data, Dataset): - game_ids = pd.concat( - source_data.games(competition_id=s[0], season_id=s[1]) - for s in training_seasons - ).index - feature_cols, target_col = prepare( - source_data, game_ids, shotfilter=self.filter_shots - ) - self._training_seasons = training_seasons - else: - target_col = source_data[target_colname] - feature_cols = source_data.drop(target_colname, axis=1) - self.model.fit(feature_cols, target_col) + target_col = source_data[target_colname] + feature_cols = source_data.drop(target_colname, axis=1) + self.pipeline.fit(feature_cols, target_col.squeeze()) self._fitted = True def validate( self, - source_data, - validation_seasons=(("ENG", "1819")), - target_colname="goal", - plot=True, + source_data: Dataset | pd.DataFrame, + target_colname: str = "goal", + validation_seasons: list[tuple[str, str]] | None = None, + n_bins=10, + bin_strategy="quantile", + plot: bool = True, ): """Validate the model. @@ -202,6 +216,15 @@ def validate( target_colname : string or integer (default=``"goal"``) The name of the target variable column. This is only relevant if ``source_data`` is not a ``Dataset``. + n_bins : int, default=10 + Number of bins to discretize the [0, 1] interval. A bigger number + requires more data. + strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + uniform + The bins have identical widths. + quantile + The bins have the same number of samples and depend on `y_prob`. plot: bool (default=true) Whether to plot the AUROC and probability calibration curves. @@ -222,23 +245,26 @@ def validate( if isinstance(source_data, Dataset): game_ids = pd.concat( source_data.games(competition_id=s[0], season_id=s[1]) for s in validation_seasons - ).index - _, target_col = prepare(source_data, game_ids) + ).index.tolist() + feature_cols, target_col = self.dataset_transformer.transform( + dataset=source_data, game_ids=game_ids + ) self._validation_seasons = validation_seasons else: game_ids = None + feature_cols = source_data.drop(target_colname, axis=1) target_col = source_data[target_colname] self._validation_seasons = [] - df_predictions = self.estimate(source_data, game_ids) - predicted_probabilities = df_predictions["xG"] - target_col = target_col.loc[df_predictions.index] + predicted_probabilities = self.estimate(feature_cols)["xG"] + target_col = target_col.squeeze() ( - self._sample_probabilities, self._predicted_goal_percents, + self._sample_probabilities, + _, self._num_shots_used, - ) = metrics.bayesian_calibration_curve(target_col.values, predicted_probabilities) + ) = metrics.calibration_curve(target_col, predicted_probabilities, n_bins, bin_strategy) # Compute the maximal deviation from a perfect prediction as well as the area under the # curve of the residual between |predicted - perfect|: @@ -251,28 +277,45 @@ def validate( roc = roc_auc_score(target_col, predicted_probabilities) brier = brier_score_loss(target_col, predicted_probabilities) ece = metrics.expected_calibration_error( - target_col, predicted_probabilities, 10, "uniform" + target_col, predicted_probabilities, n_bins, "uniform" ) ace = metrics.expected_calibration_error( - target_col, predicted_probabilities, 10, "quantile" + target_col, predicted_probabilities, n_bins, "quantile" ) if plot: import matplotlib.pyplot as plt from soccer_xg.visualisation import ( - plot_calibration_curve, + plot_reliability_diagram, plot_roc_curve, ) fig, ax = plt.subplots(1, 2, figsize=(10, 5)) - plot_roc_curve(target_col, predicted_probabilities, axis=ax[0]) - plot_calibration_curve( + plot_roc_curve(target_col, predicted_probabilities, fig=fig, ax=ax[0]) + plot_reliability_diagram( target_col, predicted_probabilities, + fig=fig, + ax=ax[1], + bayesian=False, + bins=n_bins, + bin_strategy=bin_strategy, + show_counts=False, + fmt='s-', min_samples=100, - axis=ax[1], + show_histogram=False, + overlay_histogram=False, + invert_histogram=False, + ci=0.95, + shaded_ci=True, + show_gaps=False, + show_bars=False, ) + ax[0].set_title("ROC curve") + ax[1].set_title("Reliability diagram") + plt.tight_layout() + plt.show() return { "max_dev": max_deviation, @@ -281,10 +324,9 @@ def validate( "brier": brier, "ece": ece, "ace": ace, - "fig": fig if plot else None, } - def estimate(self, source_data, game_ids=None): + def estimate(self, source_data: Dataset | pd.DataFrame, game_ids: list[int] | None = None): """Estimate the xG values for all shots in a set of games. The default is to use data from a Dataset object, however that can be changed @@ -315,32 +357,15 @@ def estimate(self, source_data, game_ids=None): if not self._fitted: raise NotFittedError("Must fit model before predicting WP.") - if isinstance(self.model, list): - xg = [] - for model in self.model: - xg.append(model.estimate(source_data, game_ids)) - return pd.concat(xg).sort_index() - else: - if isinstance(source_data, Dataset): - game_ids = source_data.games().index if game_ids is None else game_ids - source_data, _ = prepare(source_data, game_ids) - - xg = pd.DataFrame(index=source_data.index) - xg["xG"] = self.model.predict_proba(source_data)[:, 1] - return xg - - def create_default_pipeline(self): - """Create the default xG estimation pipeline. + if isinstance(source_data, Dataset): + game_ids = source_data.games().index.tolist() if game_ids is None else game_ids + source_data, _ = self.dataset_transformer.transform( + dataset=source_data, game_ids=game_ids + ) - Returns - ------- - Scikit-learn pipeline - The default pipeline, suitable for computing xG - but by no means the best possible model. - """ - models = [OpenplayXGModel(), FreekickXGModel(), PenaltyXGModel()] - self.column_descriptions = {m.__class__.__name__: m.column_descriptions for m in models} - return models + xg = pd.DataFrame(index=source_data.index) + xg["xG"] = self.pipeline.predict_proba(source_data)[:, 1] + return xg def save_model(self, filename=None): """Save the XGModel instance to disk. @@ -386,58 +411,29 @@ def load_model(cls, filename=None): return joblib.load(cls.model_directory / filename) -class OpenplayXGModel(XGModel): - _default_model_filename = "default_openplay_model.xg" - - def create_default_pipeline(self): - bodypart_colname = "bodypart_id_a0" - dist_to_goal_colname = "start_dist_to_goal_a0" - angle_to_goal_colname = "start_angle_to_goal_a0" - - self.column_descriptions = { - bodypart_colname: "Bodypart used for the shot (head, foot or other)", - dist_to_goal_colname: "Distance to goal", - angle_to_goal_colname: "Angle to goal", - } - - preprocess_pipeline = simple_proc_for_linear_algoritms( - [dist_to_goal_colname, angle_to_goal_colname], [bodypart_colname] - ) - base_model = LogisticRegression(max_iter=10000, solver="lbfgs", fit_intercept=False) - pipe = make_pipeline(preprocess_pipeline, base_model) - return pipe - - @classmethod - def filter_shots(cls, df_actions): - shot_idx = (df_actions.type_name == "shot") & df_actions.result_name.isin( - ["fail", "success"] - ) - return shot_idx - - class PenaltyXGModel(XGModel): _default_model_filename = "default_penalty_model.xg" - def __init__(self, copy_data=True): - super().__init__(copy_data) + def __init__(self): self._fitted = True + self.dataset_transformer = DatasetTransformer( + xfns=[], shotfilter=lambda x: x.type_name == "shot_penalty" + ) + self.pipeline = Pipeline([]) def train( self, - source_data, - training_seasons=(("ENG", "1617"), ("ENG", "1718")), - target_colname="goal", + source_data: Dataset | pd.DataFrame, + target_colname: str = "goal", + training_seasons: list[tuple[str, str]] | None = None, ): pass - def estimate(self, source_data, game_ids=None): + def estimate(self, source_data: Dataset | pd.DataFrame, game_ids: list[int] | None = None): if isinstance(source_data, Dataset): - game_ids = source_data.games.index if game_ids is None else game_ids - source_data, _ = prepare( - source_data, - game_ids, - xfns=[], - shotfilter=PenaltyXGModel.filter_shots, + game_ids = source_data.games().index.tolist() if game_ids is None else game_ids + source_data, _ = self.dataset_transformer.transform( + dataset=source_data, game_ids=game_ids ) xg = pd.DataFrame(index=source_data.index) @@ -445,103 +441,286 @@ def estimate(self, source_data, game_ids=None): return xg - def create_default_pipeline(self): - return None - - @classmethod - def filter_shots(cls, df_actions): - shot_idx = df_actions.type_name == "shot_penalty" - return shot_idx - class FreekickXGModel(XGModel): _default_model_filename = "default_freekick_model.xg" - def create_default_pipeline(self): - dist_to_goal_colname = "start_dist_to_goal_a0" - angle_to_goal_colname = "start_angle_to_goal_a0" + def __init__(self): + self.dataset_transformer = DatasetTransformer( + xfns=[fs.shot_dist, fs.shot_visible_angle], + shotfilter=lambda x: x.type_name == "shot_freekick", + ) + self.pipeline = self._build_pipeline() + + def _build_pipeline(self) -> Pipeline: + dist_colname = "dist_shot" + angle_colname = "visible_angle_shot" + dist_x_angle_colname = "dist_x_visible_angle_shot" self.column_descriptions = { - dist_to_goal_colname: "Distance to goal", - angle_to_goal_colname: "Angle to goal", + dist_colname: "Distance to goal", + angle_colname: "Angle to goal", + dist_x_angle_colname: "Distance * angle to goal", } + feature_pipeline = InteractionFeature([dist_colname, angle_colname], dist_x_angle_colname) preprocess_pipeline = simple_proc_for_linear_algoritms( - [dist_to_goal_colname, angle_to_goal_colname], [] + numeric_features=[dist_colname, angle_colname, dist_x_angle_colname], + categoric_features=[], ) base_model = LogisticRegression(max_iter=10000, solver="lbfgs", fit_intercept=True) - pipe = make_pipeline(preprocess_pipeline, base_model) - return pipe + return make_pipeline(feature_pipeline, preprocess_pipeline, base_model) + + +class BasicOpenplayXGModel(XGModel): + _default_model_filename = "default_openplay_model.xg" + + def __init__(self): + self.dataset_transformer = DatasetTransformer( + xfns=[fs.shot_dist, fs.shot_visible_angle, fs.shot_bodypart], + shotfilter=lambda x: x.type_name == 'shot' and x.result_name in ["fail", "success"], + ) + self.pipeline = self._build_pipeline() + + def _build_pipeline(self) -> Pipeline: + bodypart_colname = "bodypart_name_shot" + dist_colname = "dist_shot" + angle_colname = "visible_angle_shot" + dist_x_angle_colname = "dist_x_visible_angle_shot" + + self.column_descriptions = { + bodypart_colname: "Bodypart used for the shot (head, foot or other)", + dist_colname: "Distance to goal", + angle_colname: "Angle to goal", + dist_x_angle_colname: "Distance * angle to goal", + } + + feature_pipeline = InteractionFeature([dist_colname, angle_colname], dist_x_angle_colname) + preprocess_pipeline = simple_proc_for_linear_algoritms( + numeric_features=[dist_colname, angle_colname, dist_x_angle_colname], + categoric_features=[bodypart_colname], + ) + base_model = LogisticRegression(max_iter=10000, solver="lbfgs", fit_intercept=False) + return make_pipeline(feature_pipeline, preprocess_pipeline, base_model) + + +class AdvancedOpenplayXGModel(XGModel): + _default_model_filename = "default_openplay_model.xg" + + def __init__(self): + self.dataset_transformer = DatasetTransformer( + xfns=[fs.shot_dist, fs.shot_visible_angle, fs.shot_bodypart], + shotfilter=lambda x: x.type_name == 'shot' and x.result_name in ["fail", "success"], + ) + self.pipeline = self._build_pipeline() + + def _build_pipeline(self) -> Pipeline: + bodypart_colname = "bodypart_name_shot" + dist_colname = "dist_shot" + angle_colname = "visible_angle_shot" + dist_x_angle_colname = "dist_x_visible_angle_shot" + + self.column_descriptions = { + bodypart_colname: "Bodypart used for the shot (head, foot or other)", + dist_colname: "Distance to goal", + angle_colname: "Angle to goal", + dist_x_angle_colname: "Distance * angle to goal", + } + + preprocess_pipeline = simple_proc_for_linear_algoritms( + numeric_features=[dist_colname, angle_colname, dist_x_angle_colname], + categoric_features=[bodypart_colname], + ) + base_model = LogisticRegression(max_iter=10000, solver="lbfgs", fit_intercept=False) + return make_pipeline(preprocess_pipeline, base_model) + + +class StatsBombOpenplayXGModel(XGModel): + _default_model_filename = "default_openplay_model.xg" + + def __init__(self): + self.dataset_transformer = DatasetTransformer( + xfns=[fs.shot_dist, fs.shot_visible_angle, fs.shot_bodypart], + shotfilter=lambda x: x.type_name == 'shot' and x.result_name in ["fail", "success"], + ) + self.pipeline = self._build_pipeline() + + def _build_pipeline(self) -> Pipeline: + bodypart_colname = "bodypart_name_shot" + dist_colname = "dist_shot" + angle_colname = "visible_angle_shot" + dist_x_angle_colname = "dist_x_visible_angle_shot" + + self.column_descriptions = { + bodypart_colname: "Bodypart used for the shot (head, foot or other)", + dist_colname: "Distance to goal", + angle_colname: "Angle to goal", + dist_x_angle_colname: "Distance * angle to goal", + } + + feature_pipeline = make_pipeline( + InteractionFeature([dist_colname, angle_colname], dist_x_angle_colname) + ) + preprocess_pipeline = simple_proc_for_linear_algoritms( + numeric_features=[dist_colname, angle_colname, dist_x_angle_colname], + categoric_features=[bodypart_colname], + ) + base_model = LogisticRegression(max_iter=10000, solver="lbfgs", fit_intercept=False) + return make_pipeline(feature_pipeline, preprocess_pipeline, base_model) + + +class XGModelEnsemble: + def __init__(self, models=None): + super().__init__() + if models is None: + models = [BasicOpenplayXGModel(), FreekickXGModel(), PenaltyXGModel()] + self.model = models + self.column_descriptions = {m.__class__.__name__: m.column_descriptions for m in models} + + def train(self, source_data, training_seasons, target_colname="goal"): + for model in self.model: + model.train(source_data, training_seasons, target_colname) + + def validate(self, source_data, validation_seasons, target_colname="goal", plot=True): + results = {} + for model in self.model: + results.update(model.validate(source_data, validation_seasons, target_colname, plot)) + return results + + def estimate(self, source_data, game_ids=None): + xg = [] + for model in self.model: + xg.append(model.estimate(source_data, game_ids)) + return pd.concat(xg).sort_index() + + def save_model(self, filename=None): + if filename is None: + filename = self._default_model_filename + for i, model in enumerate(self.model): + model.save_model(filename=f"{filename}_{i}") @classmethod - def filter_shots(cls, df_actions): - shot_idx = df_actions.type_name == "shot_freekick" - return shot_idx - - -def prepare( - dataset: Dataset, - game_ids=None, - xfns=fs.default_features, - yfns=fs.default_labels, - shotfilter=None, - nb_prev_actions=3, - on_fail="raise", -): - """Prepare a dataset for training and validation. + def load_model(cls, filename=None): + if filename is None: + filename = cls._default_model_filename + + models = [] + i = 0 + while True: + try: + models.append(joblib.load(cls.model_directory / f"{filename}_{i}")) + i += 1 + except FileNotFoundError: + break + return cls(models) + + +def is_shot(action) -> bool: + """Return a boolean mask indicating which shots to handle. + + This method is used to filter out shots that should not be used + for training, validation, or prediction. By default, it filters + out own-goals only. + + Parameters + ---------- + df_actions : pd.Series + A dataframe containing the SPADL actions. + + Returns + ------- + pd.Series + A boolean mask indicating which shots to handle. + """ + return (action.type_name in ["shot", "shot_penalty", "shot_freekick"]) and ( + action.result_name in ["fail", "success"] + ) + + +class DatasetTransformer: + """Transforms a dataset to xG features and labels. Parameters ---------- - dataset : Dataset - The dataset to use. - game_ids : list of ints (default=None) - Only use data from the games in this list. By default, all games - in the dataset are used. xfns : list(callable) List of feature generators to apply. Defaults to ``default_features``. yfns : list(callable) List of label generators to apply. Defaults to ``default_labels``. shotfilter: callable(pd.Series) -> bool - A function that takes a shot (in SPADL format) and returns True if the - shot should be used for feature extraction. If None, all shots will be - used (excluding own-goals). + A function that takes a SPADL action and returns True if the + action should be used for feature extraction. If None, all shots will + be used (excluding own-goals). nb_prev_actions: int The number of previous actions to consider when calculating labels - on_fail: 'raise' or 'warn' - What to do if a feature or label function fails on a specific game. - - Returns - ------- - X : pd.DataFrame - A dataframe containing the features. - y : pd.DataFrame - A dataframe containing the labels. """ - game_ids = dataset.games().index if game_ids is None else game_ids - X, y = {}, {} - for game_id in tqdm(game_ids, desc="Preparing dataset"): - try: - game = dataset.games().loc[game_id] - game_actions = dataset.actions(game_id) - game_events = dataset.events(game_id) - X[game_id], y[game_id] = fs.compute_attributes( - game, - game_actions, - events=game_events, - xfns=xfns, - yfns=yfns, - shotfilter=shotfilter, - nb_prev_actions=nb_prev_actions, - ) - X[game_id]["game_id"] = game_id - y[game_id]["game_id"] = game_id - except Exception as e: - if on_fail == "warn": - warnings.warn(f"Failed for game with id={game_id}: {e}") - else: - raise RuntimeError(f"Failed for game with id={game_id}.") from e - X = pd.concat(X.values()).reset_index().set_index(["game_id", "action_id"]) - # remove post-shot features (these will all have a single unique value) - f = X.columns[X.nunique() > 1] - y = pd.concat(y.values()).reset_index().set_index(["game_id", "action_id"]) - return X[f], y + + def __init__( + self, + xfns: list[fs.AttributeGenerator | str] = fs.default_features, + yfns: list[fs.AttributeGenerator | str] = fs.default_labels, + shotfilter: Callable[[pd.Series], bool] = is_shot, + nb_prev_actions: int = 3, + ): + self.xfns = xfns + self.yfns = yfns + self.shotfilter = shotfilter + self.nb_prev_actions = nb_prev_actions + + def transform( + self, + dataset: Dataset, + game_ids: list[int] | None = None, + on_fail: Literal["raise", "warn"] = "raise", + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Prepare a dataset for training and validation. + + Parameters + ---------- + dataset : Dataset + The dataset to use. + game_ids : list of ints (default=None) + Only use data from the games in this list. By default, all games + in the dataset are used. + on_fail: 'raise' or 'warn' + What to do if a feature or label function fails on a specific game. + + Returns + ------- + X : pd.DataFrame + A dataframe containing the features. + y : pd.DataFrame + A dataframe containing the labels. + """ + games = dataset.games() + game_ids = games.index.tolist() if game_ids is None else game_ids + X, y = {}, {} + for game_id in tqdm(game_ids, desc="Preparing dataset"): + try: + game = games.loc[game_id] + game_actions = dataset.actions(game_id) + game_events = dataset.events(game_id) + _X, _y = fs.compute_attributes( + game, + game_actions, + events=game_events, + xfns=self.xfns, + yfns=self.yfns, + shotfilter=self.shotfilter, + nb_prev_actions=self.nb_prev_actions, + ) + _X["game_id"] = game_id + _y["game_id"] = game_id + if len(_X) and len(_y): + X[game_id] = _X + y[game_id] = _y + except Exception as e: + if on_fail == "warn": + warnings.warn(f"Failed for game with id={game_id}: {e}", stacklevel=2) + else: + raise RuntimeError(f"Failed for game with id={game_id}.") from e + X = pd.concat(X.values()).reset_index().set_index(["game_id", "action_id"]) + # remove post-shot features (these will all have a single unique value) + # f = X.columns[X.nunique() > 1] + f = X.columns + y = pd.concat(y.values()).reset_index().set_index(["game_id", "action_id"]) + return X[f], y