diff --git a/.gitignore b/.gitignore index 93f68a8..708b8ab 100644 --- a/.gitignore +++ b/.gitignore @@ -183,9 +183,9 @@ build.py tests/files/models/my-test-gnn/* tests/files/test.pickle.gz +examples/models/* /models /dev - -*.ipynb +/*.ipynb .env \ No newline at end of file diff --git a/examples/0_quick_start_guide.ipynb b/examples/0_quick_start_guide.ipynb index f26fd7b..73787e3 100644 --- a/examples/0_quick_start_guide.ipynb +++ b/examples/0_quick_start_guide.ipynb @@ -80,7 +80,7 @@ "converter = GraphConverter(dataset=kloppy_dataset, labels=dummy_labels(kloppy_dataset))\n", "\n", "# Compute the graphs and add them to the CustomSpektralDataset\n", - "dataset = CustomSpektralDataset(graph=converter.to_spektral_graphs())" + "dataset = CustomSpektralDataset(graphs=converter.to_spektral_graphs())" ] }, { diff --git a/examples/1_kloppy_gnn_train.ipynb b/examples/1_kloppy_gnn_train.ipynb index 475e1af..3a87e50 100644 --- a/examples/1_kloppy_gnn_train.ipynb +++ b/examples/1_kloppy_gnn_train.ipynb @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -184,9 +184,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing frames: 100%|██████████| 500/500 [00:02<00:00, 244.81it/s]\n", + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.65it/s]\n", + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 343.58it/s] \n", + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.17it/s]\n" + ] + } + ], "source": [ "from os.path import exists\n", "\n", @@ -266,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -302,9 +313,19 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train: CustomSpektralDataset(n_graphs=791)\n", + "Test: CustomSpektralDataset(n_graphs=477)\n", + "Validation: CustomSpektralDataset(n_graphs=336)\n" + ] + } + ], "source": [ "train, test, val = dataset.split_test_train_validation(\n", " split_train=4, split_test=1, split_validation=1, by_graph_id=True, random_seed=42\n", @@ -330,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -358,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -479,7 +500,9 @@ "\n", "1. We have a a [`DisjointLoader`](https://graphneural.network/loaders/#disjointloader) for training and validation sets.\n", "2. Fit the model. \n", - "3. We add `EarlyStopping` and a `validation_data` dataset to monitor performance, and set `use_multiprocessing=True` to improve training speed." + "3. We add `EarlyStopping` and a `validation_data` dataset to monitor performance, and set `use_multiprocessing=True` to improve training speed.\n", + "\n", + "⚠️ When trying to fit the model _again_ make sure to reload Data Loaders in [Section 6.4](#64-create-dataloaders), because they are generators." ] }, { @@ -491,7 +514,7 @@ "model.fit(\n", " loader_tr.load(),\n", " steps_per_epoch=loader_tr.steps_per_epoch,\n", - " epochs=10,\n", + " epochs=5,\n", " use_multiprocessing=True,\n", " validation_data=loader_va.load(),\n", " callbacks=[EarlyStopping(monitor=\"loss\", patience=5, restore_best_weights=True)],\n", @@ -529,14 +552,24 @@ "1. Create another `DisjointLoader`, this time for the test set.\n", "2. Evaluate model performance on the test set. This evaluation function uses the `metrics` passed to `model.compile`\n", "\n", - "Note: Our performance is really bad because we're using random labels, very few epochs and a small dataset." + "🗒️ Our performance is really bad because we're using random labels, very few epochs and a small dataset.\n", + "\n", + "📖 For more information on evaluation in sports analytics see: [Methodology and evaluation in sports analytics: challenges, approaches, and lessons learned {J. Davis et. al. (2024)}](https://link.springer.com/article/10.1007/s10994-024-06585-0)\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15/15 [==============================] - 0s 4ms/step - loss: 0.7250 - auc: 0.5309 - binary_accuracy: 0.5241\n" + ] + } + ], "source": [ "loader_te = DisjointLoader(test, epochs=1, shuffle=False, batch_size=batch_size)\n", "results = model.evaluate(loader_te.load())" @@ -555,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -595,9 +628,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 326.02it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11/11 [==============================] - 0s 4ms/step\n" + ] + } + ], "source": [ "# Compute the graphs and add them to the CustomSpektralDataset\n", "pred_dataset = CustomSpektralDataset(graphs=preds_converter.to_spektral_graphs())\n", @@ -612,16 +660,95 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "5. Convert Klopy dataset to a dataframe and merge back the pedictions using the frame_ids.\n", - "\n", - "Note: Not all frames have a prediction because of missing (ball) data." + "5. Convert Klopy dataset to a dataframe and merge back the pedictions using the frame_ids." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
frame_idperiod_idtimestampy
300216610 days 00:00:33.3000000.259016
301216710 days 00:00:33.4000000.251124
302216810 days 00:00:33.5000000.258305
303216910 days 00:00:33.6000000.256378
304217010 days 00:00:33.7000000.305434
\n", + "
" + ], + "text/plain": [ + " frame_id period_id timestamp y\n", + "300 2166 1 0 days 00:00:33.300000 0.259016\n", + "301 2167 1 0 days 00:00:33.400000 0.251124\n", + "302 2168 1 0 days 00:00:33.500000 0.258305\n", + "303 2169 1 0 days 00:00:33.600000 0.256378\n", + "304 2170 1 0 days 00:00:33.700000 0.305434" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -631,7 +758,16 @@ " {\"frame_id\": [x.id for x in pred_dataset], \"y\": preds.flatten()}\n", ")\n", "\n", - "kloppy_df = pd.merge(kloppy_df, preds_df, on=\"frame_id\", how=\"left\")" + "kloppy_df = pd.merge(kloppy_df, preds_df, on=\"frame_id\", how=\"left\")\n", + "\n", + "kloppy_df[300: 305][['frame_id', 'period_id', 'timestamp', 'y']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "🗒️ Not all frames have a prediction because of missing (ball) data, so we look at the 300th frame." ] } ], diff --git a/examples/graphs_faq.ipynb b/examples/graphs_faq.ipynb new file mode 100644 index 0000000..20f6a7d --- /dev/null +++ b/examples/graphs_faq.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### A. What is a Graph?\n", + "\n", + "
\n", + " 🌀 Expand for an short explanations on Graphs \n", + "
\n", + "
\n", + "\n", + "Before we continue it might be good to briefly explain what a Graph even in is!\n", + "\n", + "A Graph is a data structure consisting of:\n", + "- Nodes: Individual elements in the graph\n", + "- Edges: Connections between nodes\n", + "\n", + "The graph is typically represented by:\n", + "- [Adjacency matrix](https://en.wikipedia.org/wiki/Adjacency_matrix): Shows connections between nodes\n", + "- Node features: Attributes or properties of each node\n", + "- Edge features: Attributes of the connections between nodes\n", + "\n", + "The image on the right represents a stylized version of a frame of tracking data in soccer.\n", + "\n", + "In section 6.1 we can see what this looks like in Python.\n", + "\n", + "
\n", + "
\n", + "\n", + "![Graph representation](https://github.com/UnravelSports/unravelsports.github.io/blob/main/imgs/what-is-a-graph-4.png?raw=true)\n", + "\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### B. What are all GraphConverter settings?\n", + "\n", + "
\n", + " 🌀 Expand for a full table of additional optional GraphConverter parameters
\n", + "\n", + "| Parameter | Type | Description | Default |\n", + "|-----------|------|-------------|---------|\n", + "| `prediction` | bool | When True use the converter to create Graph dataset to apply a pre-trained model to, no labels required. Defaults to False. | False |\n", + "| `ball_carrier_threshold` | float | The distance threshold to determine the ball carrier in meters. If no ball carrier within ball_carrier_threshold, we skip the frame. | 25.0 |\n", + "| `max_player_speed` | float | The maximum speed of a player in meters per second. Used for normalizing node features. | 12.0 |\n", + "| `max_ball_speed` | float | The maximum speed of the ball in meters per second. Used for normalizing node features. | 28.0 |\n", + "| `boundary_correction` | float | A correction factor for boundary calculations, used to correct out of bounds as a percentage (Used as 1+boundary_correction, i.e., 0.05). Not setting this might lead to players outside the pitch markings to have values that fall slightly outside of our normalization range. When we set boundary_correction, any players outside the pitch will be moved to be on the closest line. | None |\n", + "| `self_loop_ball` | bool | Flag to indicate if the ball node should have a self-loop, aka be connected with itself and not only player(s) | True |\n", + "| `adjacency_matrix_connect_type` | str | The type of connection used in the adjacency matrix, typically related to the ball. Choose from 'ball', 'ball_carrier' or 'no_connection' | 'ball' |\n", + "| `adjacency_matrix_type` | str | The type of adjacency matrix, indicating how connections are structured, such as split by team. Choose from 'delaunay', 'split_by_team', 'dense', 'dense_ap' or 'dense_dp' | 'split_by_team' |\n", + "| `infer_ball_ownership` | bool | Infers 'attacking_team' if no 'ball_owning_team' exist (in Kloppy TrackingDataset) by finding the player closest to the ball using ball xyz, uses 'ball_carrier_threshold' as a cut-off. | True |\n", + "| `infer_goalkeepers` | bool | Set True if no GK label is provided, set False for incomplete (broadcast tracking) data that might not have a GK in every frame. | True |\n", + "| `defending_team_node_value` | float | Value for the node feature when player is on defending team. Should be between 0 and 1 including. | 0.1 |\n", + "| `non_potential_receiver_node_value` | float | Value for the node feature when player is NOT a potential receiver of a pass (when on opposing team or in possession of the ball). Should be between 0 and 1 including. | 0.1 |\n", + "| `label_type` | str | The type of prediction label used. Currently only supports 'binary' | 'binary' |\n", + "| `random_seed` | int, bool | When a random_seed is given, it will randomly shuffle an individual Graph without changing the underlying structure. When set to True, it will shuffle every frame differently; False won't shuffle. Advised to set True when creating an actual dataset to support Permutation Invariance. | False |\n", + "| `pad` | bool | True pads to a total amount of 22 players and ball (so 23x23 adjacency matrix). It dynamically changes the edge feature padding size based on the combination of AdjacencyMatrixConnectType and AdjacencyMatrixType, and self_loop_ball. No need to set padding because smaller and larger graphs can all be used in the same dataset. | False |\n", + "| `verbose` | bool | The converter logs warnings / error messages when specific frames have no coordinates, or other missing information. False mutes all of these warnings. | False |\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### C. What features does each Graph have?\n", + "\n", + "
\n", + " 🌀 Expand for a full list of features \n", + "
\n", + " \n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### D. What is a CustomGraphDataset?\n", + "\n", + "
\n", + " 🌀 Expand for a short explanation on CustomSpektralDataset
\n", + "\n", + "Let's have a look at the internals of our `CustomSpektralDataset`. This dataset class contains a list of graphs, available through `dataset.graphs`.\n", + "\n", + "The first item in our dataset has 23 nodes, 12 features per node and 7 features per edge.\n", + "\n", + "
\n", + "\n", + "```python\n", + "dataset.graphs[0]\n", + "\n", + ">>> Graph(n_nodes=23, n_node_features=12, n_edge_features=7, n_labels=1)\n", + "```\n", + "\n", + "The `CustomSpektralDataset` also allows us to split our data into train and test sets (and validation set if required) by using either:\n", + "- `dataset.split_test_train_validation()`\n", + "- `dataset.split_test_train()`\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + " 🌀 Expand for a short explanation on the representation of adjacency matrix
\n", + "\n", + "##### Adjacency Matrix\n", + "The **adjacency matrix** is represented as a [compressed sparse row matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix), as required by Spektral. A 'normal' version of this same matrix would be of shape 23x23 filled with zero's and one's in places where two players (or ball) are connected. \n", + "\n", + "Because we set `adjacency_matrix_type='split_by_team'` and `adjacency_matrix_connect_type=\"ball\"` this results in a total of 287 connections (ones), namely between:\n", + "- `adjacency_matrix_type='split_by_team'`:\n", + " - All players on team A (11 * 11) \n", + " - All players on team B (11 * 11)\n", + " - Ball connected to ball (1)\n", + "- `adjacency_matrix_connect_type=\"ball\"`\n", + " - All players and the ball (22) \n", + " - The ball and all players (22)\n", + "\n", + "
\n", + "\n", + "```python\n", + "dataset.graphs[0].a\n", + ">>> \n", + "```\n", + "
\n", + "
\n", + "
\n", + "
\n", + " 🌀 Expand for a short explanation on the representation of node feature matrix
\n", + "\n", + "##### Node Features\n", + "The **node features** are described using a regular Numpy array. Each column represents one feature and every row represents one player. \n", + "\n", + "The ball is presented in the last row, unless we set `random_seed=True` then every Graph gets randomly shuffled (while leaving connections in tact).\n", + "\n", + "See the bullet points in **5. Load Kloppy Data, Convert and Store** to learn which column represents which feature.\n", + "\n", + "The rows filled with zero's are 'empty' players created because we set `pad=True`. Graph Neural Networks are flexible enough to deal with all sorts of different graph shapes in the same dataset, normally it's not actually necessary to add these empty players, even for incomplete data with only a couple players in frame.\n", + "\n", + "
\n", + "\n", + "```python\n", + "dataset.graphs[0].x\n", + ">>> [[-0.163 -0.135 0.245 -0.97 0.007 0.289 0.959 0.191 0.059 0.376 1. 1. ]\n", + " [-0.332 0.011 -0.061 0.998 0.02 0.76 1.015 0.177 0.029 0.009 1. 0.1 ]\n", + " [ 0.021 -0.072 0.987 -0.162 0.017 0.474 0.88 0.203 0.121 0.468 1. 1. ]\n", + " [-0.144 0.232 0.343 0.939 0.024 0.694 0.924 0.186 0.077 0.638 1. 1. ]\n", + " [-0.252 0.302 0.99 0.141 0.032 0.523 0.964 0.176 0.078 0.741 1. 1. ]\n", + " [ 0.012 0.573 0.834 -0.551 0.035 0.407 0.842 0.191 0.19 0.646 1. 1. ]\n", + " [-0.293 0.686 0.999 -0.045 0.044 0.493 0.966 0.163 0.182 0.761 1. 1. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " ...\n", + " [ 0.202 0.124 -0.874 0.486 0.024 0.919 0.791 0.214 0.197 0.524 0.1 0.1 ]\n", + " [ 0.404 0.143 -0.997 0.08 0.029 0.987 0.709 0.23 0.281 0.519 0.1 0.1 ]\n", + " [ 0.195 -0.391 0.48 -0.877 0.014 0.33 0.847 0.218 0.222 0.417 0.1 0.1 ]\n", + " [ 0.212 -0.063 0.982 -0.187 0.009 0.47 0.804 0.217 0.2 0.483 0.1 0.1 ]\n", + " [-0.03 0.248 -0.996 0.091 0.021 0.986 0.876 0.194 0.116 0.591 0.1 0.1 ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", + " [-0.262 0.016 0.937 -0.35 0.036 0.443 0.986 0.044 0. 0. 0. 0. ]]\n", + "\n", + " \n", + "dataset.graphs[0].x.shape\n", + ">>> (23, 12)\n", + "```\n", + "
\n", + "
\n", + "
\n", + "
\n", + " 🌀 Expand for a short explanation on the representation of edge feature matrix
\n", + "\n", + "##### Edge Features\n", + "The **edge features** are also represented in a regular Numpy array. Again, each column represents one feature, and every row decribes the connection between two players, or player and ball.\n", + "\n", + "We saw before how the **adjacency matrix** was presented in a Sparse Row Matrix with 287 rows. It is no coincidence this lines up perfectly with the **edge feature matrix**. \n", + "\n", + "
\n", + "\n", + "```python\n", + "dataset.graphs[0].e\n", + ">>> [[ 0. 0. 1. 0.5 0.5 1. 0. ]\n", + " [ 0.081 0.006 0.936 0.255 0.21 0.907 1. ]\n", + " [ 0.079 0.004 0.012 0.391 0. 0.515 1. ]\n", + " [ 0.1 0.007 0.46 0.002 0.005 0.571 1. ]\n", + " [ 0.125 0.011 0.65 0.023 0.474 0.999 0. ]\n", + " [ 0.206 0.012 0.322 0.033 0.535 0.999 0. ]\n", + " [ 0.23 0.016 0.619 0.014 0.567 0.996 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. ]\n", + " [ 0. 0. 0. 0. 0. 0. 0. ]\n", + " ...\n", + " [ 0.197 -0.025 0.005 0.426 0.929 0.757 1. ]\n", + " [ 0.281 -0.023 0.004 0.439 0.959 0.699 1. ]\n", + " [ 0.222 -0.03 0.067 0.75 0.979 0.643 1. ]\n", + " [ 0.2 -0.032 0.003 0.554 0.982 0.633 1. ]\n", + " [ 0.116 -0.026 0.08 0.229 0.82 0.884 1. ]\n", + " [ 0. 0. 0. 0. 0. 0. 1. ]\n", + " [ 0. 0. 0. 0. 0. 0. 1. ]\n", + " [ 0. 0. 0. 0. 0. 0. 1. ]\n", + " [ 0. 0. 0. 0. 0. 0. 1. ]\n", + " [ 0. 0. 1. 0.5 0.5 1. 1. ]]\n", + "\n", + " dataset.graphs[0].e.shape\n", + " (287, 7)\n", + "```\n", + "
\n", + "
\n", + "\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index 49c203a..3b62c5a 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,19 @@ from setuptools import setup, find_packages +import os +import re + +# Read the version from the __init__.py file +def read_version(): + version_file = os.path.join(os.path.dirname(__file__), "unravel", "__init__.py") + with open(version_file, "r") as f: + version_match = re.search(r'^__version__ = ["\']([^"\']*)["\']', f.read(), re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") setup( name="unravelsports", - version="0.1.0", + version=read_version(), author="Joris Bekkers", author_email="joris@unravelsports.com", description="A project to analyze sports event and tracking data", diff --git a/unravel/__init__.py b/unravel/__init__.py index 64e01cc..93e998b 100644 --- a/unravel/__init__.py +++ b/unravel/__init__.py @@ -1,3 +1,5 @@ +__version__ = "0.1.1" + from .soccer import * from .utils import * from .classifiers import *