diff --git a/README.md b/README.md index 36850173..5e82cb76 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ from kloppy import ( load_metrica_tracking_data, load_tracab_tracking_data, load_epts_tracking_data, + load_statsbomb_event_data, to_pandas, transform ) @@ -53,6 +54,10 @@ dataset = load_tracab_tracking_data('meta.xml', 'raw_data.txt') # or epts dataset = load_epts_tracking_data('meta.xml', 'raw_data.txt') +# or event data +dataset = load_statsbomb_event_data('event_data.json', 'lineup.json') + + dataset = transform(dataset, pitch_dimensions=[[0, 108], [-34, 34]]) pandas_data_frame = to_pandas(dataset) ``` @@ -143,6 +148,30 @@ with open("raw_data.txt", "rb") as raw, \ ``` +or StatsBomb event data +```python +from kloppy import StatsBombSerializer + +serializer = StatsBombSerializer() + +with open("events/123123.json", "rb") as event_data, \ + open("lineup/123123.json", "rb") as lineup_data: + + dataset = serializer.deserialize( + inputs={ + 'event_data': event_data, + 'lineup_data': lineup_data + }, + options={ + "event_types": ["pass", "shot", "carry", "take_on"] + } + ) + + # start working with dataset +``` + + + ### Transform the pitch dimensions Data providers use their own pitch dimensions. Some use actual meters while others use 100x100. Use the Transformer to get from one pitch dimensions to another one. ```python @@ -195,7 +224,7 @@ Data models - [ ] Automated tests - [x] Pitch - [x] Tracking -- [ ] Event +- [x] Event Tracking data (de)serializers - [x] Automated tests @@ -207,7 +236,7 @@ Tracking data (de)serializers Event data (de)serializers - [ ] Automated tests - [ ] OPTA -- [ ] StatsBomb +- [x] StatsBomb - [ ] MetricaSports Transformers diff --git a/examples/datasets/metrica.py b/examples/datasets/metrica.py index 3564119b..e0ab51b4 100644 --- a/examples/datasets/metrica.py +++ b/examples/datasets/metrica.py @@ -1,3 +1,6 @@ +import logging +import sys + from kloppy import datasets, to_pandas @@ -6,6 +9,9 @@ def main(): This example shows the use of Metrica datasets, and how we can pass argument to the dataset loader. """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + # The metrica dataset loader loads by default the 'game1' dataset dataset = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10}) diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py new file mode 100644 index 00000000..880fb5c1 --- /dev/null +++ b/examples/datasets/statsbomb.py @@ -0,0 +1,43 @@ +import logging +import sys + +from kloppy import datasets, transform, to_pandas, load_statsbomb_event_data +from kloppy.infra.utils import performance_logging + + +def main(): + """ + This example shows the use of Statsbomb datasets, and how we can pass argument + to the dataset loader. + """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + + logger = logging.getLogger(__name__) + + dataset = datasets.load("statsbomb", { + #"event_types": ["pass", "take_on", "carry", "shot"] + })#, match_id=15946) + + with performance_logging("transform", logger=logger): + # convert to TRACAB coordinates + dataset = transform( + dataset, + to_orientation="FIXED_HOME_AWAY", + to_pitch_dimensions=[(-5500, 5500), (-3300, 3300)] + ) + + with performance_logging("to pandas", logger=logger): + dataframe = to_pandas(dataset) + + print(dataframe[:100].to_string()) + + # or load it using the helper from disk + dataset = load_statsbomb_event_data( + "events/15946.json", + "lineups/15946.json" + ) + + +if __name__ == "__main__": + main() diff --git a/examples/epts/load_epts_into_pandas.py b/examples/epts/load_epts_into_pandas.py index 3fd40ba8..0ace950a 100644 --- a/examples/epts/load_epts_into_pandas.py +++ b/examples/epts/load_epts_into_pandas.py @@ -1,3 +1,6 @@ +import logging +import sys + from pandas import DataFrame from kloppy.infra.serializers.tracking.epts.meta_data import load_meta_data as epts_load_meta_data @@ -17,6 +20,9 @@ def main(): 4. Try to consume items from generator twice 4. Convert the records into a pandas dataframe for easy data mangling """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + # step 1: load metadata with open("epts_meta.xml", "rb") as meta_fp: diff --git a/examples/playing_time.py b/examples/playing_time.py index 746e6034..1b664467 100644 --- a/examples/playing_time.py +++ b/examples/playing_time.py @@ -1,3 +1,5 @@ +import logging +import sys from collections import Counter from kloppy import datasets @@ -8,6 +10,8 @@ def main(): """ This example shows how to determine playing time """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") dataset = datasets.load("metrica_tracking", options={'sample_rate': 1./25}) diff --git a/kloppy/domain/models/__init__.py b/kloppy/domain/models/__init__.py index 370e0da7..f1d6ecaf 100644 --- a/kloppy/domain/models/__init__.py +++ b/kloppy/domain/models/__init__.py @@ -1,5 +1,5 @@ from .common import * from .pitch import * from .tracking import * -# NOT YET: from .event import * +from .event import * diff --git a/kloppy/domain/models/common.py b/kloppy/domain/models/common.py index 03a9bf53..949c1181 100644 --- a/kloppy/domain/models/common.py +++ b/kloppy/domain/models/common.py @@ -35,6 +35,9 @@ class Orientation(Enum): # change when possession changes BALL_OWNING_TEAM = "ball-owning-team" + # depends on team which executed the action + ACTION_EXECUTING_TEAM = "action-executing-team" + # changes during half-time HOME_TEAM = "home-team" AWAY_TEAM = "away-team" @@ -45,7 +48,8 @@ class Orientation(Enum): def get_orientation_factor(self, attacking_direction: AttackingDirection, - ball_owning_team: Team): + ball_owning_team: Team, + action_executing_team: Team): if self == Orientation.FIXED_HOME_AWAY: return -1 elif self == Orientation.FIXED_AWAY_HOME: @@ -65,14 +69,22 @@ def get_orientation_factor(self, else: raise Exception("AttackingDirection not set") elif self == Orientation.BALL_OWNING_TEAM: - if ((ball_owning_team == Team.HOME - and attacking_direction == AttackingDirection.HOME_AWAY) - or - (ball_owning_team == Team.AWAY - and attacking_direction == AttackingDirection.AWAY_HOME)): + if ball_owning_team == Team.HOME: return -1 + elif ball_owning_team == Team.AWAY: + return 1 else: + raise Exception(f"Invalid ball_owning_team: {ball_owning_team}") + elif self == Orientation.ACTION_EXECUTING_TEAM: + if action_executing_team == Team.HOME: + return -1 + elif action_executing_team == Team.AWAY: return 1 + else: + raise Exception(f"Invalid action_executing_team: {action_executing_team}") + else: + raise Exception(f"Unknown orientation: {self}") + @dataclass @@ -100,11 +112,11 @@ class DatasetFlag(Flag): @dataclass class DataRecord(ABC): + period: Period timestamp: float ball_owning_team: Team ball_state: BallState - period: Period @dataclass diff --git a/kloppy/domain/models/event.py b/kloppy/domain/models/event.py index b5cf33ae..acbca495 100644 --- a/kloppy/domain/models/event.py +++ b/kloppy/domain/models/event.py @@ -1,156 +1,82 @@ # Metrica Documentation https://github.com/metrica-sports/sample-data/blob/master/documentation/events-definitions.pdf -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod, abstractproperty, ABCMeta from dataclasses import dataclass from enum import Enum -from csv import reader -from typing import List, Union +from typing import List, Union, Dict from .pitch import Point from .common import DataRecord, Dataset, Team -class SubType(Enum): - pass - - -class ChallengeType(SubType): - Ground = "GROUND" - - -class ChallengeResult(SubType): - Won = "Won" - Lost = "LOST" - - -class Fault(SubType): - Fault = "FAULT" - Advantage = "ADVANTAGE" - - -class Interference1(SubType): - Interception = "INTERCEPTION" - Theft = "THEFT" - - -class Interference2(SubType): - Blocked = "BLOCKED" - Saved = "SAVED" - - -class Intervention(SubType): - Voluntary = "VOLUNTARY" - Forced = "FORCED" - End_Half = "END HALF" - - -class Attempt(SubType): - Clearance = "CLEARANCE" - Cross = "CROSS" - Through_Ball = "THROUGH BALL" - Deep_Ball = "DEEP BALL" - Goal_Kick = "GOAL KICK" - - -class Offside(SubType): - Offside = "OFFSIDE" - - -class BodyPart(SubType): - Head = "HEAD" - Foot = "FOOT" - - -class Deflection(SubType): - Woodwork = "WOODWORK" - Referee_hit = "REFEREE HIT" - Handball = "HANDBALL" - - -class ShotDirection(SubType): - On_Target = "ON TARGET" - Off_Target = "OFF TARGET" - - -class ShotResult(SubType): - Goal = "GOAL" - Out = "OUT" - Blocked = "BLOCKED" - Saved = "SAVED" - - -class Challenge(SubType): - Tackle = "TACKLE" - Dribble = "DRIBBLE" - Ground = "GROUND" - Aerial = "AERIAL" - - -class Card(SubType): - Yellow = "YELLOW" - Red = "RED" - Dismissal = "DISMISSAL" - +class ResultType(Enum): + @property + @abstractmethod + def is_success(self): + raise NotImplementedError -class SetPiece(SubType): - Kick_off = "KICK OFF" - Throw_In = "THROW IN" - Corner_Kick = "CORNER KICK" - Goal_Kick = "GOAL KICK" - Free_Kick = "FREE KICK" +class ShotResult(ResultType): + GOAL = "GOAL" + OFF_TARGET = "OFF_TARGET" + POST = "POST" + BLOCKED = "BLOCKED" + SAVED = "SAVED" -class FKAttempt(SubType): - Direct = "DIRECT" - Indirect = "INDIRECT" + @property + def is_success(self): + return self == self.GOAL -class Retaken(SubType): - Retaken = "RETAKEN" +class PassResult(ResultType): + COMPLETE = "COMPLETE" + INCOMPLETE = "INCOMPLETE" + OUT = "OUT" + OFFSIDE = "OFFSIDE" + @property + def is_success(self): + return self == self.COMPLETE -class OwnGoal(SubType): - OwnGoal = "OWN GOAL" +class TakeOnResult(ResultType): + COMPLETE = "COMPLETE" + INCOMPLETE = "INCOMPLETE" + OUT = "OUT" + @property + def is_success(self): + return self == self.COMPLETE -""" -@dataclass -class Frame: - frame_id: int - timestamp: float - ball_owning_team: Team - ball_state: BallState - period: Period +class CarryResult(ResultType): + COMPLETE = "COMPLETE" + INCOMPLETE = "INCOMPLETE" - home_team_player_positions: Dict[str, Point] - away_team_player_positions: Dict[str, Point] - ball_position: Point - -""" + @property + def is_success(self): + return self == self.COMPLETE class EventType(Enum): - SET_PIECE = "SET PIECE" - RECOVERY = "RECOVERY" + GENERIC = "generic" + PASS = "PASS" - BALL_LOST = "BALL LOST" - BALL_OUT = "BALL OUT" SHOT = "SHOT" - FAULT_RECEIVED = "FAULT RECEIVED" - CHALLENGE = "CHALLENGE" - CARD = "CARD" + TAKE_ON = "TAKE_ON" + CARRY = "CARRY" @dataclass class Event(DataRecord, ABC): - event_id: int + event_id: str team: Team - end_timestamp: float # allowed to be same as timestamp - player_jersey_no: str position: Point + result: ResultType + + raw_event: Dict + @property @abstractmethod def event_type(self) -> EventType: @@ -158,77 +84,49 @@ def event_type(self) -> EventType: @dataclass -class SetPieceEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.SET_PIECE +class GenericEvent(Event): + event_type: EventType = EventType.GENERIC @dataclass class ShotEvent(Event): - shot_result: ShotResult - - @property - def event_type(self) -> EventType: - return EventType.PASS - + result: ShotResult -@dataclass -class FaultReceivedEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.FAULT_RECEIVED + event_type: EventType = EventType.SHOT @dataclass -class ChallengeEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.CHALLENGE +class PassEvent(Event): + receive_timestamp: float + receiver_player_jersey_no: str + receiver_position: Point + result: PassResult -@dataclass -class CardEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.CARD + event_type: EventType = EventType.PASS @dataclass -class RecoveryEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.RECOVERY +class TakeOnEvent(Event): + result: TakeOnResult - -@dataclass -class BallLossEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.BALL_LOST + event_type: EventType = EventType.TAKE_ON @dataclass -class BallOutEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.BALL_OUT +class CarryEvent(Event): + end_timestamp: float + end_position: Point + result: CarryResult -@dataclass -class PassEvent(Event): - receiver_player_jersey_no: str - receiver_position: Point - - @property - def event_type(self) -> EventType: - return EventType.PASS + event_type: EventType = EventType.CARRY @dataclass class EventDataset(Dataset): records: List[Union[ - SetPieceEvent, ShotEvent + GenericEvent, ShotEvent, PassEvent, TakeOnEvent, CarryEvent ]] @property @@ -236,52 +134,9 @@ def events(self): return self.records -if __name__ == '__main__': - - - data_file = "Sample_Game_1_RawEventsData.csv" - - with open(data_file, 'r') as read_obj: - csv_reader = reader(read_obj) - next(csv_reader) # skip the header - - for team_, Type, subtype, period, start_f, start_t, end_f, end_t, From, to, start_x, start_y, end_x, end_y in csv_reader: - - ## iron out any formatting issues - Type = Type.upper() - subtype = subtype.upper() - period = int(period) - team_ = team_.title() - From = From.title() - to = to.title() - - - team = Team.HOME if team_ == "Home" else Team.AWAY - - eventtype = EventType_map[Type] - - periodid = PeriodEvent(period) - - player = Player(From) - next_player = Player(to) - - start_frame = frame_id(start_f) - end_frame = frame_id(end_t) - - start_time = time_id(start_t) - end_time = time_id(end_f) - - start_location = Point(start_x, start_y) - end_location = Point(end_x, end_y) - - - print("-"*50) - print(team, eventtype, periodid, player, next_player, start_frame, end_frame, start_time, end_time, start_location, end_location) - - subtypes = subtype.split('-') - - if subtype == "": - pass - else: - challenge_type, fault, result, intf1, intf2, intv, atmp, ofsid, bdy, dflc, shtdir, shotres, chall, crd, setp, fk, rtake= build_subtypes(subtypes, [ChallengeType, Fault, ChallengeResult, Interference1, Interference2, Intervention, Attempt, Offside, BodyPart, Deflection, ShotDirection, ShotResult, Challenge, Card, SetPiece, FKAttempt, Retaken]) - print(challenge_type, fault, result, intf1, intf2, intv, atmp, ofsid, bdy, dflc, shtdir, shotres, chall, crd, setp, fk, rtake) \ No newline at end of file +__all__ = [ + "ResultType", "EventType", + "ShotResult", "PassResult", "TakeOnResult", "CarryResult", + "Event", "GenericEvent", "ShotEvent", "PassEvent", "TakeOnEvent", "CarryEvent", + "EventDataset" +] diff --git a/kloppy/domain/models/tracking.py b/kloppy/domain/models/tracking.py index 01d35ed1..46b63a80 100644 --- a/kloppy/domain/models/tracking.py +++ b/kloppy/domain/models/tracking.py @@ -24,3 +24,9 @@ class TrackingDataset(Dataset): @property def frames(self): return self.records + + +__all__ = [ + "Frame", "TrackingDataset" +] + diff --git a/kloppy/domain/services/transformers/__init__.py b/kloppy/domain/services/transformers/__init__.py index 9fb772f5..45cecd7f 100644 --- a/kloppy/domain/services/transformers/__init__.py +++ b/kloppy/domain/services/transformers/__init__.py @@ -1,3 +1,4 @@ +from dataclasses import asdict, replace, fields from typing import TypeVar from kloppy.domain import ( @@ -6,9 +7,9 @@ Orientation, Frame, Team, AttackingDirection, - - TrackingDataset, DatasetFlag, Dataset, # NOT YET: EventDataset + TrackingDataset, DatasetFlag, Dataset, EventDataset ) +from kloppy.domain.models.event import Event class Transformer: @@ -35,17 +36,22 @@ def transform_point(self, point: Point, flip: bool) -> Point: y=self._to_pitch_dimensions.y_dim.from_base(y_base) ) - def __needs_flip(self, ball_owning_team: Team, attacking_direction: AttackingDirection) -> bool: + def __needs_flip(self, + ball_owning_team: Team, + attacking_direction: AttackingDirection, + action_executing_team: Team = None) -> bool: if self._from_orientation == self._to_orientation: flip = False else: orientation_factor_from = self._from_orientation.get_orientation_factor( ball_owning_team=ball_owning_team, - attacking_direction=attacking_direction + attacking_direction=attacking_direction, + action_executing_team=action_executing_team ) orientation_factor_to = self._to_orientation.get_orientation_factor( ball_owning_team=ball_owning_team, - attacking_direction=attacking_direction + attacking_direction=attacking_direction, + action_executing_team=action_executing_team ) flip = orientation_factor_from != orientation_factor_to return flip @@ -78,6 +84,23 @@ def transform_frame(self, frame: Frame) -> Frame: } ) + EventType = TypeVar('EventType') + + def transform_event(self, event: EventType) -> EventType: + flip = self.__needs_flip( + ball_owning_team=event.ball_owning_team, + attacking_direction=event.period.attacking_direction, + action_executing_team=event.team + ) + + position_changes = { + field.name: self.transform_point(getattr(event, field.name), flip) + for field in fields(event) + if field.name.endswith('position') and getattr(event, field.name) + } + + return replace(event, **position_changes) + DatasetType = TypeVar('DatasetType') @classmethod @@ -114,7 +137,15 @@ def transform_dataset(cls, orientation=to_orientation, records=frames ) - #elif isinstance(dataset, EventDataset): - # raise Exception("EventDataset transformer not implemented yet") + elif isinstance(dataset, EventDataset): + events = list(map(transformer.transform_event, dataset.records)) + + return EventDataset( + flags=dataset.flags, + periods=dataset.periods, + pitch_dimensions=to_pitch_dimensions, + orientation=to_orientation, + records=events + ) else: raise Exception("Unknown Dataset type") diff --git a/kloppy/helpers.py b/kloppy/helpers.py index eb65611d..d9115fbd 100644 --- a/kloppy/helpers.py +++ b/kloppy/helpers.py @@ -1,10 +1,13 @@ -from typing import Callable, TypeVar +from typing import Callable, TypeVar, Dict -from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer -from .domain import Dataset, Frame, TrackingDataset, Transformer, Orientation, PitchDimensions, Dimension +from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer, StatsBombSerializer +from .domain import ( + Dataset, Frame, Event, TrackingDataset, Transformer, Orientation, PitchDimensions, + Dimension, EventDataset, PassEvent, CarryEvent, PassResult, EventType +) -def load_tracab_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> Dataset: +def load_tracab_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> TrackingDataset: serializer = TRACABSerializer() with open(meta_data_filename, "rb") as meta_data, \ open(raw_data_filename, "rb") as raw_data: @@ -18,7 +21,7 @@ def load_tracab_tracking_data(meta_data_filename: str, raw_data_filename: str, o ) -def load_metrica_tracking_data(raw_data_home_filename: str, raw_data_away_filename: str, options: dict = None) -> Dataset: +def load_metrica_tracking_data(raw_data_home_filename: str, raw_data_away_filename: str, options: dict = None) -> TrackingDataset: serializer = MetricaTrackingSerializer() with open(raw_data_home_filename, "rb") as raw_data_home, \ open(raw_data_away_filename, "rb") as raw_data_away: @@ -32,7 +35,7 @@ def load_metrica_tracking_data(raw_data_home_filename: str, raw_data_away_filena ) -def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> Dataset: +def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> TrackingDataset: serializer = EPTSSerializer() with open(meta_data_filename, "rb") as meta_data, \ open(raw_data_filename, "rb") as raw_data: @@ -46,6 +49,20 @@ def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, opt ) +def load_statsbomb_event_data(event_data_filename: str, lineup_data_filename: str, options: dict = None) -> EventDataset: + serializer = StatsBombSerializer() + with open(event_data_filename, "rb") as event_data, \ + open(lineup_data_filename, "rb") as lineup_data: + + return serializer.deserialize( + inputs={ + 'event_data': event_data, + 'lineup_data': lineup_data + }, + options=options + ) + + DatasetType = TypeVar('DatasetType') @@ -64,12 +81,12 @@ def transform(dataset: DatasetType, to_orientation=None, to_pitch_dimensions=Non ) -def _frame_to_pandas_row_converter(frame: Frame) -> dict: +def _frame_to_pandas_row_converter(frame: Frame) -> Dict: row = dict( period_id=frame.period.id, timestamp=frame.timestamp, - ball_state=frame.ball_state, - ball_owning_team=frame.ball_owning_team, + ball_state=frame.ball_state.value if frame.ball_state else None, + ball_owning_team=frame.ball_owning_team.value if frame.ball_owning_team else None, ball_x=frame.ball_position.x if frame.ball_position else None, ball_y=frame.ball_position.y if frame.ball_position else None ) @@ -87,6 +104,44 @@ def _frame_to_pandas_row_converter(frame: Frame) -> dict: return row +def _event_to_pandas_row_converter(event: Event) -> Dict: + row = dict( + event_id=event.event_id, + event_type=( + event.event_type.value + if event.event_type != EventType.GENERIC else + f"GENERIC:{event.raw_event['type']['name']}" + ), + result=event.result.value if event.result else None, + success=event.result.is_success if event.result else None, + + period_id=event.period.id, + timestamp=event.timestamp, + end_timestamp=None, + ball_state=event.ball_state.value if event.ball_state else None, + ball_owning_team=event.ball_owning_team.value if event.ball_owning_team else None, + + team=event.team.value, + player_jersey_no=event.player_jersey_no, + position_x=event.position.x if event.position else None, + position_y=event.position.y if event.position else None + ) + if isinstance(event, PassEvent) and event.result == PassResult.COMPLETE: + row.update({ + 'end_timestamp': event.receive_timestamp, + 'end_position_x': event.receiver_position.x, + 'end_position_y': event.receiver_position.y, + 'receiver_jersey_no': event.receiver_player_jersey_no + }) + elif isinstance(event, CarryEvent): + row.update({ + 'end_timestamp': event.end_timestamp, + 'end_position_x': event.end_position.x, + 'end_position_y': event.end_position.y + }) + return row + + def to_pandas(dataset: Dataset, _record_converter: Callable = None) -> 'DataFrame': try: import pandas as pd @@ -97,6 +152,8 @@ def to_pandas(dataset: Dataset, _record_converter: Callable = None) -> 'DataFram if not _record_converter: if isinstance(dataset, TrackingDataset): _record_converter = _frame_to_pandas_row_converter + elif isinstance(dataset, EventDataset): + _record_converter = _event_to_pandas_row_converter else: raise Exception("Unknown dataset type") @@ -109,6 +166,7 @@ def to_pandas(dataset: Dataset, _record_converter: Callable = None) -> 'DataFram 'load_tracab_tracking_data', 'load_metrica_tracking_data', 'load_epts_tracking_data', + 'load_statsbomb_event_data', 'to_pandas', 'transform' ] diff --git a/kloppy/infra/datasets/__init__.py b/kloppy/infra/datasets/__init__.py index 370a5b63..6bc8a103 100644 --- a/kloppy/infra/datasets/__init__.py +++ b/kloppy/infra/datasets/__init__.py @@ -1,5 +1,6 @@ # import for registration from . import tracking +from . import event from .core.loading import load diff --git a/kloppy/infra/datasets/core/loading.py b/kloppy/infra/datasets/core/loading.py index 260218d9..7b608bd5 100644 --- a/kloppy/infra/datasets/core/loading.py +++ b/kloppy/infra/datasets/core/loading.py @@ -1,14 +1,17 @@ -import os +import os, logging import requests from typing import Dict, Union -from kloppy.domain import TrackingDataset +from kloppy.domain import TrackingDataset, EventDataset from .registered import _DATASET_REGISTRY +logger = logging.getLogger(__name__) + + def download_file(url, local_filename): with requests.get(url, stream=True) as r: r.raise_for_status() @@ -28,17 +31,19 @@ def get_local_files(dataset_name: str, files: Dict[str, str]) -> Dict[str, str]: local_files = {} for file_key, file_url in files.items(): - filename = file_url.split('/')[-1] + filename = f"{file_key}={file_url.split('/')[-1]}" local_filename = f'{dataset_base_dir}/{filename}' if not os.path.exists(local_filename): - print(f'Downloading {filename}...') + logger.info(f'Downloading {filename}') download_file(file_url, local_filename) - print('Done') + logger.info('Download complete') + else: + logger.info(f'Using local cached file {local_filename}') local_files[file_key] = local_filename return local_files -def load(dataset_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataset]: +def load(dataset_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataset, EventDataset]: if dataset_name not in _DATASET_REGISTRY: raise ValueError(f"Dataset {dataset_name} not found") diff --git a/kloppy/infra/datasets/event/__init__.py b/kloppy/infra/datasets/event/__init__.py new file mode 100644 index 00000000..2b578e61 --- /dev/null +++ b/kloppy/infra/datasets/event/__init__.py @@ -0,0 +1 @@ +from .statsbomb import Statsbomb diff --git a/kloppy/infra/datasets/event/statsbomb.py b/kloppy/infra/datasets/event/statsbomb.py new file mode 100644 index 00000000..3abdaa0d --- /dev/null +++ b/kloppy/infra/datasets/event/statsbomb.py @@ -0,0 +1,23 @@ +import warnings +from typing import Dict, Type + +from ..core.builder import DatasetBuilder +from ...serializers.event import EventDataSerializer, StatsBombSerializer + + +# 3749133 / 38412 +class Statsbomb(DatasetBuilder): + def get_dataset_urls(self,**kwargs) -> Dict[str, str]: + warnings.warn("\n\nYou are about to use StatsBomb public data." + "\nBy using this data, you are agreeing to the user agreement. " + "\nThe user agreement can be found here: https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf" + "\n") + + match_id = kwargs.get('match_id', '15946') + return { + 'event_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json', + 'lineup_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json' + } + + def get_serializer_cls(self) -> Type[EventDataSerializer]: + return StatsBombSerializer diff --git a/kloppy/infra/serializers/__init__.py b/kloppy/infra/serializers/__init__.py index e590614a..cb142202 100644 --- a/kloppy/infra/serializers/__init__.py +++ b/kloppy/infra/serializers/__init__.py @@ -1,2 +1,3 @@ from .tracking import TrackingDataSerializer, TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer +from .event import StatsBombSerializer # NOT YET: from .event import EventDataSerializer, MetricaEventSerializer diff --git a/kloppy/infra/serializers/event/__init__.py b/kloppy/infra/serializers/event/__init__.py index 57bdc2df..ec6d16c8 100644 --- a/kloppy/infra/serializers/event/__init__.py +++ b/kloppy/infra/serializers/event/__init__.py @@ -1,2 +1,3 @@ from .base import EventDataSerializer -from .metrica import MetricaEventSerializer \ No newline at end of file +#from .metrica import MetricaEventSerializer +from .statsbomb import StatsBombSerializer \ No newline at end of file diff --git a/kloppy/infra/serializers/event/statsbomb/__init__.py b/kloppy/infra/serializers/event/statsbomb/__init__.py new file mode 100644 index 00000000..87c7f42e --- /dev/null +++ b/kloppy/infra/serializers/event/statsbomb/__init__.py @@ -0,0 +1 @@ +from .serializer import StatsBombSerializer \ No newline at end of file diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py new file mode 100644 index 00000000..77abfd60 --- /dev/null +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -0,0 +1,373 @@ +from typing import Tuple, Dict, List +import logging +import json + +from kloppy.domain import ( + EventDataset, Team, Period, Point, BallState, + DatasetFlag, Orientation, PitchDimensions, Dimension, + + PassEvent, ShotEvent, TakeOnEvent, CarryEvent, GenericEvent, + PassResult, ShotResult, TakeOnResult, CarryResult, EventType +) +from kloppy.infra.serializers.event import EventDataSerializer +from kloppy.infra.utils import Readable, performance_logging + +logger = logging.getLogger(__name__) + + +SB_EVENT_TYPE_DRIBBLE = 14 +SB_EVENT_TYPE_SHOT = 16 +SB_EVENT_TYPE_PASS = 30 +SB_EVENT_TYPE_CARRY = 43 + +SB_EVENT_TYPE_HALF_START = 18 +SB_EVENT_TYPE_HALF_END = 34 + +SB_PASS_OUTCOME_COMPLETE = 8 +SB_PASS_OUTCOME_INCOMPLETE = 9 +SB_PASS_OUTCOME_INJURY_CLEARANCE = 74 +SB_PASS_OUTCOME_OUT = 75 +SB_PASS_OUTCOME_OFFSIDE = 76 +SB_PASS_OUTCOME_UNKNOWN = 77 + +SB_SHOT_OUTCOME_BLOCKED = 96 +SB_SHOT_OUTCOME_GOAL = 97 +SB_SHOT_OUTCOME_OFF_TARGET = 98 +SB_SHOT_OUTCOME_POST = 99 +SB_SHOT_OUTCOME_SAVED = 100 +SB_SHOT_OUTCOME_OFF_WAYWARD = 101 + + +def parse_str_ts(timestamp: str) -> float: + h, m, s = timestamp.split(":") + return int(h) * 3600 + int(m) * 60 + float(s) + + +def _parse_position(position: Dict, fidelity_version: int) -> Point: + # location is cell based + # [1, 120] x [1, 80] + # +-----+------+ + # | 1,1 | 2, 1 | + # +-----+------+ + # | 1,2 | 2,2 | + # +-----+------+ + cell_side = 0.1 if fidelity_version == 2 else 1.0 + cell_relative_center = cell_side / 2 + return Point( + x=position[0] - cell_relative_center, + y=position[1] - cell_relative_center + ) + + +def _parse_pass(pass_dict: Dict, current_team_map: Dict[int, int], fidelity_version: int) -> Dict: + if 'outcome' in pass_dict: + outcome_id = pass_dict['outcome']['id'] + if outcome_id == SB_PASS_OUTCOME_OUT: + result = PassResult.OUT + elif outcome_id == SB_PASS_OUTCOME_INCOMPLETE: + result = PassResult.INCOMPLETE + elif outcome_id == SB_PASS_OUTCOME_OFFSIDE: + result = PassResult.OFFSIDE + elif outcome_id == SB_PASS_OUTCOME_INJURY_CLEARANCE: + result = PassResult.OUT + elif outcome_id == SB_PASS_OUTCOME_UNKNOWN: + result = None + else: + raise Exception(f"Unknown pass outcome: {outcome_id}") + + receiver_player_jersey_no = None + receiver_position = None + else: + result = PassResult.COMPLETE + receiver_player_jersey_no = current_team_map[ + pass_dict['recipient']['id'] + ] + receiver_position = _parse_position( + pass_dict['end_location'], + fidelity_version + ) + + return dict( + result=result, + receiver_position=receiver_position, + receiver_player_jersey_no=receiver_player_jersey_no + ) + + +def _parse_shot(shot_dict: Dict) -> Dict: + outcome_id = shot_dict['outcome']['id'] + if outcome_id == SB_SHOT_OUTCOME_OFF_TARGET: + result = ShotResult.OFF_TARGET + elif outcome_id == SB_SHOT_OUTCOME_SAVED: + result = ShotResult.SAVED + elif outcome_id == SB_SHOT_OUTCOME_POST: + result = ShotResult.POST + elif outcome_id == SB_SHOT_OUTCOME_OFF_WAYWARD: + result = ShotResult.OFF_TARGET + elif outcome_id == SB_SHOT_OUTCOME_BLOCKED: + result = ShotResult.BLOCKED + elif outcome_id == SB_SHOT_OUTCOME_GOAL: + result = ShotResult.GOAL + else: + raise Exception(f"Unknown shot outcome: {outcome_id}") + + return dict( + result=result + ) + + +def _parse_carry(carry_dict: Dict, fidelity_version: int) -> Dict: + return dict( + result=CarryResult.COMPLETE, + end_position=_parse_position( + carry_dict['end_location'], + fidelity_version + ) + ) + + +def _parse_take_on(take_on_dict: Dict) -> Dict: + if 'outcome' in take_on_dict: + outcome_id = take_on_dict['outcome']['id'] + if outcome_id == SB_PASS_OUTCOME_OUT: + result = TakeOnResult.OUT + elif outcome_id == SB_PASS_OUTCOME_INCOMPLETE: + result = TakeOnResult.INCOMPLETE + elif outcome_id == SB_PASS_OUTCOME_COMPLETE: + result = TakeOnResult.COMPLETE + else: + raise Exception(f"Unknown pass outcome: {take_on_dict['outcome']['name']}({outcome_id})") + else: + result = TakeOnResult.COMPLETE + + return dict( + result=result + ) + + +def _determine_xy_fidelity_versions(events: List[Dict]) -> Tuple[int, int]: + """ + Find out if x and y are integers disguised as floats + """ + shot_fidelity_version = 1 + xy_fidelity_version = 1 + for event in events: + if 'location' in event: + x, y = event['location'] + + if not x.is_integer() or not y.is_integer(): + event_type = event['type']['id'] + if event_type == SB_EVENT_TYPE_SHOT: + shot_fidelity_version = 2 + elif event_type in (SB_EVENT_TYPE_CARRY, SB_EVENT_TYPE_DRIBBLE, SB_EVENT_TYPE_PASS): + xy_fidelity_version = 2 + return shot_fidelity_version, xy_fidelity_version + + +class StatsBombSerializer(EventDataSerializer): + @staticmethod + def __validate_inputs(inputs: Dict[str, Readable]): + if "event_data" not in inputs: + raise ValueError("Please specify a value for input 'event_data'") + if "lineup_data" not in inputs: + raise ValueError("Please specify a value for input 'lineup_data'") + + def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataset: + """ + Deserialize StatsBomb event data into a `EventDataset`. + + Parameters + ---------- + inputs : dict + input `event_data` should point to a `Readable` object containing + the 'json' formatted event data. input `lineup_data` should point + to a `Readable` object containing the 'json' formatted lineup data. + options : dict + Options for deserialization of the StatsBomb file. Possible options are + `event_types` (list of event types) to specify the event types that + should be returned. Valid types: "shot", "pass", "carry", "take_on" and + "generic". Generic is everything other than the first 4. Those events + are barely parsed. This type of event can be used to do the parsing + yourself. + Every event has a 'raw_event' attribute which contains the original + dictionary. + Returns + ------- + dataset : EventDataset + Raises + ------ + + See Also + -------- + + Examples + -------- + >>> serializer = StatsBombSerializer() + >>> with open("events/12312312.json", "rb") as event_data, \ + >>> open("lineups/123123123.json", "rb") as lineup_data: + >>> + >>> dataset = serializer.deserialize( + >>> inputs={ + >>> 'event_data': event_data, + >>> 'lineup_data': lineup_data + >>> }, + >>> options={ + >>> 'event_types': ["pass", "take_on", "carry", "shot"] + >>> } + >>> ) + """ + self.__validate_inputs(inputs) + if not options: + options = {} + + with performance_logging("load data", logger=logger): + raw_events = json.load(inputs['event_data']) + home_lineup, away_lineup = json.load(inputs['lineup_data']) + shot_fidelity_version, xy_fidelity_version = _determine_xy_fidelity_versions(raw_events) + logger.info(f"Determined Fidelity versions: shot v{shot_fidelity_version} / XY v{xy_fidelity_version}") + + with performance_logging("parse data", logger=logger): + home_player_map = { + player['player_id']: str(player['jersey_number']) + for player in home_lineup['lineup'] + } + away_player_map = { + player['player_id']: str(player['jersey_number']) + for player in away_lineup['lineup'] + } + + wanted_event_types = [ + EventType[event_type.upper()] for event_type in options.get('event_types', []) + ] + + periods = [] + period = None + events = [] + for raw_event in raw_events: + if raw_event['team']['id'] == home_lineup['team_id']: + team = Team.HOME + current_team_map = home_player_map + elif raw_event['team']['id'] == away_lineup['team_id']: + team = Team.AWAY + current_team_map = away_player_map + else: + raise Exception(f"Unknown team_id {raw_event['team']['id']}") + + if raw_event['possession_team']['id'] == home_lineup['team_id']: + possession_team = Team.HOME + elif raw_event['possession_team']['id'] == away_lineup['team_id']: + possession_team = Team.AWAY + else: + raise Exception(f"Unknown possession_team_id: {raw_event['possession_team']}") + + timestamp = parse_str_ts(raw_event['timestamp']) + period_id = int(raw_event['period']) + if not period or period.id != period_id: + period = Period( + id=period_id, + start_timestamp=timestamp, + end_timestamp=timestamp + ) + periods.append(period) + else: + period.end_timestamp = timestamp + + player_jersey_no = None + if 'player' in raw_event: + player_jersey_no = current_team_map[raw_event['player']['id']] + + event_type = raw_event['type']['id'] + if event_type == SB_EVENT_TYPE_SHOT: + fidelity_version = shot_fidelity_version + elif event_type in (SB_EVENT_TYPE_CARRY, SB_EVENT_TYPE_DRIBBLE, SB_EVENT_TYPE_PASS): + fidelity_version = xy_fidelity_version + else: + # TODO: Uh ohhhh.. don't know which one to pick + fidelity_version = xy_fidelity_version + + generic_event_kwargs = dict( + # from DataRecord + period=period, + timestamp=timestamp, + ball_owning_team=possession_team, + ball_state=BallState.ALIVE, + # from Event + event_id=raw_event['id'], + team=team, + player_jersey_no=player_jersey_no, + position=( + _parse_position( + raw_event.get('location'), + fidelity_version + ) + if 'location' in raw_event + else None + ), + raw_event=raw_event + ) + + if event_type == SB_EVENT_TYPE_PASS: + pass_event_kwargs = _parse_pass( + pass_dict=raw_event['pass'], + current_team_map=current_team_map, + fidelity_version=fidelity_version + ) + + event = PassEvent( + # TODO: Consider moving this to _parse_pass + receive_timestamp=timestamp + raw_event['duration'], + **pass_event_kwargs, + **generic_event_kwargs + ) + elif event_type == SB_EVENT_TYPE_SHOT: + shot_event_kwargs = _parse_shot( + shot_dict=raw_event['shot'] + ) + event = ShotEvent( + **shot_event_kwargs, + **generic_event_kwargs + ) + + # For dribble and carry the definitions + # are flipped between Statsbomb and kloppy + elif event_type == SB_EVENT_TYPE_DRIBBLE: + take_on_event_kwargs = _parse_take_on( + take_on_dict=raw_event['dribble'] + ) + event = TakeOnEvent( + **take_on_event_kwargs, + **generic_event_kwargs + ) + elif event_type == SB_EVENT_TYPE_CARRY: + carry_event_kwargs = _parse_carry( + carry_dict=raw_event['carry'], + fidelity_version=fidelity_version + ) + event = CarryEvent( + # TODO: Consider moving this to _parse_carry + end_timestamp=timestamp + raw_event['duration'], + **carry_event_kwargs, + **generic_event_kwargs + ) + else: + event = GenericEvent( + result=None, + **generic_event_kwargs + ) + + if not wanted_event_types or event.event_type in wanted_event_types: + events.append(event) + + return EventDataset( + flags=DatasetFlag.BALL_OWNING_TEAM, + orientation=Orientation.ACTION_EXECUTING_TEAM, + pitch_dimensions=PitchDimensions( + x_dim=Dimension(0, 120), + y_dim=Dimension(0, 80) + ), + periods=periods, + records=events + ) + + def serialize(self, data_set: EventDataset) -> Tuple[str, str]: + raise NotImplementedError diff --git a/kloppy/infra/serializers/tracking/epts/serializer.py b/kloppy/infra/serializers/tracking/epts/serializer.py index 8b751119..43bbeb33 100644 --- a/kloppy/infra/serializers/tracking/epts/serializer.py +++ b/kloppy/infra/serializers/tracking/epts/serializer.py @@ -1,3 +1,4 @@ +import logging from typing import Tuple, Dict from kloppy.domain import ( @@ -7,8 +8,6 @@ Point, Team, Orientation, - PitchDimensions, - Dimension, attacking_direction_from_frame, ) from kloppy.infra.utils import Readable, performance_logging @@ -18,6 +17,8 @@ from .. import TrackingDataSerializer +logger = logging.getLogger(__name__) + class EPTSSerializer(TrackingDataSerializer): @staticmethod @@ -109,12 +110,12 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac sample_rate = float(options.get('sample_rate', 1.0)) limit = int(options.get('limit', 0)) - with performance_logging("Loading metadata"): + with performance_logging("Loading metadata", logger=logger): meta_data = load_meta_data(inputs['meta_data']) periods = meta_data.periods - with performance_logging("Loading data"): + with performance_logging("Loading data", logger=logger): # assume they are sorted frames = [ self._frame_from_row(row, meta_data) diff --git a/kloppy/infra/serializers/tracking/metrica.py b/kloppy/infra/serializers/tracking/metrica.py index 4dcd4350..87b3af6a 100644 --- a/kloppy/infra/serializers/tracking/metrica.py +++ b/kloppy/infra/serializers/tracking/metrica.py @@ -1,3 +1,4 @@ +import logging from collections import namedtuple from typing import Tuple, Dict, Iterator @@ -15,6 +16,8 @@ from . import TrackingDataSerializer +logger = logging.getLogger(__name__) + class MetricaTrackingSerializer(TrackingDataSerializer): __PartialFrame = namedtuple("PartialFrame", "team period frame_id player_positions ball_position") @@ -148,13 +151,13 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac # consider reading this from data frame_rate = 25 - with performance_logging("prepare"): + with performance_logging("prepare", logger=logger): home_iterator = self.__create_iterator(inputs['raw_data_home'], sample_rate, frame_rate) away_iterator = self.__create_iterator(inputs['raw_data_away'], sample_rate, frame_rate) partial_frames = zip(home_iterator, away_iterator) - with performance_logging("loading"): + with performance_logging("loading", logger=logger): frames = [] periods = [] diff --git a/kloppy/infra/serializers/tracking/tracab.py b/kloppy/infra/serializers/tracking/tracab.py index 1fc94b97..080a2b7e 100644 --- a/kloppy/infra/serializers/tracking/tracab.py +++ b/kloppy/infra/serializers/tracking/tracab.py @@ -1,3 +1,4 @@ +import logging from typing import Tuple, Dict from lxml import objectify @@ -19,6 +20,8 @@ from . import TrackingDataSerializer +logger = logging.getLogger(__name__) + class TRACABSerializer(TrackingDataSerializer): @classmethod @@ -125,7 +128,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac limit = int(options.get('limit', 0)) only_alive = bool(options.get('only_alive', True)) - with performance_logging("Loading metadata"): + with performance_logging("Loading metadata", logger=logger): match = objectify.fromstring(inputs['meta_data'].read()).match frame_rate = int(match.attrib['iFrameRateFps']) pitch_size_width = float(match.attrib['fPitchXSizeMeters']) @@ -144,7 +147,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac ) ) - with performance_logging("Loading data"): + with performance_logging("Loading data", logger=logger): def _iter(): n = 0 sample = 1. / sample_rate diff --git a/kloppy/infra/utils.py b/kloppy/infra/utils.py index 83a6aa43..18a2b6b4 100644 --- a/kloppy/infra/utils.py +++ b/kloppy/infra/utils.py @@ -13,15 +13,25 @@ def to_file_object(s: Readable) -> BinaryIO: @contextmanager -def performance_logging(description: str, counter: int = None): +def performance_logging(description: str, counter: int = None, logger=None): start = time.time() try: yield finally: - took = int((time.time() - start) * 1000) + took = (time.time() - start) * 1000 extra = "" if counter is not None: extra = f" ({int(counter / took * 1000)}items/sec)" - print(f"{description} took: {took:.2f}ms {extra}") + + unit = "ms" + if took < 0.1: + took *= 1000 + unit = "us" + + msg = f"{description} took: {took:.2f}{unit} {extra}" + if logger: + logger.info(msg) + else: + print(msg) diff --git a/kloppy/tests/files/statsbomb_lineup.json b/kloppy/tests/files/statsbomb_lineup.json new file mode 100644 index 00000000..e69de29b diff --git a/kloppy/tests/files/statsbomb_raw.json b/kloppy/tests/files/statsbomb_raw.json new file mode 100644 index 00000000..e69de29b diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py index 41d10dab..86cecf5b 100644 --- a/kloppy/tests/test_helpers.py +++ b/kloppy/tests/test_helpers.py @@ -7,7 +7,7 @@ from kloppy.domain import ( Period, DatasetFlag, Point, AttackingDirection, TrackingDataset, PitchDimensions, Dimension, - Orientation, Frame + Orientation, Frame, EventDataset, PassEvent ) @@ -30,7 +30,7 @@ def test_load_tracab_tracking_data(self): assert len(dataset.records) == 5 # only alive=True assert len(dataset.periods) == 2 - def _get_dataset(self): + def _get_tracking_dataset(self): periods = [ Period(id=1, start_timestamp=0.0, end_timestamp=10.0, attacking_direction=AttackingDirection.HOME_AWAY), Period(id=2, start_timestamp=15.0, end_timestamp=25.0, attacking_direction=AttackingDirection.AWAY_HOME) @@ -72,7 +72,7 @@ def _get_dataset(self): return tracking_data def test_transform(self): - tracking_data = self._get_dataset() + tracking_data = self._get_tracking_dataset() # orientation change AND dimension scale transformed_dataset = transform( @@ -85,7 +85,7 @@ def test_transform(self): assert transformed_dataset.frames[1].ball_position == Point(x=1, y=0) def test_to_pandas(self): - tracking_data = self._get_dataset() + tracking_data = self._get_tracking_dataset() data_frame = to_pandas(tracking_data) diff --git a/kloppy/tests/test_statsbomb.py b/kloppy/tests/test_statsbomb.py new file mode 100644 index 00000000..20c3372c --- /dev/null +++ b/kloppy/tests/test_statsbomb.py @@ -0,0 +1,3 @@ +class TestStatsbomb: + def test_correct_deserialization(self): + pass