From bc18eec4f9fd713217d094d013c27b6bab2ca94b Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Mon, 1 Jun 2020 21:31:39 +0200 Subject: [PATCH 1/7] Statsbomb: WIP --- examples/datasets/statsbomb.py | 14 + kloppy/domain/models/__init__.py | 2 +- kloppy/domain/models/common.py | 2 +- kloppy/domain/models/event.py | 279 ++++-------------- kloppy/domain/models/tracking.py | 6 + kloppy/infra/datasets/__init__.py | 1 + kloppy/infra/datasets/core/loading.py | 4 +- kloppy/infra/datasets/event/__init__.py | 1 + kloppy/infra/datasets/event/statsbomb.py | 17 ++ kloppy/infra/serializers/event/__init__.py | 3 +- .../serializers/event/statsbomb/__init__.py | 1 + .../serializers/event/statsbomb/serializer.py | 147 +++++++++ kloppy/infra/utils.py | 9 +- 13 files changed, 261 insertions(+), 225 deletions(-) create mode 100644 examples/datasets/statsbomb.py create mode 100644 kloppy/infra/datasets/event/__init__.py create mode 100644 kloppy/infra/datasets/event/statsbomb.py create mode 100644 kloppy/infra/serializers/event/statsbomb/__init__.py create mode 100644 kloppy/infra/serializers/event/statsbomb/serializer.py diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py new file mode 100644 index 00000000..afca33b8 --- /dev/null +++ b/examples/datasets/statsbomb.py @@ -0,0 +1,14 @@ +from kloppy import datasets + + +def main(): + """ + This example shows the use of Statsbomb datasets, and how we can pass argument + to the dataset loader. + """ + + data_set = datasets.load("statsbomb") + + +if __name__ == "__main__": + main() diff --git a/kloppy/domain/models/__init__.py b/kloppy/domain/models/__init__.py index 370e0da7..f1d6ecaf 100644 --- a/kloppy/domain/models/__init__.py +++ b/kloppy/domain/models/__init__.py @@ -1,5 +1,5 @@ from .common import * from .pitch import * from .tracking import * -# NOT YET: from .event import * +from .event import * diff --git a/kloppy/domain/models/common.py b/kloppy/domain/models/common.py index 2f345dcc..1a7aa94b 100644 --- a/kloppy/domain/models/common.py +++ b/kloppy/domain/models/common.py @@ -100,11 +100,11 @@ class DataSetFlag(Flag): @dataclass class DataRecord(ABC): + period: Period timestamp: float ball_owning_team: Team ball_state: BallState - period: Period @dataclass diff --git a/kloppy/domain/models/event.py b/kloppy/domain/models/event.py index 02035e70..1daccef3 100644 --- a/kloppy/domain/models/event.py +++ b/kloppy/domain/models/event.py @@ -1,234 +1,118 @@ # Metrica Documentation https://github.com/metrica-sports/sample-data/blob/master/documentation/events-definitions.pdf -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod, abstractproperty, ABCMeta from dataclasses import dataclass from enum import Enum -from csv import reader from typing import List, Union from .pitch import Point from .common import DataRecord, DataSet, Team -class SubType(Enum): - pass - - -class ChallengeType(SubType): - Ground = "GROUND" - - -class ChallengeResult(SubType): - Won = "Won" - Lost = "LOST" - - -class Fault(SubType): - Fault = "FAULT" - Advantage = "ADVANTAGE" - - -class Interference1(SubType): - Interception = "INTERCEPTION" - Theft = "THEFT" - - -class Interference2(SubType): - Blocked = "BLOCKED" - Saved = "SAVED" - - -class Intervention(SubType): - Voluntary = "VOLUNTARY" - Forced = "FORCED" - End_Half = "END HALF" - - -class Attempt(SubType): - Clearance = "CLEARANCE" - Cross = "CROSS" - Through_Ball = "THROUGH BALL" - Deep_Ball = "DEEP BALL" - Goal_Kick = "GOAL KICK" - - -class Offside(SubType): - Offside = "OFFSIDE" - - -class BodyPart(SubType): - Head = "HEAD" - Foot = "FOOT" - - -class Deflection(SubType): - Woodwork = "WOODWORK" - Referee_hit = "REFEREE HIT" - Handball = "HANDBALL" - - -class ShotDirection(SubType): - On_Target = "ON TARGET" - Off_Target = "OFF TARGET" - - -class ShotResult(SubType): - Goal = "GOAL" - Out = "OUT" - Blocked = "BLOCKED" - Saved = "SAVED" - - -class Challenge(SubType): - Tackle = "TACKLE" - Dribble = "DRIBBLE" - Ground = "GROUND" - Aerial = "AERIAL" - - -class Card(SubType): - Yellow = "YELLOW" - Red = "RED" - Dismissal = "DISMISSAL" - - -class SetPiece(SubType): - Kick_off = "KICK OFF" - Throw_In = "THROW IN" - Corner_Kick = "CORNER KICK" - Goal_Kick = "GOAL KICK" - Free_Kick = "FREE KICK" - - -class FKAttempt(SubType): - Direct = "DIRECT" - Indirect = "INDIRECT" +class ResultType(Enum): + @property + @abstractmethod + def is_success(self): + raise NotImplementedError -class Retaken(SubType): - Retaken = "RETAKEN" +class ShotResult(ResultType): + GOAL = "GOAL" + OFF_TARGET = "OFF_TARGET" + POST = "POST" + BLOCKED = "BLOCKED" + SAVED = "SAVED" + @property + def is_success(self): + return self == self.GOAL -class OwnGoal(SubType): - OwnGoal = "OWN GOAL" +class PassResult(ResultType): + COMPLETE = "COMPLETE" + INCOMPLETE = "INCOMPLETE" + OUT = "OUT" + OFFSIDE = "OFFSIDE" + @property + def is_success(self): + return self == self.COMPLETE -""" -@dataclass -class Frame: - frame_id: int - timestamp: float - ball_owning_team: Team - ball_state: BallState - period: Period +class DribbleCarryResult(ResultType): + COMPLETE = "COMPLETE" + INCOMPLETE = "INCOMPLETE" + OUT = "OUT" - home_team_player_positions: Dict[str, Point] - away_team_player_positions: Dict[str, Point] - ball_position: Point - -""" + @property + def is_success(self): + return self == self.COMPLETE class EventType(Enum): - SET_PIECE = "SET PIECE" - RECOVERY = "RECOVERY" PASS = "PASS" - BALL_LOST = "BALL LOST" - BALL_OUT = "BALL OUT" SHOT = "SHOT" - FAULT_RECEIVED = "FAULT RECEIVED" - CHALLENGE = "CHALLENGE" - CARD = "CARD" + DRIBBLE = "DRIBBLE" + CARRY = "CARRY" @dataclass class Event(DataRecord, ABC): - event_id: int + event_id: str team: Team - end_timestamp: float # allowed to be same as timestamp player_jersey_no: str position: Point + result: ResultType + @property @abstractmethod def event_type(self) -> EventType: raise NotImplementedError -@dataclass -class SetPieceEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.SET_PIECE - - @dataclass class ShotEvent(Event): - shot_result: ShotResult + result: ShotResult - @property - def event_type(self) -> EventType: - return EventType.PASS + event_type: EventType = EventType.SHOT @dataclass -class FaultReceivedEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.FAULT_RECEIVED +class PassEvent(Event): + end_timestamp: float + receiver_player_jersey_no: str + receiver_position: Point + result: PassResult -@dataclass -class ChallengeEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.CHALLENGE + event_type: EventType = EventType.PASS @dataclass -class CardEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.CARD +class DribbleEvent(Event): + end_timestamp: float + end_position: Point + result: DribbleCarryResult -@dataclass -class RecoveryEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.RECOVERY + event_type: EventType = EventType.DRIBBLE @dataclass -class BallLossEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.BALL_LOST +class CarryEvent(Event): + end_timestamp: float + end_position: Point + result: DribbleCarryResult -@dataclass -class BallOutEvent(Event): - @property - def event_type(self) -> EventType: - return EventType.BALL_OUT - - -@dataclass -class PassEvent(Event): - receiver_player_jersey_no: str - receiver_position: Point - - @property - def event_type(self) -> EventType: - return EventType.PASS + event_type: EventType = EventType.CARRY @dataclass class EventDataSet(DataSet): records: List[Union[ - SetPieceEvent, ShotEvent + ShotEvent, PassEvent, DribbleEvent, CarryEvent ]] @property @@ -236,52 +120,9 @@ def events(self): return self.records -if __name__ == '__main__': - - - data_file = "Sample_Game_1_RawEventsData.csv" - - with open(data_file, 'r') as read_obj: - csv_reader = reader(read_obj) - next(csv_reader) # skip the header - - for team_, Type, subtype, period, start_f, start_t, end_f, end_t, From, to, start_x, start_y, end_x, end_y in csv_reader: - - ## iron out any formatting issues - Type = Type.upper() - subtype = subtype.upper() - period = int(period) - team_ = team_.title() - From = From.title() - to = to.title() - - - team = Team.HOME if team_ == "Home" else Team.AWAY - - eventtype = EventType_map[Type] - - periodid = PeriodEvent(period) - - player = Player(From) - next_player = Player(to) - - start_frame = frame_id(start_f) - end_frame = frame_id(end_t) - - start_time = time_id(start_t) - end_time = time_id(end_f) - - start_location = Point(start_x, start_y) - end_location = Point(end_x, end_y) - - - print("-"*50) - print(team, eventtype, periodid, player, next_player, start_frame, end_frame, start_time, end_time, start_location, end_location) - - subtypes = subtype.split('-') - - if subtype == "": - pass - else: - challenge_type, fault, result, intf1, intf2, intv, atmp, ofsid, bdy, dflc, shtdir, shotres, chall, crd, setp, fk, rtake= build_subtypes(subtypes, [ChallengeType, Fault, ChallengeResult, Interference1, Interference2, Intervention, Attempt, Offside, BodyPart, Deflection, ShotDirection, ShotResult, Challenge, Card, SetPiece, FKAttempt, Retaken]) - print(challenge_type, fault, result, intf1, intf2, intv, atmp, ofsid, bdy, dflc, shtdir, shotres, chall, crd, setp, fk, rtake) \ No newline at end of file +__all__ = [ + "ResultType", "EventType", + "ShotResult", "PassResult", "DribbleCarryResult", + "ShotEvent", "PassEvent", "DribbleEvent", "CarryEvent", + "EventDataSet" +] diff --git a/kloppy/domain/models/tracking.py b/kloppy/domain/models/tracking.py index 59bf5280..b797e3d8 100644 --- a/kloppy/domain/models/tracking.py +++ b/kloppy/domain/models/tracking.py @@ -24,3 +24,9 @@ class TrackingDataSet(DataSet): @property def frames(self): return self.records + + +__all__ = [ + "Frame", "TrackingDataSet" +] + diff --git a/kloppy/infra/datasets/__init__.py b/kloppy/infra/datasets/__init__.py index 370a5b63..6bc8a103 100644 --- a/kloppy/infra/datasets/__init__.py +++ b/kloppy/infra/datasets/__init__.py @@ -1,5 +1,6 @@ # import for registration from . import tracking +from . import event from .core.loading import load diff --git a/kloppy/infra/datasets/core/loading.py b/kloppy/infra/datasets/core/loading.py index c0005145..0298f673 100644 --- a/kloppy/infra/datasets/core/loading.py +++ b/kloppy/infra/datasets/core/loading.py @@ -28,12 +28,14 @@ def get_local_files(data_set_name: str, files: Dict[str, str]) -> Dict[str, str] local_files = {} for file_key, file_url in files.items(): - filename = file_url.split('/')[-1] + filename = f"{file_key}={file_url.split('/')[-1]}" local_filename = f'{dataset_base_dir}/{filename}' if not os.path.exists(local_filename): print(f'Downloading {filename}...') download_file(file_url, local_filename) print('Done') + else: + print(f'Using local cached file {local_filename}') local_files[file_key] = local_filename return local_files diff --git a/kloppy/infra/datasets/event/__init__.py b/kloppy/infra/datasets/event/__init__.py new file mode 100644 index 00000000..2b578e61 --- /dev/null +++ b/kloppy/infra/datasets/event/__init__.py @@ -0,0 +1 @@ +from .statsbomb import Statsbomb diff --git a/kloppy/infra/datasets/event/statsbomb.py b/kloppy/infra/datasets/event/statsbomb.py new file mode 100644 index 00000000..504109ef --- /dev/null +++ b/kloppy/infra/datasets/event/statsbomb.py @@ -0,0 +1,17 @@ +from typing import Dict, Type + +from ..core.builder import DatasetBuilder +from ...serializers.event import EventDataSerializer, StatsbombSerializer + + +# 3749133 / 38412 +class Statsbomb(DatasetBuilder): + def get_data_set_files(self,**kwargs) -> Dict[str, str]: + match_id = kwargs.get('match_id', '15946') + return { + 'raw_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json', + 'lineup': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json' + } + + def get_serializer_cls(self) -> Type[EventDataSerializer]: + return StatsbombSerializer diff --git a/kloppy/infra/serializers/event/__init__.py b/kloppy/infra/serializers/event/__init__.py index 57bdc2df..008eb204 100644 --- a/kloppy/infra/serializers/event/__init__.py +++ b/kloppy/infra/serializers/event/__init__.py @@ -1,2 +1,3 @@ from .base import EventDataSerializer -from .metrica import MetricaEventSerializer \ No newline at end of file +#from .metrica import MetricaEventSerializer +from .statsbomb import StatsbombSerializer \ No newline at end of file diff --git a/kloppy/infra/serializers/event/statsbomb/__init__.py b/kloppy/infra/serializers/event/statsbomb/__init__.py new file mode 100644 index 00000000..1cf8711e --- /dev/null +++ b/kloppy/infra/serializers/event/statsbomb/__init__.py @@ -0,0 +1 @@ +from .serializer import StatsbombSerializer \ No newline at end of file diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py new file mode 100644 index 00000000..9af64c80 --- /dev/null +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -0,0 +1,147 @@ +from typing import Tuple, Dict + +import json + +from kloppy.domain import EventDataSet, PassEvent, Team, Period, Point, PassResult +from kloppy.infra.serializers.event import EventDataSerializer +from kloppy.infra.utils import Readable, performance_logging + + +SB_EVENT_TYPE_DRIBBLE = 14 +SB_EVENT_TYPE_SHOT = 16 +SB_EVENT_TYPE_PASS = 30 +SB_EVENT_TYPE_CARRY = 43 + +SB_EVENT_TYPE_HALF_START = 18 +SB_EVENT_TYPE_HALF_END = 34 + +SB_PASS_OUTCOME_INCOMPLETE = 9 +SB_PASS_OUTCOME_INJURY_CLEARANCE = 74 +SB_PASS_OUTCOME_OUT = 75 +SB_PASS_OUTCOME_OFFSIDE = 76 +SB_PASS_OUTCOME_UNKNOWN = 77 + + +def parse_str_ts(timestamp: str) -> float: + h, m, s = timestamp.split(":") + return int(h) * 3600 + int(m) * 60 + float(s) + + +def _parse_pass(pass_dict: Dict, current_team_map: Dict[int, int]) -> Dict: + if 'outcome' in pass_dict: + outcome_id = pass_dict['outcome']['id'] + if outcome_id == SB_PASS_OUTCOME_OUT: + result = PassResult.OUT + elif outcome_id == SB_PASS_OUTCOME_INCOMPLETE: + result = PassResult.INCOMPLETE + elif outcome_id == SB_PASS_OUTCOME_OFFSIDE: + result = PassResult.OFFSIDE + elif outcome_id == SB_PASS_OUTCOME_INJURY_CLEARANCE: + result = PassResult.OUT + elif outcome_id == SB_PASS_OUTCOME_UNKNOWN: + result = None + else: + raise Exception(f"Unknown pass outcome: {outcome_id}") + + receiver_player_jersey_no = None + receiver_position = None + else: + result = PassResult.COMPLETE + receiver_player_jersey_no = current_team_map[ + pass_dict['recipient']['id'] + ] + receiver_position = Point( + x=pass_dict['end_location'][0], + y=pass_dict['end_location'][1] + ) + + return dict( + result=result, + receiver_position=receiver_position, + receiver_player_jersey_no=receiver_player_jersey_no + ) + + +class StatsbombSerializer(EventDataSerializer): + def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataSet: + with performance_logging("load data"): + raw_events = json.load(inputs['raw_data']) + home_lineup, away_lineup = json.load(inputs['lineup']) + + + + with performance_logging("parse data"): + home_player_map = { + player['player_id']: player['jersey_number'] + for player in home_lineup['lineup'] + } + away_player_map = { + player['player_id']: player['jersey_number'] + for player in away_lineup['lineup'] + } + + periods = [] + period = None + events = [] + for raw_event in raw_events: + if raw_event['team']['id'] == home_lineup['team_id']: + team = Team.HOME + current_team_map = home_player_map + elif raw_event['team']['id'] == away_lineup['team_id']: + team = Team.AWAY + current_team_map = away_player_map + else: + raise Exception(f"Unknown team_id {raw_event['team']['id']}") + + timestamp = parse_str_ts(raw_event['timestamp']) + period_id = int(raw_event['period']) + if not period or period.id != period_id: + period = Period( + id=period_id, + start_timestamp=timestamp, + end_timestamp=timestamp + ) + periods.append(period) + else: + period.end_timestamp = timestamp + + player_jersey_no = None + if 'player' in raw_event: + player_jersey_no = current_team_map[raw_event['player']['id']] + + event_kwargs = dict( + # from DataRecord + period=period, + timestamp=timestamp, + ball_owning_team=None, + ball_state=None, + # from Event + event_id=raw_event['id'], + team=team, + player_jersey_no=player_jersey_no, + position=( + Point( + x=raw_event['location'][0], + y=raw_event['location'][1] + ) + if 'location' in raw_event + else None + ) + ) + + event_type = raw_event['type']['id'] + if event_type == SB_EVENT_TYPE_PASS: + pass_event_kwargs = _parse_pass(raw_event['pass'], current_team_map) + + event = PassEvent( + end_timestamp=timestamp + raw_event['duration'], + **pass_event_kwargs, + **event_kwargs + ) + else: + continue + + events.append(event) + + def serialize(self, data_set: EventDataSet) -> Tuple[str, str]: + raise NotImplementedError diff --git a/kloppy/infra/utils.py b/kloppy/infra/utils.py index 83a6aa43..01c659d9 100644 --- a/kloppy/infra/utils.py +++ b/kloppy/infra/utils.py @@ -18,10 +18,15 @@ def performance_logging(description: str, counter: int = None): try: yield finally: - took = int((time.time() - start) * 1000) + took = (time.time() - start) * 1000 extra = "" if counter is not None: extra = f" ({int(counter / took * 1000)}items/sec)" - print(f"{description} took: {took:.2f}ms {extra}") + + unit = "ms" + if took < 0.01: + took *= 1000 + unit = "us" + print(f"{description} took: {took:.2f}{unit} {extra}") From b6b221f44651c44b978cdddff50b5c98e0fa623c Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Mon, 1 Jun 2020 22:18:13 +0200 Subject: [PATCH 2/7] Statsbomb: WIP shot --- examples/datasets/statsbomb.py | 11 +- kloppy/domain/models/common.py | 11 +- .../domain/services/transformers/__init__.py | 32 +++++- kloppy/infra/datasets/core/loading.py | 9 +- .../serializers/event/statsbomb/serializer.py | 100 ++++++++++++++++-- kloppy/infra/utils.py | 9 +- 6 files changed, 146 insertions(+), 26 deletions(-) diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py index afca33b8..13f0a694 100644 --- a/examples/datasets/statsbomb.py +++ b/examples/datasets/statsbomb.py @@ -1,4 +1,8 @@ -from kloppy import datasets +import logging +import sys + +from kloppy import datasets, transform +from kloppy.infra.utils import performance_logging def main(): @@ -6,8 +10,13 @@ def main(): This example shows the use of Statsbomb datasets, and how we can pass argument to the dataset loader. """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") data_set = datasets.load("statsbomb") + with performance_logging("transform"): + data_set = transform(data_set, to_orientation="FIXED_HOME_AWAY") + a = 1 if __name__ == "__main__": diff --git a/kloppy/domain/models/common.py b/kloppy/domain/models/common.py index 1a7aa94b..7d98be5f 100644 --- a/kloppy/domain/models/common.py +++ b/kloppy/domain/models/common.py @@ -65,14 +65,13 @@ def get_orientation_factor(self, else: raise Exception("AttackingDirection not set") elif self == Orientation.BALL_OWNING_TEAM: - if ((ball_owning_team == Team.HOME - and attacking_direction == AttackingDirection.HOME_AWAY) - or - (ball_owning_team == Team.AWAY - and attacking_direction == AttackingDirection.AWAY_HOME)): + if ball_owning_team == Team.HOME: return -1 - else: + elif ball_owning_team == Team.AWAY: return 1 + else: + raise Exception(f"Invalid ball_owning_team: {ball_owning_team}") + @dataclass diff --git a/kloppy/domain/services/transformers/__init__.py b/kloppy/domain/services/transformers/__init__.py index a7b79b24..23ea5296 100644 --- a/kloppy/domain/services/transformers/__init__.py +++ b/kloppy/domain/services/transformers/__init__.py @@ -1,3 +1,4 @@ +from dataclasses import asdict, replace, fields from typing import TypeVar from kloppy.domain import ( @@ -7,8 +8,9 @@ Frame, Team, AttackingDirection, - TrackingDataSet, DataSetFlag, DataSet, # NOT YET: EventDataSet + TrackingDataSet, DataSetFlag, DataSet, EventDataSet, # NOT YET: EventDataSet ) +from kloppy.domain.models.event import Event class Transformer: @@ -78,6 +80,22 @@ def transform_frame(self, frame: Frame) -> Frame: } ) + EventType = TypeVar('EventType') + + def transform_event(self, event: EventType) -> EventType: + flip = self.__needs_flip( + ball_owning_team=event.ball_owning_team, + attacking_direction=event.period.attacking_direction + ) + + position_changes = { + field.name: self.transform_point(getattr(event, field.name), flip) + for field in fields(event) + if field.name.endswith('position') and getattr(event, field.name) + } + + return replace(event, **position_changes) + DataSetType = TypeVar('DataSetType') @classmethod @@ -114,7 +132,15 @@ def transform_data_set(cls, orientation=to_orientation, records=frames ) - #elif isinstance(data_set, EventDataSet): - # raise Exception("EventDataSet transformer not implemented yet") + elif isinstance(data_set, EventDataSet): + events = list(map(transformer.transform_event, data_set.records)) + + return EventDataSet( + flags=data_set.flags, + periods=data_set.periods, + pitch_dimensions=to_pitch_dimensions, + orientation=to_orientation, + records=events + ) else: raise Exception("Unknown DataSet type") diff --git a/kloppy/infra/datasets/core/loading.py b/kloppy/infra/datasets/core/loading.py index 0298f673..d322b2ce 100644 --- a/kloppy/infra/datasets/core/loading.py +++ b/kloppy/infra/datasets/core/loading.py @@ -1,14 +1,17 @@ -import os +import os, logging import requests from typing import Dict, Union -from kloppy.domain import DataSet, TrackingDataSet +from kloppy.domain import TrackingDataSet from .registered import _DATASET_REGISTRY +logger = logging.getLogger(__name__) + + def download_file(url, local_filename): with requests.get(url, stream=True) as r: r.raise_for_status() @@ -40,7 +43,7 @@ def get_local_files(data_set_name: str, files: Dict[str, str]) -> Dict[str, str] return local_files -def load(data_set_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataSet]: +def load(data_set_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataSet, EventDataSet]: if data_set_name not in _DATASET_REGISTRY: raise ValueError(f"Dataset {data_set_name} not found") diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py index 9af64c80..b32e0fe3 100644 --- a/kloppy/infra/serializers/event/statsbomb/serializer.py +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -1,8 +1,9 @@ from typing import Tuple, Dict - +import logging import json -from kloppy.domain import EventDataSet, PassEvent, Team, Period, Point, PassResult +from kloppy.domain import EventDataSet, PassEvent, Team, Period, Point, PassResult, ShotEvent, BallState, ShotResult, \ + DataSetFlag, Orientation, PitchDimensions, Dimension, DribbleEvent, CarryEvent from kloppy.infra.serializers.event import EventDataSerializer from kloppy.infra.utils import Readable, performance_logging @@ -21,6 +22,13 @@ SB_PASS_OUTCOME_OFFSIDE = 76 SB_PASS_OUTCOME_UNKNOWN = 77 +SB_SHOT_OUTCOME_BLOCKED = 96 +SB_SHOT_OUTCOME_GOAL = 97 +SB_SHOT_OUTCOME_OFF_TARGET = 98 +SB_SHOT_OUTCOME_POST = 99 +SB_SHOT_OUTCOME_SAVED = 100 +SB_SHOT_OUTCOME_OFF_WAYWARD = 101 + def parse_str_ts(timestamp: str) -> float: h, m, s = timestamp.split(":") @@ -62,15 +70,38 @@ def _parse_pass(pass_dict: Dict, current_team_map: Dict[int, int]) -> Dict: ) +def _parse_shot(shot_dict: Dict) -> Dict: + outcome_id = shot_dict['outcome']['id'] + if outcome_id == SB_SHOT_OUTCOME_OFF_TARGET: + result = ShotResult.OFF_TARGET + elif outcome_id == SB_SHOT_OUTCOME_SAVED: + result = ShotResult.SAVED + elif outcome_id == SB_SHOT_OUTCOME_POST: + result = ShotResult.POST + elif outcome_id == SB_SHOT_OUTCOME_OFF_WAYWARD: + result = ShotResult.OFF_TARGET + elif outcome_id == SB_SHOT_OUTCOME_BLOCKED: + result = ShotResult.BLOCKED + elif outcome_id == SB_SHOT_OUTCOME_GOAL: + result = ShotResult.GOAL + else: + raise Exception(f"Unknown shot outcome: {outcome_id}") + + return dict( + result=result + ) + + +logger = logging.getLogger(__name__) + + class StatsbombSerializer(EventDataSerializer): def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataSet: - with performance_logging("load data"): + with performance_logging("load data", logger=logger): raw_events = json.load(inputs['raw_data']) home_lineup, away_lineup = json.load(inputs['lineup']) - - - with performance_logging("parse data"): + with performance_logging("parse data", logger=logger): home_player_map = { player['player_id']: player['jersey_number'] for player in home_lineup['lineup'] @@ -93,6 +124,13 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even else: raise Exception(f"Unknown team_id {raw_event['team']['id']}") + if raw_event['possession_team']['id'] == home_lineup['team_id']: + possession_team = Team.HOME + elif raw_event['possession_team']['id'] == away_lineup['team_id']: + possession_team = Team.AWAY + else: + raise Exception(f"Unknown possession_team_id: {raw_event['possession_team']}") + timestamp = parse_str_ts(raw_event['timestamp']) period_id = int(raw_event['period']) if not period or period.id != period_id: @@ -109,12 +147,12 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even if 'player' in raw_event: player_jersey_no = current_team_map[raw_event['player']['id']] - event_kwargs = dict( + generic_event_kwargs = dict( # from DataRecord period=period, timestamp=timestamp, - ball_owning_team=None, - ball_state=None, + ball_owning_team=possession_team, + ball_state=BallState.ALIVE, # from Event event_id=raw_event['id'], team=team, @@ -131,17 +169,57 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even event_type = raw_event['type']['id'] if event_type == SB_EVENT_TYPE_PASS: - pass_event_kwargs = _parse_pass(raw_event['pass'], current_team_map) + pass_event_kwargs = _parse_pass( + pass_dict=raw_event['pass'], + current_team_map=current_team_map + ) event = PassEvent( + # TODO: Consider moving this to _parse_pass end_timestamp=timestamp + raw_event['duration'], **pass_event_kwargs, - **event_kwargs + **generic_event_kwargs ) + elif event_type == SB_EVENT_TYPE_SHOT: + shot_event_kwargs = _parse_shot( + shot_dict=raw_event['shot'] + ) + event = ShotEvent( + **shot_event_kwargs, + **generic_event_kwargs + ) + # elif event_type == SB_EVENT_TYPE_DRIBBLE: + # dribble_event_kwargs = _parse_dribble( + # dribble_dict=raw_event['dribble'] + # ) + # event = DribbleEvent( + # **dribble_event_kwargs, + # **generic_event_kwargs + # ) + # elif event_type == SB_EVENT_TYPE_CARRY: + # carry_event_kwargs = _parse_carry( + # carry_dict=raw_event['carry'] + # ) + # event = CarryEvent( + # **carry_event_kwargs, + # **generic_event_kwargs + # ) else: + logger.debug(f"Skipping event with type {raw_event['type']['name']} (id: {event_type})") continue events.append(event) + return EventDataSet( + flags=DataSetFlag.BALL_OWNING_TEAM, + orientation=Orientation.BALL_OWNING_TEAM, + pitch_dimensions=PitchDimensions( + x_dim=Dimension(0, 100), + y_dim=Dimension(0, 100) + ), + periods=periods, + records=events + ) + def serialize(self, data_set: EventDataSet) -> Tuple[str, str]: raise NotImplementedError diff --git a/kloppy/infra/utils.py b/kloppy/infra/utils.py index 01c659d9..5dfa0d5e 100644 --- a/kloppy/infra/utils.py +++ b/kloppy/infra/utils.py @@ -13,7 +13,7 @@ def to_file_object(s: Readable) -> BinaryIO: @contextmanager -def performance_logging(description: str, counter: int = None): +def performance_logging(description: str, counter: int = None, logger=None): start = time.time() try: yield @@ -27,6 +27,11 @@ def performance_logging(description: str, counter: int = None): if took < 0.01: took *= 1000 unit = "us" - print(f"{description} took: {took:.2f}{unit} {extra}") + + msg = f"{description} took: {took:.2f}{unit} {extra}" + if logger: + logger.info(msg) + else: + print(msg) From 0dfd5bf8d2d4bab3473c1277a07b1c7d0fc931f0 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 2 Jun 2020 15:24:18 +0200 Subject: [PATCH 3/7] Statsbomb almost done --- examples/datasets/statsbomb.py | 19 +- kloppy/domain/models/event.py | 44 +++-- kloppy/domain/models/tracking.py | 2 +- kloppy/helpers.py | 55 +++++- kloppy/infra/datasets/event/statsbomb.py | 10 +- kloppy/infra/serializers/__init__.py | 1 + .../serializers/event/statsbomb/serializer.py | 183 ++++++++++++++---- 7 files changed, 241 insertions(+), 73 deletions(-) diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py index 13f0a694..9f55cecb 100644 --- a/examples/datasets/statsbomb.py +++ b/examples/datasets/statsbomb.py @@ -1,7 +1,7 @@ import logging import sys -from kloppy import datasets, transform +from kloppy import datasets, transform, to_pandas from kloppy.infra.utils import performance_logging @@ -13,10 +13,19 @@ def main(): logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") - data_set = datasets.load("statsbomb") - with performance_logging("transform"): - data_set = transform(data_set, to_orientation="FIXED_HOME_AWAY") - a = 1 + logger = logging.getLogger(__name__) + + dataset = datasets.load("statsbomb", { + "event_types": ["pass", "take_on", "carry", "shot"] + }, match_id=3749052) #16079) + + with performance_logging("transform", logger=logger): + dataset = transform(dataset, to_orientation="FIXED_HOME_AWAY") + + with performance_logging("to pandas", logger=logger): + dataframe = to_pandas(dataset) + + print(dataframe.head()) if __name__ == "__main__": diff --git a/kloppy/domain/models/event.py b/kloppy/domain/models/event.py index f9fae9e9..bb43e9aa 100644 --- a/kloppy/domain/models/event.py +++ b/kloppy/domain/models/event.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod, abstractproperty, ABCMeta from dataclasses import dataclass from enum import Enum -from typing import List, Union +from typing import List, Union, Dict from .pitch import Point from .common import DataRecord, Dataset, Team @@ -38,7 +38,7 @@ def is_success(self): return self == self.COMPLETE -class DribbleCarryResult(ResultType): +class TakeOnResult(ResultType): COMPLETE = "COMPLETE" INCOMPLETE = "INCOMPLETE" OUT = "OUT" @@ -48,10 +48,21 @@ def is_success(self): return self == self.COMPLETE +class CarryResult(ResultType): + COMPLETE = "COMPLETE" + INCOMPLETE = "INCOMPLETE" + + @property + def is_success(self): + return self == self.COMPLETE + + class EventType(Enum): + GENERIC = "generic" + PASS = "PASS" SHOT = "SHOT" - DRIBBLE = "DRIBBLE" + TAKE_ON = "TAKE_ON" CARRY = "CARRY" @@ -59,18 +70,24 @@ class EventType(Enum): class Event(DataRecord, ABC): event_id: str team: Team - player_jersey_no: str position: Point result: ResultType + raw_event: Dict + @property @abstractmethod def event_type(self) -> EventType: raise NotImplementedError +@dataclass +class GenericEvent(Event): + event_type: EventType = EventType.GENERIC + + @dataclass class ShotEvent(Event): result: ShotResult @@ -90,13 +107,10 @@ class PassEvent(Event): @dataclass -class DribbleEvent(Event): - end_timestamp: float - end_position: Point - - result: DribbleCarryResult +class TakeOnEvent(Event): + result: TakeOnResult - event_type: EventType = EventType.DRIBBLE + event_type: EventType = EventType.TAKE_ON @dataclass @@ -104,7 +118,7 @@ class CarryEvent(Event): end_timestamp: float end_position: Point - result: DribbleCarryResult + result: CarryResult event_type: EventType = EventType.CARRY @@ -112,7 +126,7 @@ class CarryEvent(Event): @dataclass class EventDataset(Dataset): records: List[Union[ - ShotEvent, PassEvent, DribbleEvent, CarryEvent + ShotEvent, PassEvent, TakeOnEvent, CarryEvent ]] @property @@ -122,7 +136,7 @@ def events(self): __all__ = [ "ResultType", "EventType", - "ShotResult", "PassResult", "DribbleCarryResult", - "ShotEvent", "PassEvent", "DribbleEvent", "CarryEvent", - "EventDataSet" + "ShotResult", "PassResult", "TakeOnResult", "CarryResult", + "Event", "GenericEvent", "ShotEvent", "PassEvent", "TakeOnEvent", "CarryEvent", + "EventDataset" ] diff --git a/kloppy/domain/models/tracking.py b/kloppy/domain/models/tracking.py index c0ab9eb8..46b63a80 100644 --- a/kloppy/domain/models/tracking.py +++ b/kloppy/domain/models/tracking.py @@ -27,6 +27,6 @@ def frames(self): __all__ = [ - "Frame", "TrackingDataSet" + "Frame", "TrackingDataset" ] diff --git a/kloppy/helpers.py b/kloppy/helpers.py index eb65611d..e1ff9bd7 100644 --- a/kloppy/helpers.py +++ b/kloppy/helpers.py @@ -1,10 +1,13 @@ -from typing import Callable, TypeVar +from typing import Callable, TypeVar, Dict -from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer -from .domain import Dataset, Frame, TrackingDataset, Transformer, Orientation, PitchDimensions, Dimension +from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer, StatsbombSerializer +from .domain import ( + Dataset, Frame, Event, TrackingDataset, Transformer, Orientation, PitchDimensions, + Dimension, EventDataset +) -def load_tracab_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> Dataset: +def load_tracab_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> TrackingDataset: serializer = TRACABSerializer() with open(meta_data_filename, "rb") as meta_data, \ open(raw_data_filename, "rb") as raw_data: @@ -18,7 +21,7 @@ def load_tracab_tracking_data(meta_data_filename: str, raw_data_filename: str, o ) -def load_metrica_tracking_data(raw_data_home_filename: str, raw_data_away_filename: str, options: dict = None) -> Dataset: +def load_metrica_tracking_data(raw_data_home_filename: str, raw_data_away_filename: str, options: dict = None) -> TrackingDataset: serializer = MetricaTrackingSerializer() with open(raw_data_home_filename, "rb") as raw_data_home, \ open(raw_data_away_filename, "rb") as raw_data_away: @@ -32,7 +35,7 @@ def load_metrica_tracking_data(raw_data_home_filename: str, raw_data_away_filena ) -def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> Dataset: +def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, options: dict = None) -> TrackingDataset: serializer = EPTSSerializer() with open(meta_data_filename, "rb") as meta_data, \ open(raw_data_filename, "rb") as raw_data: @@ -46,6 +49,20 @@ def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, opt ) +def load_statsbomb_event_data(lineup_filename: str, raw_data_filename: str, options: dict = None) -> EventDataset: + serializer = StatsbombSerializer() + with open(lineup_filename, "rb") as lineup_data, \ + open(raw_data_filename, "rb") as raw_data: + + return serializer.deserialize( + inputs={ + 'lineup_data': lineup_data, + 'raw_data': raw_data + }, + options=options + ) + + DatasetType = TypeVar('DatasetType') @@ -64,12 +81,12 @@ def transform(dataset: DatasetType, to_orientation=None, to_pitch_dimensions=Non ) -def _frame_to_pandas_row_converter(frame: Frame) -> dict: +def _frame_to_pandas_row_converter(frame: Frame) -> Dict: row = dict( period_id=frame.period.id, timestamp=frame.timestamp, - ball_state=frame.ball_state, - ball_owning_team=frame.ball_owning_team, + ball_state=frame.ball_state.value if frame.ball_state else None, + ball_owning_team=frame.ball_owning_team.value if frame.ball_owning_team else None, ball_x=frame.ball_position.x if frame.ball_position else None, ball_y=frame.ball_position.y if frame.ball_position else None ) @@ -87,6 +104,24 @@ def _frame_to_pandas_row_converter(frame: Frame) -> dict: return row +def _event_to_pandas_row_converter(event: Event) -> Dict: + row = dict( + period_id=event.period.id, + timestamp=event.timestamp, + ball_state=event.ball_state.value if event.ball_state else None, + ball_owning_team=event.ball_owning_team.value if event.ball_owning_team else None, + + event_id=event.event_id, + event_type=event.event_type.value, + team=event.team.value, + player_jersey_no=event.player_jersey_no, + position_x=event.position.x, + position_y=event.position.y, + result=event.result.value if event.result else None + ) + return row + + def to_pandas(dataset: Dataset, _record_converter: Callable = None) -> 'DataFrame': try: import pandas as pd @@ -97,6 +132,8 @@ def to_pandas(dataset: Dataset, _record_converter: Callable = None) -> 'DataFram if not _record_converter: if isinstance(dataset, TrackingDataset): _record_converter = _frame_to_pandas_row_converter + elif isinstance(dataset, EventDataset): + _record_converter = _event_to_pandas_row_converter else: raise Exception("Unknown dataset type") diff --git a/kloppy/infra/datasets/event/statsbomb.py b/kloppy/infra/datasets/event/statsbomb.py index 504109ef..f6031763 100644 --- a/kloppy/infra/datasets/event/statsbomb.py +++ b/kloppy/infra/datasets/event/statsbomb.py @@ -1,3 +1,4 @@ +import warnings from typing import Dict, Type from ..core.builder import DatasetBuilder @@ -6,11 +7,16 @@ # 3749133 / 38412 class Statsbomb(DatasetBuilder): - def get_data_set_files(self,**kwargs) -> Dict[str, str]: + def get_dataset_urls(self,**kwargs) -> Dict[str, str]: + warnings.warn("\n\nYou are about to use StatsBomb public data." + "\nBy using this data, you are agreeing to the user agreement. " + "\nThe user agreement can be found here: https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf" + "\n") + match_id = kwargs.get('match_id', '15946') return { 'raw_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json', - 'lineup': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json' + 'lineup_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json' } def get_serializer_cls(self) -> Type[EventDataSerializer]: diff --git a/kloppy/infra/serializers/__init__.py b/kloppy/infra/serializers/__init__.py index e590614a..652d684f 100644 --- a/kloppy/infra/serializers/__init__.py +++ b/kloppy/infra/serializers/__init__.py @@ -1,2 +1,3 @@ from .tracking import TrackingDataSerializer, TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer +from .event import StatsbombSerializer # NOT YET: from .event import EventDataSerializer, MetricaEventSerializer diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py index b32e0fe3..6742cbd0 100644 --- a/kloppy/infra/serializers/event/statsbomb/serializer.py +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -1,9 +1,14 @@ -from typing import Tuple, Dict +from typing import Tuple, Dict, List import logging import json -from kloppy.domain import EventDataSet, PassEvent, Team, Period, Point, PassResult, ShotEvent, BallState, ShotResult, \ - DataSetFlag, Orientation, PitchDimensions, Dimension, DribbleEvent, CarryEvent +from kloppy.domain import ( + EventDataset, Team, Period, Point, BallState, + DatasetFlag, Orientation, PitchDimensions, Dimension, + + PassEvent, ShotEvent, TakeOnEvent, CarryEvent, GenericEvent, + PassResult, ShotResult, TakeOnResult, CarryResult, EventType +) from kloppy.infra.serializers.event import EventDataSerializer from kloppy.infra.utils import Readable, performance_logging @@ -16,6 +21,7 @@ SB_EVENT_TYPE_HALF_START = 18 SB_EVENT_TYPE_HALF_END = 34 +SB_PASS_OUTCOME_COMPLETE = 8 SB_PASS_OUTCOME_INCOMPLETE = 9 SB_PASS_OUTCOME_INJURY_CLEARANCE = 74 SB_PASS_OUTCOME_OUT = 75 @@ -35,7 +41,23 @@ def parse_str_ts(timestamp: str) -> float: return int(h) * 3600 + int(m) * 60 + float(s) -def _parse_pass(pass_dict: Dict, current_team_map: Dict[int, int]) -> Dict: +def _parse_position(position: Dict, fidelity_version: int) -> Point: + # location is cell based + # [1, 120] x [1, 80] + # +-----+------+ + # | 1,1 | 2, 1 | + # +-----+------+ + # | 1,2 | 2,2 | + # +-----+------+ + cell_side = 0.1 if fidelity_version == 2 else 1.0 + cell_relative_center = cell_side / 2 + return Point( + x=position[0] - cell_relative_center, + y=position[1] - cell_relative_center + ) + + +def _parse_pass(pass_dict: Dict, current_team_map: Dict[int, int], fidelity_version: int) -> Dict: if 'outcome' in pass_dict: outcome_id = pass_dict['outcome']['id'] if outcome_id == SB_PASS_OUTCOME_OUT: @@ -58,9 +80,9 @@ def _parse_pass(pass_dict: Dict, current_team_map: Dict[int, int]) -> Dict: receiver_player_jersey_no = current_team_map[ pass_dict['recipient']['id'] ] - receiver_position = Point( - x=pass_dict['end_location'][0], - y=pass_dict['end_location'][1] + receiver_position = _parse_position( + pass_dict['end_location'], + fidelity_version ) return dict( @@ -92,25 +114,89 @@ def _parse_shot(shot_dict: Dict) -> Dict: ) +def _parse_carry(carry_dict: Dict, fidelity_version: int) -> Dict: + return dict( + result=CarryResult.COMPLETE, + end_position=_parse_position( + carry_dict['end_location'], + fidelity_version + ) + ) + + +def _parse_take_on(take_on_dict: Dict) -> Dict: + if 'outcome' in take_on_dict: + outcome_id = take_on_dict['outcome']['id'] + if outcome_id == SB_PASS_OUTCOME_OUT: + result = TakeOnResult.OUT + elif outcome_id == SB_PASS_OUTCOME_INCOMPLETE: + result = TakeOnResult.INCOMPLETE + elif outcome_id == SB_PASS_OUTCOME_COMPLETE: + result = TakeOnResult.COMPLETE + else: + raise Exception(f"Unknown pass outcome: {take_on_dict['outcome']['name']}({outcome_id})") + else: + result = TakeOnResult.COMPLETE + + return dict( + result=result + ) + + +def _determine_xy_fidelity_versions(events: List[Dict]) -> Tuple[int, int]: + """ + match_id=15946, not high fidelty from metadata, high fidelty from data + match_id=70303, high fidelty from metadata, high fidelty from data + + """ + shot_fidelity_version = 1 + xy_fidelity_version = 1 + for event in events: + if 'location' in event: + x, y = event['location'] + if abs(int(x) - x) + abs(int(y) - y) > 0: + event_type = event['type']['id'] + if event_type == SB_EVENT_TYPE_SHOT: + shot_fidelity_version = 2 + elif event_type in (SB_EVENT_TYPE_CARRY, SB_EVENT_TYPE_DRIBBLE, SB_EVENT_TYPE_PASS): + xy_fidelity_version = 2 + return shot_fidelity_version, xy_fidelity_version + + logger = logging.getLogger(__name__) class StatsbombSerializer(EventDataSerializer): - def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataSet: + @staticmethod + def __validate_inputs(inputs: Dict[str, Readable]): + if "raw_data" not in inputs: + raise ValueError("Please specify a value for input 'raw_data'") + if "lineup_data" not in inputs: + raise ValueError("Please specify a value for input 'lineup_data'") + + def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataset: + self.__validate_inputs(inputs) + with performance_logging("load data", logger=logger): raw_events = json.load(inputs['raw_data']) - home_lineup, away_lineup = json.load(inputs['lineup']) + home_lineup, away_lineup = json.load(inputs['lineup_data']) + shot_fidelity_version, xy_fidelity_version = _determine_xy_fidelity_versions(raw_events) + logger.info(f"Determined Fidelity versions to shot: {shot_fidelity_version} / XY: {xy_fidelity_version}") with performance_logging("parse data", logger=logger): home_player_map = { - player['player_id']: player['jersey_number'] + player['player_id']: str(player['jersey_number']) for player in home_lineup['lineup'] } away_player_map = { - player['player_id']: player['jersey_number'] + player['player_id']: str(player['jersey_number']) for player in away_lineup['lineup'] } + wanted_event_types = [ + EventType[event_type.upper()] for event_type in options.get('event_types', []) + ] + periods = [] period = None events = [] @@ -147,6 +233,12 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even if 'player' in raw_event: player_jersey_no = current_team_map[raw_event['player']['id']] + event_type = raw_event['type']['id'] + if event_type == SB_EVENT_TYPE_SHOT: + fidelity_version = shot_fidelity_version + else: + fidelity_version = xy_fidelity_version + generic_event_kwargs = dict( # from DataRecord period=period, @@ -158,20 +250,21 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even team=team, player_jersey_no=player_jersey_no, position=( - Point( - x=raw_event['location'][0], - y=raw_event['location'][1] + _parse_position( + raw_event.get('location'), + fidelity_version ) if 'location' in raw_event else None - ) + ), + raw_event=raw_event ) - event_type = raw_event['type']['id'] if event_type == SB_EVENT_TYPE_PASS: pass_event_kwargs = _parse_pass( pass_dict=raw_event['pass'], - current_team_map=current_team_map + current_team_map=current_team_map, + fidelity_version=fidelity_version ) event = PassEvent( @@ -188,38 +281,46 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even **shot_event_kwargs, **generic_event_kwargs ) - # elif event_type == SB_EVENT_TYPE_DRIBBLE: - # dribble_event_kwargs = _parse_dribble( - # dribble_dict=raw_event['dribble'] - # ) - # event = DribbleEvent( - # **dribble_event_kwargs, - # **generic_event_kwargs - # ) - # elif event_type == SB_EVENT_TYPE_CARRY: - # carry_event_kwargs = _parse_carry( - # carry_dict=raw_event['carry'] - # ) - # event = CarryEvent( - # **carry_event_kwargs, - # **generic_event_kwargs - # ) + + # For dribble and carry the definitions + # are flipped between Statsbomb and kloppy + elif event_type == SB_EVENT_TYPE_DRIBBLE: + take_on_event_kwargs = _parse_take_on( + take_on_dict=raw_event['dribble'] + ) + event = TakeOnEvent( + **take_on_event_kwargs, + **generic_event_kwargs + ) + elif event_type == SB_EVENT_TYPE_CARRY: + carry_event_kwargs = _parse_carry( + carry_dict=raw_event['carry'], + fidelity_version=fidelity_version + ) + event = CarryEvent( + end_timestamp=timestamp + raw_event['duration'], + **carry_event_kwargs, + **generic_event_kwargs + ) else: - logger.debug(f"Skipping event with type {raw_event['type']['name']} (id: {event_type})") - continue + event = GenericEvent( + result=None, + **generic_event_kwargs + ) - events.append(event) + if not wanted_event_types or event.event_type in wanted_event_types: + events.append(event) - return EventDataSet( - flags=DataSetFlag.BALL_OWNING_TEAM, + return EventDataset( + flags=DatasetFlag.BALL_OWNING_TEAM, orientation=Orientation.BALL_OWNING_TEAM, pitch_dimensions=PitchDimensions( - x_dim=Dimension(0, 100), - y_dim=Dimension(0, 100) + x_dim=Dimension(0, 120), + y_dim=Dimension(0, 80) ), periods=periods, records=events ) - def serialize(self, data_set: EventDataSet) -> Tuple[str, str]: + def serialize(self, data_set: EventDataset) -> Tuple[str, str]: raise NotImplementedError From 760d5de056eeb04e2e69639984b6fb39b268e2cb Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 2 Jun 2020 16:21:21 +0200 Subject: [PATCH 4/7] StatsBomb: almost finished --- examples/datasets/statsbomb.py | 6 ++-- kloppy/domain/models/common.py | 15 ++++++++- kloppy/domain/models/event.py | 4 +-- .../domain/services/transformers/__init__.py | 14 +++++--- kloppy/helpers.py | 32 +++++++++++++++---- .../serializers/event/statsbomb/serializer.py | 13 ++++---- kloppy/tests/files/statsbomb_lineup.json | 0 kloppy/tests/files/statsbomb_raw.json | 0 kloppy/tests/test_helpers.py | 8 ++--- kloppy/tests/test_statsbomb.py | 3 ++ 10 files changed, 68 insertions(+), 27 deletions(-) create mode 100644 kloppy/tests/files/statsbomb_lineup.json create mode 100644 kloppy/tests/files/statsbomb_raw.json create mode 100644 kloppy/tests/test_statsbomb.py diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py index 9f55cecb..fe5f2157 100644 --- a/examples/datasets/statsbomb.py +++ b/examples/datasets/statsbomb.py @@ -16,8 +16,8 @@ def main(): logger = logging.getLogger(__name__) dataset = datasets.load("statsbomb", { - "event_types": ["pass", "take_on", "carry", "shot"] - }, match_id=3749052) #16079) + #"event_types": ["pass", "take_on", "carry", "shot"] + }, match_id=15946) with performance_logging("transform", logger=logger): dataset = transform(dataset, to_orientation="FIXED_HOME_AWAY") @@ -25,7 +25,7 @@ def main(): with performance_logging("to pandas", logger=logger): dataframe = to_pandas(dataset) - print(dataframe.head()) + print(dataframe[:100].to_string()) if __name__ == "__main__": diff --git a/kloppy/domain/models/common.py b/kloppy/domain/models/common.py index 9621fe54..949c1181 100644 --- a/kloppy/domain/models/common.py +++ b/kloppy/domain/models/common.py @@ -35,6 +35,9 @@ class Orientation(Enum): # change when possession changes BALL_OWNING_TEAM = "ball-owning-team" + # depends on team which executed the action + ACTION_EXECUTING_TEAM = "action-executing-team" + # changes during half-time HOME_TEAM = "home-team" AWAY_TEAM = "away-team" @@ -45,7 +48,8 @@ class Orientation(Enum): def get_orientation_factor(self, attacking_direction: AttackingDirection, - ball_owning_team: Team): + ball_owning_team: Team, + action_executing_team: Team): if self == Orientation.FIXED_HOME_AWAY: return -1 elif self == Orientation.FIXED_AWAY_HOME: @@ -71,6 +75,15 @@ def get_orientation_factor(self, return 1 else: raise Exception(f"Invalid ball_owning_team: {ball_owning_team}") + elif self == Orientation.ACTION_EXECUTING_TEAM: + if action_executing_team == Team.HOME: + return -1 + elif action_executing_team == Team.AWAY: + return 1 + else: + raise Exception(f"Invalid action_executing_team: {action_executing_team}") + else: + raise Exception(f"Unknown orientation: {self}") diff --git a/kloppy/domain/models/event.py b/kloppy/domain/models/event.py index bb43e9aa..acbca495 100644 --- a/kloppy/domain/models/event.py +++ b/kloppy/domain/models/event.py @@ -97,7 +97,7 @@ class ShotEvent(Event): @dataclass class PassEvent(Event): - end_timestamp: float + receive_timestamp: float receiver_player_jersey_no: str receiver_position: Point @@ -126,7 +126,7 @@ class CarryEvent(Event): @dataclass class EventDataset(Dataset): records: List[Union[ - ShotEvent, PassEvent, TakeOnEvent, CarryEvent + GenericEvent, ShotEvent, PassEvent, TakeOnEvent, CarryEvent ]] @property diff --git a/kloppy/domain/services/transformers/__init__.py b/kloppy/domain/services/transformers/__init__.py index 0f000a27..45cecd7f 100644 --- a/kloppy/domain/services/transformers/__init__.py +++ b/kloppy/domain/services/transformers/__init__.py @@ -36,17 +36,22 @@ def transform_point(self, point: Point, flip: bool) -> Point: y=self._to_pitch_dimensions.y_dim.from_base(y_base) ) - def __needs_flip(self, ball_owning_team: Team, attacking_direction: AttackingDirection) -> bool: + def __needs_flip(self, + ball_owning_team: Team, + attacking_direction: AttackingDirection, + action_executing_team: Team = None) -> bool: if self._from_orientation == self._to_orientation: flip = False else: orientation_factor_from = self._from_orientation.get_orientation_factor( ball_owning_team=ball_owning_team, - attacking_direction=attacking_direction + attacking_direction=attacking_direction, + action_executing_team=action_executing_team ) orientation_factor_to = self._to_orientation.get_orientation_factor( ball_owning_team=ball_owning_team, - attacking_direction=attacking_direction + attacking_direction=attacking_direction, + action_executing_team=action_executing_team ) flip = orientation_factor_from != orientation_factor_to return flip @@ -84,7 +89,8 @@ def transform_frame(self, frame: Frame) -> Frame: def transform_event(self, event: EventType) -> EventType: flip = self.__needs_flip( ball_owning_team=event.ball_owning_team, - attacking_direction=event.period.attacking_direction + attacking_direction=event.period.attacking_direction, + action_executing_team=event.team ) position_changes = { diff --git a/kloppy/helpers.py b/kloppy/helpers.py index e1ff9bd7..d3b0ed9a 100644 --- a/kloppy/helpers.py +++ b/kloppy/helpers.py @@ -3,7 +3,7 @@ from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer, StatsbombSerializer from .domain import ( Dataset, Frame, Event, TrackingDataset, Transformer, Orientation, PitchDimensions, - Dimension, EventDataset + Dimension, EventDataset, PassEvent, CarryEvent, PassResult, EventType ) @@ -106,19 +106,39 @@ def _frame_to_pandas_row_converter(frame: Frame) -> Dict: def _event_to_pandas_row_converter(event: Event) -> Dict: row = dict( + event_id=event.event_id, + event_type=( + event.event_type.value + if event.event_type != EventType.GENERIC else + f"GENERIC:{event.raw_event['type']['name']}" + ), + result=event.result.value if event.result else None, + success=event.result.is_success if event.result else None, + period_id=event.period.id, timestamp=event.timestamp, + end_timestamp=None, ball_state=event.ball_state.value if event.ball_state else None, ball_owning_team=event.ball_owning_team.value if event.ball_owning_team else None, - event_id=event.event_id, - event_type=event.event_type.value, team=event.team.value, player_jersey_no=event.player_jersey_no, - position_x=event.position.x, - position_y=event.position.y, - result=event.result.value if event.result else None + position_x=event.position.x if event.position else None, + position_y=event.position.y if event.position else None ) + if isinstance(event, PassEvent) and event.result == PassResult.COMPLETE: + row.update({ + 'end_timestamp': event.receive_timestamp, + 'end_position_x': event.receiver_position.x, + 'end_position_y': event.receiver_position.y, + 'receiver_jersey_no': event.receiver_player_jersey_no + }) + elif isinstance(event, CarryEvent): + row.update({ + 'end_timestamp': event.end_timestamp, + 'end_position_x': event.end_position.x, + 'end_position_y': event.end_position.y + }) return row diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py index 6742cbd0..1fbe1410 100644 --- a/kloppy/infra/serializers/event/statsbomb/serializer.py +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -144,11 +144,6 @@ def _parse_take_on(take_on_dict: Dict) -> Dict: def _determine_xy_fidelity_versions(events: List[Dict]) -> Tuple[int, int]: - """ - match_id=15946, not high fidelty from metadata, high fidelty from data - match_id=70303, high fidelty from metadata, high fidelty from data - - """ shot_fidelity_version = 1 xy_fidelity_version = 1 for event in events: @@ -236,7 +231,10 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even event_type = raw_event['type']['id'] if event_type == SB_EVENT_TYPE_SHOT: fidelity_version = shot_fidelity_version + elif event_type in (SB_EVENT_TYPE_CARRY, SB_EVENT_TYPE_DRIBBLE, SB_EVENT_TYPE_PASS): + fidelity_version = xy_fidelity_version else: + # TODO: Uh ohhhh.. don't know which one to pick fidelity_version = xy_fidelity_version generic_event_kwargs = dict( @@ -269,7 +267,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even event = PassEvent( # TODO: Consider moving this to _parse_pass - end_timestamp=timestamp + raw_event['duration'], + receive_timestamp=timestamp + raw_event['duration'], **pass_event_kwargs, **generic_event_kwargs ) @@ -298,6 +296,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even fidelity_version=fidelity_version ) event = CarryEvent( + # TODO: Consider moving this to _parse_carry end_timestamp=timestamp + raw_event['duration'], **carry_event_kwargs, **generic_event_kwargs @@ -313,7 +312,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even return EventDataset( flags=DatasetFlag.BALL_OWNING_TEAM, - orientation=Orientation.BALL_OWNING_TEAM, + orientation=Orientation.ACTION_EXECUTING_TEAM, pitch_dimensions=PitchDimensions( x_dim=Dimension(0, 120), y_dim=Dimension(0, 80) diff --git a/kloppy/tests/files/statsbomb_lineup.json b/kloppy/tests/files/statsbomb_lineup.json new file mode 100644 index 00000000..e69de29b diff --git a/kloppy/tests/files/statsbomb_raw.json b/kloppy/tests/files/statsbomb_raw.json new file mode 100644 index 00000000..e69de29b diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py index 41d10dab..86cecf5b 100644 --- a/kloppy/tests/test_helpers.py +++ b/kloppy/tests/test_helpers.py @@ -7,7 +7,7 @@ from kloppy.domain import ( Period, DatasetFlag, Point, AttackingDirection, TrackingDataset, PitchDimensions, Dimension, - Orientation, Frame + Orientation, Frame, EventDataset, PassEvent ) @@ -30,7 +30,7 @@ def test_load_tracab_tracking_data(self): assert len(dataset.records) == 5 # only alive=True assert len(dataset.periods) == 2 - def _get_dataset(self): + def _get_tracking_dataset(self): periods = [ Period(id=1, start_timestamp=0.0, end_timestamp=10.0, attacking_direction=AttackingDirection.HOME_AWAY), Period(id=2, start_timestamp=15.0, end_timestamp=25.0, attacking_direction=AttackingDirection.AWAY_HOME) @@ -72,7 +72,7 @@ def _get_dataset(self): return tracking_data def test_transform(self): - tracking_data = self._get_dataset() + tracking_data = self._get_tracking_dataset() # orientation change AND dimension scale transformed_dataset = transform( @@ -85,7 +85,7 @@ def test_transform(self): assert transformed_dataset.frames[1].ball_position == Point(x=1, y=0) def test_to_pandas(self): - tracking_data = self._get_dataset() + tracking_data = self._get_tracking_dataset() data_frame = to_pandas(tracking_data) diff --git a/kloppy/tests/test_statsbomb.py b/kloppy/tests/test_statsbomb.py new file mode 100644 index 00000000..20c3372c --- /dev/null +++ b/kloppy/tests/test_statsbomb.py @@ -0,0 +1,3 @@ +class TestStatsbomb: + def test_correct_deserialization(self): + pass From e782abe45d5bc0bebc1c90e34e0ed506f554b141 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 2 Jun 2020 16:27:08 +0200 Subject: [PATCH 5/7] Fix some logging --- examples/datasets/statsbomb.py | 7 ++++++- kloppy/infra/serializers/event/statsbomb/serializer.py | 5 ++--- kloppy/infra/serializers/tracking/epts/serializer.py | 9 +++++---- kloppy/infra/serializers/tracking/metrica.py | 7 +++++-- kloppy/infra/serializers/tracking/tracab.py | 7 +++++-- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py index fe5f2157..a52a65b8 100644 --- a/examples/datasets/statsbomb.py +++ b/examples/datasets/statsbomb.py @@ -20,7 +20,12 @@ def main(): }, match_id=15946) with performance_logging("transform", logger=logger): - dataset = transform(dataset, to_orientation="FIXED_HOME_AWAY") + # convert to TRACAB coordinates + dataset = transform( + dataset, + to_orientation="FIXED_HOME_AWAY", + to_pitch_dimensions=[(-5500, 5500), (-3300, 3300)] + ) with performance_logging("to pandas", logger=logger): dataframe = to_pandas(dataset) diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py index 1fbe1410..32a01aba 100644 --- a/kloppy/infra/serializers/event/statsbomb/serializer.py +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -12,6 +12,8 @@ from kloppy.infra.serializers.event import EventDataSerializer from kloppy.infra.utils import Readable, performance_logging +logger = logging.getLogger(__name__) + SB_EVENT_TYPE_DRIBBLE = 14 SB_EVENT_TYPE_SHOT = 16 @@ -158,9 +160,6 @@ def _determine_xy_fidelity_versions(events: List[Dict]) -> Tuple[int, int]: return shot_fidelity_version, xy_fidelity_version -logger = logging.getLogger(__name__) - - class StatsbombSerializer(EventDataSerializer): @staticmethod def __validate_inputs(inputs: Dict[str, Readable]): diff --git a/kloppy/infra/serializers/tracking/epts/serializer.py b/kloppy/infra/serializers/tracking/epts/serializer.py index 8b751119..43bbeb33 100644 --- a/kloppy/infra/serializers/tracking/epts/serializer.py +++ b/kloppy/infra/serializers/tracking/epts/serializer.py @@ -1,3 +1,4 @@ +import logging from typing import Tuple, Dict from kloppy.domain import ( @@ -7,8 +8,6 @@ Point, Team, Orientation, - PitchDimensions, - Dimension, attacking_direction_from_frame, ) from kloppy.infra.utils import Readable, performance_logging @@ -18,6 +17,8 @@ from .. import TrackingDataSerializer +logger = logging.getLogger(__name__) + class EPTSSerializer(TrackingDataSerializer): @staticmethod @@ -109,12 +110,12 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac sample_rate = float(options.get('sample_rate', 1.0)) limit = int(options.get('limit', 0)) - with performance_logging("Loading metadata"): + with performance_logging("Loading metadata", logger=logger): meta_data = load_meta_data(inputs['meta_data']) periods = meta_data.periods - with performance_logging("Loading data"): + with performance_logging("Loading data", logger=logger): # assume they are sorted frames = [ self._frame_from_row(row, meta_data) diff --git a/kloppy/infra/serializers/tracking/metrica.py b/kloppy/infra/serializers/tracking/metrica.py index 4dcd4350..87b3af6a 100644 --- a/kloppy/infra/serializers/tracking/metrica.py +++ b/kloppy/infra/serializers/tracking/metrica.py @@ -1,3 +1,4 @@ +import logging from collections import namedtuple from typing import Tuple, Dict, Iterator @@ -15,6 +16,8 @@ from . import TrackingDataSerializer +logger = logging.getLogger(__name__) + class MetricaTrackingSerializer(TrackingDataSerializer): __PartialFrame = namedtuple("PartialFrame", "team period frame_id player_positions ball_position") @@ -148,13 +151,13 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac # consider reading this from data frame_rate = 25 - with performance_logging("prepare"): + with performance_logging("prepare", logger=logger): home_iterator = self.__create_iterator(inputs['raw_data_home'], sample_rate, frame_rate) away_iterator = self.__create_iterator(inputs['raw_data_away'], sample_rate, frame_rate) partial_frames = zip(home_iterator, away_iterator) - with performance_logging("loading"): + with performance_logging("loading", logger=logger): frames = [] periods = [] diff --git a/kloppy/infra/serializers/tracking/tracab.py b/kloppy/infra/serializers/tracking/tracab.py index 1fc94b97..080a2b7e 100644 --- a/kloppy/infra/serializers/tracking/tracab.py +++ b/kloppy/infra/serializers/tracking/tracab.py @@ -1,3 +1,4 @@ +import logging from typing import Tuple, Dict from lxml import objectify @@ -19,6 +20,8 @@ from . import TrackingDataSerializer +logger = logging.getLogger(__name__) + class TRACABSerializer(TrackingDataSerializer): @classmethod @@ -125,7 +128,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac limit = int(options.get('limit', 0)) only_alive = bool(options.get('only_alive', True)) - with performance_logging("Loading metadata"): + with performance_logging("Loading metadata", logger=logger): match = objectify.fromstring(inputs['meta_data'].read()).match frame_rate = int(match.attrib['iFrameRateFps']) pitch_size_width = float(match.attrib['fPitchXSizeMeters']) @@ -144,7 +147,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac ) ) - with performance_logging("Loading data"): + with performance_logging("Loading data", logger=logger): def _iter(): n = 0 sample = 1. / sample_rate From 351156d43573b60c49dba34b9e8ffcdb3a92eca1 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 2 Jun 2020 16:42:29 +0200 Subject: [PATCH 6/7] Some minor renaming + some docs --- examples/datasets/statsbomb.py | 2 +- kloppy/helpers.py | 4 +- kloppy/infra/datasets/event/statsbomb.py | 6 +- kloppy/infra/serializers/__init__.py | 2 +- kloppy/infra/serializers/event/__init__.py | 2 +- .../serializers/event/statsbomb/__init__.py | 2 +- .../serializers/event/statsbomb/serializer.py | 59 +++++++++++++++++-- 7 files changed, 62 insertions(+), 15 deletions(-) diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py index a52a65b8..10cc69c7 100644 --- a/examples/datasets/statsbomb.py +++ b/examples/datasets/statsbomb.py @@ -17,7 +17,7 @@ def main(): dataset = datasets.load("statsbomb", { #"event_types": ["pass", "take_on", "carry", "shot"] - }, match_id=15946) + })#, match_id=15946) with performance_logging("transform", logger=logger): # convert to TRACAB coordinates diff --git a/kloppy/helpers.py b/kloppy/helpers.py index d3b0ed9a..2738b011 100644 --- a/kloppy/helpers.py +++ b/kloppy/helpers.py @@ -1,6 +1,6 @@ from typing import Callable, TypeVar, Dict -from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer, StatsbombSerializer +from . import TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer, StatsBombSerializer from .domain import ( Dataset, Frame, Event, TrackingDataset, Transformer, Orientation, PitchDimensions, Dimension, EventDataset, PassEvent, CarryEvent, PassResult, EventType @@ -50,7 +50,7 @@ def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, opt def load_statsbomb_event_data(lineup_filename: str, raw_data_filename: str, options: dict = None) -> EventDataset: - serializer = StatsbombSerializer() + serializer = StatsBombSerializer() with open(lineup_filename, "rb") as lineup_data, \ open(raw_data_filename, "rb") as raw_data: diff --git a/kloppy/infra/datasets/event/statsbomb.py b/kloppy/infra/datasets/event/statsbomb.py index f6031763..3abdaa0d 100644 --- a/kloppy/infra/datasets/event/statsbomb.py +++ b/kloppy/infra/datasets/event/statsbomb.py @@ -2,7 +2,7 @@ from typing import Dict, Type from ..core.builder import DatasetBuilder -from ...serializers.event import EventDataSerializer, StatsbombSerializer +from ...serializers.event import EventDataSerializer, StatsBombSerializer # 3749133 / 38412 @@ -15,9 +15,9 @@ def get_dataset_urls(self,**kwargs) -> Dict[str, str]: match_id = kwargs.get('match_id', '15946') return { - 'raw_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json', + 'event_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json', 'lineup_data': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json' } def get_serializer_cls(self) -> Type[EventDataSerializer]: - return StatsbombSerializer + return StatsBombSerializer diff --git a/kloppy/infra/serializers/__init__.py b/kloppy/infra/serializers/__init__.py index 652d684f..cb142202 100644 --- a/kloppy/infra/serializers/__init__.py +++ b/kloppy/infra/serializers/__init__.py @@ -1,3 +1,3 @@ from .tracking import TrackingDataSerializer, TRACABSerializer, MetricaTrackingSerializer, EPTSSerializer -from .event import StatsbombSerializer +from .event import StatsBombSerializer # NOT YET: from .event import EventDataSerializer, MetricaEventSerializer diff --git a/kloppy/infra/serializers/event/__init__.py b/kloppy/infra/serializers/event/__init__.py index 008eb204..ec6d16c8 100644 --- a/kloppy/infra/serializers/event/__init__.py +++ b/kloppy/infra/serializers/event/__init__.py @@ -1,3 +1,3 @@ from .base import EventDataSerializer #from .metrica import MetricaEventSerializer -from .statsbomb import StatsbombSerializer \ No newline at end of file +from .statsbomb import StatsBombSerializer \ No newline at end of file diff --git a/kloppy/infra/serializers/event/statsbomb/__init__.py b/kloppy/infra/serializers/event/statsbomb/__init__.py index 1cf8711e..87c7f42e 100644 --- a/kloppy/infra/serializers/event/statsbomb/__init__.py +++ b/kloppy/infra/serializers/event/statsbomb/__init__.py @@ -1 +1 @@ -from .serializer import StatsbombSerializer \ No newline at end of file +from .serializer import StatsBombSerializer \ No newline at end of file diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py index 32a01aba..cc279d6f 100644 --- a/kloppy/infra/serializers/event/statsbomb/serializer.py +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -146,12 +146,16 @@ def _parse_take_on(take_on_dict: Dict) -> Dict: def _determine_xy_fidelity_versions(events: List[Dict]) -> Tuple[int, int]: + """ + Find out if x and y are integers disguised as floats + """ shot_fidelity_version = 1 xy_fidelity_version = 1 for event in events: if 'location' in event: x, y = event['location'] - if abs(int(x) - x) + abs(int(y) - y) > 0: + + if not x.is_integer() or not y.is_integer(): event_type = event['type']['id'] if event_type == SB_EVENT_TYPE_SHOT: shot_fidelity_version = 2 @@ -160,22 +164,65 @@ def _determine_xy_fidelity_versions(events: List[Dict]) -> Tuple[int, int]: return shot_fidelity_version, xy_fidelity_version -class StatsbombSerializer(EventDataSerializer): +class StatsBombSerializer(EventDataSerializer): @staticmethod def __validate_inputs(inputs: Dict[str, Readable]): - if "raw_data" not in inputs: - raise ValueError("Please specify a value for input 'raw_data'") + if "event_data" not in inputs: + raise ValueError("Please specify a value for input 'event_data'") if "lineup_data" not in inputs: raise ValueError("Please specify a value for input 'lineup_data'") def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataset: + """ + Deserialize StatsBomb event data into a `EventDataset`. + + Parameters + ---------- + inputs : dict + input `event_data` should point to a `Readable` object containing + the 'json' formatted event data. input `lineup_data` should point + to a `Readable` object containing the 'json' formatted lineup data. + options : dict + Options for deserialization of the StatsBomb file. Possible options are + `event_types` (list of event types) to specify the event types that + should be returned. Valid types: "shot", "pass", "carry", "take_on" and + "generic". Generic is everything other than the first 4. Those events + are barely parsed. This type of event can be used to do the parsing + yourself. + Every event has a 'raw_event' attribute which contains the original + dictionary. + Returns + ------- + dataset : EventDataset + Raises + ------ + + See Also + -------- + + Examples + -------- + >>> serializer = StatsBombSerializer() + >>> with open("events/12312312.json", "rb") as event_data, \ + >>> open("lineups/123123123.json", "rb") as lineup_data: + >>> + >>> dataset = serializer.deserialize( + >>> inputs={ + >>> 'event_data': event_data, + >>> 'lineup_data': lineup_data + >>> }, + >>> options={ + >>> 'event_types': ["pass", "take_on", "carry", "shot"] + >>> } + >>> ) + """ self.__validate_inputs(inputs) with performance_logging("load data", logger=logger): - raw_events = json.load(inputs['raw_data']) + raw_events = json.load(inputs['event_data']) home_lineup, away_lineup = json.load(inputs['lineup_data']) shot_fidelity_version, xy_fidelity_version = _determine_xy_fidelity_versions(raw_events) - logger.info(f"Determined Fidelity versions to shot: {shot_fidelity_version} / XY: {xy_fidelity_version}") + logger.info(f"Determined Fidelity versions: shot v{shot_fidelity_version} / XY v{xy_fidelity_version}") with performance_logging("parse data", logger=logger): home_player_map = { From 201e9bd63f7d75761a58f235e40e642b27e5f1b4 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Tue, 2 Jun 2020 16:59:38 +0200 Subject: [PATCH 7/7] Statsbomb: even more fixes --- README.md | 33 +++++++++++++++++-- examples/datasets/metrica.py | 6 ++++ examples/datasets/statsbomb.py | 8 ++++- examples/epts/load_epts_into_pandas.py | 6 ++++ examples/playing_time.py | 4 +++ kloppy/helpers.py | 11 ++++--- .../serializers/event/statsbomb/serializer.py | 2 ++ kloppy/infra/utils.py | 2 +- 8 files changed, 63 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 36850173..5e82cb76 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ from kloppy import ( load_metrica_tracking_data, load_tracab_tracking_data, load_epts_tracking_data, + load_statsbomb_event_data, to_pandas, transform ) @@ -53,6 +54,10 @@ dataset = load_tracab_tracking_data('meta.xml', 'raw_data.txt') # or epts dataset = load_epts_tracking_data('meta.xml', 'raw_data.txt') +# or event data +dataset = load_statsbomb_event_data('event_data.json', 'lineup.json') + + dataset = transform(dataset, pitch_dimensions=[[0, 108], [-34, 34]]) pandas_data_frame = to_pandas(dataset) ``` @@ -143,6 +148,30 @@ with open("raw_data.txt", "rb") as raw, \ ``` +or StatsBomb event data +```python +from kloppy import StatsBombSerializer + +serializer = StatsBombSerializer() + +with open("events/123123.json", "rb") as event_data, \ + open("lineup/123123.json", "rb") as lineup_data: + + dataset = serializer.deserialize( + inputs={ + 'event_data': event_data, + 'lineup_data': lineup_data + }, + options={ + "event_types": ["pass", "shot", "carry", "take_on"] + } + ) + + # start working with dataset +``` + + + ### Transform the pitch dimensions Data providers use their own pitch dimensions. Some use actual meters while others use 100x100. Use the Transformer to get from one pitch dimensions to another one. ```python @@ -195,7 +224,7 @@ Data models - [ ] Automated tests - [x] Pitch - [x] Tracking -- [ ] Event +- [x] Event Tracking data (de)serializers - [x] Automated tests @@ -207,7 +236,7 @@ Tracking data (de)serializers Event data (de)serializers - [ ] Automated tests - [ ] OPTA -- [ ] StatsBomb +- [x] StatsBomb - [ ] MetricaSports Transformers diff --git a/examples/datasets/metrica.py b/examples/datasets/metrica.py index 3564119b..e0ab51b4 100644 --- a/examples/datasets/metrica.py +++ b/examples/datasets/metrica.py @@ -1,3 +1,6 @@ +import logging +import sys + from kloppy import datasets, to_pandas @@ -6,6 +9,9 @@ def main(): This example shows the use of Metrica datasets, and how we can pass argument to the dataset loader. """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + # The metrica dataset loader loads by default the 'game1' dataset dataset = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10}) diff --git a/examples/datasets/statsbomb.py b/examples/datasets/statsbomb.py index 10cc69c7..880fb5c1 100644 --- a/examples/datasets/statsbomb.py +++ b/examples/datasets/statsbomb.py @@ -1,7 +1,7 @@ import logging import sys -from kloppy import datasets, transform, to_pandas +from kloppy import datasets, transform, to_pandas, load_statsbomb_event_data from kloppy.infra.utils import performance_logging @@ -32,6 +32,12 @@ def main(): print(dataframe[:100].to_string()) + # or load it using the helper from disk + dataset = load_statsbomb_event_data( + "events/15946.json", + "lineups/15946.json" + ) + if __name__ == "__main__": main() diff --git a/examples/epts/load_epts_into_pandas.py b/examples/epts/load_epts_into_pandas.py index 3fd40ba8..0ace950a 100644 --- a/examples/epts/load_epts_into_pandas.py +++ b/examples/epts/load_epts_into_pandas.py @@ -1,3 +1,6 @@ +import logging +import sys + from pandas import DataFrame from kloppy.infra.serializers.tracking.epts.meta_data import load_meta_data as epts_load_meta_data @@ -17,6 +20,9 @@ def main(): 4. Try to consume items from generator twice 4. Convert the records into a pandas dataframe for easy data mangling """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + # step 1: load metadata with open("epts_meta.xml", "rb") as meta_fp: diff --git a/examples/playing_time.py b/examples/playing_time.py index 746e6034..1b664467 100644 --- a/examples/playing_time.py +++ b/examples/playing_time.py @@ -1,3 +1,5 @@ +import logging +import sys from collections import Counter from kloppy import datasets @@ -8,6 +10,8 @@ def main(): """ This example shows how to determine playing time """ + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") dataset = datasets.load("metrica_tracking", options={'sample_rate': 1./25}) diff --git a/kloppy/helpers.py b/kloppy/helpers.py index 2738b011..d9115fbd 100644 --- a/kloppy/helpers.py +++ b/kloppy/helpers.py @@ -49,15 +49,15 @@ def load_epts_tracking_data(meta_data_filename: str, raw_data_filename: str, opt ) -def load_statsbomb_event_data(lineup_filename: str, raw_data_filename: str, options: dict = None) -> EventDataset: +def load_statsbomb_event_data(event_data_filename: str, lineup_data_filename: str, options: dict = None) -> EventDataset: serializer = StatsBombSerializer() - with open(lineup_filename, "rb") as lineup_data, \ - open(raw_data_filename, "rb") as raw_data: + with open(event_data_filename, "rb") as event_data, \ + open(lineup_data_filename, "rb") as lineup_data: return serializer.deserialize( inputs={ - 'lineup_data': lineup_data, - 'raw_data': raw_data + 'event_data': event_data, + 'lineup_data': lineup_data }, options=options ) @@ -166,6 +166,7 @@ def to_pandas(dataset: Dataset, _record_converter: Callable = None) -> 'DataFram 'load_tracab_tracking_data', 'load_metrica_tracking_data', 'load_epts_tracking_data', + 'load_statsbomb_event_data', 'to_pandas', 'transform' ] diff --git a/kloppy/infra/serializers/event/statsbomb/serializer.py b/kloppy/infra/serializers/event/statsbomb/serializer.py index cc279d6f..77abfd60 100644 --- a/kloppy/infra/serializers/event/statsbomb/serializer.py +++ b/kloppy/infra/serializers/event/statsbomb/serializer.py @@ -217,6 +217,8 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Even >>> ) """ self.__validate_inputs(inputs) + if not options: + options = {} with performance_logging("load data", logger=logger): raw_events = json.load(inputs['event_data']) diff --git a/kloppy/infra/utils.py b/kloppy/infra/utils.py index 5dfa0d5e..18a2b6b4 100644 --- a/kloppy/infra/utils.py +++ b/kloppy/infra/utils.py @@ -24,7 +24,7 @@ def performance_logging(description: str, counter: int = None, logger=None): extra = f" ({int(counter / took * 1000)}items/sec)" unit = "ms" - if took < 0.01: + if took < 0.1: took *= 1000 unit = "us"