Skip to content

Commit

Permalink
fix: common bug in parsing of UTC datetimes (#373)
Browse files Browse the repository at this point in the history
  • Loading branch information
probberechts authored Dec 17, 2024
1 parent 45ab84c commit b0f56e1
Show file tree
Hide file tree
Showing 15 changed files with 84 additions and 71 deletions.
14 changes: 7 additions & 7 deletions kloppy/infra/serializers/event/datafactory/deserializer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import json
import logging
from datetime import timedelta, datetime, timezone
from dateutil.parser import parse, _parser
from dataclasses import replace
from typing import Dict, List, Tuple, Union, IO, NamedTuple
from datetime import datetime, timedelta, timezone
from typing import IO, Dict, List, NamedTuple, Tuple, Union

from kloppy.domain import (
AttackingDirection,
Expand Down Expand Up @@ -41,7 +40,6 @@
from kloppy.infra.serializers.event.deserializer import EventDataDeserializer
from kloppy.utils import Readable, performance_logging


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -435,7 +433,7 @@ def deserialize(self, inputs: DatafactoryInputs) -> EventDataset:
+ status_update["time"]
+ match["stadiumGMT"],
"%Y%m%d%H:%M:%S%z",
).astimezone(timezone.utc)
)
half = status_update["t"]["half"]
if status_update["type"] == DF_EVENT_TYPE_STATUS_MATCH_START:
half = 1
Expand All @@ -458,8 +456,10 @@ def deserialize(self, inputs: DatafactoryInputs) -> EventDataset:
date = match["date"]
if date:
# TODO: scheduledStart and stadiumGMT should probably be used here too
date = parse(date).astimezone(timezone.utc)
except _parser.ParserError:
date = datetime.strptime(date, "%Y%m%d").replace(
tzinfo=timezone.utc
)
except ValueError:
date = None
game_week = match.get("week", None)
if game_week:
Expand Down
7 changes: 3 additions & 4 deletions kloppy/infra/serializers/event/sportec/deserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Dict, List, NamedTuple, IO
from datetime import timedelta, datetime, timezone
import logging
from dateutil.parser import parse
from lxml import objectify

from kloppy.domain import (
Expand Down Expand Up @@ -314,7 +313,7 @@ def _event_chain_from_xml_elm(event_elm):


def _parse_datetime(dt_str: str) -> datetime:
return parse(dt_str).astimezone(timezone.utc)
return datetime.fromisoformat(dt_str)


def _get_event_qualifiers(event_chain: Dict) -> List[Qualifier]:
Expand Down Expand Up @@ -469,9 +468,9 @@ def deserialize(self, inputs: SportecEventDataInputs) -> EventDataset:
event_root = objectify.fromstring(inputs.event_data.read())

with performance_logging("parse data", logger=logger):
date = parse(
date = datetime.fromisoformat(
match_root.MatchInformation.General.attrib["KickoffTime"]
).astimezone(timezone.utc)
)
game_week = match_root.MatchInformation.General.attrib["MatchDay"]
game_id = match_root.MatchInformation.General.attrib["MatchId"]

Expand Down
18 changes: 13 additions & 5 deletions kloppy/infra/serializers/event/statsperform/deserializer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pytz
import math
from typing import Dict, List, NamedTuple, IO, Optional
import logging
from datetime import datetime, timedelta

import pytz

from kloppy.domain import (
EventDataset,
Team,
Expand Down Expand Up @@ -795,11 +796,18 @@ def deserialize(self, inputs: StatsPerformInputs) -> EventDataset:
):
if raw_event.type_id == EVENT_TYPE_SHOT_GOAL:
if 374 in raw_event.qualifiers:
# Qualifier 374 specifies the actual time of the shot for all goal events
# It uses London timezone for both MA3 and F24 feeds
naive_datetime = datetime.strptime(
raw_event.qualifiers[374],
"%Y-%m-%d %H:%M:%S.%f",
)
timezone = pytz.timezone("Europe/London")
aware_datetime = timezone.localize(
naive_datetime
)
generic_event_kwargs["timestamp"] = (
datetime.strptime(
raw_event.qualifiers[374],
"%Y-%m-%d %H:%M:%S.%f",
).replace(tzinfo=pytz.utc)
aware_datetime.astimezone(pytz.utc)
- period.start_timestamp
)
shot_event_kwargs = _parse_shot(raw_event)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def extract_score(self) -> Optional[Score]:
"""Return the score of the game."""
return None

def extract_date(self) -> Optional[str]:
def extract_date(self) -> Optional[datetime]:
"""Return the date of the game."""
return None

Expand Down
25 changes: 16 additions & 9 deletions kloppy/infra/serializers/event/statsperform/parsers/f24_xml.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""XML parser for Opta F24 feeds."""
import pytz
from datetime import datetime, timezone

from datetime import datetime
from typing import List, Optional
from dateutil.parser import parse

from .base import OptaXMLParser, OptaEvent
import pytz

from .base import OptaEvent, OptaXMLParser


def _parse_f24_datetime(dt_str: str) -> datetime:
Expand All @@ -15,9 +16,10 @@ def zero_pad_milliseconds(timestamp):
return ".".join(parts[:-1] + ["{:03d}".format(int(parts[-1]))])

dt_str = zero_pad_milliseconds(dt_str)
return datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%S.%f").replace(
tzinfo=pytz.utc
)
naive_datetime = datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%S.%f")
timezone = pytz.timezone("Europe/London")
aware_datetime = timezone.localize(naive_datetime)
return aware_datetime.astimezone(pytz.utc)


class F24XMLParser(OptaXMLParser):
Expand Down Expand Up @@ -54,11 +56,16 @@ def extract_events(self) -> List[OptaEvent]:
for event in game_elm.iterchildren("Event")
]

def extract_date(self) -> Optional[str]:
def extract_date(self) -> Optional[datetime]:
"""Return the date of the game."""
game_elm = self.root.find("Game")
if game_elm and "game_date" in game_elm.attrib:
return parse(game_elm.attrib["game_date"]).astimezone(timezone.utc)
naive_datetime = datetime.strptime(
game_elm.attrib["game_date"], "%Y-%m-%dT%H:%M:%S"
)
timezone = pytz.timezone("Europe/London")
aware_datetime = timezone.localize(naive_datetime)
return aware_datetime.astimezone(pytz.utc)
else:
return None

Expand Down
15 changes: 8 additions & 7 deletions kloppy/infra/serializers/event/statsperform/parsers/ma1_json.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""JSON parser for Stats Perform MA1 feeds."""
import pytz

from datetime import datetime, timezone
from typing import Any, Optional, List, Tuple, Dict
from typing import Any, Dict, List, Optional, Tuple

from kloppy.domain import Period, Score, Team, Ground, Player
from kloppy.domain import Ground, Period, Player, Score, Team
from kloppy.exceptions import DeserializationError

from .base import OptaJSONParser


Expand All @@ -30,12 +31,12 @@ def extract_periods(self) -> List[Period]:
id=period["id"],
start_timestamp=datetime.strptime(
period_start_raw, "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=pytz.utc)
).replace(tzinfo=timezone.utc)
if period_start_raw
else None,
end_timestamp=datetime.strptime(
period_end_raw, "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=pytz.utc)
).replace(tzinfo=timezone.utc)
if period_end_raw
else None,
)
Expand Down Expand Up @@ -95,12 +96,12 @@ def extract_lineups(self) -> Tuple[Team, Team]:
raise DeserializationError("Lineup incomplete")
return home_team, away_team

def extract_date(self) -> Optional[str]:
def extract_date(self) -> Optional[datetime]:
"""Return the date of the game."""
if "matchInfo" in self.root and "date" in self.root["matchInfo"]:
return datetime.strptime(
self.root["matchInfo"]["date"], "%Y-%m-%dZ"
).astimezone(timezone.utc)
).replace(tzinfo=timezone.utc)
else:
return None

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""XML parser for Stats Perform MA1 feeds."""
import pytz
from datetime import datetime
from datetime import datetime, timezone
from typing import Any, Optional, List, Dict, Tuple

from kloppy.domain import Period, Score, Team, Ground, Player
Expand All @@ -22,10 +21,10 @@ def extract_periods(self) -> List[Period]:
id=int(period.get("id")),
start_timestamp=datetime.strptime(
period.get("start"), "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=pytz.utc),
).replace(tzinfo=timezone.utc),
end_timestamp=datetime.strptime(
period.get("end"), "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=pytz.utc),
).replace(tzinfo=timezone.utc),
)
)
return parsed_periods
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""JSON parser for Stats Perform MA3 feeds."""
import pytz
from datetime import datetime
from datetime import datetime, timezone
from typing import List

from .base import OptaJSONParser, OptaEvent
Expand All @@ -9,12 +8,12 @@
def _parse_ma3_datetime(dt_str: str) -> datetime:
try:
return datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
tzinfo=pytz.utc
tzinfo=timezone.utc
)

except ValueError:
return datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%SZ").replace(
tzinfo=pytz.utc
tzinfo=timezone.utc
)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""XML parser for Stats Perform MA3 feeds."""
import pytz
from datetime import datetime
from datetime import datetime, timezone
from typing import List

from .base import OptaXMLParser, OptaEvent
Expand All @@ -9,11 +8,11 @@
def _parse_ma3_datetime(dt_str: str) -> datetime:
try:
return datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
tzinfo=pytz.utc
tzinfo=timezone.utc
)
except ValueError:
return datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%SZ").replace(
tzinfo=pytz.utc
tzinfo=timezone.utc
)


Expand Down
8 changes: 4 additions & 4 deletions kloppy/infra/serializers/event/wyscout/deserializer_v3.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json
import logging
from dataclasses import replace
from datetime import timedelta, timezone
from datetime import datetime, timedelta, timezone
from enum import Enum
from typing import Dict, List, Optional

from dateutil.parser import parse

from kloppy.domain import (
BodyPart,
BodyPartQualifier,
Expand Down Expand Up @@ -709,7 +707,9 @@ def deserialize(self, inputs: WyscoutInputs) -> EventDataset:
)
date = raw_events["match"].get("dateutc")
if date:
date = parse(date).astimezone(timezone.utc)
date = datetime.strptime(date, "%Y-%m-%d %H:%M:%S").replace(
tzinfo=timezone.utc
)
game_week = raw_events["match"].get("gameweek")
if game_week:
game_week = str(game_week)
Expand Down
18 changes: 10 additions & 8 deletions kloppy/infra/serializers/tracking/skillcorner.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import json
import logging
from datetime import timedelta, timezone
from dateutil.parser import parse
import warnings
from typing import NamedTuple, IO, Optional, Union, Dict
from collections import Counter
import numpy as np
import json
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import IO, Dict, NamedTuple, Optional, Union

import numpy as np

from kloppy.domain import (
attacking_direction_from_frame,
AttackingDirection,
DatasetFlag,
Frame,
Expand All @@ -18,14 +17,15 @@
Orientation,
Period,
Player,
PlayerData,
Point,
Point3D,
PositionType,
Provider,
Score,
Team,
TrackingDataset,
PlayerData,
attacking_direction_from_frame,
)
from kloppy.infra.serializers.tracking.deserializer import (
TrackingDataDeserializer,
Expand Down Expand Up @@ -367,7 +367,9 @@ def deserialize(self, inputs: SkillCornerInputs) -> TrackingDataset:

date = metadata.get("date_time")
if date:
date = parse(date).astimezone(timezone.utc)
date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").replace(
tzinfo=timezone.utc
)

game_id = metadata.get("id")
if game_id:
Expand Down
7 changes: 3 additions & 4 deletions kloppy/infra/serializers/tracking/sportec/deserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import warnings
from collections import defaultdict
from typing import NamedTuple, Optional, Union, IO
from datetime import timedelta, timezone
from dateutil.parser import parse
from datetime import datetime, timedelta

from lxml import objectify

Expand Down Expand Up @@ -138,9 +137,9 @@ def deserialize(
]

with performance_logging("parse raw data", logger=logger):
date = parse(
date = datetime.fromisoformat(
match_root.MatchInformation.General.attrib["KickoffTime"]
).astimezone(timezone.utc)
)
game_week = match_root.MatchInformation.General.attrib["MatchDay"]
game_id = match_root.MatchInformation.General.attrib["MatchId"]

Expand Down
Loading

0 comments on commit b0f56e1

Please sign in to comment.