From d36e3182414af5f5a0ead8f2f080fdd0750ed5f5 Mon Sep 17 00:00:00 2001 From: Marigold Date: Mon, 27 May 2024 15:07:48 +0200 Subject: [PATCH] :hammer: refactor metadata classes --- lib/catalog/owid/catalog/meta.py | 160 +++++++++---------------------- 1 file changed, 46 insertions(+), 114 deletions(-) diff --git a/lib/catalog/owid/catalog/meta.py b/lib/catalog/owid/catalog/meta.py index 3cebfed2db40..63bc6a1e9c5b 100644 --- a/lib/catalog/owid/catalog/meta.py +++ b/lib/catalog/owid/catalog/meta.py @@ -10,11 +10,12 @@ import re from dataclasses import dataclass, field, is_dataclass from pathlib import Path -from typing import Any, Dict, List, Literal, NewType, Optional, Union +from typing import Any, Dict, List, Literal, NewType, Optional, Type, TypeVar, Union import mistune import pandas as pd from dataclasses_json import dataclass_json +from typing_extensions import Self from .processing_log import ProcessingLog from .utils import pruned_json @@ -26,24 +27,52 @@ YearDateLatest = NewType("YearDateLatest", str) -@pruned_json -@dataclass_json -@dataclass -class License: - name: Optional[str] = None - url: Optional[str] = None +T = TypeVar("T") + +class Base: def __hash__(self): - """Hash that uniquely identifies a License.""" + """Hash that uniquely identifies an object (without needing frozen dataclass).""" return _hash_dataclass(self) def to_dict(self) -> Dict[str, Any]: ... - @staticmethod - def from_dict(d: Dict[str, Any]) -> "License": + @classmethod + def from_dict(cls: Type[T], d: Dict[str, Any]) -> T: ... + def update(self, **kwargs: Dict[str, Any]) -> None: + """Update object with new values.""" + for key, value in kwargs.items(): + if value is not None: + setattr(self, key, value) + + def copy(self, deep=True) -> Self: + """Return a copy of the object.""" + if not deep: + return dataclasses.replace(self) # type: ignore + else: + return _deepcopy_dataclass(self) + + def save(self, filename: Union[str, Path]) -> None: + filename = Path(filename).as_posix() + with open(filename, "w") as ostream: + json.dump(self.to_dict(), ostream, indent=2, default=str) + + @classmethod + def load(cls, filename: str) -> Self: + with open(filename) as istream: + return cls.from_dict(json.load(istream)) + + +@pruned_json +@dataclass_json +@dataclass +class License(Base): + name: Optional[str] = None + url: Optional[str] = None + def __bool__(self): return bool(self.name or self.url) @@ -52,7 +81,7 @@ def __bool__(self): @pruned_json @dataclass_json @dataclass -class Source: +class Source(Base): """Notes on importing sources to grapher: - Field `source.description` gets mapped to `Internal notes`, but we rather use it for `additional_info` - The most important fields are `published_by` and `additional_info` @@ -73,27 +102,11 @@ class Source: # we're keeping both for the time being. We might consolidate them in the future published_by: Optional[str] = None - def __hash__(self): - """Hash that uniquely identifies a source.""" - return _hash_dataclass(self) - - def to_dict(self) -> Dict[str, Any]: - ... - - @staticmethod - def from_dict(d: Dict[str, Any]) -> "Source": - ... - - def update(self, **kwargs: Dict[str, Any]) -> None: - for key, value in kwargs.items(): - if value is not None: - setattr(self, key, value) - @pruned_json @dataclass_json @dataclass -class Origin: +class Origin(Base): # Producer name # Name of the institution or the author(s) that produced the data product. producer: str @@ -127,10 +140,6 @@ class Origin: # License of the dataset license: Optional[License] = None - def __hash__(self): - """Hash that uniquely identifies an origin.""" - return _hash_dataclass(self) - def __post_init__(self): if self.date_published: # convert date or int to string @@ -140,18 +149,6 @@ def __post_init__(self): if self.date_published != "latest" and not is_year_or_date(self.date_published): raise ValueError("date_published should be either a year or a date or latest") - def to_dict(self) -> Dict[str, Any]: - ... - - @staticmethod - def from_dict(d: Dict[str, Any]) -> "Origin": - ... - - def update(self, **kwargs: Dict[str, Any]) -> None: - for key, value in kwargs.items(): - if value is not None: - setattr(self, key, value) - # Minor is for cases where we only harmonized the countries or similar # Major is for cases where we do more, like create new aggregations, combine multiple indicators, etc. @@ -167,7 +164,7 @@ def update(self, **kwargs: Dict[str, Any]) -> None: @pruned_json @dataclass_json @dataclass -class FaqLink: +class FaqLink(Base): gdoc_id: str fragment_id: str @@ -178,7 +175,7 @@ class FaqLink: @pruned_json @dataclass_json @dataclass -class VariablePresentationMeta: +class VariablePresentationMeta(Base): # Any fields of grapher config can be set here - title and subtitle *should* be set whenever possible grapher_config: Optional[GrapherConfig] = None # The text for the header of the data page @@ -197,19 +194,11 @@ class VariablePresentationMeta: # List of google doc ids + fragment id faqs: List[FaqLink] = field(default_factory=list) - def __hash__(self): - """Hash that uniquely identifies VariablePresentationMeta.""" - return _hash_dataclass(self) - - @staticmethod - def from_dict(d: Dict[str, Any]) -> "VariablePresentationMeta": - ... - @pruned_json @dataclass_json @dataclass -class VariableMeta: +class VariableMeta(Base): """Allowed fields for `display` attribute used for grapher: name zeroDay @@ -267,17 +256,6 @@ class VariableMeta: # List of categories for ordinal type indicators sort: List[str] = field(default_factory=list) - def __hash__(self): - """Hash that uniquely identifies VariableMeta.""" - return _hash_dataclass(self) - - def to_dict(self) -> Dict[str, Any]: - ... - - @staticmethod - def from_dict(d: Dict[str, Any]) -> "VariableMeta": - ... - @property def schema_version(self) -> int: """Schema version is used to easily understand everywhere what metadata standard was used @@ -298,18 +276,11 @@ def _repr_html_(self): {} """.format(getattr(self, "_name", None), to_html(record)) - def copy(self, deep=True) -> "VariableMeta": - """Return a copy of the VariableMeta object.""" - if not deep: - return dataclasses.replace(self) - else: - return _deepcopy_dataclass(self) - @pruned_json @dataclass_json @dataclass -class DatasetMeta: +class DatasetMeta(Base): """ The metadata for this entire dataset kept in JSON (e.g. mydataset/index.json). @@ -339,10 +310,6 @@ class DatasetMeta: # an md5 checksum of the ingredients used to make this dataset source_checksum: Optional[str] = None - def __hash__(self): - """Hash that uniquely identifies DatasetMeta.""" - return _hash_dataclass(self) - def __post_init__(self) -> None: """Imply version from publication_date or publication_year if not given in __init__.""" @@ -356,23 +323,6 @@ def __post_init__(self) -> None: else: self.version = None - def save(self, filename: Union[str, Path]) -> None: - filename = Path(filename).as_posix() - with open(filename, "w") as ostream: - json.dump(self.to_dict(), ostream, indent=2, default=str) - - @classmethod - def load(cls, filename: str) -> "DatasetMeta": - with open(filename) as istream: - return cls.from_dict(json.load(istream)) - - def to_dict(self) -> Dict[str, Any]: - ... - - @staticmethod - def from_dict(d: Dict[str, Any]) -> "DatasetMeta": - ... - def _params_yaml(self) -> dict: """Parameters passed to YAML for dynamic interpolation.""" params = {} @@ -432,7 +382,7 @@ def uri(self) -> str: @pruned_json @dataclass_json @dataclass -class TableMeta: +class TableMeta(Base): # data about this table short_name: Optional[str] = None title: Optional[str] = None @@ -442,10 +392,6 @@ class TableMeta: dataset: Optional[DatasetMeta] = field(compare=False, default=None) primary_key: List[str] = field(default_factory=list) - def __hash__(self): - """Hash that uniquely identifies TableMeta.""" - return _hash_dataclass(self) - @property def checked_name(self) -> str: if not self.short_name: @@ -453,13 +399,6 @@ def checked_name(self) -> str: return self.short_name - def to_dict(self) -> Dict[str, Any]: - ... - - @staticmethod - def from_dict(dict: Dict[str, Any]) -> "TableMeta": - ... - def _repr_html_(self): # Render a nice display of the table metadata record = self.to_dict() @@ -470,13 +409,6 @@ def _repr_html_(self): {} """.format(short_name, to_html(record)) - def copy(self, deep=True) -> "TableMeta": - """Return a copy of the TableMeta object.""" - if not deep: - return dataclasses.replace(self) - else: - return _deepcopy_dataclass(self) - def to_html(record: Any) -> Optional[str]: if isinstance(record, dict):