Skip to content

Commit

Permalink
🔨 refactor metadata classes
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed May 27, 2024
1 parent ceff3c0 commit d36e318
Showing 1 changed file with 46 additions and 114 deletions.
160 changes: 46 additions & 114 deletions lib/catalog/owid/catalog/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
import re
from dataclasses import dataclass, field, is_dataclass
from pathlib import Path
from typing import Any, Dict, List, Literal, NewType, Optional, Union
from typing import Any, Dict, List, Literal, NewType, Optional, Type, TypeVar, Union

import mistune
import pandas as pd
from dataclasses_json import dataclass_json
from typing_extensions import Self

from .processing_log import ProcessingLog
from .utils import pruned_json
Expand All @@ -26,24 +27,52 @@
YearDateLatest = NewType("YearDateLatest", str)


@pruned_json
@dataclass_json
@dataclass
class License:
name: Optional[str] = None
url: Optional[str] = None
T = TypeVar("T")


class Base:
def __hash__(self):
"""Hash that uniquely identifies a License."""
"""Hash that uniquely identifies an object (without needing frozen dataclass)."""
return _hash_dataclass(self)

def to_dict(self) -> Dict[str, Any]:
...

@staticmethod
def from_dict(d: Dict[str, Any]) -> "License":
@classmethod
def from_dict(cls: Type[T], d: Dict[str, Any]) -> T:
...

def update(self, **kwargs: Dict[str, Any]) -> None:
"""Update object with new values."""
for key, value in kwargs.items():
if value is not None:
setattr(self, key, value)

def copy(self, deep=True) -> Self:
"""Return a copy of the object."""
if not deep:
return dataclasses.replace(self) # type: ignore
else:
return _deepcopy_dataclass(self)

def save(self, filename: Union[str, Path]) -> None:
filename = Path(filename).as_posix()
with open(filename, "w") as ostream:
json.dump(self.to_dict(), ostream, indent=2, default=str)

@classmethod
def load(cls, filename: str) -> Self:
with open(filename) as istream:
return cls.from_dict(json.load(istream))


@pruned_json
@dataclass_json
@dataclass
class License(Base):
name: Optional[str] = None
url: Optional[str] = None

def __bool__(self):
return bool(self.name or self.url)

Expand All @@ -52,7 +81,7 @@ def __bool__(self):
@pruned_json
@dataclass_json
@dataclass
class Source:
class Source(Base):
"""Notes on importing sources to grapher:
- Field `source.description` gets mapped to `Internal notes`, but we rather use it for `additional_info`
- The most important fields are `published_by` and `additional_info`
Expand All @@ -73,27 +102,11 @@ class Source:
# we're keeping both for the time being. We might consolidate them in the future
published_by: Optional[str] = None

def __hash__(self):
"""Hash that uniquely identifies a source."""
return _hash_dataclass(self)

def to_dict(self) -> Dict[str, Any]:
...

@staticmethod
def from_dict(d: Dict[str, Any]) -> "Source":
...

def update(self, **kwargs: Dict[str, Any]) -> None:
for key, value in kwargs.items():
if value is not None:
setattr(self, key, value)


@pruned_json
@dataclass_json
@dataclass
class Origin:
class Origin(Base):
# Producer name
# Name of the institution or the author(s) that produced the data product.
producer: str
Expand Down Expand Up @@ -127,10 +140,6 @@ class Origin:
# License of the dataset
license: Optional[License] = None

def __hash__(self):
"""Hash that uniquely identifies an origin."""
return _hash_dataclass(self)

def __post_init__(self):
if self.date_published:
# convert date or int to string
Expand All @@ -140,18 +149,6 @@ def __post_init__(self):
if self.date_published != "latest" and not is_year_or_date(self.date_published):
raise ValueError("date_published should be either a year or a date or latest")

def to_dict(self) -> Dict[str, Any]:
...

@staticmethod
def from_dict(d: Dict[str, Any]) -> "Origin":
...

def update(self, **kwargs: Dict[str, Any]) -> None:
for key, value in kwargs.items():
if value is not None:
setattr(self, key, value)


# Minor is for cases where we only harmonized the countries or similar
# Major is for cases where we do more, like create new aggregations, combine multiple indicators, etc.
Expand All @@ -167,7 +164,7 @@ def update(self, **kwargs: Dict[str, Any]) -> None:
@pruned_json
@dataclass_json
@dataclass
class FaqLink:
class FaqLink(Base):
gdoc_id: str
fragment_id: str

Expand All @@ -178,7 +175,7 @@ class FaqLink:
@pruned_json
@dataclass_json
@dataclass
class VariablePresentationMeta:
class VariablePresentationMeta(Base):
# Any fields of grapher config can be set here - title and subtitle *should* be set whenever possible
grapher_config: Optional[GrapherConfig] = None
# The text for the header of the data page
Expand All @@ -197,19 +194,11 @@ class VariablePresentationMeta:
# List of google doc ids + fragment id
faqs: List[FaqLink] = field(default_factory=list)

def __hash__(self):
"""Hash that uniquely identifies VariablePresentationMeta."""
return _hash_dataclass(self)

@staticmethod
def from_dict(d: Dict[str, Any]) -> "VariablePresentationMeta":
...


@pruned_json
@dataclass_json
@dataclass
class VariableMeta:
class VariableMeta(Base):
"""Allowed fields for `display` attribute used for grapher:
name
zeroDay
Expand Down Expand Up @@ -267,17 +256,6 @@ class VariableMeta:
# List of categories for ordinal type indicators
sort: List[str] = field(default_factory=list)

def __hash__(self):
"""Hash that uniquely identifies VariableMeta."""
return _hash_dataclass(self)

def to_dict(self) -> Dict[str, Any]:
...

@staticmethod
def from_dict(d: Dict[str, Any]) -> "VariableMeta":
...

@property
def schema_version(self) -> int:
"""Schema version is used to easily understand everywhere what metadata standard was used
Expand All @@ -298,18 +276,11 @@ def _repr_html_(self):
{}
""".format(getattr(self, "_name", None), to_html(record))

def copy(self, deep=True) -> "VariableMeta":
"""Return a copy of the VariableMeta object."""
if not deep:
return dataclasses.replace(self)
else:
return _deepcopy_dataclass(self)


@pruned_json
@dataclass_json
@dataclass
class DatasetMeta:
class DatasetMeta(Base):
"""
The metadata for this entire dataset kept in JSON (e.g. mydataset/index.json).
Expand Down Expand Up @@ -339,10 +310,6 @@ class DatasetMeta:
# an md5 checksum of the ingredients used to make this dataset
source_checksum: Optional[str] = None

def __hash__(self):
"""Hash that uniquely identifies DatasetMeta."""
return _hash_dataclass(self)

def __post_init__(self) -> None:
"""Imply version from publication_date or publication_year if not given
in __init__."""
Expand All @@ -356,23 +323,6 @@ def __post_init__(self) -> None:
else:
self.version = None

def save(self, filename: Union[str, Path]) -> None:
filename = Path(filename).as_posix()
with open(filename, "w") as ostream:
json.dump(self.to_dict(), ostream, indent=2, default=str)

@classmethod
def load(cls, filename: str) -> "DatasetMeta":
with open(filename) as istream:
return cls.from_dict(json.load(istream))

def to_dict(self) -> Dict[str, Any]:
...

@staticmethod
def from_dict(d: Dict[str, Any]) -> "DatasetMeta":
...

def _params_yaml(self) -> dict:
"""Parameters passed to YAML for dynamic interpolation."""
params = {}
Expand Down Expand Up @@ -432,7 +382,7 @@ def uri(self) -> str:
@pruned_json
@dataclass_json
@dataclass
class TableMeta:
class TableMeta(Base):
# data about this table
short_name: Optional[str] = None
title: Optional[str] = None
Expand All @@ -442,24 +392,13 @@ class TableMeta:
dataset: Optional[DatasetMeta] = field(compare=False, default=None)
primary_key: List[str] = field(default_factory=list)

def __hash__(self):
"""Hash that uniquely identifies TableMeta."""
return _hash_dataclass(self)

@property
def checked_name(self) -> str:
if not self.short_name:
raise Exception("table has no short_name")

return self.short_name

def to_dict(self) -> Dict[str, Any]:
...

@staticmethod
def from_dict(dict: Dict[str, Any]) -> "TableMeta":
...

def _repr_html_(self):
# Render a nice display of the table metadata
record = self.to_dict()
Expand All @@ -470,13 +409,6 @@ def _repr_html_(self):
{}
""".format(short_name, to_html(record))

def copy(self, deep=True) -> "TableMeta":
"""Return a copy of the TableMeta object."""
if not deep:
return dataclasses.replace(self)
else:
return _deepcopy_dataclass(self)


def to_html(record: Any) -> Optional[str]:
if isinstance(record, dict):
Expand Down

0 comments on commit d36e318

Please sign in to comment.