From f042a528adb40c29d7d5bc3a1df3478709795729 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 11:59:20 +0100 Subject: [PATCH 01/18] =?UTF-8?q?=E2=9C=A8=20mdim/explorer=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 22dc47b1b4db292282c9a1367bfa893a20ce3326 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 12:01:40 +0100 Subject: [PATCH 02/18] wip --- etl/collections/base.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index 0a260f47769..bad21a1aac3 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -18,18 +18,14 @@ class ViewConfig: """MDIM/Explorer view configuration.""" dimensions: Dict[str, str] - indicators: "IndicatorSelection" + indicators: "ViewIndicators" config: Optional[Any] metadata: Optional[Any] -class DimensionConfig: - """MDIM/Explorer dimension configuration.""" - - choices: List["ChoiceConfig"] - +class ViewIndicators: + """Indicators in a MDIM/Explorer view.""" -class IndicatorSelection: y: Optional[List["Indicator"]] x: Optional[List["Indicator"]] size: Optional[List["Indicator"]] @@ -41,6 +37,12 @@ class Indicator: display: Dict[str, Any] +class DimensionConfig: + """MDIM/Explorer dimension configuration.""" + + choices: List["ChoiceConfig"] + + class ChoiceConfig: slug: str name: str From e35186a47dcd4d6f292445c265d81f50b3176e71 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 15:33:37 +0100 Subject: [PATCH 03/18] wip --- etl/steps/export/explorers/covid/latest/covid.config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etl/steps/export/explorers/covid/latest/covid.config.yml b/etl/steps/export/explorers/covid/latest/covid.config.yml index 2a7c737f710..22a63714577 100644 --- a/etl/steps/export/explorers/covid/latest/covid.config.yml +++ b/etl/steps/export/explorers/covid/latest/covid.config.yml @@ -126,10 +126,10 @@ views: relative: false indicators: y: - - excess_mortality_economist#cumulative_estimated_daily_excess_deaths - - excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_bot - - excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_top - - cases_deaths#total_deaths + - path: excess_mortality_economist#cumulative_estimated_daily_excess_deaths + - path: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_bot + - path: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_top + - path: cases_deaths#total_deaths config: title: Estimated cumulative excess deaths during COVID-19 subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. From 349cdbd6da600a3cc25a453c6ac7549954c75c00 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 16:25:37 +0100 Subject: [PATCH 04/18] wip --- etl/collections/base.py | 103 ++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index bad21a1aac3..46e729301cb 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -3,47 +3,100 @@ This should be aligned with the MDIM schema. """ +from dataclasses import dataclass from typing import Any, Dict, List, Optional +import yaml +from owid.catalog.meta import MetaBase -class Config: - """Overall MDIM/Explorer config""" - config: Dict[str, str] - views: List["ViewConfig"] - dimensions: List["DimensionConfig"] +@dataclass +class Indicator(MetaBase): + path: str + display: Optional[Dict[str, Any]] = None + def __post_init__(self): + print("POST ViewIndicators!") -class ViewConfig: + +@dataclass +class ViewIndicators(MetaBase): + """Indicators in a MDIM/Explorer view.""" + + # TODO: these attributes should ALL be Optional. + # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class + y: List[Indicator] + x: Optional[List[Indicator]] = None + size: Optional[List[Indicator]] = None + color: Optional[List[Indicator]] = None + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": + """Coerce the dictionary into the expected shape before passing it to the parent class.""" + # Make a shallow copy so we don't mutate the user's dictionary in-place + data = dict(d) + + # Coerce each dimension field (y, x, size, color) from [str, ...] -> [{'path': str}, ...] + for dim in ("y", "x", "size", "color"): + if dim in data: + if isinstance(data[dim], str): + data[dim] = [{"path": data[dim]}] + if isinstance(data[dim], list): + coerced_items = [] + for item in data[dim]: + if isinstance(item, str): + coerced_items.append({"path": item}) + else: + # If already a dict or something else, leave it as-is + coerced_items.append(item) + data[dim] = coerced_items + + # Now that data is in the expected shape, let the parent class handle the rest + return super().from_dict(data) + + +@dataclass +class View(MetaBase): """MDIM/Explorer view configuration.""" dimensions: Dict[str, str] - indicators: "ViewIndicators" - config: Optional[Any] - metadata: Optional[Any] + indicators: ViewIndicators + # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class + config: Optional[Any] = None + metadata: Optional[Any] = None -class ViewIndicators: - """Indicators in a MDIM/Explorer view.""" +@dataclass +class DimensionChoice(MetaBase): + slug: str + name: str + description: Optional[str] = None - y: Optional[List["Indicator"]] - x: Optional[List["Indicator"]] - size: Optional[List["Indicator"]] - color: Optional[List["Indicator"]] +@dataclass +class Dimension(MetaBase): + """MDIM/Explorer dimension configuration.""" -class Indicator: - path: str - display: Dict[str, Any] + slug: str + name: str + # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class + choices: Optional[List[DimensionChoice]] = None # Only allowed to be None if checkbox + presentation: Optional[Dict[str, Any]] = None -class DimensionConfig: - """MDIM/Explorer dimension configuration.""" +@dataclass +class Collection(MetaBase): + """Overall MDIM/Explorer config""" - choices: List["ChoiceConfig"] + config: Dict[str, str] + dimensions: List[Dimension] + views: List[View] -class ChoiceConfig: - slug: str - name: str - description: str +# def main(): +# filename = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases.yml" +filename = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" +with open(filename) as istream: + yml = yaml.safe_load(istream) +# cfg = Config.from_dict(yml) +# cfg.views[0].indicators.y From bd73edb255b94120664a5b834fe4873291546889 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 16:58:18 +0100 Subject: [PATCH 05/18] use class from_dict --- lib/catalog/owid/catalog/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/catalog/owid/catalog/utils.py b/lib/catalog/owid/catalog/utils.py index 6e2920d6108..e16caec686b 100644 --- a/lib/catalog/owid/catalog/utils.py +++ b/lib/catalog/owid/catalog/utils.py @@ -314,7 +314,7 @@ def dataclass_from_dict(cls: Optional[Type[T]], d: Dict[str, Any]) -> T: key_type, value_type = args init_args[field_name] = {k: dataclass_from_dict(value_type, item) for k, item in v.items()} elif dataclasses.is_dataclass(field_type): - init_args[field_name] = dataclass_from_dict(field_type, v) # type: ignore + init_args[field_name] = field_type.from_dict(v) # type: ignore elif isinstance(field_type, type) and field_type not in (Any,): try: init_args[field_name] = field_type(v) From a5abfc41434fd5d0892c4fa1a741ba23d1c567a6 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 16:58:24 +0100 Subject: [PATCH 06/18] wip --- etl/collections/base.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index 46e729301cb..d13fe649872 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -1,6 +1,10 @@ """WIP: Drafting a model for dealing with MDIM/Explorer configuration. This should be aligned with the MDIM schema. + +THINGS TO SOLVE: + + - If an attribute is Optional, MetaBase.from_dict is not correctly loading it as the appropriate class when given. """ from dataclasses import dataclass @@ -8,8 +12,10 @@ import yaml from owid.catalog.meta import MetaBase +from owid.catalog.utils import pruned_json +@pruned_json @dataclass class Indicator(MetaBase): path: str @@ -19,6 +25,7 @@ def __post_init__(self): print("POST ViewIndicators!") +@pruned_json @dataclass class ViewIndicators(MetaBase): """Indicators in a MDIM/Explorer view.""" @@ -55,6 +62,7 @@ def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": return super().from_dict(data) +@pruned_json @dataclass class View(MetaBase): """MDIM/Explorer view configuration.""" @@ -66,6 +74,7 @@ class View(MetaBase): metadata: Optional[Any] = None +@pruned_json @dataclass class DimensionChoice(MetaBase): slug: str @@ -73,6 +82,7 @@ class DimensionChoice(MetaBase): description: Optional[str] = None +@pruned_json @dataclass class Dimension(MetaBase): """MDIM/Explorer dimension configuration.""" @@ -84,6 +94,7 @@ class Dimension(MetaBase): presentation: Optional[Dict[str, Any]] = None +@pruned_json @dataclass class Collection(MetaBase): """Overall MDIM/Explorer config""" @@ -98,5 +109,5 @@ class Collection(MetaBase): filename = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" with open(filename) as istream: yml = yaml.safe_load(istream) -# cfg = Config.from_dict(yml) +collection = Collection.from_dict(yml) # cfg.views[0].indicators.y From 4f49d5c9b3e8c4eb9f09d8698fcb0519fbcdafd0 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 17:29:00 +0100 Subject: [PATCH 07/18] wip --- etl/collections/base.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index d13fe649872..ab189489a9f 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -21,9 +21,6 @@ class Indicator(MetaBase): path: str display: Optional[Dict[str, Any]] = None - def __post_init__(self): - print("POST ViewIndicators!") - @pruned_json @dataclass @@ -99,15 +96,33 @@ class Dimension(MetaBase): class Collection(MetaBase): """Overall MDIM/Explorer config""" - config: Dict[str, str] dimensions: List[Dimension] views: List[View] +@pruned_json +@dataclass +class Explorer(Collection): + config: Dict[str, str] + + +@pruned_json +@dataclass +class Multidim(Collection): + title: Dict[str, str] + defaultSelection: List[str] + topicTags: Optional[List[str]] = None + definitions: Optional[Any] = None + + # def main(): -# filename = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases.yml" -filename = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" -with open(filename) as istream: - yml = yaml.safe_load(istream) -collection = Collection.from_dict(yml) +f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases.yml" +with open(f_mdim) as istream: + cfg_mdim = yaml.safe_load(istream) +mdim = Multidim.from_dict(cfg_mdim) + +f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" +with open(f_explorer) as istream: + cfg_explorer = yaml.safe_load(istream) +explorer = Explorer.from_dict(cfg_explorer) # cfg.views[0].indicators.y From a8801336290c966e51b2aa20de7300e43387f4bb Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 17:37:02 +0100 Subject: [PATCH 08/18] wip --- etl/collections/base.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index ab189489a9f..8fa804f2747 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -103,26 +103,30 @@ class Collection(MetaBase): @pruned_json @dataclass class Explorer(Collection): + """Model for Explorer configuration.""" + config: Dict[str, str] @pruned_json @dataclass class Multidim(Collection): + """Model for MDIM configuration.""" + title: Dict[str, str] defaultSelection: List[str] topicTags: Optional[List[str]] = None definitions: Optional[Any] = None -# def main(): -f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases.yml" -with open(f_mdim) as istream: - cfg_mdim = yaml.safe_load(istream) -mdim = Multidim.from_dict(cfg_mdim) +# # def main(): +# f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases.yml" +# with open(f_mdim) as istream: +# cfg_mdim = yaml.safe_load(istream) +# mdim = Multidim.from_dict(cfg_mdim) -f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" -with open(f_explorer) as istream: - cfg_explorer = yaml.safe_load(istream) -explorer = Explorer.from_dict(cfg_explorer) -# cfg.views[0].indicators.y +# f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" +# with open(f_explorer) as istream: +# cfg_explorer = yaml.safe_load(istream) +# explorer = Explorer.from_dict(cfg_explorer) +# # cfg.views[0].indicators.y From cd7cc347bdc4b89757faf10c97241677388d3219 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 18:04:45 +0100 Subject: [PATCH 09/18] missing type --- etl/collections/base.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index 8fa804f2747..345a536fd3f 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -18,7 +18,7 @@ @pruned_json @dataclass class Indicator(MetaBase): - path: str + catalogPath: str display: Optional[Dict[str, Any]] = None @@ -43,17 +43,20 @@ def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": # Coerce each dimension field (y, x, size, color) from [str, ...] -> [{'path': str}, ...] for dim in ("y", "x", "size", "color"): if dim in data: - if isinstance(data[dim], str): - data[dim] = [{"path": data[dim]}] if isinstance(data[dim], list): coerced_items = [] for item in data[dim]: if isinstance(item, str): - coerced_items.append({"path": item}) + coerced_items.append({"catalogPath": item}) else: # If already a dict or something else, leave it as-is coerced_items.append(item) data[dim] = coerced_items + else: + if isinstance(data[dim], str): + data[dim] = [{"catalogPath": data[dim]}] + else: + data[dim] = [data[dim]] # Now that data is in the expected shape, let the parent class handle the rest return super().from_dict(data) @@ -120,10 +123,10 @@ class Multidim(Collection): # # def main(): -# f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases.yml" -# with open(f_mdim) as istream: -# cfg_mdim = yaml.safe_load(istream) -# mdim = Multidim.from_dict(cfg_mdim) +f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml" +with open(f_mdim) as istream: + cfg_mdim = yaml.safe_load(istream) +mdim = Multidim.from_dict(cfg_mdim) # f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" # with open(f_explorer) as istream: From cca934d220cf4fd8e244dd62d4a7074bb2259540 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 18:15:10 +0100 Subject: [PATCH 10/18] remove unused file --- .../export/multidim/covid/latest/covid.population.yml | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 etl/steps/export/multidim/covid/latest/covid.population.yml diff --git a/etl/steps/export/multidim/covid/latest/covid.population.yml b/etl/steps/export/multidim/covid/latest/covid.population.yml deleted file mode 100644 index c0e89f2740d..00000000000 --- a/etl/steps/export/multidim/covid/latest/covid.population.yml +++ /dev/null @@ -1,7 +0,0 @@ -title: - title: Population - titleVariant: by sex, age, variant -defaultSelection: - - United Kingdom -topicTags: - - COVID-19 From 54ed3ff29903c6c97f0b2a57c958aec577fd5180 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 18:16:52 +0100 Subject: [PATCH 11/18] demo covid mdim with model --- etl/collections/base.py | 45 ++++++++++++++++--- etl/collections/multidim.py | 33 ++++++++++++++ .../export/multidim/covid/latest/covid.py | 12 +++-- 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index 345a536fd3f..cad2bac5673 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -8,11 +8,36 @@ """ from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, TypeVar import yaml from owid.catalog.meta import MetaBase -from owid.catalog.utils import pruned_json + +T = TypeVar("T") + + +def prune_dict(d: dict) -> dict: + """Remove all keys starting with underscore and all empty values from a dictionary.""" + out = {} + for k, v in d.items(): + if not k.startswith("_") and v not in [None, {}]: + if isinstance(v, dict): + out[k] = prune_dict(v) + elif isinstance(v, list): + out[k] = [prune_dict(x) if isinstance(x, dict) else x for x in v if x not in [None, {}]] + else: + out[k] = v + return out + + +def pruned_json(cls: T) -> T: + orig = cls.to_dict # type: ignore + + # only keep non-null public variables + # calling original to_dict returns dictionaries, not objects + cls.to_dict = lambda self, **kwargs: prune_dict(orig(self, **kwargs)) # type: ignore + + return cls @pruned_json @@ -30,9 +55,9 @@ class ViewIndicators(MetaBase): # TODO: these attributes should ALL be Optional. # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class y: List[Indicator] - x: Optional[List[Indicator]] = None - size: Optional[List[Indicator]] = None - color: Optional[List[Indicator]] = None + x: Optional[Indicator] = None + size: Optional[Indicator] = None + color: Optional[Indicator] = None @classmethod def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": @@ -54,9 +79,15 @@ def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": data[dim] = coerced_items else: if isinstance(data[dim], str): - data[dim] = [{"catalogPath": data[dim]}] + if dim == "y": + data[dim] = [{"catalogPath": data[dim]}] + else: + data[dim] = {"catalogPath": data[dim]} else: - data[dim] = [data[dim]] + if dim == "y": + data[dim] = [data[dim]] + else: + data[dim] = data[dim] # Now that data is in the expected shape, let the parent class handle the rest return super().from_dict(data) diff --git a/etl/collections/multidim.py b/etl/collections/multidim.py index f2541e38efc..335669aa72b 100644 --- a/etl/collections/multidim.py +++ b/etl/collections/multidim.py @@ -19,6 +19,7 @@ from structlog import get_logger from apps.chart_sync.admin_api import AdminAPI +from etl.collections.base import Multidim from etl.collections.utils import ( expand_catalog_paths, extract_catalog_path, @@ -191,6 +192,38 @@ def upsert_multidim_data_page( _upsert_multidim_data_page(mdim_catalog_path, config, owid_env) +def upsert_multidim_data_page_2( + mdim: Multidim, paths: PathFinder, mdim_name: Optional[str] = None, owid_env: Optional[OWIDEnv] = None +) -> None: + """Import MDIM config to DB. + + Args: + ----- + + slug: str + Slug of the MDIM page. MDIM will be published at /slug + config: dict + MDIM configuration. + paths: PathFinder + Pass `paths = PathFinder(__file__)` from the script where this function is called. + mdim_name: str + Name of the MDIM page. Default is short_name from mdim catalog path. + owid_env: Optional[OWIDEnv] + Environment where to publish the MDIM page. + """ + dependencies = paths.dependencies + mdim_catalog_path = f"{paths.namespace}/{paths.version}/{paths.short_name}#{mdim_name or paths.short_name}" + + config = mdim.to_dict() + # Edit views + process_mdim_views(config, dependencies=dependencies) + + # TODO: Possibly add other edits (to dimensions?) + + # Upsert to DB + _upsert_multidim_data_page(mdim_catalog_path, config, owid_env) + + def process_mdim_views(config: dict, dependencies: Set[str]): """Process views in MDIM configuration. diff --git a/etl/steps/export/multidim/covid/latest/covid.py b/etl/steps/export/multidim/covid/latest/covid.py index a249a6aa07d..e906d48c34b 100644 --- a/etl/steps/export/multidim/covid/latest/covid.py +++ b/etl/steps/export/multidim/covid/latest/covid.py @@ -1,4 +1,5 @@ from etl.collections import multidim +from etl.collections.base import Multidim from etl.helpers import PathFinder # Get paths and naming conventions for current step. @@ -36,12 +37,17 @@ def run(dest_dir: str) -> None: paths.log.info(fname) config = paths.load_mdim_config(fname) - multidim.upsert_multidim_data_page( - config=config, + mdim = Multidim.from_dict(config) + # multidim.upsert_multidim_data_page( + # config=config, + # paths=paths, + # mdim_name=fname_to_mdim_name(fname), + # ) + multidim.upsert_multidim_data_page_2( + mdim=mdim, paths=paths, mdim_name=fname_to_mdim_name(fname), ) - # PART 2: MDIMs hybridly generated (mix of YAML file + data) ds = paths.load_dataset("google_mobility") tb = ds.read("google_mobility") From d8cef1ae81033f8df8c3e4b19c94515012313629 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 18:20:18 +0100 Subject: [PATCH 12/18] comment debugging --- etl/collections/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index cad2bac5673..a0453380c2d 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -154,10 +154,10 @@ class Multidim(Collection): # # def main(): -f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml" -with open(f_mdim) as istream: - cfg_mdim = yaml.safe_load(istream) -mdim = Multidim.from_dict(cfg_mdim) +# f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml" +# with open(f_mdim) as istream: +# cfg_mdim = yaml.safe_load(istream) +# mdim = Multidim.from_dict(cfg_mdim) # f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" # with open(f_explorer) as istream: From d007f6b300f5e2b01e4b1a48820aca1ec847459f Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 26 Feb 2025 18:23:59 +0100 Subject: [PATCH 13/18] irrelevant import --- etl/collections/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/etl/collections/base.py b/etl/collections/base.py index a0453380c2d..68fe2873cca 100644 --- a/etl/collections/base.py +++ b/etl/collections/base.py @@ -10,7 +10,6 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, TypeVar -import yaml from owid.catalog.meta import MetaBase T = TypeVar("T") From fba5d5a531e34637354586bf0843482e128c4b38 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 27 Feb 2025 19:58:17 +0100 Subject: [PATCH 14/18] wip --- etl/collections/base.py | 165 ---------- etl/collections/common.py | 89 ++++++ etl/collections/model.py | 302 ++++++++++++++++++ etl/collections/multidim.py | 51 ++- etl/collections/utils.py | 167 +++++++--- .../export/multidim/covid/latest/covid.py | 11 +- 6 files changed, 543 insertions(+), 242 deletions(-) delete mode 100644 etl/collections/base.py create mode 100644 etl/collections/common.py create mode 100644 etl/collections/model.py diff --git a/etl/collections/base.py b/etl/collections/base.py deleted file mode 100644 index 68fe2873cca..00000000000 --- a/etl/collections/base.py +++ /dev/null @@ -1,165 +0,0 @@ -"""WIP: Drafting a model for dealing with MDIM/Explorer configuration. - -This should be aligned with the MDIM schema. - -THINGS TO SOLVE: - - - If an attribute is Optional, MetaBase.from_dict is not correctly loading it as the appropriate class when given. -""" - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, TypeVar - -from owid.catalog.meta import MetaBase - -T = TypeVar("T") - - -def prune_dict(d: dict) -> dict: - """Remove all keys starting with underscore and all empty values from a dictionary.""" - out = {} - for k, v in d.items(): - if not k.startswith("_") and v not in [None, {}]: - if isinstance(v, dict): - out[k] = prune_dict(v) - elif isinstance(v, list): - out[k] = [prune_dict(x) if isinstance(x, dict) else x for x in v if x not in [None, {}]] - else: - out[k] = v - return out - - -def pruned_json(cls: T) -> T: - orig = cls.to_dict # type: ignore - - # only keep non-null public variables - # calling original to_dict returns dictionaries, not objects - cls.to_dict = lambda self, **kwargs: prune_dict(orig(self, **kwargs)) # type: ignore - - return cls - - -@pruned_json -@dataclass -class Indicator(MetaBase): - catalogPath: str - display: Optional[Dict[str, Any]] = None - - -@pruned_json -@dataclass -class ViewIndicators(MetaBase): - """Indicators in a MDIM/Explorer view.""" - - # TODO: these attributes should ALL be Optional. - # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class - y: List[Indicator] - x: Optional[Indicator] = None - size: Optional[Indicator] = None - color: Optional[Indicator] = None - - @classmethod - def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": - """Coerce the dictionary into the expected shape before passing it to the parent class.""" - # Make a shallow copy so we don't mutate the user's dictionary in-place - data = dict(d) - - # Coerce each dimension field (y, x, size, color) from [str, ...] -> [{'path': str}, ...] - for dim in ("y", "x", "size", "color"): - if dim in data: - if isinstance(data[dim], list): - coerced_items = [] - for item in data[dim]: - if isinstance(item, str): - coerced_items.append({"catalogPath": item}) - else: - # If already a dict or something else, leave it as-is - coerced_items.append(item) - data[dim] = coerced_items - else: - if isinstance(data[dim], str): - if dim == "y": - data[dim] = [{"catalogPath": data[dim]}] - else: - data[dim] = {"catalogPath": data[dim]} - else: - if dim == "y": - data[dim] = [data[dim]] - else: - data[dim] = data[dim] - - # Now that data is in the expected shape, let the parent class handle the rest - return super().from_dict(data) - - -@pruned_json -@dataclass -class View(MetaBase): - """MDIM/Explorer view configuration.""" - - dimensions: Dict[str, str] - indicators: ViewIndicators - # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class - config: Optional[Any] = None - metadata: Optional[Any] = None - - -@pruned_json -@dataclass -class DimensionChoice(MetaBase): - slug: str - name: str - description: Optional[str] = None - - -@pruned_json -@dataclass -class Dimension(MetaBase): - """MDIM/Explorer dimension configuration.""" - - slug: str - name: str - # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class - choices: Optional[List[DimensionChoice]] = None # Only allowed to be None if checkbox - presentation: Optional[Dict[str, Any]] = None - - -@pruned_json -@dataclass -class Collection(MetaBase): - """Overall MDIM/Explorer config""" - - dimensions: List[Dimension] - views: List[View] - - -@pruned_json -@dataclass -class Explorer(Collection): - """Model for Explorer configuration.""" - - config: Dict[str, str] - - -@pruned_json -@dataclass -class Multidim(Collection): - """Model for MDIM configuration.""" - - title: Dict[str, str] - defaultSelection: List[str] - topicTags: Optional[List[str]] = None - definitions: Optional[Any] = None - - -# # def main(): -# f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml" -# with open(f_mdim) as istream: -# cfg_mdim = yaml.safe_load(istream) -# mdim = Multidim.from_dict(cfg_mdim) - -# f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" -# with open(f_explorer) as istream: -# cfg_explorer = yaml.safe_load(istream) -# explorer = Explorer.from_dict(cfg_explorer) -# # cfg.views[0].indicators.y diff --git a/etl/collections/common.py b/etl/collections/common.py new file mode 100644 index 00000000000..ee9e3cfb7b1 --- /dev/null +++ b/etl/collections/common.py @@ -0,0 +1,89 @@ +from typing import Any, Dict, List, Union + +from owid.catalog import Table + +from etl.collections.model import DIMENSIONS, View + + +def expand_catalog_paths(view: View, tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: + """Expand catalog paths in views to full dataset URIs. + + This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', + it is assumed to be a table name that must be expanded to a full dataset URI based on + the provided dependencies. + + NOTE: Possible improvements for internal function `_expand`: + - we should make this function a bit more robust when checking the URIs. + - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. + + Args: + config (dict): Configuration dictionary containing views. + tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. + """ + + def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: + """Return same indicator, but with complete catalog path.""" + + def _expand(indicator: str): + assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" + + # Complete dataset URI + if "/" in indicator: + return indicator + # table#indicator format + else: + indicator_split = indicator.split("#") + + # Check format is actually table#indicator + assert (len(indicator_split) == 2) & ( + indicator_split[0] != "" + ), f"Expected 'table#indicator' format. Instead found {indicator}" + + # Check table is in any of the datasets! + assert ( + indicator_split[0] in tables_by_name + ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" + + # Check table name to table mapping is unique + assert ( + len(tables_by_name[indicator_split[0]]) == 1 + ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." + + # Check dataset in table metadata is not None + tb = tables_by_name[indicator_split[0]][0] + assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" + + # Build URI + return tb.m.dataset.uri + "/" + indicator + + # Expand catalog path if it's a string + if isinstance(indicator, str): + return _expand(indicator) + # Expand catalog path if it's a dictionary + elif isinstance(indicator, dict): + assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" + indicator["catalogPath"] = _expand(indicator["catalogPath"]) + return indicator + + # Update indicators for each dimension + view.indicators.expand_paths(tables_by_name) + + for dim in DIMENSIONS: + if dim in view["indicators"]: + if isinstance(view["indicators"][dim], list): + view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] + else: + view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) + + # Update indicators from sortColumnSlug + if "config" in view: + if "sortColumnSlug" in view["config"]: + view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) + + # Update indicators from map.columnSlug + if "config" in view: + if "map" in view["config"]: + if "columnSlug" in view["config"]["map"]: + view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) + + return view diff --git a/etl/collections/model.py b/etl/collections/model.py new file mode 100644 index 00000000000..5e4c6a3ddd0 --- /dev/null +++ b/etl/collections/model.py @@ -0,0 +1,302 @@ +"""WIP: Drafting a model for dealing with MDIM/Explorer configuration. + +This should be aligned with the MDIM schema. + +THINGS TO SOLVE: + + - If an attribute is Optional, MetaBase.from_dict is not correctly loading it as the appropriate class when given. +""" + +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, TypeVar + +from owid.catalog import Table +from owid.catalog.meta import MetaBase + +DIMENSIONS = ["y", "x", "size", "color"] +T = TypeVar("T") +REGEX_CATALOG_PATH = ( + r"^(?:grapher/[A-Za-z0-9_]+/(?:\d{4}-\d{2}-\d{2}|\d{4}|latest)/[A-Za-z0-9_]+/)?[A-Za-z0-9_]+#[A-Za-z0-9_]+$" +) + + +def prune_dict(d: dict) -> dict: + """Remove all keys starting with underscore and all empty values from a dictionary. + + NOTE: This method was copied from owid.catalog.utils. It is slightly different in the sense that it does not remove fields with empty lists! This is because there are some fields which are mandatory and can be empty! (TODO: should probably fix the schema / engineering side) + + """ + out = {} + for k, v in d.items(): + if not k.startswith("_") and v not in [None, {}]: + if isinstance(v, dict): + out[k] = prune_dict(v) + elif isinstance(v, list): + out[k] = [prune_dict(x) if isinstance(x, dict) else x for x in v if x not in [None, {}]] + else: + out[k] = v + return out + + +def pruned_json(cls: T) -> T: + orig = cls.to_dict # type: ignore + + # only keep non-null public variables + # calling original to_dict returns dictionaries, not objects + cls.to_dict = lambda self, **kwargs: prune_dict(orig(self, **kwargs)) # type: ignore + + return cls + + +@pruned_json +@dataclass +class Indicator(MetaBase): + catalogPath: str + display: Optional[Dict[str, Any]] = None + + def __post_init__(self): + # Validate that the catalog path is either (i) complete or (ii) in the format table#indicator. + if not self.is_a_valid_path(self.catalogPath): + raise ValueError(f"Invalid catalog path: {self.catalogPath}") + + def has_complete_path(self) -> bool: + return "/" in self.catalogPath + + @classmethod + def is_a_valid_path(cls, path: str) -> bool: + pattern = re.compile(REGEX_CATALOG_PATH) + valid = bool(pattern.match(path)) + return valid + + def __setattr__(self, name, value): + """Validate that the catalog path is either (i) complete or (ii) in the format table#indicator.""" + if hasattr(self, name): + if (name == "catalogPath") and (not self.is_a_valid_path(value)): + raise ValueError(f"Invalid catalog path: {value}") + return super().__setattr__(name, value) + + def expand_path(self, tables_by_name: Dict[str, List[Table]]) -> None: + # Do nothing if path is already complete + if self.has_complete_path(): + return + + # If path is not complete, we need to expand it! + table_name = self.catalogPath.split("#")[0] + + # Check table is in any of the datasets! + assert ( + table_name in tables_by_name + ), f"Table name `{table_name}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" + + # Check table name to table mapping is unique + assert ( + len(tables_by_name[table_name]) == 1 + ), f"There are multiple dependencies (datasets) with a table named {table_name}. Please use the complete dataset URI in this case." + + # Check dataset in table metadata is not None + tb = tables_by_name[table_name][0] + assert tb.m.dataset is not None, f"Dataset not found for table {table_name}" + + # Build URI + self.catalogPath = tb.m.dataset.uri + "/" + self.catalogPath + + +@pruned_json +@dataclass +class ViewIndicators(MetaBase): + """Indicators in a MDIM/Explorer view.""" + + y: Optional[List[Indicator]] = None + x: Optional[Indicator] = None + size: Optional[Indicator] = None + color: Optional[Indicator] = None + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": + """Coerce the dictionary into the expected shape before passing it to the parent class.""" + # Make a shallow copy so we don't mutate the user's dictionary in-place + data = dict(d) + + # Coerce each dimension field (y, x, size, color) from [str, ...] -> [{'path': str}, ...] + for dim in DIMENSIONS: + if dim in data: + if isinstance(data[dim], list): + data[dim] = [{"catalogPath": item} if isinstance(item, str) else item for item in data[dim]] + else: + if isinstance(data[dim], str): + data[dim] = [{"catalogPath": data[dim]}] if dim == "y" else {"catalogPath": data[dim]} + elif dim == "y": + data[dim] = [data[dim]] + # Now that data is in the expected shape, let the parent class handle the rest + return super().from_dict(data) + + def to_records(self) -> List[Dict[str, str]]: + indicators = [] + for dim in DIMENSIONS: + dimension_val = getattr(self, dim, None) + if dimension_val is None: + continue + if isinstance(dimension_val, list): + for d in dimension_val: + indicators.append({"path": d.catalogPath, "dimension": dim}) + else: + indicators.append({"path": dimension_val.catalogPath, "dimension": dim}) + return indicators + + def expand_paths(self, tables_by_name: Dict[str, List[Table]]): + """Expand the catalog paths of all indicators in the view.""" + for dim in DIMENSIONS: + dimension_val = getattr(self, dim, None) + if dimension_val is None: + continue + if isinstance(dimension_val, list): + for indicator in dimension_val: + indicator.expand_path(tables_by_name) + else: + dimension_val.expand_path(tables_by_name) + + +@pruned_json +@dataclass +class View(MetaBase): + """MDIM/Explorer view configuration.""" + + dimensions: Dict[str, str] + indicators: ViewIndicators + # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class + config: Optional[Any] = None + metadata: Optional[Any] = None + + @property + def has_multiple_indicators(self) -> bool: + # Get list of indicators + indicators = self.indicators.to_records() + return len(indicators) > 1 + + @property + def metadata_is_needed(self) -> bool: + return self.has_multiple_indicators and (self.metadata is None) + + # def indicators_in_view(self): + # """Get the list of indicators in use in a view. + + # It returns the list as a list of records: + + # [ + # { + # "path": "data://path/to/dataset#indicator", + # "dimension": "y" + # }, + # ... + # ] + + # TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? + # """ + # indicators_view = [] + # # Get indicators from dimensions + # for dim in DIMENSIONS: + # if dim in self.indicators: + # indicator_raw = view["indicators"][dim] + # if isinstance(indicator_raw, list): + # assert dim == "y", "Only `y` can come as a list" + # indicators_view += [ + # { + # "path": extract_catalog_path(ind), + # "dimension": dim, + # } + # for ind in indicator_raw + # ] + # else: + # indicators_view.append( + # { + # "path": extract_catalog_path(indicator_raw), + # "dimension": dim, + # } + # ) + # return indicators_view + + +@pruned_json +@dataclass +class DimensionChoice(MetaBase): + slug: str + name: str + description: Optional[str] = None + + +@pruned_json +@dataclass +class Dimension(MetaBase): + """MDIM/Explorer dimension configuration.""" + + slug: str + name: str + # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class + choices: Optional[List[DimensionChoice]] = None # Only allowed to be None if checkbox + presentation: Optional[Dict[str, Any]] = None + + +@pruned_json +@dataclass +class Collection(MetaBase): + """Overall MDIM/Explorer config""" + + dimensions: List[Dimension] + views: List[View] + + +@pruned_json +@dataclass +class Explorer(Collection): + """Model for Explorer configuration.""" + + config: Dict[str, str] + + +@pruned_json +@dataclass +class Multidim(Collection): + """Model for MDIM configuration.""" + + title: Dict[str, str] + defaultSelection: List[str] + topicTags: Optional[List[str]] = None + definitions: Optional[Any] = None + + +# # def main(): +# import yaml + +# from etl.collections.utils import ( +# get_tables_by_name_mapping, +# ) + +# f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml" +# with open(f_mdim) as istream: +# cfg_mdim = yaml.safe_load(istream) +# mdim = Multidim.from_dict(cfg_mdim) + +# dependencies = { +# "data://grapher/covid/latest/hospital", +# "data://grapher/covid/latest/vaccinations_global", +# "data://grapher/covid/latest/vaccinations_manufacturer", +# "data://grapher/covid/latest/testing", +# "data://grapher/excess_mortality/latest/excess_mortality", +# "data-private://grapher/excess_mortality/latest/excess_mortality_economist", +# "data://grapher/covid/latest/xm_who", +# "data://grapher/covid/latest/cases_deaths", +# "data://grapher/covid/latest/covax", +# "data://grapher/covid/latest/infections_model", +# "data://grapher/covid/latest/google_mobility", +# "data://grapher/regions/2023-01-01/regions", +# } +# tables_by_name = get_tables_by_name_mapping(dependencies) + +# mdim.views[0].indicators.expand_paths(tables_by_name) + +# f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" +# with open(f_explorer) as istream: +# cfg_explorer = yaml.safe_load(istream) +# explorer = Explorer.from_dict(cfg_explorer) +# # cfg.views[0].indicators.y diff --git a/etl/collections/multidim.py b/etl/collections/multidim.py index 335669aa72b..b2d08e1b6e7 100644 --- a/etl/collections/multidim.py +++ b/etl/collections/multidim.py @@ -19,9 +19,10 @@ from structlog import get_logger from apps.chart_sync.admin_api import AdminAPI -from etl.collections.base import Multidim +from etl.collections.model import Multidim from etl.collections.utils import ( expand_catalog_paths, + expand_catalog_paths_2, extract_catalog_path, get_indicators_in_view, get_tables_by_name_mapping, @@ -183,6 +184,9 @@ def upsert_multidim_data_page( dependencies = paths.dependencies mdim_catalog_path = f"{paths.namespace}/{paths.version}/{paths.short_name}#{mdim_name or paths.short_name}" + mdim = Multidim.from_dict(config) + config = mdim.to_dict() + # Edit views process_mdim_views(config, dependencies=dependencies) @@ -192,36 +196,29 @@ def upsert_multidim_data_page( _upsert_multidim_data_page(mdim_catalog_path, config, owid_env) -def upsert_multidim_data_page_2( - mdim: Multidim, paths: PathFinder, mdim_name: Optional[str] = None, owid_env: Optional[OWIDEnv] = None -) -> None: - """Import MDIM config to DB. - - Args: - ----- +def process_mdim_views_2(mdim: Multidim, dependencies: Set[str]): + """Process views in MDIM configuration. - slug: str - Slug of the MDIM page. MDIM will be published at /slug - config: dict - MDIM configuration. - paths: PathFinder - Pass `paths = PathFinder(__file__)` from the script where this function is called. - mdim_name: str - Name of the MDIM page. Default is short_name from mdim catalog path. - owid_env: Optional[OWIDEnv] - Environment where to publish the MDIM page. + This includes: + - Make sure that catalog paths for indicators are complete. + - TODO: Process views with multiple indicators to have adequate metadata """ - dependencies = paths.dependencies - mdim_catalog_path = f"{paths.namespace}/{paths.version}/{paths.short_name}#{mdim_name or paths.short_name}" - - config = mdim.to_dict() - # Edit views - process_mdim_views(config, dependencies=dependencies) + # Get table information by table name, and table URI + tables_by_name = get_tables_by_name_mapping(dependencies) + # tables_by_uri = get_tables_by_uri_mapping(tables_by_name) # This is to be used when processing views with multiple indicators - # TODO: Possibly add other edits (to dimensions?) + # Go through all views and expand catalog paths + for view in mdim.views: + # Update indicators for each dimension, making sure they have the complete URI + expand_catalog_paths_2(view, tables_by_name=tables_by_name) - # Upsert to DB - _upsert_multidim_data_page(mdim_catalog_path, config, owid_env) + # Combine metadata in views which contain multiple indicators + if view.metadata_is_needed: # Check if view "contains multiple indicators" + # TODO + # view["metadata"] = build_view_metadata_multi(indicators, tables_by_uri) + log.info( + f"View with multiple indicators detected. You should edit its `metadata` field to reflect that! This will be done programmatically in the future. Check view with dimensions {view.dimensions}" + ) def process_mdim_views(config: dict, dependencies: Set[str]): diff --git a/etl/collections/utils.py b/etl/collections/utils.py index 80bd2c86eb7..d86b9dd6f93 100644 --- a/etl/collections/utils.py +++ b/etl/collections/utils.py @@ -6,8 +6,6 @@ from etl.paths import DATA_DIR -DIMENSIONS = ["y", "x", "size", "color"] - def records_to_dictionary(records, key: str): """Transform: [{key: ..., a: ..., b: ...}, ...] -> {key: {a: ..., b: ...}, ...}.""" @@ -20,45 +18,6 @@ def records_to_dictionary(records, key: str): return dix -def get_indicators_in_view(view): - """Get the list of indicators in use in a view. - - It returns the list as a list of records: - - [ - { - "path": "data://path/to/dataset#indicator", - "dimension": "y" - }, - ... - ] - - TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? - """ - indicators_view = [] - # Get indicators from dimensions - for dim in DIMENSIONS: - if dim in view["indicators"]: - indicator_raw = view["indicators"][dim] - if isinstance(indicator_raw, list): - assert dim == "y", "Only `y` can come as a list" - indicators_view += [ - { - "path": extract_catalog_path(ind), - "dimension": dim, - } - for ind in indicator_raw - ] - else: - indicators_view.append( - { - "path": extract_catalog_path(indicator_raw), - "dimension": dim, - } - ) - return indicators_view - - def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]]: """Dictionary mapping table short name to table object. @@ -79,6 +38,88 @@ def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]] return tb_name_to_tb +def expand_catalog_paths_2(view: "View", tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: + """Expand catalog paths in views to full dataset URIs. + + This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', + it is assumed to be a table name that must be expanded to a full dataset URI based on + the provided dependencies. + + NOTE: Possible improvements for internal function `_expand`: + - we should make this function a bit more robust when checking the URIs. + - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. + + Args: + config (dict): Configuration dictionary containing views. + tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. + """ + + def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: + """Return same indicator, but with complete catalog path.""" + + def _expand(indicator: str): + assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" + + # Complete dataset URI + if "/" in indicator: + return indicator + # table#indicator format + else: + indicator_split = indicator.split("#") + + # Check format is actually table#indicator + assert (len(indicator_split) == 2) & ( + indicator_split[0] != "" + ), f"Expected 'table#indicator' format. Instead found {indicator}" + + # Check table is in any of the datasets! + assert ( + indicator_split[0] in tables_by_name + ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" + + # Check table name to table mapping is unique + assert ( + len(tables_by_name[indicator_split[0]]) == 1 + ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." + + # Check dataset in table metadata is not None + tb = tables_by_name[indicator_split[0]][0] + assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" + + # Build URI + return tb.m.dataset.uri + "/" + indicator + + # Expand catalog path if it's a string + if isinstance(indicator, str): + return _expand(indicator) + # Expand catalog path if it's a dictionary + elif isinstance(indicator, dict): + assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" + indicator["catalogPath"] = _expand(indicator["catalogPath"]) + return indicator + + # Update indicators for each dimension + for dim in DIMENSIONS: + if dim in view["indicators"]: + if isinstance(view["indicators"][dim], list): + view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] + else: + view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) + + # Update indicators from sortColumnSlug + if "config" in view: + if "sortColumnSlug" in view["config"]: + view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) + + # Update indicators from map.columnSlug + if "config" in view: + if "map" in view["config"]: + if "columnSlug" in view["config"]["map"]: + view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) + + return view + + def expand_catalog_paths(view: Dict[Any, Any], tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: """Expand catalog paths in views to full dataset URIs. @@ -170,3 +211,47 @@ def extract_catalog_path(indicator_raw): return indicator_raw["catalogPath"] else: raise ValueError(f"Unexpected indicator property type: {indicator_raw}") + + +################################################ +# DEPRECATE +################################################ + + +def get_indicators_in_view(view): + """Get the list of indicators in use in a view. + + It returns the list as a list of records: + + [ + { + "path": "data://path/to/dataset#indicator", + "dimension": "y" + }, + ... + ] + + TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? + """ + indicators_view = [] + # Get indicators from dimensions + for dim in DIMENSIONS: + if dim in view["indicators"]: + indicator_raw = view["indicators"][dim] + if isinstance(indicator_raw, list): + assert dim == "y", "Only `y` can come as a list" + indicators_view += [ + { + "path": extract_catalog_path(ind), + "dimension": dim, + } + for ind in indicator_raw + ] + else: + indicators_view.append( + { + "path": extract_catalog_path(indicator_raw), + "dimension": dim, + } + ) + return indicators_view diff --git a/etl/steps/export/multidim/covid/latest/covid.py b/etl/steps/export/multidim/covid/latest/covid.py index e906d48c34b..573bfdb0b11 100644 --- a/etl/steps/export/multidim/covid/latest/covid.py +++ b/etl/steps/export/multidim/covid/latest/covid.py @@ -1,5 +1,4 @@ from etl.collections import multidim -from etl.collections.base import Multidim from etl.helpers import PathFinder # Get paths and naming conventions for current step. @@ -37,14 +36,8 @@ def run(dest_dir: str) -> None: paths.log.info(fname) config = paths.load_mdim_config(fname) - mdim = Multidim.from_dict(config) - # multidim.upsert_multidim_data_page( - # config=config, - # paths=paths, - # mdim_name=fname_to_mdim_name(fname), - # ) - multidim.upsert_multidim_data_page_2( - mdim=mdim, + multidim.upsert_multidim_data_page( + config=config, paths=paths, mdim_name=fname_to_mdim_name(fname), ) From f0d47bfd8518b5cee1deb66602e614c194c5d39f Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 28 Feb 2025 00:51:16 +0100 Subject: [PATCH 15/18] wip --- etl/collections/model.py | 31 ++++++- etl/collections/multidim.py | 37 +------- etl/collections/utils.py | 84 +------------------ .../export/multidim/covid/latest/covid.py | 2 + 4 files changed, 36 insertions(+), 118 deletions(-) diff --git a/etl/collections/model.py b/etl/collections/model.py index 5e4c6a3ddd0..47d54ff5ca1 100644 --- a/etl/collections/model.py +++ b/etl/collections/model.py @@ -76,10 +76,10 @@ def __setattr__(self, name, value): raise ValueError(f"Invalid catalog path: {value}") return super().__setattr__(name, value) - def expand_path(self, tables_by_name: Dict[str, List[Table]]) -> None: + def expand_path(self, tables_by_name: Dict[str, List[Table]]): # Do nothing if path is already complete if self.has_complete_path(): - return + return self # If path is not complete, we need to expand it! table_name = self.catalogPath.split("#")[0] @@ -101,6 +101,8 @@ def expand_path(self, tables_by_name: Dict[str, List[Table]]) -> None: # Build URI self.catalogPath = tb.m.dataset.uri + "/" + self.catalogPath + return self + @pruned_json @dataclass @@ -156,6 +158,8 @@ def expand_paths(self, tables_by_name: Dict[str, List[Table]]): else: dimension_val.expand_path(tables_by_name) + return self + @pruned_json @dataclass @@ -164,7 +168,7 @@ class View(MetaBase): dimensions: Dict[str, str] indicators: ViewIndicators - # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class + # NOTE: Maybe worth putting as classes at some point? config: Optional[Any] = None metadata: Optional[Any] = None @@ -178,6 +182,27 @@ def has_multiple_indicators(self) -> bool: def metadata_is_needed(self) -> bool: return self.has_multiple_indicators and (self.metadata is None) + def expand_paths(self, tables_by_name: Dict[str, List[Table]]): + """Expand all indicator paths in the view. + + Make sure that they are all complete paths. This includes indicators in view, but also those in config (if any). + """ + # Expand paths in indicators + self.indicators.expand_paths(tables_by_name) + + # Expand paths in config fields + if self.config is not None: + if "sortColumnSlug" in self.config: + indicator = Indicator(self.config["sortColumnSlug"]).expand_path(tables_by_name) + self.config["sortColumnSlug"] = indicator.catalogPath + + if "map" in self.config: + if "columnSlug" in self.config["map"]: + indicator = Indicator(self.config["map"]["columnSlug"]).expand_path(tables_by_name) + self.config["map"]["columnSlug"] = indicator.catalogPath + + return self + # def indicators_in_view(self): # """Get the list of indicators in use in a view. diff --git a/etl/collections/multidim.py b/etl/collections/multidim.py index b2d08e1b6e7..fe74994c6b7 100644 --- a/etl/collections/multidim.py +++ b/etl/collections/multidim.py @@ -22,7 +22,6 @@ from etl.collections.model import Multidim from etl.collections.utils import ( expand_catalog_paths, - expand_catalog_paths_2, extract_catalog_path, get_indicators_in_view, get_tables_by_name_mapping, @@ -185,18 +184,17 @@ def upsert_multidim_data_page( mdim_catalog_path = f"{paths.namespace}/{paths.version}/{paths.short_name}#{mdim_name or paths.short_name}" mdim = Multidim.from_dict(config) - config = mdim.to_dict() # Edit views - process_mdim_views(config, dependencies=dependencies) + process_mdim_views(mdim, dependencies=dependencies) # TODO: Possibly add other edits (to dimensions?) # Upsert to DB - _upsert_multidim_data_page(mdim_catalog_path, config, owid_env) + _upsert_multidim_data_page(mdim_catalog_path, mdim.to_dict(), owid_env) -def process_mdim_views_2(mdim: Multidim, dependencies: Set[str]): +def process_mdim_views(mdim: Multidim, dependencies: Set[str]): """Process views in MDIM configuration. This includes: @@ -210,7 +208,7 @@ def process_mdim_views_2(mdim: Multidim, dependencies: Set[str]): # Go through all views and expand catalog paths for view in mdim.views: # Update indicators for each dimension, making sure they have the complete URI - expand_catalog_paths_2(view, tables_by_name=tables_by_name) + view.expand_paths(tables_by_name) # Combine metadata in views which contain multiple indicators if view.metadata_is_needed: # Check if view "contains multiple indicators" @@ -221,33 +219,6 @@ def process_mdim_views_2(mdim: Multidim, dependencies: Set[str]): ) -def process_mdim_views(config: dict, dependencies: Set[str]): - """Process views in MDIM configuration. - - This includes: - - Make sure that catalog paths for indicators are complete. - - TODO: Process views with multiple indicators to have adequate metadata - """ - # Get table information by table name, and table URI - tables_by_name = get_tables_by_name_mapping(dependencies) - # tables_by_uri = get_tables_by_uri_mapping(tables_by_name) # This is to be used when processing views with multiple indicators - - # Go through all views and expand catalog paths - for view in config["views"]: - # Update indicators for each dimension, making sure they have the complete URI - expand_catalog_paths(view, tables_by_name=tables_by_name) - - # Combine metadata in views which contain multiple indicators - indicators = get_indicators_in_view(view) - if (len(indicators) > 1) and ("metadata" not in view): # Check if view "contains multiple indicators" - # TODO - # view["metadata"] = build_view_metadata_multi(indicators, tables_by_uri) - log.info( - f"View with multiple indicators detected. You should edit its `metadata` field to reflect that! This will be done programmatically in the future. Check view with dimensions {view['dimensions']}" - ) - pass - - def _upsert_multidim_data_page(mdim_catalog_path: str, config: dict, owid_env: Optional[OWIDEnv] = None) -> None: """Actual upsert to DB.""" # Ensure we have an environment set diff --git a/etl/collections/utils.py b/etl/collections/utils.py index d86b9dd6f93..aa324169c7c 100644 --- a/etl/collections/utils.py +++ b/etl/collections/utils.py @@ -38,88 +38,6 @@ def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]] return tb_name_to_tb -def expand_catalog_paths_2(view: "View", tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: - """Expand catalog paths in views to full dataset URIs. - - This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', - it is assumed to be a table name that must be expanded to a full dataset URI based on - the provided dependencies. - - NOTE: Possible improvements for internal function `_expand`: - - we should make this function a bit more robust when checking the URIs. - - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. - - Args: - config (dict): Configuration dictionary containing views. - tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. - """ - - def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: - """Return same indicator, but with complete catalog path.""" - - def _expand(indicator: str): - assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" - - # Complete dataset URI - if "/" in indicator: - return indicator - # table#indicator format - else: - indicator_split = indicator.split("#") - - # Check format is actually table#indicator - assert (len(indicator_split) == 2) & ( - indicator_split[0] != "" - ), f"Expected 'table#indicator' format. Instead found {indicator}" - - # Check table is in any of the datasets! - assert ( - indicator_split[0] in tables_by_name - ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" - - # Check table name to table mapping is unique - assert ( - len(tables_by_name[indicator_split[0]]) == 1 - ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." - - # Check dataset in table metadata is not None - tb = tables_by_name[indicator_split[0]][0] - assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" - - # Build URI - return tb.m.dataset.uri + "/" + indicator - - # Expand catalog path if it's a string - if isinstance(indicator, str): - return _expand(indicator) - # Expand catalog path if it's a dictionary - elif isinstance(indicator, dict): - assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" - indicator["catalogPath"] = _expand(indicator["catalogPath"]) - return indicator - - # Update indicators for each dimension - for dim in DIMENSIONS: - if dim in view["indicators"]: - if isinstance(view["indicators"][dim], list): - view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] - else: - view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) - - # Update indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) - - return view - - def expand_catalog_paths(view: Dict[Any, Any], tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: """Expand catalog paths in views to full dataset URIs. @@ -217,6 +135,8 @@ def extract_catalog_path(indicator_raw): # DEPRECATE ################################################ +DIMENSIONS = ["y", "x", "size", "color"] + def get_indicators_in_view(view): """Get the list of indicators in use in a view. diff --git a/etl/steps/export/multidim/covid/latest/covid.py b/etl/steps/export/multidim/covid/latest/covid.py index 573bfdb0b11..962fdee7380 100644 --- a/etl/steps/export/multidim/covid/latest/covid.py +++ b/etl/steps/export/multidim/covid/latest/covid.py @@ -15,6 +15,8 @@ "addCountryMode": "change-country", } +print(1) + def run(dest_dir: str) -> None: # PART 1: MDIMs entirely from YAML files From 309313ea3e4857e8b656a4988ece774579d93500 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 28 Feb 2025 01:15:27 +0100 Subject: [PATCH 16/18] wip --- etl/collections/common.py | 90 +------------------ etl/collections/explorers.py | 87 ++++++++++++++++++ etl/collections/model.py | 60 +++++-------- etl/collections/multidim.py | 15 ++-- etl/collections/utils.py | 82 ----------------- .../export/explorers/covid/latest/covid.py | 2 +- .../export/multidim/covid/latest/covid.py | 2 +- 7 files changed, 120 insertions(+), 218 deletions(-) create mode 100644 etl/collections/explorers.py diff --git a/etl/collections/common.py b/etl/collections/common.py index ee9e3cfb7b1..73ab5b00a5b 100644 --- a/etl/collections/common.py +++ b/etl/collections/common.py @@ -1,89 +1 @@ -from typing import Any, Dict, List, Union - -from owid.catalog import Table - -from etl.collections.model import DIMENSIONS, View - - -def expand_catalog_paths(view: View, tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: - """Expand catalog paths in views to full dataset URIs. - - This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', - it is assumed to be a table name that must be expanded to a full dataset URI based on - the provided dependencies. - - NOTE: Possible improvements for internal function `_expand`: - - we should make this function a bit more robust when checking the URIs. - - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. - - Args: - config (dict): Configuration dictionary containing views. - tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. - """ - - def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: - """Return same indicator, but with complete catalog path.""" - - def _expand(indicator: str): - assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" - - # Complete dataset URI - if "/" in indicator: - return indicator - # table#indicator format - else: - indicator_split = indicator.split("#") - - # Check format is actually table#indicator - assert (len(indicator_split) == 2) & ( - indicator_split[0] != "" - ), f"Expected 'table#indicator' format. Instead found {indicator}" - - # Check table is in any of the datasets! - assert ( - indicator_split[0] in tables_by_name - ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" - - # Check table name to table mapping is unique - assert ( - len(tables_by_name[indicator_split[0]]) == 1 - ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." - - # Check dataset in table metadata is not None - tb = tables_by_name[indicator_split[0]][0] - assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" - - # Build URI - return tb.m.dataset.uri + "/" + indicator - - # Expand catalog path if it's a string - if isinstance(indicator, str): - return _expand(indicator) - # Expand catalog path if it's a dictionary - elif isinstance(indicator, dict): - assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" - indicator["catalogPath"] = _expand(indicator["catalogPath"]) - return indicator - - # Update indicators for each dimension - view.indicators.expand_paths(tables_by_name) - - for dim in DIMENSIONS: - if dim in view["indicators"]: - if isinstance(view["indicators"][dim], list): - view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] - else: - view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) - - # Update indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) - - return view +"""Common tooling for MDIMs/Explorers.""" diff --git a/etl/collections/explorers.py b/etl/collections/explorers.py new file mode 100644 index 00000000000..68488d3a3bd --- /dev/null +++ b/etl/collections/explorers.py @@ -0,0 +1,87 @@ +from typing import Any, Dict, List, Union + +from owid.catalog import Table + +from etl.collections.model import DIMENSIONS + + +def expand_catalog_paths(view: Dict[Any, Any], tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: + """Expand catalog paths in views to full dataset URIs. + + This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', + it is assumed to be a table name that must be expanded to a full dataset URI based on + the provided dependencies. + + NOTE: Possible improvements for internal function `_expand`: + - we should make this function a bit more robust when checking the URIs. + - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. + + Args: + config (dict): Configuration dictionary containing views. + tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. + """ + + def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: + """Return same indicator, but with complete catalog path.""" + + def _expand(indicator: str): + assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" + + # Complete dataset URI + if "/" in indicator: + return indicator + # table#indicator format + else: + indicator_split = indicator.split("#") + + # Check format is actually table#indicator + assert (len(indicator_split) == 2) & ( + indicator_split[0] != "" + ), f"Expected 'table#indicator' format. Instead found {indicator}" + + # Check table is in any of the datasets! + assert ( + indicator_split[0] in tables_by_name + ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" + + # Check table name to table mapping is unique + assert ( + len(tables_by_name[indicator_split[0]]) == 1 + ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." + + # Check dataset in table metadata is not None + tb = tables_by_name[indicator_split[0]][0] + assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" + + # Build URI + return tb.m.dataset.uri + "/" + indicator + + # Expand catalog path if it's a string + if isinstance(indicator, str): + return _expand(indicator) + # Expand catalog path if it's a dictionary + elif isinstance(indicator, dict): + assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" + indicator["catalogPath"] = _expand(indicator["catalogPath"]) + return indicator + + # Update indicators for each dimension + for dim in DIMENSIONS: + if dim in view["indicators"]: + if isinstance(view["indicators"][dim], list): + view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] + else: + view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) + + # Update indicators from sortColumnSlug + if "config" in view: + if "sortColumnSlug" in view["config"]: + view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) + + # Update indicators from map.columnSlug + if "config" in view: + if "map" in view["config"]: + if "columnSlug" in view["config"]["map"]: + view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) + + return view diff --git a/etl/collections/model.py b/etl/collections/model.py index 47d54ff5ca1..75ea9edf9ca 100644 --- a/etl/collections/model.py +++ b/etl/collections/model.py @@ -203,43 +203,14 @@ def expand_paths(self, tables_by_name: Dict[str, List[Table]]): return self - # def indicators_in_view(self): - # """Get the list of indicators in use in a view. - - # It returns the list as a list of records: - - # [ - # { - # "path": "data://path/to/dataset#indicator", - # "dimension": "y" - # }, - # ... - # ] - - # TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? - # """ - # indicators_view = [] - # # Get indicators from dimensions - # for dim in DIMENSIONS: - # if dim in self.indicators: - # indicator_raw = view["indicators"][dim] - # if isinstance(indicator_raw, list): - # assert dim == "y", "Only `y` can come as a list" - # indicators_view += [ - # { - # "path": extract_catalog_path(ind), - # "dimension": dim, - # } - # for ind in indicator_raw - # ] - # else: - # indicators_view.append( - # { - # "path": extract_catalog_path(indicator_raw), - # "dimension": dim, - # } - # ) - # return indicators_view + def all_indicators(self): + """Return all indicators in the view.""" + indicators_main = self.indicators.to_records() + + # Auxiliary + # indicators_aux = [] + + # validation? @pruned_json @@ -261,6 +232,17 @@ class Dimension(MetaBase): choices: Optional[List[DimensionChoice]] = None # Only allowed to be None if checkbox presentation: Optional[Dict[str, Any]] = None + def __post_init__(self): + # Validate that choices are defined for checkbox type + if self.choices is None: + if (self.presentation is None) or (self.presentation["type"] != "checkbox"): + raise ValueError(f"Choices not found for dimension: {self.slug}") + + @property + def choice_slugs(self): + if self.choices is not None: + return [choice.slug for choice in self.choices] + @pruned_json @dataclass @@ -270,6 +252,10 @@ class Collection(MetaBase): dimensions: List[Dimension] views: List[View] + def validate_views_with_dimensions(self): + """Validate that the dimension choices in all views are defined.""" + dix = {dim.slug: dim.choice_slugs for dim in self.dimensions} + @pruned_json @dataclass diff --git a/etl/collections/multidim.py b/etl/collections/multidim.py index fe74994c6b7..becc4e4a856 100644 --- a/etl/collections/multidim.py +++ b/etl/collections/multidim.py @@ -21,7 +21,6 @@ from apps.chart_sync.admin_api import AdminAPI from etl.collections.model import Multidim from etl.collections.utils import ( - expand_catalog_paths, extract_catalog_path, get_indicators_in_view, get_tables_by_name_mapping, @@ -191,7 +190,7 @@ def upsert_multidim_data_page( # TODO: Possibly add other edits (to dimensions?) # Upsert to DB - _upsert_multidim_data_page(mdim_catalog_path, mdim.to_dict(), owid_env) + _upsert_multidim_data_page(mdim_catalog_path, mdim, owid_env) def process_mdim_views(mdim: Multidim, dependencies: Set[str]): @@ -219,19 +218,19 @@ def process_mdim_views(mdim: Multidim, dependencies: Set[str]): ) -def _upsert_multidim_data_page(mdim_catalog_path: str, config: dict, owid_env: Optional[OWIDEnv] = None) -> None: +def _upsert_multidim_data_page(mdim_catalog_path: str, mdim: Multidim, owid_env: Optional[OWIDEnv] = None) -> None: """Actual upsert to DB.""" # Ensure we have an environment set if owid_env is None: owid_env = OWID_ENV # Validate config - validate_schema(config) - validate_multidim_config(config, owid_env.engine) + validate_schema(mdim.to_dict()) + validate_multidim_config(mdim, owid_env.engine) # Replace especial fields URIs with IDs (e.g. sortColumnSlug). # TODO: I think we could move this to the Grapher side. - config = replace_catalog_paths_with_ids(config) + config = replace_catalog_paths_with_ids(mdim.to_dict()) # Upsert config via Admin API admin_api = AdminAPI(owid_env) @@ -289,9 +288,9 @@ def validate_schema(config: dict) -> None: raise ValueError(f"Config validation error: {e.message}") # type: ignore -def validate_multidim_config(config: dict, engine: Engine) -> None: +def validate_multidim_config(mdim: Multidim, engine: Engine) -> None: # Ensure that all views are in choices - for dim in config["dimensions"]: + for dim in mdim.dimensions: allowed_slugs = [choice["slug"] for choice in dim["choices"]] for view in config["views"]: diff --git a/etl/collections/utils.py b/etl/collections/utils.py index aa324169c7c..78eda0a7c51 100644 --- a/etl/collections/utils.py +++ b/etl/collections/utils.py @@ -38,88 +38,6 @@ def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]] return tb_name_to_tb -def expand_catalog_paths(view: Dict[Any, Any], tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: - """Expand catalog paths in views to full dataset URIs. - - This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', - it is assumed to be a table name that must be expanded to a full dataset URI based on - the provided dependencies. - - NOTE: Possible improvements for internal function `_expand`: - - we should make this function a bit more robust when checking the URIs. - - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. - - Args: - config (dict): Configuration dictionary containing views. - tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. - """ - - def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: - """Return same indicator, but with complete catalog path.""" - - def _expand(indicator: str): - assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" - - # Complete dataset URI - if "/" in indicator: - return indicator - # table#indicator format - else: - indicator_split = indicator.split("#") - - # Check format is actually table#indicator - assert (len(indicator_split) == 2) & ( - indicator_split[0] != "" - ), f"Expected 'table#indicator' format. Instead found {indicator}" - - # Check table is in any of the datasets! - assert ( - indicator_split[0] in tables_by_name - ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" - - # Check table name to table mapping is unique - assert ( - len(tables_by_name[indicator_split[0]]) == 1 - ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." - - # Check dataset in table metadata is not None - tb = tables_by_name[indicator_split[0]][0] - assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" - - # Build URI - return tb.m.dataset.uri + "/" + indicator - - # Expand catalog path if it's a string - if isinstance(indicator, str): - return _expand(indicator) - # Expand catalog path if it's a dictionary - elif isinstance(indicator, dict): - assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" - indicator["catalogPath"] = _expand(indicator["catalogPath"]) - return indicator - - # Update indicators for each dimension - for dim in DIMENSIONS: - if dim in view["indicators"]: - if isinstance(view["indicators"][dim], list): - view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] - else: - view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) - - # Update indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) - - return view - - def extract_catalog_path(indicator_raw): "Indicator spec can come either as a plain string, or a dictionary." if isinstance(indicator_raw, str): diff --git a/etl/steps/export/explorers/covid/latest/covid.py b/etl/steps/export/explorers/covid/latest/covid.py index c7fab442fc6..33dce25e3f3 100644 --- a/etl/steps/export/explorers/covid/latest/covid.py +++ b/etl/steps/export/explorers/covid/latest/covid.py @@ -4,8 +4,8 @@ import pandas as pd +from etl.collections.explorers import expand_catalog_paths from etl.collections.utils import ( - expand_catalog_paths, get_indicators_in_view, get_tables_by_name_mapping, records_to_dictionary, diff --git a/etl/steps/export/multidim/covid/latest/covid.py b/etl/steps/export/multidim/covid/latest/covid.py index 962fdee7380..84732201e1f 100644 --- a/etl/steps/export/multidim/covid/latest/covid.py +++ b/etl/steps/export/multidim/covid/latest/covid.py @@ -15,7 +15,7 @@ "addCountryMode": "change-country", } -print(1) +print(2) def run(dest_dir: str) -> None: From 169e14aacf10f13bca1b5514a72e4df1391c3059 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 28 Feb 2025 17:57:33 +0100 Subject: [PATCH 17/18] wip mdims --- etl/collections/explorers.py | 50 ++++++++ etl/collections/model.py | 108 +++++++++++++++--- etl/collections/multidim.py | 70 ++---------- etl/collections/utils.py | 59 +--------- .../explorers/covid/latest/covid.config.yml | 5 + .../export/explorers/covid/latest/covid.py | 3 +- 6 files changed, 161 insertions(+), 134 deletions(-) diff --git a/etl/collections/explorers.py b/etl/collections/explorers.py index 68488d3a3bd..4557c033209 100644 --- a/etl/collections/explorers.py +++ b/etl/collections/explorers.py @@ -85,3 +85,53 @@ def _expand(indicator: str): view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) return view + + +def extract_catalog_path(indicator_raw): + "Indicator spec can come either as a plain string, or a dictionary." + if isinstance(indicator_raw, str): + return indicator_raw + elif isinstance(indicator_raw, dict): + assert "catalogPath" in indicator_raw + return indicator_raw["catalogPath"] + else: + raise ValueError(f"Unexpected indicator property type: {indicator_raw}") + + +def get_indicators_in_view(view): + """Get the list of indicators in use in a view. + + It returns the list as a list of records: + + [ + { + "path": "data://path/to/dataset#indicator", + "dimension": "y" + }, + ... + ] + + TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? + """ + indicators_view = [] + # Get indicators from dimensions + for dim in DIMENSIONS: + if dim in view["indicators"]: + indicator_raw = view["indicators"][dim] + if isinstance(indicator_raw, list): + assert dim == "y", "Only `y` can come as a list" + indicators_view += [ + { + "path": extract_catalog_path(ind), + "dimension": dim, + } + for ind in indicator_raw + ] + else: + indicators_view.append( + { + "path": extract_catalog_path(indicator_raw), + "dimension": dim, + } + ) + return indicators_view diff --git a/etl/collections/model.py b/etl/collections/model.py index 75ea9edf9ca..c66997e3058 100644 --- a/etl/collections/model.py +++ b/etl/collections/model.py @@ -7,10 +7,13 @@ - If an attribute is Optional, MetaBase.from_dict is not correctly loading it as the appropriate class when given. """ +import json import re from dataclasses import dataclass from typing import Any, Dict, List, Optional, TypeVar +import fastjsonschema +import yaml from owid.catalog import Table from owid.catalog.meta import MetaBase @@ -203,14 +206,40 @@ def expand_paths(self, tables_by_name: Dict[str, List[Table]]): return self - def all_indicators(self): - """Return all indicators in the view.""" - indicators_main = self.indicators.to_records() + @property + def indicators_in_config(self): + indicators = [] + if self.config is not None: + # Get indicators from sortColumnSlug + if "sortColumnSlug" in self.config: + indicators.append(self.config["sortColumnSlug"]) + + # Update indicators from map.columnSlug + if ("map" in self.config) and "columnSlug" in self.config["map"]: + indicators.append((self.config["map"]["columnSlug"])) + + return indicators - # Auxiliary - # indicators_aux = [] + def indicators_used(self): + """Get a flatten list of all indicators used in the view. - # validation? + In addition, it also validates that indicators used in config are also in the view. + + NOTE: Use this method after expanding paths! Otherwise, it will not work as expected. E.g. view.expand_paths(tables_by_name).indicators_used() + """ + # Validate indicators in view + indicators = self.indicators.to_records() + indicators = [ind["path"] for ind in indicators] + + # All indicators in `indicators_extra` should be in `indicators`! E.g. you can't sort by an indicator that is not in the chart! + ## E.g. the indicator used to sort, should be in use in the chart! Or, the indicator in the map tab should be in use in the chart! + invalid_indicators = set(self.indicators_in_config).difference(set(indicators)) + if invalid_indicators: + raise ValueError( + f"Extra indicators not in use. This means that some indicators are referenced in the chart config (e.g. map.columnSlug or sortColumnSlug), but never used in the chart tab. Unexpected indicators: {invalid_indicators}" + ) + + return indicators @pruned_json @@ -228,20 +257,25 @@ class Dimension(MetaBase): slug: str name: str - # NOTE: currently MetaBase.from_dict not loading Optional fields with appropriate class - choices: Optional[List[DimensionChoice]] = None # Only allowed to be None if checkbox + choices: List[DimensionChoice] + # This is just for explorers at the moment! presentation: Optional[Dict[str, Any]] = None - def __post_init__(self): - # Validate that choices are defined for checkbox type - if self.choices is None: - if (self.presentation is None) or (self.presentation["type"] != "checkbox"): - raise ValueError(f"Choices not found for dimension: {self.slug}") + @property + def ui_type(self): + default = "dropdown" + if self.presentation is not None: + return self.presentation.get("type", default) + return default + + @property + def ui_is_bool(self): + return self.ui_type in ["checkbox"] @property def choice_slugs(self): - if self.choices is not None: - return [choice.slug for choice in self.choices] + # if self.choices is not None: + return [choice.slug for choice in self.choices] @pruned_json @@ -256,6 +290,50 @@ def validate_views_with_dimensions(self): """Validate that the dimension choices in all views are defined.""" dix = {dim.slug: dim.choice_slugs for dim in self.dimensions} + for view in self.views: + for slug, value in view.dimensions.items(): + assert slug in dix, f"Dimension {slug} not found in dimensions! View: {self.to_dict()}" + assert value in dix[slug], f"Choice {value} not found for dimension {slug}! View: {self.to_dict()}" + + def validate_schema(self, schema_path): + """Validate class against schema.""" + with open(schema_path) as f: + schema = json.load(f) + + validator = fastjsonschema.compile(schema) + + try: + validator(self.to_dict()) # type: ignore + except fastjsonschema.JsonSchemaException as e: + raise ValueError(f"Config validation error: {e.message}") # type: ignore + + def indicators_in_use(self): + # Get all indicators used in all views + indicators = [] + for view in self.views: + indicators.extend(view.indicators_used()) + + # Make sure indicators are unique + indicators = list(set(indicators)) + + return indicators + + def check_duplicate_views(self): + """Check for duplicate views in the collection.""" + seen_dims = set() + for view in self.views: + dims = tuple(view.dimensions.items()) + if dims in seen_dims: + raise ValueError(f"Duplicate view:\n\n{yaml.dump(view.dimensions)}") + seen_dims.add(dims) + + # NOTE: this is allowed, some views might contain other views + # Check uniqueness + # inds = pd.Series(indicators) + # vc = inds.value_counts() + # if vc[vc > 1].any(): + # raise ValueError(f"Duplicate indicators: {vc[vc > 1].index.tolist()}") + @pruned_json @dataclass diff --git a/etl/collections/multidim.py b/etl/collections/multidim.py index becc4e4a856..4af759c5b51 100644 --- a/etl/collections/multidim.py +++ b/etl/collections/multidim.py @@ -12,7 +12,6 @@ import fastjsonschema import pandas as pd -import yaml from deprecated import deprecated from owid.catalog import Table from sqlalchemy.engine import Engine @@ -21,8 +20,6 @@ from apps.chart_sync.admin_api import AdminAPI from etl.collections.model import Multidim from etl.collections.utils import ( - extract_catalog_path, - get_indicators_in_view, get_tables_by_name_mapping, records_to_dictionary, ) @@ -38,6 +35,7 @@ DIMENSIONS = ["y", "x", "size", "color"] +# TODO: Return List[Dimensions] and List[Views] instead of {"dimensions": [...], "views": [...]} def expand_config( tb: Table, indicator_name: Optional[str] = None, @@ -225,7 +223,7 @@ def _upsert_multidim_data_page(mdim_catalog_path: str, mdim: Multidim, owid_env: owid_env = OWID_ENV # Validate config - validate_schema(mdim.to_dict()) + mdim.validate_schema(SCHEMAS_DIR / "multidim-schema.json") validate_multidim_config(mdim, owid_env.engine) # Replace especial fields URIs with IDs (e.g. sortColumnSlug). @@ -290,64 +288,18 @@ def validate_schema(config: dict) -> None: def validate_multidim_config(mdim: Multidim, engine: Engine) -> None: # Ensure that all views are in choices - for dim in mdim.dimensions: - allowed_slugs = [choice["slug"] for choice in dim["choices"]] - - for view in config["views"]: - for dim_name, dim_value in view["dimensions"].items(): - if dim_name == dim["slug"] and dim_value not in allowed_slugs: - raise ValueError( - f"Slug `{dim_value}` does not exist in dimension `{dim_name}`. View:\n\n{yaml.dump(view)}" - ) - - # Get all used indicators - indicators = [] - for view in config["views"]: - # Get indicators from dimensions - indicators_view = get_indicators_in_view(view) - indicators_view = [ind["path"] for ind in indicators_view] - indicators_extra = [] - - # Get indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - indicators_extra.append(extract_catalog_path(view["config"]["sortColumnSlug"])) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - indicators_extra.append(extract_catalog_path(view["config"]["map"]["columnSlug"])) - - # All indicators in `indicators_extra` should be in `indicators`! E.g. you can't sort by an indicator that is not in the chart! - ## E.g. the indicator used to sort, should be in use in the chart! Or, the indicator in the map tab should be in use in the chart! - invalid_indicators = set(indicators_extra).difference(set(indicators_view)) - if invalid_indicators: - raise ValueError( - f"Extra indicators not in use. This means that some indicators are referenced in the chart config (e.g. map.columnSlug or sortColumnSlug), but never used in the chart tab. Unexpected indicators: {invalid_indicators}" - ) + mdim.validate_views_with_dimensions() - indicators.extend(indicators_view) + # Validate duplicate views + mdim.check_duplicate_views() - # Make sure indicators are unique - indicators = list(set(indicators)) + # Check that all indicators in mdim exist + indicators = mdim.indicators_in_use() + validate_indicators_in_db(indicators, engine) - # Validate duplicate views - seen_dims = set() - for view in config["views"]: - dims = tuple(view["dimensions"].items()) - if dims in seen_dims: - raise ValueError(f"Duplicate view:\n\n{yaml.dump(view['dimensions'])}") - seen_dims.add(dims) - - # NOTE: this is allowed, some views might contain other views - # Check uniqueness - # inds = pd.Series(indicators) - # vc = inds.value_counts() - # if vc[vc > 1].any(): - # raise ValueError(f"Duplicate indicators: {vc[vc > 1].index.tolist()}") - - # Check that all indicators exist + +def validate_indicators_in_db(indicators, engine): + """Check that indicators are in DB!""" q = """ select id, diff --git a/etl/collections/utils.py b/etl/collections/utils.py index 78eda0a7c51..2f232a49b54 100644 --- a/etl/collections/utils.py +++ b/etl/collections/utils.py @@ -1,6 +1,6 @@ import re from collections import defaultdict -from typing import Any, Dict, List, Set, Union +from typing import Dict, List, Set from owid.catalog import Dataset, Table @@ -36,60 +36,3 @@ def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]] tb_name_to_tb[table_name].append(ds.read(table_name, load_data=False)) return tb_name_to_tb - - -def extract_catalog_path(indicator_raw): - "Indicator spec can come either as a plain string, or a dictionary." - if isinstance(indicator_raw, str): - return indicator_raw - elif isinstance(indicator_raw, dict): - assert "catalogPath" in indicator_raw - return indicator_raw["catalogPath"] - else: - raise ValueError(f"Unexpected indicator property type: {indicator_raw}") - - -################################################ -# DEPRECATE -################################################ - -DIMENSIONS = ["y", "x", "size", "color"] - - -def get_indicators_in_view(view): - """Get the list of indicators in use in a view. - - It returns the list as a list of records: - - [ - { - "path": "data://path/to/dataset#indicator", - "dimension": "y" - }, - ... - ] - - TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? - """ - indicators_view = [] - # Get indicators from dimensions - for dim in DIMENSIONS: - if dim in view["indicators"]: - indicator_raw = view["indicators"][dim] - if isinstance(indicator_raw, list): - assert dim == "y", "Only `y` can come as a list" - indicators_view += [ - { - "path": extract_catalog_path(ind), - "dimension": dim, - } - for ind in indicator_raw - ] - else: - indicators_view.append( - { - "path": extract_catalog_path(indicator_raw), - "dimension": dim, - } - ) - return indicators_view diff --git a/etl/steps/export/explorers/covid/latest/covid.config.yml b/etl/steps/export/explorers/covid/latest/covid.config.yml index 22a63714577..0809688a890 100644 --- a/etl/steps/export/explorers/covid/latest/covid.config.yml +++ b/etl/steps/export/explorers/covid/latest/covid.config.yml @@ -112,6 +112,11 @@ dimensions: type: dropdown - slug: relative name: Relative to population + choices: + - slug: false + name: "True" + - slug: true + name: Per 100,000 people presentation: type: checkbox diff --git a/etl/steps/export/explorers/covid/latest/covid.py b/etl/steps/export/explorers/covid/latest/covid.py index 33dce25e3f3..28dfd59b9b2 100644 --- a/etl/steps/export/explorers/covid/latest/covid.py +++ b/etl/steps/export/explorers/covid/latest/covid.py @@ -4,9 +4,8 @@ import pandas as pd -from etl.collections.explorers import expand_catalog_paths +from etl.collections.explorers import expand_catalog_paths, get_indicators_in_view from etl.collections.utils import ( - get_indicators_in_view, get_tables_by_name_mapping, records_to_dictionary, ) From bc3ed1465ea6af30052534fb45c7bbe2012f3792 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 28 Feb 2025 19:23:13 +0100 Subject: [PATCH 18/18] wip explorers --- etl/collections/explorers.py | 142 +----------------- etl/collections/model.py | 60 +++++++- .../explorers/covid/latest/covid.config.yml | 14 +- .../export/explorers/covid/latest/covid.py | 43 ++---- 4 files changed, 80 insertions(+), 179 deletions(-) diff --git a/etl/collections/explorers.py b/etl/collections/explorers.py index 4557c033209..565450734e6 100644 --- a/etl/collections/explorers.py +++ b/etl/collections/explorers.py @@ -1,137 +1,11 @@ -from typing import Any, Dict, List, Union +from typing import Optional -from owid.catalog import Table +from etl.config import OWIDEnv +from etl.helpers import PathFinder -from etl.collections.model import DIMENSIONS - -def expand_catalog_paths(view: Dict[Any, Any], tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: - """Expand catalog paths in views to full dataset URIs. - - This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', - it is assumed to be a table name that must be expanded to a full dataset URI based on - the provided dependencies. - - NOTE: Possible improvements for internal function `_expand`: - - we should make this function a bit more robust when checking the URIs. - - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. - - Args: - config (dict): Configuration dictionary containing views. - tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. - """ - - def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: - """Return same indicator, but with complete catalog path.""" - - def _expand(indicator: str): - assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" - - # Complete dataset URI - if "/" in indicator: - return indicator - # table#indicator format - else: - indicator_split = indicator.split("#") - - # Check format is actually table#indicator - assert (len(indicator_split) == 2) & ( - indicator_split[0] != "" - ), f"Expected 'table#indicator' format. Instead found {indicator}" - - # Check table is in any of the datasets! - assert ( - indicator_split[0] in tables_by_name - ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" - - # Check table name to table mapping is unique - assert ( - len(tables_by_name[indicator_split[0]]) == 1 - ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." - - # Check dataset in table metadata is not None - tb = tables_by_name[indicator_split[0]][0] - assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" - - # Build URI - return tb.m.dataset.uri + "/" + indicator - - # Expand catalog path if it's a string - if isinstance(indicator, str): - return _expand(indicator) - # Expand catalog path if it's a dictionary - elif isinstance(indicator, dict): - assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" - indicator["catalogPath"] = _expand(indicator["catalogPath"]) - return indicator - - # Update indicators for each dimension - for dim in DIMENSIONS: - if dim in view["indicators"]: - if isinstance(view["indicators"][dim], list): - view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] - else: - view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) - - # Update indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) - - return view - - -def extract_catalog_path(indicator_raw): - "Indicator spec can come either as a plain string, or a dictionary." - if isinstance(indicator_raw, str): - return indicator_raw - elif isinstance(indicator_raw, dict): - assert "catalogPath" in indicator_raw - return indicator_raw["catalogPath"] - else: - raise ValueError(f"Unexpected indicator property type: {indicator_raw}") - - -def get_indicators_in_view(view): - """Get the list of indicators in use in a view. - - It returns the list as a list of records: - - [ - { - "path": "data://path/to/dataset#indicator", - "dimension": "y" - }, - ... - ] - - TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? - """ - indicators_view = [] - # Get indicators from dimensions - for dim in DIMENSIONS: - if dim in view["indicators"]: - indicator_raw = view["indicators"][dim] - if isinstance(indicator_raw, list): - assert dim == "y", "Only `y` can come as a list" - indicators_view += [ - { - "path": extract_catalog_path(ind), - "dimension": dim, - } - for ind in indicator_raw - ] - else: - indicators_view.append( - { - "path": extract_catalog_path(indicator_raw), - "dimension": dim, - } - ) - return indicators_view +def upsert_explorer( + config: dict, paths: PathFinder, explorer_name: Optional[str] = None, owid_env: Optional[OWIDEnv] = None +) -> None: + """TODO: Replicate `etl.collections.multidim.upsert_mdim_data_page`.""" + pass diff --git a/etl/collections/model.py b/etl/collections/model.py index c66997e3058..be1f64078d7 100644 --- a/etl/collections/model.py +++ b/etl/collections/model.py @@ -10,7 +10,7 @@ import json import re from dataclasses import dataclass -from typing import Any, Dict, List, Optional, TypeVar +from typing import Any, Dict, List, Literal, Optional, TypeVar import fastjsonschema import yaml @@ -250,6 +250,17 @@ class DimensionChoice(MetaBase): description: Optional[str] = None +@pruned_json +@dataclass +class DimensionPresentation(MetaBase): + type: Literal["dropdown", "checkbox", "radio"] + + def __post_init__(self): + # TODO: is there a cleaner way of validating this? Feels redundant with the Literal type specified above + UI_TYPE_ACCEPTED = ["dropdown", "checkbox", "radio"] + assert self.type in UI_TYPE_ACCEPTED, f"Invalid type: {self.type}. Accepted are {UI_TYPE_ACCEPTED}" + + @pruned_json @dataclass class Dimension(MetaBase): @@ -258,25 +269,24 @@ class Dimension(MetaBase): slug: str name: str choices: List[DimensionChoice] - # This is just for explorers at the moment! - presentation: Optional[Dict[str, Any]] = None + presentation: Optional[DimensionPresentation] = None @property def ui_type(self): default = "dropdown" if self.presentation is not None: - return self.presentation.get("type", default) + return self.presentation.type return default - @property - def ui_is_bool(self): - return self.ui_type in ["checkbox"] - @property def choice_slugs(self): # if self.choices is not None: return [choice.slug for choice in self.choices] + @property + def ppt(self): + return self.presentation + @pruned_json @dataclass @@ -286,6 +296,14 @@ class Collection(MetaBase): dimensions: List[Dimension] views: List[View] + @property + def v(self): + return self.views + + @property + def d(self): + return self.dimensions + def validate_views_with_dimensions(self): """Validate that the dimension choices in all views are defined.""" dix = {dim.slug: dim.choice_slugs for dim in self.dimensions} @@ -342,6 +360,32 @@ class Explorer(Collection): config: Dict[str, str] + def display_config_names(self): + """Get display names for all dimensions and choices. + + The structure of the output is: + + { + dimension_slug: { + "widget_name": "...", + "choices": { + choice_slug: choice_name, + ... + } + }, + ... + } + + where `widget_name` is actually not displayed anywhere, but used as header name in explorer config. + """ + mapping = {} + for dim in self.dimensions: + mapping[dim.slug] = { + "widget_name": f"{dim.name} {dim.ui_type.title()}", + "choices": {choice.slug: choice.name for choice in dim.choices}, + } + return mapping + @pruned_json @dataclass diff --git a/etl/steps/export/explorers/covid/latest/covid.config.yml b/etl/steps/export/explorers/covid/latest/covid.config.yml index 0809688a890..8d6d23eb5c9 100644 --- a/etl/steps/export/explorers/covid/latest/covid.config.yml +++ b/etl/steps/export/explorers/covid/latest/covid.config.yml @@ -131,10 +131,10 @@ views: relative: false indicators: y: - - path: excess_mortality_economist#cumulative_estimated_daily_excess_deaths - - path: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_bot - - path: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_top - - path: cases_deaths#total_deaths + - catalogPath: excess_mortality_economist#cumulative_estimated_daily_excess_deaths + - catalogPath: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_bot + - catalogPath: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_top + - catalogPath: cases_deaths#total_deaths config: title: Estimated cumulative excess deaths during COVID-19 subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. @@ -560,7 +560,7 @@ views: # CFR ####################### - indicators: - y: cases_deaths/cases_deaths#cfr + y: cases_deaths#cfr dimensions: metric: cfr interval: cum @@ -656,7 +656,7 @@ views: interval: cum relative: false indicators: - y: vaccinations_global/vaccinations_global#total_boosters + y: vaccinations_global#total_boosters - dimensions: metric: boosters @@ -788,7 +788,7 @@ views: y: - vaccinations_global#total_vaccinations_per_hundred - cases_deaths#new_cases_per_million_7_day_avg_right - - grapher/covid/latest/hospital/hospital#daily_occupancy_icu_per_1m + - hospital#daily_occupancy_icu_per_1m - cases_deaths#new_deaths_per_million_7_day_avg_right dimensions: metric: vax_cases_icu_deaths diff --git a/etl/steps/export/explorers/covid/latest/covid.py b/etl/steps/export/explorers/covid/latest/covid.py index 28dfd59b9b2..662d33e51f5 100644 --- a/etl/steps/export/explorers/covid/latest/covid.py +++ b/etl/steps/export/explorers/covid/latest/covid.py @@ -4,20 +4,15 @@ import pandas as pd -from etl.collections.explorers import expand_catalog_paths, get_indicators_in_view +from etl.collections.model import Explorer from etl.collections.utils import ( get_tables_by_name_mapping, - records_to_dictionary, ) from etl.helpers import PathFinder, create_explorer # Get paths and naming conventions for current step. paths = PathFinder(__file__) -OPTION_TYPES = { - "dropdown": "Dropdown", - "checkbox": "Checkbox", -} RELATED = { "deaths": { "text": "Since 8 March, we rely on data from the WHO for confirmed cases and deaths", @@ -69,9 +64,9 @@ def run(dest_dir: str) -> None: # Load grapher config from YAML config = paths.load_explorer_config() - header = config["config"] - grapher_views = config["views"] - grapher_dimensions = config["dimensions"] + explorer = Explorer.from_dict(config) + + header = explorer.config # Load necessary tables # ds = paths.load_dataset("cases_deaths") @@ -88,38 +83,23 @@ def run(dest_dir: str) -> None: # 3. Obtain `df_grapher`: This is the final DataFrame that will be saved as the Explorer dataset. It is basically a different presentation of the config # 1. Prepare Dimension display dictionary - dimensions_display = records_to_dictionary(grapher_dimensions, key="slug") - for slug, values in dimensions_display.items(): - # Sanity checks - assert "name" in values, f"name not found for dimension: {slug}!" - assert "presentation" in values, f"presentation not found for dimension: {slug}!" - assert "type" in values["presentation"], f"type not found for dimension: {slug}!" - - # Index choices - if "choices" not in values: - assert values["presentation"]["type"] == "checkbox", f"Choices not found for dimension: {slug}!" - else: - values["choices"] = records_to_dictionary(values["choices"], key="slug") - - # Widget name - values["widget_name"] = f"{values['name']} {values['presentation']['type'].title()}" + dimensions_display = explorer.display_config_names() # 2. Get table information by table name, and table URI tables_by_name = get_tables_by_name_mapping(paths.dependencies) # 3. Remix configuration to generate explorer-friendly graphers table. records = [] - for view in grapher_views: - # Expand catalog paths - expand_catalog_paths(view, tables_by_name) + for view in explorer.views: + view.expand_paths(tables_by_name) # Build dimensions dictionary for a view dimensions = bake_dimensions_view( dimensions_display=dimensions_display, - view=view, + view=view.to_dict(), ) # Get options and variable IDs - indicator_paths = get_indicators_in_view(view) + indicator_paths = view.indicators.to_records() # Build record record = { @@ -212,7 +192,10 @@ def bake_dimensions_view(dimensions_display, view): Given is dimension_slug: choice_slug. We need to convert it to dimension_name: choice_name (using dimensions_display). """ view_dimensions = {} - for slug_dim, slug_choice in view["dimensions"].items(): + for slug_dim, slug_choice in view.dimensions.items(): + # dim_name = f"{}" + # choice_name = "" + if "choices" in dimensions_display[slug_dim]: view_dimensions[dimensions_display[slug_dim]["widget_name"]] = dimensions_display[slug_dim]["choices"][ slug_choice