diff --git a/etl/collections/base.py b/etl/collections/base.py deleted file mode 100644 index 0a260f47769..00000000000 --- a/etl/collections/base.py +++ /dev/null @@ -1,47 +0,0 @@ -"""WIP: Drafting a model for dealing with MDIM/Explorer configuration. - -This should be aligned with the MDIM schema. -""" - -from typing import Any, Dict, List, Optional - - -class Config: - """Overall MDIM/Explorer config""" - - config: Dict[str, str] - views: List["ViewConfig"] - dimensions: List["DimensionConfig"] - - -class ViewConfig: - """MDIM/Explorer view configuration.""" - - dimensions: Dict[str, str] - indicators: "IndicatorSelection" - config: Optional[Any] - metadata: Optional[Any] - - -class DimensionConfig: - """MDIM/Explorer dimension configuration.""" - - choices: List["ChoiceConfig"] - - -class IndicatorSelection: - y: Optional[List["Indicator"]] - x: Optional[List["Indicator"]] - size: Optional[List["Indicator"]] - color: Optional[List["Indicator"]] - - -class Indicator: - path: str - display: Dict[str, Any] - - -class ChoiceConfig: - slug: str - name: str - description: str diff --git a/etl/collections/common.py b/etl/collections/common.py new file mode 100644 index 00000000000..73ab5b00a5b --- /dev/null +++ b/etl/collections/common.py @@ -0,0 +1 @@ +"""Common tooling for MDIMs/Explorers.""" diff --git a/etl/collections/explorers.py b/etl/collections/explorers.py new file mode 100644 index 00000000000..565450734e6 --- /dev/null +++ b/etl/collections/explorers.py @@ -0,0 +1,11 @@ +from typing import Optional + +from etl.config import OWIDEnv +from etl.helpers import PathFinder + + +def upsert_explorer( + config: dict, paths: PathFinder, explorer_name: Optional[str] = None, owid_env: Optional[OWIDEnv] = None +) -> None: + """TODO: Replicate `etl.collections.multidim.upsert_mdim_data_page`.""" + pass diff --git a/etl/collections/model.py b/etl/collections/model.py new file mode 100644 index 00000000000..be1f64078d7 --- /dev/null +++ b/etl/collections/model.py @@ -0,0 +1,435 @@ +"""WIP: Drafting a model for dealing with MDIM/Explorer configuration. + +This should be aligned with the MDIM schema. + +THINGS TO SOLVE: + + - If an attribute is Optional, MetaBase.from_dict is not correctly loading it as the appropriate class when given. +""" + +import json +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Optional, TypeVar + +import fastjsonschema +import yaml +from owid.catalog import Table +from owid.catalog.meta import MetaBase + +DIMENSIONS = ["y", "x", "size", "color"] +T = TypeVar("T") +REGEX_CATALOG_PATH = ( + r"^(?:grapher/[A-Za-z0-9_]+/(?:\d{4}-\d{2}-\d{2}|\d{4}|latest)/[A-Za-z0-9_]+/)?[A-Za-z0-9_]+#[A-Za-z0-9_]+$" +) + + +def prune_dict(d: dict) -> dict: + """Remove all keys starting with underscore and all empty values from a dictionary. + + NOTE: This method was copied from owid.catalog.utils. It is slightly different in the sense that it does not remove fields with empty lists! This is because there are some fields which are mandatory and can be empty! (TODO: should probably fix the schema / engineering side) + + """ + out = {} + for k, v in d.items(): + if not k.startswith("_") and v not in [None, {}]: + if isinstance(v, dict): + out[k] = prune_dict(v) + elif isinstance(v, list): + out[k] = [prune_dict(x) if isinstance(x, dict) else x for x in v if x not in [None, {}]] + else: + out[k] = v + return out + + +def pruned_json(cls: T) -> T: + orig = cls.to_dict # type: ignore + + # only keep non-null public variables + # calling original to_dict returns dictionaries, not objects + cls.to_dict = lambda self, **kwargs: prune_dict(orig(self, **kwargs)) # type: ignore + + return cls + + +@pruned_json +@dataclass +class Indicator(MetaBase): + catalogPath: str + display: Optional[Dict[str, Any]] = None + + def __post_init__(self): + # Validate that the catalog path is either (i) complete or (ii) in the format table#indicator. + if not self.is_a_valid_path(self.catalogPath): + raise ValueError(f"Invalid catalog path: {self.catalogPath}") + + def has_complete_path(self) -> bool: + return "/" in self.catalogPath + + @classmethod + def is_a_valid_path(cls, path: str) -> bool: + pattern = re.compile(REGEX_CATALOG_PATH) + valid = bool(pattern.match(path)) + return valid + + def __setattr__(self, name, value): + """Validate that the catalog path is either (i) complete or (ii) in the format table#indicator.""" + if hasattr(self, name): + if (name == "catalogPath") and (not self.is_a_valid_path(value)): + raise ValueError(f"Invalid catalog path: {value}") + return super().__setattr__(name, value) + + def expand_path(self, tables_by_name: Dict[str, List[Table]]): + # Do nothing if path is already complete + if self.has_complete_path(): + return self + + # If path is not complete, we need to expand it! + table_name = self.catalogPath.split("#")[0] + + # Check table is in any of the datasets! + assert ( + table_name in tables_by_name + ), f"Table name `{table_name}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" + + # Check table name to table mapping is unique + assert ( + len(tables_by_name[table_name]) == 1 + ), f"There are multiple dependencies (datasets) with a table named {table_name}. Please use the complete dataset URI in this case." + + # Check dataset in table metadata is not None + tb = tables_by_name[table_name][0] + assert tb.m.dataset is not None, f"Dataset not found for table {table_name}" + + # Build URI + self.catalogPath = tb.m.dataset.uri + "/" + self.catalogPath + + return self + + +@pruned_json +@dataclass +class ViewIndicators(MetaBase): + """Indicators in a MDIM/Explorer view.""" + + y: Optional[List[Indicator]] = None + x: Optional[Indicator] = None + size: Optional[Indicator] = None + color: Optional[Indicator] = None + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "ViewIndicators": + """Coerce the dictionary into the expected shape before passing it to the parent class.""" + # Make a shallow copy so we don't mutate the user's dictionary in-place + data = dict(d) + + # Coerce each dimension field (y, x, size, color) from [str, ...] -> [{'path': str}, ...] + for dim in DIMENSIONS: + if dim in data: + if isinstance(data[dim], list): + data[dim] = [{"catalogPath": item} if isinstance(item, str) else item for item in data[dim]] + else: + if isinstance(data[dim], str): + data[dim] = [{"catalogPath": data[dim]}] if dim == "y" else {"catalogPath": data[dim]} + elif dim == "y": + data[dim] = [data[dim]] + # Now that data is in the expected shape, let the parent class handle the rest + return super().from_dict(data) + + def to_records(self) -> List[Dict[str, str]]: + indicators = [] + for dim in DIMENSIONS: + dimension_val = getattr(self, dim, None) + if dimension_val is None: + continue + if isinstance(dimension_val, list): + for d in dimension_val: + indicators.append({"path": d.catalogPath, "dimension": dim}) + else: + indicators.append({"path": dimension_val.catalogPath, "dimension": dim}) + return indicators + + def expand_paths(self, tables_by_name: Dict[str, List[Table]]): + """Expand the catalog paths of all indicators in the view.""" + for dim in DIMENSIONS: + dimension_val = getattr(self, dim, None) + if dimension_val is None: + continue + if isinstance(dimension_val, list): + for indicator in dimension_val: + indicator.expand_path(tables_by_name) + else: + dimension_val.expand_path(tables_by_name) + + return self + + +@pruned_json +@dataclass +class View(MetaBase): + """MDIM/Explorer view configuration.""" + + dimensions: Dict[str, str] + indicators: ViewIndicators + # NOTE: Maybe worth putting as classes at some point? + config: Optional[Any] = None + metadata: Optional[Any] = None + + @property + def has_multiple_indicators(self) -> bool: + # Get list of indicators + indicators = self.indicators.to_records() + return len(indicators) > 1 + + @property + def metadata_is_needed(self) -> bool: + return self.has_multiple_indicators and (self.metadata is None) + + def expand_paths(self, tables_by_name: Dict[str, List[Table]]): + """Expand all indicator paths in the view. + + Make sure that they are all complete paths. This includes indicators in view, but also those in config (if any). + """ + # Expand paths in indicators + self.indicators.expand_paths(tables_by_name) + + # Expand paths in config fields + if self.config is not None: + if "sortColumnSlug" in self.config: + indicator = Indicator(self.config["sortColumnSlug"]).expand_path(tables_by_name) + self.config["sortColumnSlug"] = indicator.catalogPath + + if "map" in self.config: + if "columnSlug" in self.config["map"]: + indicator = Indicator(self.config["map"]["columnSlug"]).expand_path(tables_by_name) + self.config["map"]["columnSlug"] = indicator.catalogPath + + return self + + @property + def indicators_in_config(self): + indicators = [] + if self.config is not None: + # Get indicators from sortColumnSlug + if "sortColumnSlug" in self.config: + indicators.append(self.config["sortColumnSlug"]) + + # Update indicators from map.columnSlug + if ("map" in self.config) and "columnSlug" in self.config["map"]: + indicators.append((self.config["map"]["columnSlug"])) + + return indicators + + def indicators_used(self): + """Get a flatten list of all indicators used in the view. + + In addition, it also validates that indicators used in config are also in the view. + + NOTE: Use this method after expanding paths! Otherwise, it will not work as expected. E.g. view.expand_paths(tables_by_name).indicators_used() + """ + # Validate indicators in view + indicators = self.indicators.to_records() + indicators = [ind["path"] for ind in indicators] + + # All indicators in `indicators_extra` should be in `indicators`! E.g. you can't sort by an indicator that is not in the chart! + ## E.g. the indicator used to sort, should be in use in the chart! Or, the indicator in the map tab should be in use in the chart! + invalid_indicators = set(self.indicators_in_config).difference(set(indicators)) + if invalid_indicators: + raise ValueError( + f"Extra indicators not in use. This means that some indicators are referenced in the chart config (e.g. map.columnSlug or sortColumnSlug), but never used in the chart tab. Unexpected indicators: {invalid_indicators}" + ) + + return indicators + + +@pruned_json +@dataclass +class DimensionChoice(MetaBase): + slug: str + name: str + description: Optional[str] = None + + +@pruned_json +@dataclass +class DimensionPresentation(MetaBase): + type: Literal["dropdown", "checkbox", "radio"] + + def __post_init__(self): + # TODO: is there a cleaner way of validating this? Feels redundant with the Literal type specified above + UI_TYPE_ACCEPTED = ["dropdown", "checkbox", "radio"] + assert self.type in UI_TYPE_ACCEPTED, f"Invalid type: {self.type}. Accepted are {UI_TYPE_ACCEPTED}" + + +@pruned_json +@dataclass +class Dimension(MetaBase): + """MDIM/Explorer dimension configuration.""" + + slug: str + name: str + choices: List[DimensionChoice] + presentation: Optional[DimensionPresentation] = None + + @property + def ui_type(self): + default = "dropdown" + if self.presentation is not None: + return self.presentation.type + return default + + @property + def choice_slugs(self): + # if self.choices is not None: + return [choice.slug for choice in self.choices] + + @property + def ppt(self): + return self.presentation + + +@pruned_json +@dataclass +class Collection(MetaBase): + """Overall MDIM/Explorer config""" + + dimensions: List[Dimension] + views: List[View] + + @property + def v(self): + return self.views + + @property + def d(self): + return self.dimensions + + def validate_views_with_dimensions(self): + """Validate that the dimension choices in all views are defined.""" + dix = {dim.slug: dim.choice_slugs for dim in self.dimensions} + + for view in self.views: + for slug, value in view.dimensions.items(): + assert slug in dix, f"Dimension {slug} not found in dimensions! View: {self.to_dict()}" + assert value in dix[slug], f"Choice {value} not found for dimension {slug}! View: {self.to_dict()}" + + def validate_schema(self, schema_path): + """Validate class against schema.""" + with open(schema_path) as f: + schema = json.load(f) + + validator = fastjsonschema.compile(schema) + + try: + validator(self.to_dict()) # type: ignore + except fastjsonschema.JsonSchemaException as e: + raise ValueError(f"Config validation error: {e.message}") # type: ignore + + def indicators_in_use(self): + # Get all indicators used in all views + indicators = [] + for view in self.views: + indicators.extend(view.indicators_used()) + + # Make sure indicators are unique + indicators = list(set(indicators)) + + return indicators + + def check_duplicate_views(self): + """Check for duplicate views in the collection.""" + seen_dims = set() + for view in self.views: + dims = tuple(view.dimensions.items()) + if dims in seen_dims: + raise ValueError(f"Duplicate view:\n\n{yaml.dump(view.dimensions)}") + seen_dims.add(dims) + + # NOTE: this is allowed, some views might contain other views + # Check uniqueness + # inds = pd.Series(indicators) + # vc = inds.value_counts() + # if vc[vc > 1].any(): + # raise ValueError(f"Duplicate indicators: {vc[vc > 1].index.tolist()}") + + +@pruned_json +@dataclass +class Explorer(Collection): + """Model for Explorer configuration.""" + + config: Dict[str, str] + + def display_config_names(self): + """Get display names for all dimensions and choices. + + The structure of the output is: + + { + dimension_slug: { + "widget_name": "...", + "choices": { + choice_slug: choice_name, + ... + } + }, + ... + } + + where `widget_name` is actually not displayed anywhere, but used as header name in explorer config. + """ + mapping = {} + for dim in self.dimensions: + mapping[dim.slug] = { + "widget_name": f"{dim.name} {dim.ui_type.title()}", + "choices": {choice.slug: choice.name for choice in dim.choices}, + } + return mapping + + +@pruned_json +@dataclass +class Multidim(Collection): + """Model for MDIM configuration.""" + + title: Dict[str, str] + defaultSelection: List[str] + topicTags: Optional[List[str]] = None + definitions: Optional[Any] = None + + +# # def main(): +# import yaml + +# from etl.collections.utils import ( +# get_tables_by_name_mapping, +# ) + +# f_mdim = "/home/lucas/repos/etl/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml" +# with open(f_mdim) as istream: +# cfg_mdim = yaml.safe_load(istream) +# mdim = Multidim.from_dict(cfg_mdim) + +# dependencies = { +# "data://grapher/covid/latest/hospital", +# "data://grapher/covid/latest/vaccinations_global", +# "data://grapher/covid/latest/vaccinations_manufacturer", +# "data://grapher/covid/latest/testing", +# "data://grapher/excess_mortality/latest/excess_mortality", +# "data-private://grapher/excess_mortality/latest/excess_mortality_economist", +# "data://grapher/covid/latest/xm_who", +# "data://grapher/covid/latest/cases_deaths", +# "data://grapher/covid/latest/covax", +# "data://grapher/covid/latest/infections_model", +# "data://grapher/covid/latest/google_mobility", +# "data://grapher/regions/2023-01-01/regions", +# } +# tables_by_name = get_tables_by_name_mapping(dependencies) + +# mdim.views[0].indicators.expand_paths(tables_by_name) + +# f_explorer = "/home/lucas/repos/etl/etl/steps/export/explorers/covid/latest/covid.config.yml" +# with open(f_explorer) as istream: +# cfg_explorer = yaml.safe_load(istream) +# explorer = Explorer.from_dict(cfg_explorer) +# # cfg.views[0].indicators.y diff --git a/etl/collections/multidim.py b/etl/collections/multidim.py index f2541e38efc..4af759c5b51 100644 --- a/etl/collections/multidim.py +++ b/etl/collections/multidim.py @@ -12,17 +12,14 @@ import fastjsonschema import pandas as pd -import yaml from deprecated import deprecated from owid.catalog import Table from sqlalchemy.engine import Engine from structlog import get_logger from apps.chart_sync.admin_api import AdminAPI +from etl.collections.model import Multidim from etl.collections.utils import ( - expand_catalog_paths, - extract_catalog_path, - get_indicators_in_view, get_tables_by_name_mapping, records_to_dictionary, ) @@ -38,6 +35,7 @@ DIMENSIONS = ["y", "x", "size", "color"] +# TODO: Return List[Dimensions] and List[Views] instead of {"dimensions": [...], "views": [...]} def expand_config( tb: Table, indicator_name: Optional[str] = None, @@ -182,16 +180,18 @@ def upsert_multidim_data_page( dependencies = paths.dependencies mdim_catalog_path = f"{paths.namespace}/{paths.version}/{paths.short_name}#{mdim_name or paths.short_name}" + mdim = Multidim.from_dict(config) + # Edit views - process_mdim_views(config, dependencies=dependencies) + process_mdim_views(mdim, dependencies=dependencies) # TODO: Possibly add other edits (to dimensions?) # Upsert to DB - _upsert_multidim_data_page(mdim_catalog_path, config, owid_env) + _upsert_multidim_data_page(mdim_catalog_path, mdim, owid_env) -def process_mdim_views(config: dict, dependencies: Set[str]): +def process_mdim_views(mdim: Multidim, dependencies: Set[str]): """Process views in MDIM configuration. This includes: @@ -203,34 +203,32 @@ def process_mdim_views(config: dict, dependencies: Set[str]): # tables_by_uri = get_tables_by_uri_mapping(tables_by_name) # This is to be used when processing views with multiple indicators # Go through all views and expand catalog paths - for view in config["views"]: + for view in mdim.views: # Update indicators for each dimension, making sure they have the complete URI - expand_catalog_paths(view, tables_by_name=tables_by_name) + view.expand_paths(tables_by_name) # Combine metadata in views which contain multiple indicators - indicators = get_indicators_in_view(view) - if (len(indicators) > 1) and ("metadata" not in view): # Check if view "contains multiple indicators" + if view.metadata_is_needed: # Check if view "contains multiple indicators" # TODO # view["metadata"] = build_view_metadata_multi(indicators, tables_by_uri) log.info( - f"View with multiple indicators detected. You should edit its `metadata` field to reflect that! This will be done programmatically in the future. Check view with dimensions {view['dimensions']}" + f"View with multiple indicators detected. You should edit its `metadata` field to reflect that! This will be done programmatically in the future. Check view with dimensions {view.dimensions}" ) - pass -def _upsert_multidim_data_page(mdim_catalog_path: str, config: dict, owid_env: Optional[OWIDEnv] = None) -> None: +def _upsert_multidim_data_page(mdim_catalog_path: str, mdim: Multidim, owid_env: Optional[OWIDEnv] = None) -> None: """Actual upsert to DB.""" # Ensure we have an environment set if owid_env is None: owid_env = OWID_ENV # Validate config - validate_schema(config) - validate_multidim_config(config, owid_env.engine) + mdim.validate_schema(SCHEMAS_DIR / "multidim-schema.json") + validate_multidim_config(mdim, owid_env.engine) # Replace especial fields URIs with IDs (e.g. sortColumnSlug). # TODO: I think we could move this to the Grapher side. - config = replace_catalog_paths_with_ids(config) + config = replace_catalog_paths_with_ids(mdim.to_dict()) # Upsert config via Admin API admin_api = AdminAPI(owid_env) @@ -288,66 +286,20 @@ def validate_schema(config: dict) -> None: raise ValueError(f"Config validation error: {e.message}") # type: ignore -def validate_multidim_config(config: dict, engine: Engine) -> None: +def validate_multidim_config(mdim: Multidim, engine: Engine) -> None: # Ensure that all views are in choices - for dim in config["dimensions"]: - allowed_slugs = [choice["slug"] for choice in dim["choices"]] - - for view in config["views"]: - for dim_name, dim_value in view["dimensions"].items(): - if dim_name == dim["slug"] and dim_value not in allowed_slugs: - raise ValueError( - f"Slug `{dim_value}` does not exist in dimension `{dim_name}`. View:\n\n{yaml.dump(view)}" - ) - - # Get all used indicators - indicators = [] - for view in config["views"]: - # Get indicators from dimensions - indicators_view = get_indicators_in_view(view) - indicators_view = [ind["path"] for ind in indicators_view] - indicators_extra = [] - - # Get indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - indicators_extra.append(extract_catalog_path(view["config"]["sortColumnSlug"])) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - indicators_extra.append(extract_catalog_path(view["config"]["map"]["columnSlug"])) - - # All indicators in `indicators_extra` should be in `indicators`! E.g. you can't sort by an indicator that is not in the chart! - ## E.g. the indicator used to sort, should be in use in the chart! Or, the indicator in the map tab should be in use in the chart! - invalid_indicators = set(indicators_extra).difference(set(indicators_view)) - if invalid_indicators: - raise ValueError( - f"Extra indicators not in use. This means that some indicators are referenced in the chart config (e.g. map.columnSlug or sortColumnSlug), but never used in the chart tab. Unexpected indicators: {invalid_indicators}" - ) + mdim.validate_views_with_dimensions() - indicators.extend(indicators_view) + # Validate duplicate views + mdim.check_duplicate_views() - # Make sure indicators are unique - indicators = list(set(indicators)) + # Check that all indicators in mdim exist + indicators = mdim.indicators_in_use() + validate_indicators_in_db(indicators, engine) - # Validate duplicate views - seen_dims = set() - for view in config["views"]: - dims = tuple(view["dimensions"].items()) - if dims in seen_dims: - raise ValueError(f"Duplicate view:\n\n{yaml.dump(view['dimensions'])}") - seen_dims.add(dims) - - # NOTE: this is allowed, some views might contain other views - # Check uniqueness - # inds = pd.Series(indicators) - # vc = inds.value_counts() - # if vc[vc > 1].any(): - # raise ValueError(f"Duplicate indicators: {vc[vc > 1].index.tolist()}") - - # Check that all indicators exist + +def validate_indicators_in_db(indicators, engine): + """Check that indicators are in DB!""" q = """ select id, diff --git a/etl/collections/utils.py b/etl/collections/utils.py index 80bd2c86eb7..2f232a49b54 100644 --- a/etl/collections/utils.py +++ b/etl/collections/utils.py @@ -1,13 +1,11 @@ import re from collections import defaultdict -from typing import Any, Dict, List, Set, Union +from typing import Dict, List, Set from owid.catalog import Dataset, Table from etl.paths import DATA_DIR -DIMENSIONS = ["y", "x", "size", "color"] - def records_to_dictionary(records, key: str): """Transform: [{key: ..., a: ..., b: ...}, ...] -> {key: {a: ..., b: ...}, ...}.""" @@ -20,45 +18,6 @@ def records_to_dictionary(records, key: str): return dix -def get_indicators_in_view(view): - """Get the list of indicators in use in a view. - - It returns the list as a list of records: - - [ - { - "path": "data://path/to/dataset#indicator", - "dimension": "y" - }, - ... - ] - - TODO: This is being called twice, maybe there is a way to just call it once. Maybe if it is an attribute of a class? - """ - indicators_view = [] - # Get indicators from dimensions - for dim in DIMENSIONS: - if dim in view["indicators"]: - indicator_raw = view["indicators"][dim] - if isinstance(indicator_raw, list): - assert dim == "y", "Only `y` can come as a list" - indicators_view += [ - { - "path": extract_catalog_path(ind), - "dimension": dim, - } - for ind in indicator_raw - ] - else: - indicators_view.append( - { - "path": extract_catalog_path(indicator_raw), - "dimension": dim, - } - ) - return indicators_view - - def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]]: """Dictionary mapping table short name to table object. @@ -77,96 +36,3 @@ def get_tables_by_name_mapping(dependencies: Set[str]) -> Dict[str, List[Table]] tb_name_to_tb[table_name].append(ds.read(table_name, load_data=False)) return tb_name_to_tb - - -def expand_catalog_paths(view: Dict[Any, Any], tables_by_name: Dict[str, List[Table]]) -> Dict[Any, Any]: - """Expand catalog paths in views to full dataset URIs. - - This function updates the given configuration dictionary in-place by modifying the dimension ('y', 'x', 'size', 'color') entries under "indicators" in each view. If an entry does not contain a '/', - it is assumed to be a table name that must be expanded to a full dataset URI based on - the provided dependencies. - - NOTE: Possible improvements for internal function `_expand`: - - we should make this function a bit more robust when checking the URIs. - - currently we only allow for 'table#indicator' format. We should also allow for other cases that could be useful in the event of name collisions, e.g. 'dataset/indicator#subindicator'. - - Args: - config (dict): Configuration dictionary containing views. - tables_by_name (Dict[str, List[Table]]): Mapping of table short names to tables. - """ - - def _expand_catalog_path(indicator: Union[str, Dict[str, str]]) -> Union[str, Dict[str, str]]: - """Return same indicator, but with complete catalog path.""" - - def _expand(indicator: str): - assert "#" in indicator, f"Missing '#' in indicator! '{indicator}'" - - # Complete dataset URI - if "/" in indicator: - return indicator - # table#indicator format - else: - indicator_split = indicator.split("#") - - # Check format is actually table#indicator - assert (len(indicator_split) == 2) & ( - indicator_split[0] != "" - ), f"Expected 'table#indicator' format. Instead found {indicator}" - - # Check table is in any of the datasets! - assert ( - indicator_split[0] in tables_by_name - ), f"Table name `{indicator_split[0]}` not found in dependency tables! Available tables are: {', '.join(tables_by_name.keys())}" - - # Check table name to table mapping is unique - assert ( - len(tables_by_name[indicator_split[0]]) == 1 - ), f"There are multiple dependencies (datasets) with a table named {indicator_split[0]}. Please use the complete dataset URI in this case." - - # Check dataset in table metadata is not None - tb = tables_by_name[indicator_split[0]][0] - assert tb.m.dataset is not None, f"Dataset not found for table {indicator_split[0]}" - - # Build URI - return tb.m.dataset.uri + "/" + indicator - - # Expand catalog path if it's a string - if isinstance(indicator, str): - return _expand(indicator) - # Expand catalog path if it's a dictionary - elif isinstance(indicator, dict): - assert "catalogPath" in indicator, "Expected 'catalogPath' key in indicator dictionary" - indicator["catalogPath"] = _expand(indicator["catalogPath"]) - return indicator - - # Update indicators for each dimension - for dim in DIMENSIONS: - if dim in view["indicators"]: - if isinstance(view["indicators"][dim], list): - view["indicators"][dim] = [_expand_catalog_path(dim) for dim in view["indicators"][dim]] - else: - view["indicators"][dim] = _expand_catalog_path(view["indicators"][dim]) - - # Update indicators from sortColumnSlug - if "config" in view: - if "sortColumnSlug" in view["config"]: - view["config"]["sortColumnSlug"] = _expand_catalog_path(view["config"]["sortColumnSlug"]) - - # Update indicators from map.columnSlug - if "config" in view: - if "map" in view["config"]: - if "columnSlug" in view["config"]["map"]: - view["config"]["map"]["columnSlug"] = _expand_catalog_path(view["config"]["map"]["columnSlug"]) - - return view - - -def extract_catalog_path(indicator_raw): - "Indicator spec can come either as a plain string, or a dictionary." - if isinstance(indicator_raw, str): - return indicator_raw - elif isinstance(indicator_raw, dict): - assert "catalogPath" in indicator_raw - return indicator_raw["catalogPath"] - else: - raise ValueError(f"Unexpected indicator property type: {indicator_raw}") diff --git a/etl/steps/export/explorers/covid/latest/covid.config.yml b/etl/steps/export/explorers/covid/latest/covid.config.yml index 2a7c737f710..8d6d23eb5c9 100644 --- a/etl/steps/export/explorers/covid/latest/covid.config.yml +++ b/etl/steps/export/explorers/covid/latest/covid.config.yml @@ -112,6 +112,11 @@ dimensions: type: dropdown - slug: relative name: Relative to population + choices: + - slug: false + name: "True" + - slug: true + name: Per 100,000 people presentation: type: checkbox @@ -126,10 +131,10 @@ views: relative: false indicators: y: - - excess_mortality_economist#cumulative_estimated_daily_excess_deaths - - excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_bot - - excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_top - - cases_deaths#total_deaths + - catalogPath: excess_mortality_economist#cumulative_estimated_daily_excess_deaths + - catalogPath: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_bot + - catalogPath: excess_mortality_economist#cumulative_estimated_daily_excess_deaths_ci_95_top + - catalogPath: cases_deaths#total_deaths config: title: Estimated cumulative excess deaths during COVID-19 subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. @@ -555,7 +560,7 @@ views: # CFR ####################### - indicators: - y: cases_deaths/cases_deaths#cfr + y: cases_deaths#cfr dimensions: metric: cfr interval: cum @@ -651,7 +656,7 @@ views: interval: cum relative: false indicators: - y: vaccinations_global/vaccinations_global#total_boosters + y: vaccinations_global#total_boosters - dimensions: metric: boosters @@ -783,7 +788,7 @@ views: y: - vaccinations_global#total_vaccinations_per_hundred - cases_deaths#new_cases_per_million_7_day_avg_right - - grapher/covid/latest/hospital/hospital#daily_occupancy_icu_per_1m + - hospital#daily_occupancy_icu_per_1m - cases_deaths#new_deaths_per_million_7_day_avg_right dimensions: metric: vax_cases_icu_deaths diff --git a/etl/steps/export/explorers/covid/latest/covid.py b/etl/steps/export/explorers/covid/latest/covid.py index c7fab442fc6..662d33e51f5 100644 --- a/etl/steps/export/explorers/covid/latest/covid.py +++ b/etl/steps/export/explorers/covid/latest/covid.py @@ -4,21 +4,15 @@ import pandas as pd +from etl.collections.model import Explorer from etl.collections.utils import ( - expand_catalog_paths, - get_indicators_in_view, get_tables_by_name_mapping, - records_to_dictionary, ) from etl.helpers import PathFinder, create_explorer # Get paths and naming conventions for current step. paths = PathFinder(__file__) -OPTION_TYPES = { - "dropdown": "Dropdown", - "checkbox": "Checkbox", -} RELATED = { "deaths": { "text": "Since 8 March, we rely on data from the WHO for confirmed cases and deaths", @@ -70,9 +64,9 @@ def run(dest_dir: str) -> None: # Load grapher config from YAML config = paths.load_explorer_config() - header = config["config"] - grapher_views = config["views"] - grapher_dimensions = config["dimensions"] + explorer = Explorer.from_dict(config) + + header = explorer.config # Load necessary tables # ds = paths.load_dataset("cases_deaths") @@ -89,38 +83,23 @@ def run(dest_dir: str) -> None: # 3. Obtain `df_grapher`: This is the final DataFrame that will be saved as the Explorer dataset. It is basically a different presentation of the config # 1. Prepare Dimension display dictionary - dimensions_display = records_to_dictionary(grapher_dimensions, key="slug") - for slug, values in dimensions_display.items(): - # Sanity checks - assert "name" in values, f"name not found for dimension: {slug}!" - assert "presentation" in values, f"presentation not found for dimension: {slug}!" - assert "type" in values["presentation"], f"type not found for dimension: {slug}!" - - # Index choices - if "choices" not in values: - assert values["presentation"]["type"] == "checkbox", f"Choices not found for dimension: {slug}!" - else: - values["choices"] = records_to_dictionary(values["choices"], key="slug") - - # Widget name - values["widget_name"] = f"{values['name']} {values['presentation']['type'].title()}" + dimensions_display = explorer.display_config_names() # 2. Get table information by table name, and table URI tables_by_name = get_tables_by_name_mapping(paths.dependencies) # 3. Remix configuration to generate explorer-friendly graphers table. records = [] - for view in grapher_views: - # Expand catalog paths - expand_catalog_paths(view, tables_by_name) + for view in explorer.views: + view.expand_paths(tables_by_name) # Build dimensions dictionary for a view dimensions = bake_dimensions_view( dimensions_display=dimensions_display, - view=view, + view=view.to_dict(), ) # Get options and variable IDs - indicator_paths = get_indicators_in_view(view) + indicator_paths = view.indicators.to_records() # Build record record = { @@ -213,7 +192,10 @@ def bake_dimensions_view(dimensions_display, view): Given is dimension_slug: choice_slug. We need to convert it to dimension_name: choice_name (using dimensions_display). """ view_dimensions = {} - for slug_dim, slug_choice in view["dimensions"].items(): + for slug_dim, slug_choice in view.dimensions.items(): + # dim_name = f"{}" + # choice_name = "" + if "choices" in dimensions_display[slug_dim]: view_dimensions[dimensions_display[slug_dim]["widget_name"]] = dimensions_display[slug_dim]["choices"][ slug_choice diff --git a/etl/steps/export/multidim/covid/latest/covid.population.yml b/etl/steps/export/multidim/covid/latest/covid.population.yml deleted file mode 100644 index c0e89f2740d..00000000000 --- a/etl/steps/export/multidim/covid/latest/covid.population.yml +++ /dev/null @@ -1,7 +0,0 @@ -title: - title: Population - titleVariant: by sex, age, variant -defaultSelection: - - United Kingdom -topicTags: - - COVID-19 diff --git a/etl/steps/export/multidim/covid/latest/covid.py b/etl/steps/export/multidim/covid/latest/covid.py index a249a6aa07d..84732201e1f 100644 --- a/etl/steps/export/multidim/covid/latest/covid.py +++ b/etl/steps/export/multidim/covid/latest/covid.py @@ -15,6 +15,8 @@ "addCountryMode": "change-country", } +print(2) + def run(dest_dir: str) -> None: # PART 1: MDIMs entirely from YAML files @@ -41,7 +43,6 @@ def run(dest_dir: str) -> None: paths=paths, mdim_name=fname_to_mdim_name(fname), ) - # PART 2: MDIMs hybridly generated (mix of YAML file + data) ds = paths.load_dataset("google_mobility") tb = ds.read("google_mobility") diff --git a/lib/catalog/owid/catalog/utils.py b/lib/catalog/owid/catalog/utils.py index adeb1822eb1..2e65f22d1e2 100644 --- a/lib/catalog/owid/catalog/utils.py +++ b/lib/catalog/owid/catalog/utils.py @@ -324,7 +324,7 @@ def dataclass_from_dict(cls: Optional[Type[T]], d: Dict[str, Any]) -> T: key_type, value_type = args init_args[field_name] = {k: dataclass_from_dict(value_type, item) for k, item in v.items()} elif dataclasses.is_dataclass(field_type): - init_args[field_name] = dataclass_from_dict(field_type, v) # type: ignore + init_args[field_name] = field_type.from_dict(v) # type: ignore elif isinstance(field_type, type) and field_type not in (Any,): try: init_args[field_name] = field_type(v)