From 5f6a1352aef1b34834f28a916f6ce668dece3e56 Mon Sep 17 00:00:00 2001 From: Lars Yencken Date: Tue, 7 May 2024 21:19:20 +0100 Subject: [PATCH] :tada: Add owid.catalog.charts module Adds a draft API for fetching the data for a chart. ``` from owid.catalog.charts import Chart df = Chart('life-expectancy').get_data() ``` --- lib/catalog/Makefile | 5 + lib/catalog/owid/catalog/charts.py | 67 +++++++++++ lib/catalog/owid/catalog/internal.py | 171 +++++++++++++++++++++++++++ lib/catalog/tests/test_charts.py | 57 +++++++++ 4 files changed, 300 insertions(+) create mode 100644 lib/catalog/owid/catalog/charts.py create mode 100644 lib/catalog/owid/catalog/internal.py create mode 100644 lib/catalog/tests/test_charts.py diff --git a/lib/catalog/Makefile b/lib/catalog/Makefile index dc1976c27cdd..6bdfc9fc17fc 100644 --- a/lib/catalog/Makefile +++ b/lib/catalog/Makefile @@ -10,6 +10,11 @@ SRC = owid tests # watch: # poetry run watchmedo shell-command -c 'clear; make unittest' --recursive --drop . +.venv: poetry.toml pyproject.toml poetry.lock + @echo '==> Installing packages' + poetry install + touch .venv + check-typing: .venv @echo '==> Checking types' poetry run pyright $(SRC) diff --git a/lib/catalog/owid/catalog/charts.py b/lib/catalog/owid/catalog/charts.py new file mode 100644 index 000000000000..622c56e404a7 --- /dev/null +++ b/lib/catalog/owid/catalog/charts.py @@ -0,0 +1,67 @@ +# +# owid.catalog.charts +# +# +# Access to data in OWID charts. +# + +from dataclasses import dataclass +from typing import List, Optional + +import pandas as pd + +from .internal import ( + ChartNotFoundError, # noqa + LicenseError, # noqa + _fetch_bundle, + _GrapherBundle, + _list_charts, +) + + +@dataclass +class Chart: + """ + A chart published on Our World in Data, for example: + + https://ourworldindata.org/grapher/life-expectancy + """ + + slug: str + + _bundle: Optional[_GrapherBundle] = None + + @property + def bundle(self) -> _GrapherBundle: + # LARS: give a nice error if the chart does not exist + if self._bundle is None: + self._bundle = _fetch_bundle(self.slug) + + return self._bundle + + @property + def config(self) -> dict: + return self.bundle.config # type: ignore + + def get_data(self) -> pd.DataFrame: + return self.bundle.to_frame() + + def __lt__(self, other): + return self.slug < other.slug + + def __eq__(self, value: object) -> bool: + return isinstance(value, Chart) and value.slug == self.slug + + +def list_charts() -> List[str]: + """ + List all available charts published on Our World in Data. + """ + return sorted(_list_charts()) + + +def get_data(slug: str) -> pd.DataFrame: + """ + Fetch the data for a chart by its slug. + """ + return Chart(slug).get_data() diff --git a/lib/catalog/owid/catalog/internal.py b/lib/catalog/owid/catalog/internal.py new file mode 100644 index 000000000000..b6651c34616c --- /dev/null +++ b/lib/catalog/owid/catalog/internal.py @@ -0,0 +1,171 @@ +# +# internal.py +# +# Internal APIs subject to change at any time. +# + +import datetime as dt +import json +import re +from dataclasses import dataclass +from typing import Dict, List, Literal + +import pandas as pd +import requests +from dateutil.parser import parse as date_parse + + +class LicenseError(Exception): + pass + + +class ChartNotFoundError(Exception): + pass + + +@dataclass +class _Indicator: + data: dict + metadata: dict + + def to_dict(self): + return {"data": self.data, "metadata": self.metadata} + + def to_frame(self): + if self.metadata.get("nonRedistributable"): + raise LicenseError( + "API download is disallowed for this indicator due to license restrictions from the data provider" + ) + + # getting a data frame is easy + df = pd.DataFrame.from_dict(self.data) + + # turning entity ids into entity names + entities = pd.DataFrame.from_records(self.metadata["dimensions"]["entities"]["values"]) + id_to_name = entities.set_index("id").name.to_dict() + df["entities"] = df.entities.apply(id_to_name.__getitem__) + + # make the "values" column more interestingly named + short_name = self.metadata.get("shortName", f'_{self.metadata["id"]}') + df = df.rename(columns={"values": short_name}) + + time_col = self._detect_time_col_type() + if time_col == "dates": + df["years"] = self._convert_years_to_dates(df["years"]) + + # order the columns better + cols = ["entities", "years"] + sorted(df.columns.difference(["entities", "years"])) + df = df[cols] + + return df + + def _detect_time_col_type(self) -> Literal["dates", "years"]: + if self.metadata.get("display", {}).get("yearIsDay"): + return "dates" + + return "years" + + def _convert_years_to_dates(self, years): + base_date = date_parse(self.metadata["display"]["zeroDay"]) + return years.apply(lambda y: base_date + dt.timedelta(days=y)) + + +@dataclass +class _GrapherBundle: + config: dict + dimensions: Dict[int, _Indicator] + origins: List[dict] + + def to_json(self): + return json.dumps( + { + "config": self.config, + "dimensions": {k: i.to_dict() for k, i in self.dimensions.items()}, + "origins": self.origins, + } + ) + + def size(self): + return len(self.to_json()) + + @property + def indicators(self) -> List[_Indicator]: + return list(self.dimensions.values()) + + def to_frame(self): + # combine all the indicators into a single data frame and one metadata dict + metadata = {} + df = None + for i in self.indicators: + to_merge = i.to_frame() + (value_col,) = to_merge.columns.difference(["entities", "years"]) + metadata[value_col] = i.metadata.copy() + + if df is None: + df = to_merge + else: + df = pd.merge(df, to_merge, how="outer", on=["entities", "years"]) + + assert df is not None + + # save some useful metadata onto the frame + assert self.config + slug = self.config["slug"] + df.attrs["slug"] = slug + df.attrs["url"] = f"https://ourworldindata.org/grapher/{slug}" + df.attrs["metadata"] = metadata + + # if there is only one indicator, we can use the slug as the column name + if len(df.columns) == 3: + assert self.config + (value_col,) = df.columns.difference(["entities", "years"]) + short_name = slug.replace("-", "_") + df = df.rename(columns={value_col: short_name}) + df.attrs["metadata"][short_name] = df.attrs["metadata"].pop(value_col) + + df.attrs["value_col"] = short_name + + # we kept using "years" until now to keep the code paths the same, but they could + # be dates + if df["years"].astype(str).str.match(r"^\d{4}-\d{2}-\d{2}$").all(): + df = df.rename(columns={"years": "dates"}) + + return df + + def __repr__(self): + return f"GrapherBundle(config={self.config}, dimensions=..., origins=...)" + + +def _fetch_grapher_config(slug): + resp = requests.get(f"https://ourworldindata.org/grapher/{slug}") + if resp.status_code == 404: + raise ChartNotFoundError(slug) + + resp.raise_for_status() + return json.loads(resp.content.decode("utf-8").split("//EMBEDDED_JSON")[1]) + + +def _fetch_dimension(id: int) -> _Indicator: + data = requests.get(f"https://api.ourworldindata.org/v1/indicators/{id}.data.json").json() + metadata = requests.get(f"https://api.ourworldindata.org/v1/indicators/{id}.metadata.json").json() + return _Indicator(data, metadata) + + +def _fetch_bundle(slug: str) -> _GrapherBundle: + config = _fetch_grapher_config(slug) + indicator_ids = [d["variableId"] for d in config["dimensions"]] + + dimensions = {indicator_id: _fetch_dimension(indicator_id) for indicator_id in indicator_ids} + + origins = [] + for d in dimensions.values(): + if d.metadata.get("origins"): + origins.append(d.metadata.pop("origins")) + return _GrapherBundle(config, dimensions, origins) + + +def _list_charts() -> List[str]: + content = requests.get("https://ourworldindata.org/charts").content.decode("utf-8") + links = re.findall('"(/grapher/[^"]+)"', content) + slugs = [link.strip('"').split("/")[-1] for link in links] + return sorted(set(slugs)) diff --git a/lib/catalog/tests/test_charts.py b/lib/catalog/tests/test_charts.py new file mode 100644 index 000000000000..69ebc20a9301 --- /dev/null +++ b/lib/catalog/tests/test_charts.py @@ -0,0 +1,57 @@ +import pytest + +from owid.catalog import charts +from owid.catalog.internal import LicenseError + +# NOTE: the tests below make multiple network requests per check, we could consider +# mocking them out if they cause problems + + +def test_fetch_chart_data_with_slug_as_column(): + chart = charts.Chart("life-expectancy") + df = chart.get_data() + assert df is not None + assert len(df) > 0 + assert "entities" in df.columns + assert "years" in df.columns + assert "life_expectancy" in df.columns + + assert "metadata" in df.attrs + assert "life_expectancy" in df.attrs["metadata"] + + +def test_fetch_chart_data_with_multiple_indicators(): + df = charts.Chart("eat-lancet-diet-comparison").get_data() + value_cols = df.columns.difference(["entities", "years"]) + assert len(value_cols) > 1 + + assert "metadata" in df.attrs + assert all(c in df.attrs["metadata"] for c in value_cols) + + +def test_fetch_non_redistributable_chart(): + # a chart where nonRedistributable is true in the indicator's metadata; see also + # the dataset at https://admin.owid.io/admin/datasets/6457 + chart = charts.Chart("test-scores-ai-capabilities-relative-human-performance") + with pytest.raises(LicenseError): + chart.get_data() + + +def test_list_charts(): + slugs = charts.list_charts() + assert len(slugs) > 0 + assert "life-expectancy" in slugs + + # all unique + assert len(slugs) == len(set(slugs)) + + +def test_fetch_missing_chart(): + with pytest.raises(charts.ChartNotFoundError): + charts.Chart("this-chart-does-not-exist").bundle + + +def test_fetch_chart_with_dates(): + df = charts.Chart("sea-level").get_data() + assert "dates" in df.columns + assert "years" not in df.columns