From 8c0834b81387f8c345b32cc81cdfaebf91760875 Mon Sep 17 00:00:00 2001 From: Lars Yencken Date: Tue, 7 May 2024 21:19:20 +0100 Subject: [PATCH] :tada: Add owid.catalog.charts module Adds a draft API for fetching the data for a chart. ``` from owid.catalog.charts import Chart df = Chart('life-expectancy').get_data() ``` --- lib/catalog/Makefile | 5 ++ lib/catalog/owid/catalog/charts.py | 47 ++++++++++ lib/catalog/owid/catalog/internal.py | 126 +++++++++++++++++++++++++++ lib/catalog/tests/test_charts.py | 18 ++++ 4 files changed, 196 insertions(+) create mode 100644 lib/catalog/owid/catalog/charts.py create mode 100644 lib/catalog/owid/catalog/internal.py create mode 100644 lib/catalog/tests/test_charts.py diff --git a/lib/catalog/Makefile b/lib/catalog/Makefile index dc1976c27cdd..6bdfc9fc17fc 100644 --- a/lib/catalog/Makefile +++ b/lib/catalog/Makefile @@ -10,6 +10,11 @@ SRC = owid tests # watch: # poetry run watchmedo shell-command -c 'clear; make unittest' --recursive --drop . +.venv: poetry.toml pyproject.toml poetry.lock + @echo '==> Installing packages' + poetry install + touch .venv + check-typing: .venv @echo '==> Checking types' poetry run pyright $(SRC) diff --git a/lib/catalog/owid/catalog/charts.py b/lib/catalog/owid/catalog/charts.py new file mode 100644 index 000000000000..f8b67be40058 --- /dev/null +++ b/lib/catalog/owid/catalog/charts.py @@ -0,0 +1,47 @@ +# +# owid.catalog.charts +# +# +# Access to data in OWID charts. +# + +from dataclasses import dataclass +from typing import List, Optional + +import pandas as pd + +from .internal import _fetch_bundle, _GrapherBundle, _list_charts + + +@dataclass +class Chart: + """ + A chart published on Our World in Data, for example: + + https://ourworldindata.org/grapher/life-expectancy + """ + + slug: str + + _bundle: Optional[_GrapherBundle] = None + + @property + def config(self) -> str: + if self._bundle is None: + self._bundle = _fetch_bundle(self.slug) + + return self._bundle.config # type: ignore + + @property + def get_data(self) -> pd.DataFrame: + if self._bundle is None: + self._bundle = _fetch_bundle(self.slug) + + return self._bundle.to_frame() # type: ignore + + +def list_charts() -> List[Chart]: + """ + List all available charts published on Our World in Data. + """ + return [Chart(slug) for slug in _list_charts()] diff --git a/lib/catalog/owid/catalog/internal.py b/lib/catalog/owid/catalog/internal.py new file mode 100644 index 000000000000..779dbe7b402d --- /dev/null +++ b/lib/catalog/owid/catalog/internal.py @@ -0,0 +1,126 @@ +# +# internal.py +# +# Internal APIs subject to change at any time. +# + +import json +import re +from dataclasses import dataclass +from typing import Dict, List, Optional + +import pandas as pd +import requests + + +@dataclass +class _Indicator: + data: dict + metadata: dict + + def to_dict(self): + return {"data": self.data, "metadata": self.metadata} + + def to_frame(self): + # getting a data frame is easy + df = pd.DataFrame.from_dict(self.data) + + # turning entity ids into entity names + entities = pd.DataFrame.from_records(self.metadata["dimensions"]["entities"]["values"]) + id_to_name = entities.set_index("id").name.to_dict() + df["entities"] = df.entities.apply(id_to_name.__getitem__) + + # make the "values" column more interestingly named + short_name = self.metadata.get("shortName", f'_{self.metadata["id"]}') + df = df.rename(columns={"values": short_name}) + + # order the columns better + cols = ["entities", "years"] + sorted([c for c in df.columns if c not in ["entities", "years"]]) + df = df[cols] + + return df + + +@dataclass +class _GrapherBundle: + config: Optional[dict] + dimensions: Dict[int, _Indicator] + origins: List[dict] + + def to_json(self): + return json.dumps( + { + "config": self.config, + "dimensions": {k: i.to_dict() for k, i in self.dimensions.items()}, + "origins": self.origins, + } + ) + + def size(self): + return len(self.to_json()) + + @property + def indicators(self) -> List[_Indicator]: + return list(self.dimensions.values()) + + def to_frame(self): + df = None + for i in self.indicators: + to_merge = i.to_frame() + if df is None: + df = to_merge + else: + df = pd.merge(df, to_merge, how="outer", on=["entities", "years"]) + + assert df is not None + + if len(df.columns) == 3: + # use the slug as the column name for values + assert self.config + (value_col,) = [c for c in df.columns if c not in ["entities", "years"]] + slug = self.config["slug"].replace("-", "_") + df = df.rename(columns={value_col: slug}) + + return df + + def __repr__(self): + return f"GrapherBundle(config={self.config}, dimensions=..., origins=...)" + + +def _fetch_grapher_config(slug): + resp = requests.get(f"https://ourworldindata.org/grapher/{slug}") + resp.raise_for_status() + return json.loads(resp.content.decode("utf-8").split("//EMBEDDED_JSON")[1]) + + +def _fetch_dimension(id: int) -> _Indicator: + data = requests.get(f"https://api.ourworldindata.org/v1/indicators/{id}.data.json").json() + metadata = requests.get(f"https://api.ourworldindata.org/v1/indicators/{id}.metadata.json").json() + return _Indicator(data, metadata) + + +def _fetch_bundle(slug: Optional[str] = None, indicator_id: Optional[int] = None) -> _GrapherBundle: + indicator_ids: List[int] + if slug: + config = _fetch_grapher_config(slug) + indicator_ids = [d["variableId"] for d in config["dimensions"]] + else: + assert indicator_id is not None + print(f"Fetching indicator {indicator_id}") + config = None + indicator_ids = [indicator_id] + + dimensions = {indicator_id: _fetch_dimension(indicator_id) for indicator_id in indicator_ids} + + origins = [] + for d in dimensions.values(): + if d.metadata.get("origins"): + origins.append(d.metadata.pop("origins")) + return _GrapherBundle(config, dimensions, origins) + + +def _list_charts() -> List[str]: + content = requests.get("https://ourworldindata.org/charts").content.decode("utf-8") + links = re.findall('"(/grapher/[^"]+)"', content) + slugs = [link.strip('"').split("/")[-1] for link in links] + return slugs diff --git a/lib/catalog/tests/test_charts.py b/lib/catalog/tests/test_charts.py new file mode 100644 index 000000000000..f3eb0a53a8dd --- /dev/null +++ b/lib/catalog/tests/test_charts.py @@ -0,0 +1,18 @@ +from owid.catalog import charts + + +def test_fetch_chart_data(): + chart = charts.Chart("life-expectancy") + df = chart.get_data + assert df is not None + assert len(df) > 0 + assert "entities" in df.columns + assert "years" in df.columns + assert "life_expectancy" in df.columns + + +def test_list_charts(): + cs = charts.list_charts() + assert len(cs) > 0 + assert all(isinstance(c, charts.Chart) for c in cs) + assert "life-expectancy" in [c.slug for c in cs]