From 5f6a1352aef1b34834f28a916f6ce668dece3e56 Mon Sep 17 00:00:00 2001
From: Lars Yencken <lars@yencken.org>
Date: Tue, 7 May 2024 21:19:20 +0100
Subject: [PATCH] :tada: Add owid.catalog.charts module

Adds a draft API for fetching the data for a chart.

```
from owid.catalog.charts import Chart

df = Chart('life-expectancy').get_data()
```
---
 lib/catalog/Makefile                 |   5 +
 lib/catalog/owid/catalog/charts.py   |  67 +++++++++++
 lib/catalog/owid/catalog/internal.py | 171 +++++++++++++++++++++++++++
 lib/catalog/tests/test_charts.py     |  57 +++++++++
 4 files changed, 300 insertions(+)
 create mode 100644 lib/catalog/owid/catalog/charts.py
 create mode 100644 lib/catalog/owid/catalog/internal.py
 create mode 100644 lib/catalog/tests/test_charts.py

diff --git a/lib/catalog/Makefile b/lib/catalog/Makefile
index dc1976c27cdd..6bdfc9fc17fc 100644
--- a/lib/catalog/Makefile
+++ b/lib/catalog/Makefile
@@ -10,6 +10,11 @@ SRC = owid tests
 # watch:
 # 	poetry run watchmedo shell-command -c 'clear; make unittest' --recursive --drop .
 
+.venv: poetry.toml pyproject.toml poetry.lock
+	@echo '==> Installing packages'
+	poetry install
+	touch .venv
+
 check-typing: .venv
 	@echo '==> Checking types'
 	poetry run pyright $(SRC)
diff --git a/lib/catalog/owid/catalog/charts.py b/lib/catalog/owid/catalog/charts.py
new file mode 100644
index 000000000000..622c56e404a7
--- /dev/null
+++ b/lib/catalog/owid/catalog/charts.py
@@ -0,0 +1,67 @@
+#
+#  owid.catalog.charts
+#
+#
+#  Access to data in OWID charts.
+#
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+import pandas as pd
+
+from .internal import (
+    ChartNotFoundError,  # noqa
+    LicenseError,  # noqa
+    _fetch_bundle,
+    _GrapherBundle,
+    _list_charts,
+)
+
+
+@dataclass
+class Chart:
+    """
+    A chart published on Our World in Data, for example:
+
+    https://ourworldindata.org/grapher/life-expectancy
+    """
+
+    slug: str
+
+    _bundle: Optional[_GrapherBundle] = None
+
+    @property
+    def bundle(self) -> _GrapherBundle:
+        # LARS: give a nice error if the chart does not exist
+        if self._bundle is None:
+            self._bundle = _fetch_bundle(self.slug)
+
+        return self._bundle
+
+    @property
+    def config(self) -> dict:
+        return self.bundle.config  # type: ignore
+
+    def get_data(self) -> pd.DataFrame:
+        return self.bundle.to_frame()
+
+    def __lt__(self, other):
+        return self.slug < other.slug
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(value, Chart) and value.slug == self.slug
+
+
+def list_charts() -> List[str]:
+    """
+    List all available charts published on Our World in Data.
+    """
+    return sorted(_list_charts())
+
+
+def get_data(slug: str) -> pd.DataFrame:
+    """
+    Fetch the data for a chart by its slug.
+    """
+    return Chart(slug).get_data()
diff --git a/lib/catalog/owid/catalog/internal.py b/lib/catalog/owid/catalog/internal.py
new file mode 100644
index 000000000000..b6651c34616c
--- /dev/null
+++ b/lib/catalog/owid/catalog/internal.py
@@ -0,0 +1,171 @@
+#
+#  internal.py
+#
+#  Internal APIs subject to change at any time.
+#
+
+import datetime as dt
+import json
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Literal
+
+import pandas as pd
+import requests
+from dateutil.parser import parse as date_parse
+
+
+class LicenseError(Exception):
+    pass
+
+
+class ChartNotFoundError(Exception):
+    pass
+
+
+@dataclass
+class _Indicator:
+    data: dict
+    metadata: dict
+
+    def to_dict(self):
+        return {"data": self.data, "metadata": self.metadata}
+
+    def to_frame(self):
+        if self.metadata.get("nonRedistributable"):
+            raise LicenseError(
+                "API download is disallowed for this indicator due to license restrictions from the data provider"
+            )
+
+        # getting a data frame is easy
+        df = pd.DataFrame.from_dict(self.data)
+
+        # turning entity ids into entity names
+        entities = pd.DataFrame.from_records(self.metadata["dimensions"]["entities"]["values"])
+        id_to_name = entities.set_index("id").name.to_dict()
+        df["entities"] = df.entities.apply(id_to_name.__getitem__)
+
+        # make the "values" column more interestingly named
+        short_name = self.metadata.get("shortName", f'_{self.metadata["id"]}')
+        df = df.rename(columns={"values": short_name})
+
+        time_col = self._detect_time_col_type()
+        if time_col == "dates":
+            df["years"] = self._convert_years_to_dates(df["years"])
+
+        # order the columns better
+        cols = ["entities", "years"] + sorted(df.columns.difference(["entities", "years"]))
+        df = df[cols]
+
+        return df
+
+    def _detect_time_col_type(self) -> Literal["dates", "years"]:
+        if self.metadata.get("display", {}).get("yearIsDay"):
+            return "dates"
+
+        return "years"
+
+    def _convert_years_to_dates(self, years):
+        base_date = date_parse(self.metadata["display"]["zeroDay"])
+        return years.apply(lambda y: base_date + dt.timedelta(days=y))
+
+
+@dataclass
+class _GrapherBundle:
+    config: dict
+    dimensions: Dict[int, _Indicator]
+    origins: List[dict]
+
+    def to_json(self):
+        return json.dumps(
+            {
+                "config": self.config,
+                "dimensions": {k: i.to_dict() for k, i in self.dimensions.items()},
+                "origins": self.origins,
+            }
+        )
+
+    def size(self):
+        return len(self.to_json())
+
+    @property
+    def indicators(self) -> List[_Indicator]:
+        return list(self.dimensions.values())
+
+    def to_frame(self):
+        # combine all the indicators into a single data frame and one metadata dict
+        metadata = {}
+        df = None
+        for i in self.indicators:
+            to_merge = i.to_frame()
+            (value_col,) = to_merge.columns.difference(["entities", "years"])
+            metadata[value_col] = i.metadata.copy()
+
+            if df is None:
+                df = to_merge
+            else:
+                df = pd.merge(df, to_merge, how="outer", on=["entities", "years"])
+
+        assert df is not None
+
+        # save some useful metadata onto the frame
+        assert self.config
+        slug = self.config["slug"]
+        df.attrs["slug"] = slug
+        df.attrs["url"] = f"https://ourworldindata.org/grapher/{slug}"
+        df.attrs["metadata"] = metadata
+
+        # if there is only one indicator, we can use the slug as the column name
+        if len(df.columns) == 3:
+            assert self.config
+            (value_col,) = df.columns.difference(["entities", "years"])
+            short_name = slug.replace("-", "_")
+            df = df.rename(columns={value_col: short_name})
+            df.attrs["metadata"][short_name] = df.attrs["metadata"].pop(value_col)
+
+            df.attrs["value_col"] = short_name
+
+        # we kept using "years" until now to keep the code paths the same, but they could
+        # be dates
+        if df["years"].astype(str).str.match(r"^\d{4}-\d{2}-\d{2}$").all():
+            df = df.rename(columns={"years": "dates"})
+
+        return df
+
+    def __repr__(self):
+        return f"GrapherBundle(config={self.config}, dimensions=..., origins=...)"
+
+
+def _fetch_grapher_config(slug):
+    resp = requests.get(f"https://ourworldindata.org/grapher/{slug}")
+    if resp.status_code == 404:
+        raise ChartNotFoundError(slug)
+
+    resp.raise_for_status()
+    return json.loads(resp.content.decode("utf-8").split("//EMBEDDED_JSON")[1])
+
+
+def _fetch_dimension(id: int) -> _Indicator:
+    data = requests.get(f"https://api.ourworldindata.org/v1/indicators/{id}.data.json").json()
+    metadata = requests.get(f"https://api.ourworldindata.org/v1/indicators/{id}.metadata.json").json()
+    return _Indicator(data, metadata)
+
+
+def _fetch_bundle(slug: str) -> _GrapherBundle:
+    config = _fetch_grapher_config(slug)
+    indicator_ids = [d["variableId"] for d in config["dimensions"]]
+
+    dimensions = {indicator_id: _fetch_dimension(indicator_id) for indicator_id in indicator_ids}
+
+    origins = []
+    for d in dimensions.values():
+        if d.metadata.get("origins"):
+            origins.append(d.metadata.pop("origins"))
+    return _GrapherBundle(config, dimensions, origins)
+
+
+def _list_charts() -> List[str]:
+    content = requests.get("https://ourworldindata.org/charts").content.decode("utf-8")
+    links = re.findall('"(/grapher/[^"]+)"', content)
+    slugs = [link.strip('"').split("/")[-1] for link in links]
+    return sorted(set(slugs))
diff --git a/lib/catalog/tests/test_charts.py b/lib/catalog/tests/test_charts.py
new file mode 100644
index 000000000000..69ebc20a9301
--- /dev/null
+++ b/lib/catalog/tests/test_charts.py
@@ -0,0 +1,57 @@
+import pytest
+
+from owid.catalog import charts
+from owid.catalog.internal import LicenseError
+
+# NOTE: the tests below make multiple network requests per check, we could consider
+#       mocking them out if they cause problems
+
+
+def test_fetch_chart_data_with_slug_as_column():
+    chart = charts.Chart("life-expectancy")
+    df = chart.get_data()
+    assert df is not None
+    assert len(df) > 0
+    assert "entities" in df.columns
+    assert "years" in df.columns
+    assert "life_expectancy" in df.columns
+
+    assert "metadata" in df.attrs
+    assert "life_expectancy" in df.attrs["metadata"]
+
+
+def test_fetch_chart_data_with_multiple_indicators():
+    df = charts.Chart("eat-lancet-diet-comparison").get_data()
+    value_cols = df.columns.difference(["entities", "years"])
+    assert len(value_cols) > 1
+
+    assert "metadata" in df.attrs
+    assert all(c in df.attrs["metadata"] for c in value_cols)
+
+
+def test_fetch_non_redistributable_chart():
+    # a chart where nonRedistributable is true in the indicator's metadata; see also
+    # the dataset at https://admin.owid.io/admin/datasets/6457
+    chart = charts.Chart("test-scores-ai-capabilities-relative-human-performance")
+    with pytest.raises(LicenseError):
+        chart.get_data()
+
+
+def test_list_charts():
+    slugs = charts.list_charts()
+    assert len(slugs) > 0
+    assert "life-expectancy" in slugs
+
+    # all unique
+    assert len(slugs) == len(set(slugs))
+
+
+def test_fetch_missing_chart():
+    with pytest.raises(charts.ChartNotFoundError):
+        charts.Chart("this-chart-does-not-exist").bundle
+
+
+def test_fetch_chart_with_dates():
+    df = charts.Chart("sea-level").get_data()
+    assert "dates" in df.columns
+    assert "years" not in df.columns