Merge branch 'master' into data/undp

owid · Apr 17, 2024 · 46010e8 · 46010e8
2 parents 17eac8f + 05a63cf
commit 46010e8
Show file tree

Hide file tree

Showing 225 changed files with 12,922 additions and 1,252 deletions.
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,5 @@ site/
 .wizardcfg/*
 .streamlit/*
 .ipynb_lock
+.execution_time.json
+
diff --git a/Makefile b/Makefile
@@ -23,6 +23,7 @@ help:
 	@echo '  make format-all 	Format code (including modules in lib/)'
 	@echo '  make full      	Fetch all data and run full transformations'
 	@echo '  make grapher   	Publish supported datasets to Grapher'
+	@echo '  make sync.catalog  Sync catalog from R2 into local data/ folder'
 	@echo '  make lab       	Start a Jupyter Lab server'
 	@echo '  make publish   	Publish the generated catalog to S3'
 	@echo '  make api   		Start the ETL API on port 8081'
@@ -118,6 +119,14 @@ prune: .venv
 	@echo '==> Prune datasets with no recipe from catalog'
 	poetry run etl d prune
 
+# Syncing catalog is useful if you want to avoid rebuilding it locally from scratch
+# which could take a few hours. This will download ~10gb from the main channels
+# (meadow, garden, open_numbers) and is especially useful when we increase ETL_EPOCH
+# or update regions.
+sync.catalog: .venv
+	@echo '==> Sync catalog from R2 into local data/ folder (~10gb)'
+	rclone copy owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**"
+
 grapher: .venv
 	@echo '==> Running full etl with grapher upsert'
 	poetry run etl run --grapher

diff --git a/api/v1/__init__.py b/api/v1/__init__.py
@@ -179,8 +179,12 @@ def _indicator_metadata_dict(indicator: Indicator, db_indicator: gm.Variable) ->
     indicator_update_dict = indicator.to_meta_dict()
     update_period_days = indicator_update_dict.pop("update_period_days", None)
 
+    # if indicator has dimensions, use its original name
+    original_short_name = (db_indicator.dimensions or {}).get("originalShortName")
+    short_name = original_short_name or db_indicator.shortName
+
     # create dictionary for metadata
-    meta_dict = {"tables": {db_indicator.table_name: {"variables": {db_indicator.shortName: indicator_update_dict}}}}
+    meta_dict = {"tables": {db_indicator.table_name: {"variables": {short_name: indicator_update_dict}}}}
 
     if update_period_days:
         meta_dict["dataset"] = {"update_period_days": update_period_days}

diff --git a/apps/backport/backport.py b/apps/backport/backport.py
@@ -20,7 +20,7 @@
 from etl import config, paths
 from etl import grapher_model as gm
 from etl.backport_helpers import GrapherConfig
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 from etl.files import checksum_str
 from etl.snapshot import Snapshot, SnapshotMeta
 
@@ -346,7 +346,7 @@ def _load_values(engine: Engine, variable_ids: list[int]) -> pd.DataFrame:
             "entityCode": "entity_code",
         }
     )
-    vf: pd.DataFrame = pd.read_sql(q, engine, params={"variable_ids": variable_ids})
+    vf = read_sql(q, engine, params={"variable_ids": variable_ids})
     df = df.merge(vf, on="variable_id")
 
     # try converting values to float if possible, this can make the data 50% smaller

diff --git a/apps/backport/bulk_backport.py b/apps/backport/bulk_backport.py
@@ -9,7 +9,7 @@
 from sqlalchemy.engine import Engine
 
 from etl import config
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 from etl.snapshot import snapshot_catalog
 from etl.steps import load_dag
 
@@ -195,7 +195,7 @@ def _active_datasets(
         limit %(limit)s
         """
 
-    df = pd.read_sql(
+    df = read_sql(
         q,
         engine,
         params={

diff --git a/apps/backport/datasync/data_metadata.py b/apps/backport/datasync/data_metadata.py
@@ -83,7 +83,13 @@ def add_entity_code_and_name(session: Session, df: pd.DataFrame) -> pd.DataFrame
         df["entityCode"] = []
         return df
 
-    entities = _fetch_entities(session, list(df["entityId"].unique()))
+    unique_entities = df["entityId"].unique()
+
+    entities = _fetch_entities(session, list(unique_entities))
+
+    if set(unique_entities) - set(entities.entityId):
+        missing_entities = set(unique_entities) - set(entities.entityId)
+        raise ValueError(f"Missing entities in the database: {missing_entities}")
 
     return pd.merge(df, entities, on="entityId")
 

diff --git a/apps/metadata_migrate/cli.py b/apps/metadata_migrate/cli.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, List, Optional
 
 import click
-import pandas as pd
 import structlog
 from owid.catalog import Dataset, DatasetMeta, License, Origin, Source, Table
 from rich import print
@@ -16,7 +15,7 @@
 from etl import config
 from etl import grapher_model as gm
 from etl.command import main as etl_main
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 from etl.metadata_export import merge_or_create_yaml, reorder_fields
 from etl.paths import BASE_DIR, DAG_FILE, DATA_DIR, STEP_DIR
 
@@ -108,7 +107,7 @@ def cli(
         select config from charts
         where slug = '{chart_slug}'
         """
-        df = pd.read_sql(q, engine)
+        df = read_sql(q, engine)
         if df.empty:
             raise ValueError(f"no chart found for slug {chart_slug}")
 
@@ -359,7 +358,7 @@ def _load_grapher_config(engine: Engine, col: str, ds_meta: DatasetMeta) -> Dict
         d.version = '{ds_meta.version}' and
         d.shortName = '{ds_meta.short_name}'
     """
-    cf = pd.read_sql(q, engine)
+    cf = read_sql(q, engine)
     if len(cf) == 0:
         log.warning(f"no chart found for variable {col}")
         return {}

diff --git a/apps/owidbot/__init__.py b/apps/owidbot/__init__.py
diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py
@@ -0,0 +1,195 @@
+import datetime as dt
+import subprocess
+import time
+from typing import Tuple
+
+import click
+import structlog
+from github import Auth, Github
+from rich import print
+from rich.ansi import AnsiDecoder
+from rich_click.rich_command import RichCommand
+
+from apps.staging_sync.cli import _get_container_name
+from etl import config
+from etl.paths import BASE_DIR
+
+log = structlog.get_logger()
+
+
+EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet|country_profile"
+
+
+@click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__)
+@click.option(
+    "--branch",
+    type=str,
+)
+@click.option(
+    "--include",
+    type=str,
+    default="garden",
+    help="Include datasets matching this regex.",
+)
+@click.option(
+    "--dry-run/--no-dry-run",
+    default=False,
+    type=bool,
+    help="Print to console, do not post to Github.",
+)
+def cli(
+    branch: str,
+    include: str,
+    dry_run: bool,
+) -> None:
+    """Post result of `etl diff` to Github PR.
+
+    Example:
+
+    ```
+    $ python apps/owidbot/etldiff.py --branch my-branch
+    ```
+    """
+    t = time.time()
+
+    lines = call_etl_diff(include)
+    diff, result = format_etl_diff(lines)
+
+    container_name = _get_container_name(branch) if branch else "dry-run"
+
+    # TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't
+    # run etl diff if the PR is from etl repo.
+    # - **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}
+
+    body = f"""
+<details>
+
+<summary><b>Staging server</b>: </summary>
+
+- **Admin**: http://{container_name}/admin/login
+- **Site**: http://{container_name}/
+- **Login**: `ssh owid@{container_name}`
+</details>
+
+<details>
+
+<summary><b>etl diff</b>: {result}</summary>
+
+```diff
+{diff}
+```
+
+Automatically updated datasets matching _{EXCLUDE_DATASETS}_ are not included
+</details>
+
+_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_
+_Execution time: {time.time() - t:.2f} seconds_
+    """.strip()
+
+    if dry_run:
+        print(body)
+    else:
+        post_comment_to_pr(branch, body)
+
+
+def post_comment_to_pr(branch_name: str, body: str) -> None:
+    assert config.OWIDBOT_ACCESS_TOKEN
+    auth = Auth.Token(config.OWIDBOT_ACCESS_TOKEN)
+    g = Github(auth=auth)
+
+    repo = g.get_repo("owid/etl")
+
+    # Find pull requests for the branch (assuming you're looking for open PRs)
+    pulls = repo.get_pulls(state="open", sort="created", head=f"{repo.owner.login}:{branch_name}")
+    pulls = list(pulls)
+
+    if len(pulls) == 0:
+        raise AssertionError(f"No open PR found for branch {branch_name}")
+    elif len(pulls) > 1:
+        raise AssertionError(f"More than one open PR found for branch {branch_name}")
+
+    pr = pulls[0]
+
+    comments = pr.get_issue_comments()
+
+    owidbot_comments = [comment for comment in comments if comment.user.login == "owidbot"]
+
+    if len(owidbot_comments) == 0:
+        pr.create_issue_comment(body=body)
+    elif len(owidbot_comments) == 1:
+        owidbot_comment = owidbot_comments[0]
+        owidbot_comment.edit(body=body)
+    else:
+        raise AssertionError("More than one owidbot comment found.")
+
+
+def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
+    new_lines = []
+    result = ""
+    for line in lines:
+        # extract result
+        if line and line[0] in ("✅", "❌", "⚠️", "❓"):
+            result = line
+            continue
+
+        # skip some lines
+        if "this may get slow" in line or "comparison with compare" in line:
+            continue
+
+        if line.strip().startswith("-"):
+            line = "-" + line[1:]
+        if line.strip().startswith("+"):
+            line = "+" + line[1:]
+
+        new_lines.append(line)
+
+    diff = "\n".join(new_lines)
+
+    # NOTE: we don't need this anymore, we now have consistent checksums on local and remote
+    # Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output
+    # problem). Hotfix this by removing matching datasets from the output.
+    # Example:
+    # = Dataset meadow/agriculture/2024-03-26/attainable_yields
+    #     = Table attainable_yields
+    # = Dataset garden/agriculture/2024-03-26/attainable_yields
+    #     = Table attainable_yields
+    #        ~ Column A
+    # = Dataset grapher/agriculture/2024-03-26/attainable_yields
+    #     = Table attainable_yields
+    # pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)"
+    # diff = re.sub(pattern, "", diff)
+
+    return diff, result
+
+
+def call_etl_diff(include: str) -> list[str]:
+    cmd = [
+        "poetry",
+        "run",
+        "etl",
+        "diff",
+        "REMOTE",
+        "data/",
+        "--include",
+        include,
+        "--exclude",
+        EXCLUDE_DATASETS,
+        "--verbose",
+        "--workers",
+        "3",
+    ]
+
+    result = subprocess.Popen(cmd, cwd=BASE_DIR, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = result.communicate()
+
+    stdout = stdout.decode()
+    stderr = stderr.decode()
+
+    if stderr:
+        raise Exception(f"Error: {stderr}")
+
+    return [str(line) for line in AnsiDecoder().decode(stdout)]
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/apps/staging_sync/cli.py b/apps/staging_sync/cli.py
@@ -18,7 +18,7 @@
 from etl import grapher_model as gm
 from etl.config import GRAPHER_USER_ID
 from etl.datadiff import _dict_diff
-from etl.db import Engine, get_engine
+from etl.db import Engine, get_engine, read_sql
 
 from .admin_api import AdminAPI
 
@@ -404,7 +404,7 @@ def _modified_chart_ids_by_admin(session: Session) -> Set[int]:
         select id from charts where publishedAt is not null
     )
     """
-    return set(pd.read_sql(q, session.bind).chartId.tolist())
+    return set(read_sql(q, session.bind).chartId.tolist())  # type: ignore
 
 
 def _get_git_branch_creation_date(branch_name: str) -> dt.datetime:

diff --git a/...garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/...garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
@@ -23,7 +23,7 @@ def run(dest_dir: str) -> None:
     tb = geo.harmonize_countries(
         df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
     )
-    tb = tb.set_index(["country", "year"], verify_integrity=True)
+    tb = tb.format(["country", "year"])
 
     #
     # Save outputs.

diff --git a/...meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/...meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
@@ -20,7 +20,7 @@ def run(dest_dir: str) -> None:
     # Process data.
     #
     # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
-    tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index()
+    tb = tb.format(["country", "year"])
 
     #
     # Save outputs.