Skip to content

Commit

Permalink
Merge branch 'master' into data/undp
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasrodes committed Apr 17, 2024
2 parents 17eac8f + 05a63cf commit 46010e8
Show file tree
Hide file tree
Showing 225 changed files with 12,922 additions and 1,252 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ site/
.wizardcfg/*
.streamlit/*
.ipynb_lock
.execution_time.json

9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ help:
@echo ' make format-all Format code (including modules in lib/)'
@echo ' make full Fetch all data and run full transformations'
@echo ' make grapher Publish supported datasets to Grapher'
@echo ' make sync.catalog Sync catalog from R2 into local data/ folder'
@echo ' make lab Start a Jupyter Lab server'
@echo ' make publish Publish the generated catalog to S3'
@echo ' make api Start the ETL API on port 8081'
Expand Down Expand Up @@ -118,6 +119,14 @@ prune: .venv
@echo '==> Prune datasets with no recipe from catalog'
poetry run etl d prune

# Syncing catalog is useful if you want to avoid rebuilding it locally from scratch
# which could take a few hours. This will download ~10gb from the main channels
# (meadow, garden, open_numbers) and is especially useful when we increase ETL_EPOCH
# or update regions.
sync.catalog: .venv
@echo '==> Sync catalog from R2 into local data/ folder (~10gb)'
rclone copy owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**"

grapher: .venv
@echo '==> Running full etl with grapher upsert'
poetry run etl run --grapher
Expand Down
6 changes: 5 additions & 1 deletion api/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,12 @@ def _indicator_metadata_dict(indicator: Indicator, db_indicator: gm.Variable) ->
indicator_update_dict = indicator.to_meta_dict()
update_period_days = indicator_update_dict.pop("update_period_days", None)

# if indicator has dimensions, use its original name
original_short_name = (db_indicator.dimensions or {}).get("originalShortName")
short_name = original_short_name or db_indicator.shortName

# create dictionary for metadata
meta_dict = {"tables": {db_indicator.table_name: {"variables": {db_indicator.shortName: indicator_update_dict}}}}
meta_dict = {"tables": {db_indicator.table_name: {"variables": {short_name: indicator_update_dict}}}}

if update_period_days:
meta_dict["dataset"] = {"update_period_days": update_period_days}
Expand Down
4 changes: 2 additions & 2 deletions apps/backport/backport.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from etl import config, paths
from etl import grapher_model as gm
from etl.backport_helpers import GrapherConfig
from etl.db import get_engine
from etl.db import get_engine, read_sql
from etl.files import checksum_str
from etl.snapshot import Snapshot, SnapshotMeta

Expand Down Expand Up @@ -346,7 +346,7 @@ def _load_values(engine: Engine, variable_ids: list[int]) -> pd.DataFrame:
"entityCode": "entity_code",
}
)
vf: pd.DataFrame = pd.read_sql(q, engine, params={"variable_ids": variable_ids})
vf = read_sql(q, engine, params={"variable_ids": variable_ids})
df = df.merge(vf, on="variable_id")

# try converting values to float if possible, this can make the data 50% smaller
Expand Down
4 changes: 2 additions & 2 deletions apps/backport/bulk_backport.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sqlalchemy.engine import Engine

from etl import config
from etl.db import get_engine
from etl.db import get_engine, read_sql
from etl.snapshot import snapshot_catalog
from etl.steps import load_dag

Expand Down Expand Up @@ -195,7 +195,7 @@ def _active_datasets(
limit %(limit)s
"""

df = pd.read_sql(
df = read_sql(
q,
engine,
params={
Expand Down
8 changes: 7 additions & 1 deletion apps/backport/datasync/data_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,13 @@ def add_entity_code_and_name(session: Session, df: pd.DataFrame) -> pd.DataFrame
df["entityCode"] = []
return df

entities = _fetch_entities(session, list(df["entityId"].unique()))
unique_entities = df["entityId"].unique()

entities = _fetch_entities(session, list(unique_entities))

if set(unique_entities) - set(entities.entityId):
missing_entities = set(unique_entities) - set(entities.entityId)
raise ValueError(f"Missing entities in the database: {missing_entities}")

return pd.merge(df, entities, on="entityId")

Expand Down
7 changes: 3 additions & 4 deletions apps/metadata_migrate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Any, Dict, List, Optional

import click
import pandas as pd
import structlog
from owid.catalog import Dataset, DatasetMeta, License, Origin, Source, Table
from rich import print
Expand All @@ -16,7 +15,7 @@
from etl import config
from etl import grapher_model as gm
from etl.command import main as etl_main
from etl.db import get_engine
from etl.db import get_engine, read_sql
from etl.metadata_export import merge_or_create_yaml, reorder_fields
from etl.paths import BASE_DIR, DAG_FILE, DATA_DIR, STEP_DIR

Expand Down Expand Up @@ -108,7 +107,7 @@ def cli(
select config from charts
where slug = '{chart_slug}'
"""
df = pd.read_sql(q, engine)
df = read_sql(q, engine)
if df.empty:
raise ValueError(f"no chart found for slug {chart_slug}")

Expand Down Expand Up @@ -359,7 +358,7 @@ def _load_grapher_config(engine: Engine, col: str, ds_meta: DatasetMeta) -> Dict
d.version = '{ds_meta.version}' and
d.shortName = '{ds_meta.short_name}'
"""
cf = pd.read_sql(q, engine)
cf = read_sql(q, engine)
if len(cf) == 0:
log.warning(f"no chart found for variable {col}")
return {}
Expand Down
Empty file added apps/owidbot/__init__.py
Empty file.
195 changes: 195 additions & 0 deletions apps/owidbot/etldiff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import datetime as dt
import subprocess
import time
from typing import Tuple

import click
import structlog
from github import Auth, Github
from rich import print
from rich.ansi import AnsiDecoder
from rich_click.rich_command import RichCommand

from apps.staging_sync.cli import _get_container_name
from etl import config
from etl.paths import BASE_DIR

log = structlog.get_logger()


EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet|country_profile"


@click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__)
@click.option(
"--branch",
type=str,
)
@click.option(
"--include",
type=str,
default="garden",
help="Include datasets matching this regex.",
)
@click.option(
"--dry-run/--no-dry-run",
default=False,
type=bool,
help="Print to console, do not post to Github.",
)
def cli(
branch: str,
include: str,
dry_run: bool,
) -> None:
"""Post result of `etl diff` to Github PR.
Example:
```
$ python apps/owidbot/etldiff.py --branch my-branch
```
"""
t = time.time()

lines = call_etl_diff(include)
diff, result = format_etl_diff(lines)

container_name = _get_container_name(branch) if branch else "dry-run"

# TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't
# run etl diff if the PR is from etl repo.
# - **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}

body = f"""
<details>
<summary><b>Staging server</b>: </summary>
- **Admin**: http://{container_name}/admin/login
- **Site**: http://{container_name}/
- **Login**: `ssh owid@{container_name}`
</details>
<details>
<summary><b>etl diff</b>: {result}</summary>
```diff
{diff}
```
Automatically updated datasets matching _{EXCLUDE_DATASETS}_ are not included
</details>
_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_
_Execution time: {time.time() - t:.2f} seconds_
""".strip()

if dry_run:
print(body)
else:
post_comment_to_pr(branch, body)


def post_comment_to_pr(branch_name: str, body: str) -> None:
assert config.OWIDBOT_ACCESS_TOKEN
auth = Auth.Token(config.OWIDBOT_ACCESS_TOKEN)
g = Github(auth=auth)

repo = g.get_repo("owid/etl")

# Find pull requests for the branch (assuming you're looking for open PRs)
pulls = repo.get_pulls(state="open", sort="created", head=f"{repo.owner.login}:{branch_name}")
pulls = list(pulls)

if len(pulls) == 0:
raise AssertionError(f"No open PR found for branch {branch_name}")
elif len(pulls) > 1:
raise AssertionError(f"More than one open PR found for branch {branch_name}")

pr = pulls[0]

comments = pr.get_issue_comments()

owidbot_comments = [comment for comment in comments if comment.user.login == "owidbot"]

if len(owidbot_comments) == 0:
pr.create_issue_comment(body=body)
elif len(owidbot_comments) == 1:
owidbot_comment = owidbot_comments[0]
owidbot_comment.edit(body=body)
else:
raise AssertionError("More than one owidbot comment found.")


def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
new_lines = []
result = ""
for line in lines:
# extract result
if line and line[0] in ("✅", "❌", "⚠️", "❓"):
result = line
continue

# skip some lines
if "this may get slow" in line or "comparison with compare" in line:
continue

if line.strip().startswith("-"):
line = "-" + line[1:]
if line.strip().startswith("+"):
line = "+" + line[1:]

new_lines.append(line)

diff = "\n".join(new_lines)

# NOTE: we don't need this anymore, we now have consistent checksums on local and remote
# Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output
# problem). Hotfix this by removing matching datasets from the output.
# Example:
# = Dataset meadow/agriculture/2024-03-26/attainable_yields
# = Table attainable_yields
# = Dataset garden/agriculture/2024-03-26/attainable_yields
# = Table attainable_yields
# ~ Column A
# = Dataset grapher/agriculture/2024-03-26/attainable_yields
# = Table attainable_yields
# pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)"
# diff = re.sub(pattern, "", diff)

return diff, result


def call_etl_diff(include: str) -> list[str]:
cmd = [
"poetry",
"run",
"etl",
"diff",
"REMOTE",
"data/",
"--include",
include,
"--exclude",
EXCLUDE_DATASETS,
"--verbose",
"--workers",
"3",
]

result = subprocess.Popen(cmd, cwd=BASE_DIR, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = result.communicate()

stdout = stdout.decode()
stderr = stderr.decode()

if stderr:
raise Exception(f"Error: {stderr}")

return [str(line) for line in AnsiDecoder().decode(stdout)]


if __name__ == "__main__":
cli()
4 changes: 2 additions & 2 deletions apps/staging_sync/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from etl import grapher_model as gm
from etl.config import GRAPHER_USER_ID
from etl.datadiff import _dict_diff
from etl.db import Engine, get_engine
from etl.db import Engine, get_engine, read_sql

from .admin_api import AdminAPI

Expand Down Expand Up @@ -404,7 +404,7 @@ def _modified_chart_ids_by_admin(session: Session) -> Set[int]:
select id from charts where publishedAt is not null
)
"""
return set(pd.read_sql(q, session.bind).chartId.tolist())
return set(read_sql(q, session.bind).chartId.tolist()) # type: ignore


def _get_git_branch_creation_date(branch_name: str) -> dt.datetime:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None:
tb = geo.harmonize_countries(
df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
)
tb = tb.set_index(["country", "year"], verify_integrity=True)
tb = tb.format(["country", "year"])

#
# Save outputs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None:
# Process data.
#
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index()
tb = tb.format(["country", "year"])

#
# Save outputs.
Expand Down
Loading

0 comments on commit 46010e8

Please sign in to comment.