Skip to content

Commit

Permalink
Merge branch 'update-natural-disasters-data-reference' of github.com:…
Browse files Browse the repository at this point in the history
…owid/etl into update-natural-disasters-data
  • Loading branch information
pabloarosado committed Apr 17, 2024
2 parents de74844 + 0cbbab5 commit af6393d
Show file tree
Hide file tree
Showing 134 changed files with 8,910 additions and 555 deletions.
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ help:
@echo ' make format-all Format code (including modules in lib/)'
@echo ' make full Fetch all data and run full transformations'
@echo ' make grapher Publish supported datasets to Grapher'
@echo ' make sync.catalog Sync catalog from R2 into local data/ folder'
@echo ' make lab Start a Jupyter Lab server'
@echo ' make publish Publish the generated catalog to S3'
@echo ' make api Start the ETL API on port 8081'
Expand Down Expand Up @@ -118,6 +119,14 @@ prune: .venv
@echo '==> Prune datasets with no recipe from catalog'
poetry run etl d prune

# Syncing catalog is useful if you want to avoid rebuilding it locally from scratch
# which could take a few hours. This will download ~10gb from the main channels
# (meadow, garden, open_numbers) and is especially useful when we increase ETL_EPOCH
# or update regions.
sync.catalog: .venv
@echo '==> Sync catalog from R2 into local data/ folder (~10gb)'
rclone sync owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**"

grapher: .venv
@echo '==> Running full etl with grapher upsert'
poetry run etl run --grapher
Expand Down
6 changes: 5 additions & 1 deletion api/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,12 @@ def _indicator_metadata_dict(indicator: Indicator, db_indicator: gm.Variable) ->
indicator_update_dict = indicator.to_meta_dict()
update_period_days = indicator_update_dict.pop("update_period_days", None)

# if indicator has dimensions, use its original name
original_short_name = (db_indicator.dimensions or {}).get("originalShortName")
short_name = original_short_name or db_indicator.shortName

# create dictionary for metadata
meta_dict = {"tables": {db_indicator.table_name: {"variables": {db_indicator.shortName: indicator_update_dict}}}}
meta_dict = {"tables": {db_indicator.table_name: {"variables": {short_name: indicator_update_dict}}}}

if update_period_days:
meta_dict["dataset"] = {"update_period_days": update_period_days}
Expand Down
8 changes: 7 additions & 1 deletion apps/backport/datasync/data_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,13 @@ def add_entity_code_and_name(session: Session, df: pd.DataFrame) -> pd.DataFrame
df["entityCode"] = []
return df

entities = _fetch_entities(session, list(df["entityId"].unique()))
unique_entities = df["entityId"].unique()

entities = _fetch_entities(session, list(unique_entities))

if set(unique_entities) - set(entities.entityId):
missing_entities = set(unique_entities) - set(entities.entityId)
raise ValueError(f"Missing entities in the database: {missing_entities}")

return pd.merge(df, entities, on="entityId")

Expand Down
50 changes: 46 additions & 4 deletions apps/owidbot/etldiff.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime as dt
import subprocess
import time
from typing import Tuple

import click
Expand All @@ -9,20 +10,27 @@
from rich.ansi import AnsiDecoder
from rich_click.rich_command import RichCommand

from apps.staging_sync.cli import _get_container_name
from etl import config
from etl.paths import BASE_DIR

log = structlog.get_logger()


EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet"
EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet|country_profile"


@click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__)
@click.option(
"--branch",
type=str,
)
@click.option(
"--include",
type=str,
default="garden",
help="Include datasets matching this regex.",
)
@click.option(
"--dry-run/--no-dry-run",
default=False,
Expand All @@ -31,6 +39,7 @@
)
def cli(
branch: str,
include: str,
dry_run: bool,
) -> None:
"""Post result of `etl diff` to Github PR.
Expand All @@ -41,12 +50,29 @@ def cli(
$ python apps/owidbot/etldiff.py --branch my-branch
```
"""
lines = call_etl_diff()
t = time.time()

lines = call_etl_diff(include)
diff, result = format_etl_diff(lines)

container_name = _get_container_name(branch) if branch else "dry-run"

# TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't
# run etl diff if the PR is from etl repo.
# - **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}

body = f"""
<details>
<summary><b>Staging server</b>: </summary>
- **Admin**: http://{container_name}/admin/login
- **Site**: http://{container_name}/
- **Login**: `ssh owid@{container_name}`
</details>
<details>
<summary><b>etl diff</b>: {result}</summary>
```diff
Expand All @@ -57,6 +83,7 @@ def cli(
</details>
_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_
_Execution time: {time.time() - t:.2f} seconds_
""".strip()

if dry_run:
Expand Down Expand Up @@ -117,10 +144,25 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
new_lines.append(line)

diff = "\n".join(new_lines)

# NOTE: we don't need this anymore, we now have consistent checksums on local and remote
# Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output
# problem). Hotfix this by removing matching datasets from the output.
# Example:
# = Dataset meadow/agriculture/2024-03-26/attainable_yields
# = Table attainable_yields
# = Dataset garden/agriculture/2024-03-26/attainable_yields
# = Table attainable_yields
# ~ Column A
# = Dataset grapher/agriculture/2024-03-26/attainable_yields
# = Table attainable_yields
# pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)"
# diff = re.sub(pattern, "", diff)

return diff, result


def call_etl_diff() -> list[str]:
def call_etl_diff(include: str) -> list[str]:
cmd = [
"poetry",
"run",
Expand All @@ -129,7 +171,7 @@ def call_etl_diff() -> list[str]:
"REMOTE",
"data/",
"--include",
"garden",
include,
"--exclude",
EXCLUDE_DATASETS,
"--verbose",
Expand Down
Loading

0 comments on commit af6393d

Please sign in to comment.