Skip to content

Commit

Permalink
Merge branch 'master' into lucasrodes-data-hmd-update-beabe3-private
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasrodes committed Dec 2, 2024
2 parents a9f2ddf + bdf71c9 commit 41ee5c1
Show file tree
Hide file tree
Showing 52 changed files with 1,273 additions and 334 deletions.
12 changes: 9 additions & 3 deletions apps/pr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def bake_branch_name(repo, pr_title, no_llm, remote_branches):
category = pr_title.category

# Get input title (without emoji, scope, etc.)
title = _extract_relevant_title_for_branch_name(pr_title.title, not no_llm)
title = _extract_relevant_title_for_branch_name(pr_title.title, category, not no_llm)

# Bake complete PR branch name
# name = f"{user}-{category}-{title}"
Expand All @@ -383,13 +383,14 @@ def bake_branch_name(repo, pr_title, no_llm, remote_branches):
# if name in remote_branches:
# log.info("Generating a hash for this branch name to prevent name collisions.")
# name = f"{name}-{user}"
if name in remote_branches:
local_branches = [branch.name for branch in repo.branches]
if (name in remote_branches) or (name in local_branches):
log.info("Generating a hash for this branch name to prevent name collisions.")
name = f"{name}-{generate_short_hash()}"
return name


def _extract_relevant_title_for_branch_name(text_in: str, use_llm) -> str:
def _extract_relevant_title_for_branch_name(text_in: str, category: str, use_llm) -> str:
"""
Process the input string by:
1. Removing all symbols, keeping only letters and numbers.
Expand All @@ -411,8 +412,13 @@ def _extract_relevant_title_for_branch_name(text_in: str, use_llm) -> str:

# Split into tokens/words
tokens = cleaned_text.split()

# Clean if there is word included in category
tokens = [t for t in tokens if t.lower() != category]

# Keep only the first 3 tokens
tokens = tokens[:3]

# Combine tokens with '-'
name = "-".join(tokens).lower()

Expand Down
3 changes: 3 additions & 0 deletions apps/wizard/app_pages/anomalist/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,9 @@ def show_anomaly_compact(index, df):
config = bake_chart_config(variable_id=indicator_id, selected_entities=entities)
config["hideAnnotationFieldsInTitle"]["time"] = True
config["hideFacetControl"] = False
config["hideShareButton"] = True
config["hideExploreTheDataButton"] = True
# config["isSocialMediaExport"] = False

# Actually plot
grapher_chart(chart_config=config, owid_env=OWID_ENV)
Expand Down
2 changes: 0 additions & 2 deletions dag/archive/fasttrack.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
steps:
data://grapher/fasttrack/2023-06-19/world_population_comparison:
- snapshot://fasttrack/2023-06-19/world_population_comparison.csv
data://grapher/fasttrack/latest/democracy_lexical_index:
- snapshot://fasttrack/latest/democracy_lexical_index.csv
2 changes: 2 additions & 0 deletions dag/fasttrack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,5 @@ steps:
- snapshot://fasttrack/latest/mineral_prices_usgs.csv
data://grapher/fasttrack/latest/useful_energy_cost_way:
- snapshot://fasttrack/latest/useful_energy_cost_way.csv
data://grapher/fasttrack/2023-06-19/world_population_comparison:
- snapshot://fasttrack/2023-06-19/world_population_comparison.csv
63 changes: 35 additions & 28 deletions dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ steps:
data://grapher/who/2023-03-09/gho_suicides:
- data://garden/who/2023-03-09/gho_suicides


# IHME Global Burden of Disease - Risk factors
data://meadow/ihme_gbd/2019/gbd_risk:
- walden://ihme_gbd/2019/gbd_risk
Expand All @@ -48,8 +47,6 @@ steps:
data://grapher/ihme_gbd/2019/gbd_risk:
- data://garden/ihme_gbd/2019/gbd_risk



# IHME GBD Leading cause of deaths - update
data-private://meadow/ihme_gbd/2024-06-10/cause_hierarchy:
- snapshot-private://ihme_gbd/2024-06-10/cause_hierarchy.csv
Expand Down Expand Up @@ -192,7 +189,6 @@ steps:
data://grapher/un/2023-08-10/comtrade_pandemics:
- data://garden/un/2023-08-02/comtrade_pandemics


# UNAIDS
data://meadow/health/2023-08-09/unaids:
- snapshot://health/2023-08-09/unaids.csv
Expand Down Expand Up @@ -232,7 +228,6 @@ steps:
data://grapher/oecd/2024-07-01/road_accidents:
- data://garden/oecd/2024-07-01/road_accidents


# Kucharski
data://meadow/health/2023-08-14/avian_influenza_h5n1_kucharski:
- snapshot://health/2023-08-14/avian_influenza_h5n1_kucharski.xlsx
Expand Down Expand Up @@ -577,7 +572,6 @@ steps:
data-private://grapher/ihme_gbd/2024-05-20/gbd_mental_health_burden_dalys:
- data-private://garden/ihme_gbd/2024-05-20/gbd_mental_health_burden


# GBD 2021 - GBD Risk Factors
data-private://meadow/ihme_gbd/2024-05-20/gbd_risk:
- snapshot-private://ihme_gbd/2024-05-20/gbd_risk.feather
Expand All @@ -598,7 +592,6 @@ steps:
data-private://grapher/ihme_gbd/2024-05-20/gbd_drug_risk:
- data-private://garden/ihme_gbd/2024-05-20/gbd_drug_risk


# GBD 2021 - GBD Child Mortality
data-private://meadow/ihme_gbd/2024-05-20/gbd_child_mortality:
- snapshot-private://ihme_gbd/2024-05-20/gbd_child_mortality.feather
Expand All @@ -609,7 +602,6 @@ steps:
data-private://grapher/ihme_gbd/2024-05-20/gbd_child_mortality:
- data-private://garden/ihme_gbd/2024-05-20/gbd_child_mortality


# GBD 2021 - GBD Health-adjusted Life Expectancy and Life Expectancy
data-private://meadow/ihme_gbd/2024-07-02/gbd_life_expectancy:
- snapshot-private://ihme_gbd/2024-07-02/gbd_life_expectancy.zip
Expand Down Expand Up @@ -715,7 +707,6 @@ steps:
data://grapher/who/2024-08-06/mortality_database_cancer_most_common:
- data://garden/who/2024-08-06/mortality_database_cancer_most_common


data://meadow/who/latest/monkeypox:
- snapshot://who/latest/monkeypox.csv
data://garden/who/latest/monkeypox:
Expand All @@ -729,28 +720,26 @@ steps:
- data://garden/who/latest/monkeypox
export://github/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
# Mpox - Global.health
# Mpox - Global.health
data://meadow/health/latest/global_health_mpox:
- snapshot://health/latest/global_health_mpox.csv
data://garden/health/latest/global_health_mpox:
- data://meadow/health/latest/global_health_mpox

# Eurostat cancer

# Eurostat Cancer Screening
# Eurostat Cancer Screening
data://meadow/health/2024-08-23/eurostat_cancer:
- snapshot://health/2024-08-23/eurostat_cancer.csv
data://garden/health/2024-08-23/eurostat_cancer:
- data://meadow/health/2024-08-23/eurostat_cancer
data://grapher/health/2024-08-23/eurostat_cancer:
- data://garden/health/2024-08-23/eurostat_cancer


# Multi-dim indicators
export://multidim/health/latest/causes_of_death:
- grapher://grapher/ihme_gbd/2024-05-20/gbd_cause


# GBD 2021 - GBD Risk Factors cancer specific
data-private://meadow/ihme_gbd/2024-08-26/gbd_risk_cancer:
- snapshot-private://ihme_gbd/2024-08-26/gbd_risk_cancer.feather
Expand All @@ -775,7 +764,6 @@ steps:
data://grapher/health/2024-09-05/seattle_pathogens:
- data://garden/health/2024-09-05/seattle_pathogens


# International Agency for Research on Cancer
data://meadow/cancer/2024-08-30/gco_alcohol:
- snapshot://cancer/2024-08-30/gco_alcohol.csv
Expand All @@ -791,13 +779,13 @@ steps:
data://grapher/cancer/2024-09-06/gco_infections:
- data://garden/cancer/2024-09-06/gco_infections

# Flu testing data
# Flu testing data
data://garden/who/2024-09-09/flu_test:
- data://meadow/who/latest/flunet
data://grapher/who/2024-09-09/flu_test:
- data://garden/who/2024-09-09/flu_test

# Cancer diagnosis routes and survival rates
# Cancer diagnosis routes and survival rates
data://meadow/cancer/2024-09-13/diagnosis_routes_by_route:
- snapshot://cancer/2024-09-13/diagnosis_routes_by_route.csv
data://garden/cancer/2024-09-13/diagnosis_routes_by_route:
Expand Down Expand Up @@ -847,7 +835,6 @@ steps:
data://grapher/antibiotics/2024-10-09/gram_children:
- data://garden/antibiotics/2024-10-09/gram_children


# Cervical cancer incidence rates GCO - Cancer Over Time
data://meadow/cancer/2024-10-13/gco_cancer_over_time_cervical:
- snapshot://cancer/2024-10-13/gco_cancer_over_time_cervical.csv
Expand Down Expand Up @@ -888,7 +875,6 @@ steps:
data://grapher/antibiotics/2024-10-23/animuse_year:
- data://garden/antibiotics/2024-10-23/animuse_year


# ESVAC antimicrobial use in animals
data://meadow/antibiotics/2024-10-25/esvac_sales:
- snapshot://antibiotics/2024-10-25/esvac_sales.zip
Expand Down Expand Up @@ -941,17 +927,38 @@ steps:
- data-private://meadow/antibiotics/2024-11-20/pathogen_bloodstream
data-private://grapher/antibiotics/2024-11-20/pathogen_bloodstream:
- data-private://garden/antibiotics/2024-11-20/pathogen_bloodstream
# IHME Neonatal bloodstream infections by resistance
data-private://meadow/antibiotics/2024-11-20/bloodstream_amr:
- snapshot-private://antibiotics/2024-11-20/bloodstream_amr.csv
data-private://garden/antibiotics/2024-11-20/bloodstream_amr:
- data-private://meadow/antibiotics/2024-11-20/bloodstream_amr
data-private://grapher/antibiotics/2024-11-20/bloodstream_amr:
- data-private://garden/antibiotics/2024-11-20/bloodstream_amr
# IHME Neonatal infections by syndrome
data-private://meadow/antibiotics/2024-11-24/total_syndrome:
- snapshot-private://antibiotics/2024-11-24/total_syndrome.csv
data-private://garden/antibiotics/2024-11-24/total_syndrome:
# IHME Neonatal infections by syndrome and amr resistance
data-private://meadow/antibiotics/2024-12-02/microbe_amr:
- snapshot-private://antibiotics/2024-12-02/microbe_amr.csv
data-private://garden/antibiotics/2024-12-02/microbe_amr:
- data-private://meadow/antibiotics/2024-12-02/microbe_amr
- data-private://meadow/antibiotics/2024-11-24/total_syndrome
data-private://grapher/antibiotics/2024-11-24/total_syndrome:
- data-private://garden/antibiotics/2024-11-24/total_syndrome
data-private://grapher/antibiotics/2024-12-02/microbe_amr:
- data-private://garden/antibiotics/2024-12-02/microbe_amr
# IHME Neonatal infections and amr resistance
data-private://meadow/antibiotics/2024-12-02/microbe_neonatal_amr:
- snapshot-private://antibiotics/2024-12-02/microbe_neonatal_amr.csv
data-private://garden/antibiotics/2024-12-02/microbe_neonatal_amr:
- data-private://meadow/antibiotics/2024-12-02/microbe_neonatal_amr
- data-private://meadow/antibiotics/2024-11-20/microbe
data-private://grapher/antibiotics/2024-12-02/microbe_neonatal_amr:
- data-private://garden/antibiotics/2024-12-02/microbe_neonatal_amr

# MICROBE - total deaths by pathogen
data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream:
- snapshot-private://antibiotics/2024-12-02/total_pathogen_bloodstream.csv
data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream:
- data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream
data-private://grapher/antibiotics/2024-12-02/total_pathogen_bloodstream:
- data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream
# MICROBE - total deaths by pathogen and amr resistance
data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr:
- snapshot-private://antibiotics/2024-12-02/total_pathogen_bloodstream_amr.csv
data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr:
- data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream
- data-private://meadow/antibiotics/2024-12-02/total_pathogen_bloodstream_amr
data-private://grapher/antibiotics/2024-12-02/total_pathogen_bloodstream_amr:
- data-private://garden/antibiotics/2024-12-02/total_pathogen_bloodstream_amr
39 changes: 38 additions & 1 deletion docs/architecture/metadata/structuring-yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,4 +233,41 @@ tables:
{definitions.conflict_type_estimate}
```

Be cautious with line breaks and trailing whitespace when utilizing templates. Despite using good defaults, you might end up experimenting a lot to get the desired result.
Line breaks and whitespaces can be tricky when using Jinja templates. We use reasonable defaults and strip whitespaces, so in most cases you should be fine with using `<%` and `%>`, but in more complex cases, you might have to experiment with
more fine grained [whitespace control](https://jinja.palletsprojects.com/en/stable/templates/#whitespace-control) using tags `<%-` and `-%>`. This is most often used in if-else blocks like this

```yaml
age: |-
<% if age_group == "ALLAges" %>
...
<%- elif age_group == "Age-standardized" %>
...
<%- else %>
...
<%- endif %>
```

The most straightforward way to check your metadata is in Admin, although that means waiting for your step to finish. There's a faster way to check your YAML file directly. Create a `playground.ipynb` notebook in the same folder as your YAML file and copy this to the first cell:

```python
from etl import grapher_helpers as gh
dim_dict = {
"age_group": "YEARS0-4", "sex": "Male", "cause": "Drug use disorders"
}
d = gh.render_yaml_file("ghe.meta.yml", dim_dict=dim_dict)
d["tables"]["ghe"]["variables"]["death_count"]
```

An alternative is examining `VariableMeta`

```python
from etl import grapher_helpers as gh
from etl import paths
tb = Dataset(paths.DATA_DIR / "garden/who/2024-07-30/ghe")['ghe']
# Sample a random row to get the dimension values
dim_dict = dict(zip(tb.index.names, tb.sample(1).index[0]))
gh.render_variable_meta(tb.death_count.m, dim_dict=dim_dict)
```
51 changes: 45 additions & 6 deletions etl/grapher_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@
from dataclasses import dataclass, field, is_dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Iterable, List, Literal, Optional, Set, cast
from typing import Any, Dict, Iterable, List, Literal, Optional, Set, Union, cast

import jinja2
import numpy as np
import pandas as pd
import pymysql
import sqlalchemy
import structlog
from jinja2 import Environment
from owid import catalog
from owid.catalog import warnings
from owid.catalog.utils import underscore
from owid.catalog.utils import dynamic_yaml_load, dynamic_yaml_to_dict, underscore
from sqlalchemy import text
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session
Expand All @@ -23,7 +22,7 @@

log = structlog.get_logger()

jinja_env = Environment(
jinja_env = jinja2.Environment(
block_start_string="<%",
block_end_string="%>",
variable_start_string="<<",
Expand All @@ -32,8 +31,17 @@
comment_end_string="#>",
trim_blocks=True,
lstrip_blocks=True,
undefined=jinja2.StrictUndefined,
)


# Helper function to raise an error with << raise("uh oh...") >>
def raise_helper(msg):
raise Exception(msg)


jinja_env.globals["raise"] = raise_helper

# this might work too pd.api.types.is_integer_dtype(col)
INT_TYPES = tuple(
{f"{n}{b}{p}" for n in ("int", "Int", "uint", "UInt") for b in ("8", "16", "32", "64") for p in ("", "[pyarrow]")}
Expand Down Expand Up @@ -209,14 +217,18 @@ def _expand_jinja_text(text: str, dim_dict: Dict[str, str]) -> str:
return text

try:
return _cached_jinja_template(text).render(dim_dict)
# NOTE: we're stripping the result to avoid trailing newlines
return _cached_jinja_template(text).render(dim_dict).strip()
except jinja2.exceptions.TemplateSyntaxError as e:
new_message = f"{e.message}\n\nDimensions:\n{dim_dict}\n\nTemplate:\n{text}\n"
raise e.__class__(new_message, e.lineno, e.name, e.filename) from e
except jinja2.exceptions.UndefinedError as e:
new_message = f"{e.message}\n\nDimensions:\n{dim_dict}\n\nTemplate:\n{text}\n"
raise e.__class__(new_message) from e


def _expand_jinja(obj: Any, dim_dict: Dict[str, str]) -> Any:
"""Expand Jinja in all metadata fields."""
"""Expand Jinja in all metadata fields. This modifies the original object in place."""
if obj is None:
return None
elif isinstance(obj, str):
Expand All @@ -233,6 +245,33 @@ def _expand_jinja(obj: Any, dim_dict: Dict[str, str]) -> Any:
return obj


def render_yaml_file(path: Union[str, Path], dim_dict: Dict[str, str]) -> Dict[str, Any]:
"""Load YAML file and render Jinja in all fields. Return a dictionary.
Usage:
from etl import grapher_helpers as gh
from etl import paths
tb = Dataset(paths.DATA_DIR / "garden/who/2024-07-30/ghe")['ghe']
gh.render_variable_meta(tb.my_col.m, dim_dict={"sex": "male"})
"""
meta = dynamic_yaml_to_dict(dynamic_yaml_load(path))
return _expand_jinja(meta, dim_dict)


def render_variable_meta(meta: catalog.VariableMeta, dim_dict: Dict[str, str]) -> catalog.VariableMeta:
"""Render Jinja in all fields of VariableMeta. Return a new VariableMeta object.
Usage:
# Create a playground.ipynb next to YAML file and run this in notebook
from etl import grapher_helpers as gh
m = gh.render_yaml_file("ghe.meta.yml", dim_dict={"sex": "male"})
m['tables']['ghe']['variables']['death_count']
"""
# TODO: move this as a method to VariableMeta class
return _expand_jinja(meta.copy(), dim_dict)


def _title_column_and_dimensions(title: str, dim_dict: Dict[str, Any]) -> str:
"""Create new title from column title and dimensions.
For instance `Deaths`, ["age", "sex"], ["10-18", "male"] will be converted into
Expand Down
Loading

0 comments on commit 41ee5c1

Please sign in to comment.