Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Dec 13, 2024
1 parent 8a838cc commit 35165aa
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 15 deletions.
2 changes: 1 addition & 1 deletion etl/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def variable_metadata_url(variable_id):
MAX_VIRTUAL_MEMORY_LINUX = 32 * 2**30 # 32 GB

# increment this to force a full rebuild of all datasets
ETL_EPOCH = 6
ETL_EPOCH = 5

# any garden or grapher dataset after this date will have strict mode enabled
STRICT_AFTER = "2023-06-25"
Expand Down
19 changes: 7 additions & 12 deletions etl/grapher_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def _yield_wide_table(
table_to_yield = table_to_yield[[c for c in table_to_yield.columns if c not in dim_names]]

# Filter NaN values from dimensions and return dictionary
dim_dict: Dict[str, Any] = {n: v for n, v in zip(dim_names, dim_values) if pd.notnull(v)}
dim_dict = _create_dim_dict(dim_names, dim_values) # type: ignore

# Now iterate over every column in the original dataset and export the
# subset of data that we prepared above
Expand Down Expand Up @@ -207,18 +207,18 @@ def _metadata_for_dimensions(meta: catalog.VariableMeta, dim_dict: Dict[str, Any
return meta


def _create_dim_dict(dim_names: List[str], dim_values: List[Any]) -> Dict[str, Any]:
# Filter NaN values from dimensions and return dictionary
return {n: v for n, v in zip(dim_names, dim_values) if pd.notnull(v)}


def long_to_wide(long_tb: catalog.Table) -> catalog.Table:
"""Convert a long table to a wide table by unstacking dimensions. This function mimics the process that occurs
when a long table is upserted to the database. With this function, you can explicitly perform this transformation
in the grapher step and store a flattened dataset in the catalog."""

dim_names = [k for k in long_tb.primary_key if k not in ("year", "country", "date")]

# Check for null values in dimensions
# for dim_name in dim_names:
# if long_tb.index.get_level_values(dim_name).isnull().any():
# raise ValueError(f"NaN values in dimension: {dim_name}")

# Unstack dimensions to a wide format
wide_tb = cast(catalog.Table, long_tb.unstack(level=dim_names)) # type: ignore

Expand All @@ -231,13 +231,8 @@ def long_to_wide(long_tb: catalog.Table) -> catalog.Table:
for dims in wide_tb.columns:
column = dims[0]

# TODO: DRY this with function above
# Filter NaN values from dimensions and return dictionary
dim_dict: Dict[str, Any] = {n: v for n, v in zip(dim_names, dims[1:]) if pd.notnull(v)}

# Check for NaN values in dimensions
# if any(pd.isnull(v) for v in dim_dict.values()):
# raise ValueError(f"NaN values in dimensions: {dim_dict}")
dim_dict = _create_dim_dict(dim_names, dims[1:])

# Create a short name from dimension values
short_name = _underscore_column_and_dimensions(column, dim_dict)
Expand Down
4 changes: 2 additions & 2 deletions etl/steps/data/grapher/cancer/2024-08-30/gco_alcohol.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def run(dest_dir: str) -> None:
#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(
dest_dir, tables=[tb], long_to_wide=True, check_variables_metadata=True, default_metadata=ds_garden.metadata
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
)

# Save changes in the new grapher dataset.
Expand Down
23 changes: 23 additions & 0 deletions tests/test_grapher_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,26 @@ def test_underscore_column_and_dimensions():
def test_title_column_and_dimensions():
assert gh._title_column_and_dimensions("A", {"age": "1"}) == "A - Age: 1"
assert gh._title_column_and_dimensions("A", {"age_group": "15-18"}) == "A - Age group: 15-18"


def test_long_to_wide():
df = pd.DataFrame(
{
"year": [2019, 2019, 2019, 2019],
"country": ["France", "France", "France", "France"],
"age": ["10-18", "19-25", "26-30", np.nan],
"deaths": [1, 2, 3, 4],
}
)
table = Table(df.set_index(["country", "year", "age"]))
table.deaths.metadata.unit = "people"
table.deaths.metadata.title = "Deaths"

wide = gh.long_to_wide(table)

assert list(wide.columns) == ["deaths", "deaths__age_10_18", "deaths__age_19_25", "deaths__age_26_30"]

assert wide["deaths"].m.title == "Deaths"
assert wide["deaths__age_10_18"].m.title == "Deaths - Age: 10-18"
assert wide["deaths__age_19_25"].m.title == "Deaths - Age: 19-25"
assert wide["deaths__age_26_30"].m.title == "Deaths - Age: 26-30"

0 comments on commit 35165aa

Please sign in to comment.