wip

owid · Dec 13, 2024 · 35165aa · 35165aa
1 parent 8a838cc
commit 35165aa
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 15 deletions.
diff --git a/etl/config.py b/etl/config.py
@@ -180,7 +180,7 @@ def variable_metadata_url(variable_id):
 MAX_VIRTUAL_MEMORY_LINUX = 32 * 2**30  # 32 GB
 
 # increment this to force a full rebuild of all datasets
-ETL_EPOCH = 6
+ETL_EPOCH = 5
 
 # any garden or grapher dataset after this date will have strict mode enabled
 STRICT_AFTER = "2023-06-25"

diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py
@@ -134,7 +134,7 @@ def _yield_wide_table(
         table_to_yield = table_to_yield[[c for c in table_to_yield.columns if c not in dim_names]]
 
         # Filter NaN values from dimensions and return dictionary
-        dim_dict: Dict[str, Any] = {n: v for n, v in zip(dim_names, dim_values) if pd.notnull(v)}
+        dim_dict = _create_dim_dict(dim_names, dim_values)  # type: ignore
 
         # Now iterate over every column in the original dataset and export the
         # subset of data that we prepared above
@@ -207,18 +207,18 @@ def _metadata_for_dimensions(meta: catalog.VariableMeta, dim_dict: Dict[str, Any
     return meta
 
 
+def _create_dim_dict(dim_names: List[str], dim_values: List[Any]) -> Dict[str, Any]:
+    # Filter NaN values from dimensions and return dictionary
+    return {n: v for n, v in zip(dim_names, dim_values) if pd.notnull(v)}
+
+
 def long_to_wide(long_tb: catalog.Table) -> catalog.Table:
     """Convert a long table to a wide table by unstacking dimensions. This function mimics the process that occurs
     when a long table is upserted to the database. With this function, you can explicitly perform this transformation
     in the grapher step and store a flattened dataset in the catalog."""
 
     dim_names = [k for k in long_tb.primary_key if k not in ("year", "country", "date")]
 
-    # Check for null values in dimensions
-    # for dim_name in dim_names:
-    #     if long_tb.index.get_level_values(dim_name).isnull().any():
-    #         raise ValueError(f"NaN values in dimension: {dim_name}")
-
     # Unstack dimensions to a wide format
     wide_tb = cast(catalog.Table, long_tb.unstack(level=dim_names))  # type: ignore
 
@@ -231,13 +231,8 @@ def long_to_wide(long_tb: catalog.Table) -> catalog.Table:
     for dims in wide_tb.columns:
         column = dims[0]
 
-        # TODO: DRY this with function above
         # Filter NaN values from dimensions and return dictionary
-        dim_dict: Dict[str, Any] = {n: v for n, v in zip(dim_names, dims[1:]) if pd.notnull(v)}
-
-        # Check for NaN values in dimensions
-        # if any(pd.isnull(v) for v in dim_dict.values()):
-        #     raise ValueError(f"NaN values in dimensions: {dim_dict}")
+        dim_dict = _create_dim_dict(dim_names, dims[1:])
 
         # Create a short name from dimension values
         short_name = _underscore_column_and_dimensions(column, dim_dict)

diff --git a/etl/steps/data/grapher/cancer/2024-08-30/gco_alcohol.py b/etl/steps/data/grapher/cancer/2024-08-30/gco_alcohol.py
@@ -23,9 +23,9 @@ def run(dest_dir: str) -> None:
     #
     # Save outputs.
     #
-    # Create a new grapher dataset with the same metadata as the garden dataset
+    # Create a new grapher dataset with the same metadata as the garden dataset.
     ds_grapher = create_dataset(
-        dest_dir, tables=[tb], long_to_wide=True, check_variables_metadata=True, default_metadata=ds_garden.metadata
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
     )
 
     # Save changes in the new grapher dataset.

diff --git a/tests/test_grapher_helpers.py b/tests/test_grapher_helpers.py
@@ -245,3 +245,26 @@ def test_underscore_column_and_dimensions():
 def test_title_column_and_dimensions():
     assert gh._title_column_and_dimensions("A", {"age": "1"}) == "A - Age: 1"
     assert gh._title_column_and_dimensions("A", {"age_group": "15-18"}) == "A - Age group: 15-18"
+
+
+def test_long_to_wide():
+    df = pd.DataFrame(
+        {
+            "year": [2019, 2019, 2019, 2019],
+            "country": ["France", "France", "France", "France"],
+            "age": ["10-18", "19-25", "26-30", np.nan],
+            "deaths": [1, 2, 3, 4],
+        }
+    )
+    table = Table(df.set_index(["country", "year", "age"]))
+    table.deaths.metadata.unit = "people"
+    table.deaths.metadata.title = "Deaths"
+
+    wide = gh.long_to_wide(table)
+
+    assert list(wide.columns) == ["deaths", "deaths__age_10_18", "deaths__age_19_25", "deaths__age_26_30"]
+
+    assert wide["deaths"].m.title == "Deaths"
+    assert wide["deaths__age_10_18"].m.title == "Deaths - Age: 10-18"
+    assert wide["deaths__age_19_25"].m.title == "Deaths - Age: 19-25"
+    assert wide["deaths__age_26_30"].m.title == "Deaths - Age: 26-30"