Merge pull request #122 from Urban-Analytics-Technology-Platform/120-…

…filepaths Revise metadata filepaths and module names (#120)
Urban-Analytics-Technology-Platform · Jul 1, 2024 · 86ab0ed · 86ab0ed
2 parents d0d94fe + 7921e90
commit 86ab0ed
Show file tree

Hide file tree

Showing 14 changed files with 72 additions and 64 deletions.
diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from . import be, ni, uk, us
+from . import bel, gb_nir, uk, us
 
-countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us]]
+countries = [(mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us]]
 
 __all__ = ["countries"]
diff --git a/python/popgetter/assets/be/__init__.py → python/popgetter/assets/bel/__init__.py b/python/popgetter/assets/be/__init__.py → python/popgetter/assets/bel/__init__.py
diff --git a/python/popgetter/assets/be/belgium.py → python/popgetter/assets/bel/belgium.py b/python/popgetter/assets/be/belgium.py → python/popgetter/assets/bel/belgium.py
@@ -13,4 +13,4 @@
 )
 
 WORKING_DIR = Path("belgium")
-asset_prefix = "be"
+asset_prefix = "bel"
diff --git a/python/popgetter/assets/be/census_derived.py → ...on/popgetter/assets/bel/census_derived.py b/python/popgetter/assets/be/census_derived.py → ...on/popgetter/assets/bel/census_derived.py
@@ -275,7 +275,10 @@ def derived_metrics_by_partition(
     derived_metrics: list[pd.DataFrame] = []
     derived_mmd: list[MetricMetadata] = []
 
-    parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet"
+    parquet_file_name = (
+        f"{asset_prefix}/metrics/"
+        f"{''.join(c for c in node if c.isalnum()) + '.parquet'}"
+    )
 
     for metric_spec in metric_specs:
         new_table = (

diff --git a/...on/popgetter/assets/be/census_geometry.py → ...n/popgetter/assets/bel/census_geometry.py b/...on/popgetter/assets/be/census_geometry.py → ...n/popgetter/assets/bel/census_geometry.py
@@ -23,7 +23,7 @@
 )
 from popgetter.utils import markdown_from_plot
 
-from .belgium import asset_prefix
+from .belgium import asset_prefix, country
 from .census_tables import publisher
 
 
@@ -115,6 +115,7 @@ def geometry(context, sector_geometries) -> list[GeometryOutput]:
 
     for level_details in BELGIUM_GEOMETRY_LEVELS.values():
         geometry_metadata = GeometryMetadata(
+            country_metadata=country,
             validity_period_start=date(2023, 1, 1),
             validity_period_end=date(2023, 12, 31),
             level=level_details.level,

diff --git a/python/popgetter/assets/be/census_tables.py → python/popgetter/assets/bel/census_tables.py b/python/popgetter/assets/be/census_tables.py → python/popgetter/assets/bel/census_tables.py
diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py
@@ -38,11 +38,13 @@ class Country(ABC):
 
     """
 
-    key_prefix: ClassVar[str]
+    country_metadata: ClassVar[CountryMetadata]
+    key_prefix: str
     partition_name: str
     dataset_node_partition: DynamicPartitionsDefinition
 
     def __init__(self):
+        self.key_prefix = self.country_metadata.id
         self.partition_name = f"{self.key_prefix}_nodes"
         self.dataset_node_partition = DynamicPartitionsDefinition(
             name=self.partition_name
@@ -85,9 +87,8 @@ def country_metadata(context):
 
         return country_metadata
 
-    @abstractmethod
-    def _country_metadata(self, context) -> CountryMetadata:
-        ...
+    def _country_metadata(self, _context) -> CountryMetadata:
+        return self.country_metadata
 
     def create_data_publisher(self):
         """Creates an asset providing the data publisher metadata."""

diff --git a/python/popgetter/assets/ni/README.md → python/popgetter/assets/gb_nir/README.md b/python/popgetter/assets/ni/README.md → python/popgetter/assets/gb_nir/README.md
diff --git a/python/popgetter/assets/ni/__init__.py → python/popgetter/assets/gb_nir/__init__.py b/python/popgetter/assets/ni/__init__.py → python/popgetter/assets/gb_nir/__init__.py
@@ -241,19 +241,16 @@ def census_table_metadata(
 
 
 class NorthernIreland(Country):
-    key_prefix: ClassVar[str] = "uk-ni"
+    country_metadata: CountryMetadata = CountryMetadata(
+        name_short_en="Northern Ireland",
+        name_official="Northern Ireland",
+        iso3="GBR",
+        iso2="GB",
+        iso3166_2="GB-NIR",
+    )
     geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
     tables_to_process: list[str] | None = TABLES_TO_PROCESS
 
-    def _country_metadata(self, _context) -> CountryMetadata:
-        return CountryMetadata(
-            name_short_en="Northern Ireland",
-            name_official="Northern Ireland",
-            iso3="GBR",
-            iso2="GB",
-            iso3166_2="GB-NIR",
-        )
-
     def _data_publisher(
         self, _context, country_metadata: CountryMetadata
     ) -> DataPublisher:
@@ -394,6 +391,7 @@ def _geometry(self, context) -> list[GeometryOutput]:
         for level_details in NI_GEO_LEVELS.values():
             # TODO: get correct values
             geometry_metadata = GeometryMetadata(
+                country_metadata=self.country_metadata,
                 validity_period_start=CENSUS_COLLECTION_DATE,
                 validity_period_end=CENSUS_COLLECTION_DATE,
                 level=level_details.level,
@@ -538,7 +536,8 @@ def _derived_metrics(
         )
 
         parquet_file_name = (
-            "".join(c for c in partition_key if c.isalnum()) + ".parquet"
+            f"{self.key_prefix}/metrics/"
+            f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}"
         )
         derived_metrics: list[pd.DataFrame] = []
         derived_mmd: list[MetricMetadata] = []

diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py
@@ -132,27 +132,15 @@ class GeometryOutputPaths:
 
     def get_full_paths_geoms(
         self,
-        context: OutputContext,
         geo_metadata: GeometryMetadata,
     ) -> GeometryOutputPaths:
-        filename_stem = geo_metadata.filename_stem
-        asset_prefix = list(context.partition_key.split("/"))[:-1]  # e.g. ['be']
+        filepath_stem = geo_metadata.filename_stem
         base_path = self.get_base_path()
         return self.GeometryOutputPaths(
-            flatgeobuf=base_path
-            / UPath("/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"])),
-            pmtiles=base_path
-            / UPath(
-                "/".join([*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"])
-            ),
-            geojsonseq=base_path
-            / UPath(
-                "/".join([*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"])
-            ),
-            names=base_path
-            / UPath(
-                "/".join([*asset_prefix, "geometries", f"{filename_stem}.parquet"])
-            ),
+            flatgeobuf=base_path / UPath(f"{filepath_stem}.fgb"),
+            pmtiles=base_path / UPath(f"TODO_{filepath_stem}.pmtiles"),
+            geojsonseq=base_path / UPath(f"{filepath_stem}.geojsonseq"),
+            names=base_path / UPath(f"{filepath_stem}.parquet"),
         )
 
     def get_full_path_metadata(
@@ -217,7 +205,7 @@ def handle_output(
             output.gdf["GEO_ID"] = output.gdf["GEO_ID"].astype("string")
             output.names_df = output.names_df.astype("string")
 
-            full_paths = self.get_full_paths_geoms(context, output.metadata)
+            full_paths = self.get_full_paths_geoms(output.metadata)
 
             self.handle_flatgeobuf(context, output.gdf, full_paths.flatgeobuf)
             self.handle_geojsonseq(context, output.gdf, full_paths.geojsonseq)
@@ -253,12 +241,10 @@ def get_full_path_metadata(
 
     def get_full_path_metrics(
         self,
-        context: OutputContext,
         parquet_path: str,
     ) -> UPath:
         base_path = self.get_base_path()
-        asset_prefix = list(context.partition_key.split("/"))[:-1]
-        return base_path / UPath("/".join([*asset_prefix, "metrics", parquet_path]))
+        return base_path / UPath(parquet_path)
 
     def handle_output(
         self,
@@ -329,7 +315,7 @@ def handle_output(
         # of the tuple
         for metrics_output in obj:
             rel_path = metrics_output.metadata[0].metric_parquet_path
-            full_path = self.get_full_path_metrics(context, rel_path)
+            full_path = self.get_full_path_metrics(rel_path)
             self.handle_df(context, metrics_output.metrics, full_path)
 
         # Add metadata

diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py
@@ -19,15 +19,28 @@ def hash_class_vars(self):
         Note that `vars()` does not include properties, so the IDs themselves are
         not part of the hash, which avoids self-reference issues.
         """
+
         # Must copy the dict to avoid overriding the actual instance attributes!
         # Because we're only modifying dates -> strings, we don't need to perform a
-        # deepcopy
-        variables = dict(**vars(self))
-        # Python doesn't serialise dates to JSON, have to convert to ISO 8601 first
-        for key, val in variables.items():
-            if isinstance(val, date):
-                variables[key] = val.isoformat()
-        return sha256(jcs.canonicalize(variables)).hexdigest()
+        # deepcopy but all variables must be serializable
+        def serializable_vars(obj: object) -> dict:
+            variables = {}
+            # Check if variables are serializable
+            for key, val in vars(obj).items():
+                try:
+                    jcs.canonicalize(val)
+                    variables[key] = val
+                except Exception:
+                    pass
+
+            # Python doesn't serialise dates to JSON, have to convert to ISO 8601 first
+            for key, val in variables.items():
+                if isinstance(val, date):
+                    variables[key] = val.isoformat()
+
+            return variables
+
+        return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest()
 
     @classmethod
     def fix_types(cls, df: pd.DataFrame) -> pd.DataFrame:
@@ -67,8 +80,8 @@ class CountryMetadata(MetadataBaseModel):
     @property
     def id(self) -> str:
         if self.iso3166_2 is not None:
-            return self.iso3166_2.lower()
-        return self.iso3.lower()
+            return self.iso3166_2.lower().replace("-", "_")
+        return self.iso3.lower().replace("-", "_")
 
     name_short_en: str = Field(
         description="The short name of the country in English (for example 'Belgium')."
@@ -119,10 +132,15 @@ def id(self) -> str:
 
     @computed_field
     @property
+    # TODO: update metadata field name to `filepath_stem` (https://github.com/Urban-Analytics-Technology-Platform/popgetter/issues/129)
     def filename_stem(self) -> str:
         level = "_".join(self.level.lower().split())
         year = self.validity_period_start.year
-        return f"{level}_{year}"
+        return f"{self.country_metadata.id}/geometries/{level}_{year}"
+
+    country_metadata: CountryMetadata = Field(
+        "The `CountryMetadata` associated with the geometry.", exclude=True
+    )
 
     validity_period_start: date = Field(
         description="The start of the range of time for which the regions are valid (inclusive)"

diff --git a/tests/test_be.py → tests/test_bel.py b/tests/test_be.py → tests/test_bel.py
@@ -12,7 +12,7 @@
 from rdflib import Graph
 from rdflib.namespace import DCAT
 
-from popgetter.assets import be
+from popgetter.assets import bel
 
 
 @pytest.fixture(scope="module")
@@ -36,7 +36,7 @@ def demo_catalog() -> Graph:
 @pytest.fixture(scope="module")
 def demo_catalog_df(demo_catalog) -> pd.DataFrame:
     context = build_asset_context()
-    return be.census_tables.catalog_as_dataframe(context, demo_catalog)
+    return bel.census_tables.catalog_as_dataframe(context, demo_catalog)
 
 
 @pytest.mark.skip(
@@ -46,7 +46,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
     # Test the that the row count is correctly added to the metadata
     context = build_asset_context()
 
-    actual_municipalities = be.census_geometry.aggregate_sectors_to_municipalities(
+    actual_municipalities = bel.census_geometry.aggregate_sectors_to_municipalities(
         context, demo_sectors
     )
 
@@ -62,7 +62,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
 @pytest.mark.skip(reason="Fix test_get_population_details_per_municipality first")
 def test_get_population_details_per_municipality():
     with build_asset_context() as muni_context:
-        stat_muni = be.census_tables.get_population_details_per_municipality(
+        stat_muni = bel.census_tables.get_population_details_per_municipality(
             muni_context
         )
 
@@ -87,7 +87,7 @@ def test_pivot_population():
         )
 
         # Get the geometries
-        stat_muni = be.census_tables.get_population_details_per_municipality(
+        stat_muni = bel.census_tables.get_population_details_per_municipality(
             muni_context
         )
 
@@ -99,7 +99,7 @@ def test_pivot_population():
 
     with build_asset_context() as pivot_context:
         # Pivot the population
-        pivoted = be.pivot_population(pivot_context, stat_muni)
+        pivoted = bel.pivot_population(pivot_context, stat_muni)
 
     expected_number_of_municipalities = 581
 
@@ -115,7 +115,7 @@ def test_demo_catalog(demo_catalog):
     actual_length = len(
         list(
             demo_catalog.objects(
-                subject=be.census_tables.opendata_catalog_root,
+                subject=bel.census_tables.opendata_catalog_root,
                 predicate=DCAT.dataset,
                 unique=False,
             )
@@ -128,7 +128,7 @@ def test_demo_catalog(demo_catalog):
 def test_catalog_metadata_details(demo_catalog_df):
     # Get the metadata for a specific dataset in the demo catalogue:
     # https://statbel.fgov.be/node/4151 "Population by Statistical sector"
-    # mmd = be.census_tables.get_mmd_from_dataset_node(
+    # mmd = bel.census_tables.get_mmd_from_dataset_node(
     #     demo_catalog, dataset_node=URIRef("https://statbel.fgov.be/node/4151")
     # )
 
@@ -179,7 +179,7 @@ def test_catalog_as_dataframe(demo_catalog_df):
 
     # # Convert the demo catalog to a DataFrame
     # with build_asset_context() as context:
-    #     catalog_df = be.census_tables.catalog_as_dataframe(context, demo_catalog_df)
+    #     catalog_df = bel.census_tables.catalog_as_dataframe(context, demo_catalog_df)
 
     #     # Check that the catalog has been converted to a DataFrame
     #     assert isinstance(catalog_df, pd.DataFrame)
@@ -228,7 +228,7 @@ def test_filter_known_failing_datasets():
         "2676",
     ]
 
-    actual_list = be.census_tables.filter_known_failing_datasets(mock_catalog)
+    actual_list = bel.census_tables.filter_known_failing_datasets(mock_catalog)
 
     assert mock_catalog != expected_list
     assert actual_list != mock_catalog

diff --git a/tests/test_cloud_outputs.py b/tests/test_cloud_outputs.py
@@ -9,7 +9,7 @@
 # )
 # generate_pmtiles,
 # TODO, Move this to a fixture to somewhere more universal
-from .test_be import demo_sectors  # noqa: F401
+from .test_bel import demo_sectors  # noqa: F401
 
 # Commented out test as part of #92 as functions no longer importable
 # @pytest.mark.skip(

diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -57,7 +57,7 @@ def test_source_data_release_hash():
     )
     assert (
         source_data_release.id
-        == "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597"
+        == "4d61bfe401ba17becd02d6b3912152c135daa9ecaebc9bd45a589dc831a85217"
     )
 
     source_data_release2 = SourceDataRelease(