Skip to content

Commit

Permalink
Merge pull request #122 from Urban-Analytics-Technology-Platform/120-…
Browse files Browse the repository at this point in the history
…filepaths

Revise metadata filepaths and module names (#120)
  • Loading branch information
sgreenbury authored Jul 1, 2024
2 parents d0d94fe + 7921e90 commit 86ab0ed
Show file tree
Hide file tree
Showing 14 changed files with 72 additions and 64 deletions.
4 changes: 2 additions & 2 deletions python/popgetter/assets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from . import be, ni, uk, us
from . import bel, gb_nir, uk, us

countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us]]
countries = [(mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us]]

__all__ = ["countries"]
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
)

WORKING_DIR = Path("belgium")
asset_prefix = "be"
asset_prefix = "bel"
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,10 @@ def derived_metrics_by_partition(
derived_metrics: list[pd.DataFrame] = []
derived_mmd: list[MetricMetadata] = []

parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet"
parquet_file_name = (
f"{asset_prefix}/metrics/"
f"{''.join(c for c in node if c.isalnum()) + '.parquet'}"
)

for metric_spec in metric_specs:
new_table = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
)
from popgetter.utils import markdown_from_plot

from .belgium import asset_prefix
from .belgium import asset_prefix, country
from .census_tables import publisher


Expand Down Expand Up @@ -115,6 +115,7 @@ def geometry(context, sector_geometries) -> list[GeometryOutput]:

for level_details in BELGIUM_GEOMETRY_LEVELS.values():
geometry_metadata = GeometryMetadata(
country_metadata=country,
validity_period_start=date(2023, 1, 1),
validity_period_end=date(2023, 12, 31),
level=level_details.level,
Expand Down
File renamed without changes.
9 changes: 5 additions & 4 deletions python/popgetter/assets/country.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ class Country(ABC):
"""

key_prefix: ClassVar[str]
country_metadata: ClassVar[CountryMetadata]
key_prefix: str
partition_name: str
dataset_node_partition: DynamicPartitionsDefinition

def __init__(self):
self.key_prefix = self.country_metadata.id
self.partition_name = f"{self.key_prefix}_nodes"
self.dataset_node_partition = DynamicPartitionsDefinition(
name=self.partition_name
Expand Down Expand Up @@ -85,9 +87,8 @@ def country_metadata(context):

return country_metadata

@abstractmethod
def _country_metadata(self, context) -> CountryMetadata:
...
def _country_metadata(self, _context) -> CountryMetadata:
return self.country_metadata

def create_data_publisher(self):
"""Creates an asset providing the data publisher metadata."""
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -241,19 +241,16 @@ def census_table_metadata(


class NorthernIreland(Country):
key_prefix: ClassVar[str] = "uk-ni"
country_metadata: CountryMetadata = CountryMetadata(
name_short_en="Northern Ireland",
name_official="Northern Ireland",
iso3="GBR",
iso2="GB",
iso3166_2="GB-NIR",
)
geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
tables_to_process: list[str] | None = TABLES_TO_PROCESS

def _country_metadata(self, _context) -> CountryMetadata:
return CountryMetadata(
name_short_en="Northern Ireland",
name_official="Northern Ireland",
iso3="GBR",
iso2="GB",
iso3166_2="GB-NIR",
)

def _data_publisher(
self, _context, country_metadata: CountryMetadata
) -> DataPublisher:
Expand Down Expand Up @@ -394,6 +391,7 @@ def _geometry(self, context) -> list[GeometryOutput]:
for level_details in NI_GEO_LEVELS.values():
# TODO: get correct values
geometry_metadata = GeometryMetadata(
country_metadata=self.country_metadata,
validity_period_start=CENSUS_COLLECTION_DATE,
validity_period_end=CENSUS_COLLECTION_DATE,
level=level_details.level,
Expand Down Expand Up @@ -538,7 +536,8 @@ def _derived_metrics(
)

parquet_file_name = (
"".join(c for c in partition_key if c.isalnum()) + ".parquet"
f"{self.key_prefix}/metrics/"
f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}"
)
derived_metrics: list[pd.DataFrame] = []
derived_mmd: list[MetricMetadata] = []
Expand Down
30 changes: 8 additions & 22 deletions python/popgetter/io_managers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,27 +132,15 @@ class GeometryOutputPaths:

def get_full_paths_geoms(
self,
context: OutputContext,
geo_metadata: GeometryMetadata,
) -> GeometryOutputPaths:
filename_stem = geo_metadata.filename_stem
asset_prefix = list(context.partition_key.split("/"))[:-1] # e.g. ['be']
filepath_stem = geo_metadata.filename_stem
base_path = self.get_base_path()
return self.GeometryOutputPaths(
flatgeobuf=base_path
/ UPath("/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"])),
pmtiles=base_path
/ UPath(
"/".join([*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"])
),
geojsonseq=base_path
/ UPath(
"/".join([*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"])
),
names=base_path
/ UPath(
"/".join([*asset_prefix, "geometries", f"{filename_stem}.parquet"])
),
flatgeobuf=base_path / UPath(f"{filepath_stem}.fgb"),
pmtiles=base_path / UPath(f"TODO_{filepath_stem}.pmtiles"),
geojsonseq=base_path / UPath(f"{filepath_stem}.geojsonseq"),
names=base_path / UPath(f"{filepath_stem}.parquet"),
)

def get_full_path_metadata(
Expand Down Expand Up @@ -217,7 +205,7 @@ def handle_output(
output.gdf["GEO_ID"] = output.gdf["GEO_ID"].astype("string")
output.names_df = output.names_df.astype("string")

full_paths = self.get_full_paths_geoms(context, output.metadata)
full_paths = self.get_full_paths_geoms(output.metadata)

self.handle_flatgeobuf(context, output.gdf, full_paths.flatgeobuf)
self.handle_geojsonseq(context, output.gdf, full_paths.geojsonseq)
Expand Down Expand Up @@ -253,12 +241,10 @@ def get_full_path_metadata(

def get_full_path_metrics(
self,
context: OutputContext,
parquet_path: str,
) -> UPath:
base_path = self.get_base_path()
asset_prefix = list(context.partition_key.split("/"))[:-1]
return base_path / UPath("/".join([*asset_prefix, "metrics", parquet_path]))
return base_path / UPath(parquet_path)

def handle_output(
self,
Expand Down Expand Up @@ -329,7 +315,7 @@ def handle_output(
# of the tuple
for metrics_output in obj:
rel_path = metrics_output.metadata[0].metric_parquet_path
full_path = self.get_full_path_metrics(context, rel_path)
full_path = self.get_full_path_metrics(rel_path)
self.handle_df(context, metrics_output.metrics, full_path)

# Add metadata
Expand Down
38 changes: 28 additions & 10 deletions python/popgetter/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,28 @@ def hash_class_vars(self):
Note that `vars()` does not include properties, so the IDs themselves are
not part of the hash, which avoids self-reference issues.
"""

# Must copy the dict to avoid overriding the actual instance attributes!
# Because we're only modifying dates -> strings, we don't need to perform a
# deepcopy
variables = dict(**vars(self))
# Python doesn't serialise dates to JSON, have to convert to ISO 8601 first
for key, val in variables.items():
if isinstance(val, date):
variables[key] = val.isoformat()
return sha256(jcs.canonicalize(variables)).hexdigest()
# deepcopy but all variables must be serializable
def serializable_vars(obj: object) -> dict:
variables = {}
# Check if variables are serializable
for key, val in vars(obj).items():
try:
jcs.canonicalize(val)
variables[key] = val
except Exception:
pass

# Python doesn't serialise dates to JSON, have to convert to ISO 8601 first
for key, val in variables.items():
if isinstance(val, date):
variables[key] = val.isoformat()

return variables

return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest()

@classmethod
def fix_types(cls, df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -67,8 +80,8 @@ class CountryMetadata(MetadataBaseModel):
@property
def id(self) -> str:
if self.iso3166_2 is not None:
return self.iso3166_2.lower()
return self.iso3.lower()
return self.iso3166_2.lower().replace("-", "_")
return self.iso3.lower().replace("-", "_")

name_short_en: str = Field(
description="The short name of the country in English (for example 'Belgium')."
Expand Down Expand Up @@ -119,10 +132,15 @@ def id(self) -> str:

@computed_field
@property
# TODO: update metadata field name to `filepath_stem` (https://github.com/Urban-Analytics-Technology-Platform/popgetter/issues/129)
def filename_stem(self) -> str:
level = "_".join(self.level.lower().split())
year = self.validity_period_start.year
return f"{level}_{year}"
return f"{self.country_metadata.id}/geometries/{level}_{year}"

country_metadata: CountryMetadata = Field(
"The `CountryMetadata` associated with the geometry.", exclude=True
)

validity_period_start: date = Field(
description="The start of the range of time for which the regions are valid (inclusive)"
Expand Down
20 changes: 10 additions & 10 deletions tests/test_be.py → tests/test_bel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from rdflib import Graph
from rdflib.namespace import DCAT

from popgetter.assets import be
from popgetter.assets import bel


@pytest.fixture(scope="module")
Expand All @@ -36,7 +36,7 @@ def demo_catalog() -> Graph:
@pytest.fixture(scope="module")
def demo_catalog_df(demo_catalog) -> pd.DataFrame:
context = build_asset_context()
return be.census_tables.catalog_as_dataframe(context, demo_catalog)
return bel.census_tables.catalog_as_dataframe(context, demo_catalog)


@pytest.mark.skip(
Expand All @@ -46,7 +46,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
# Test the that the row count is correctly added to the metadata
context = build_asset_context()

actual_municipalities = be.census_geometry.aggregate_sectors_to_municipalities(
actual_municipalities = bel.census_geometry.aggregate_sectors_to_municipalities(
context, demo_sectors
)

Expand All @@ -62,7 +62,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
@pytest.mark.skip(reason="Fix test_get_population_details_per_municipality first")
def test_get_population_details_per_municipality():
with build_asset_context() as muni_context:
stat_muni = be.census_tables.get_population_details_per_municipality(
stat_muni = bel.census_tables.get_population_details_per_municipality(
muni_context
)

Expand All @@ -87,7 +87,7 @@ def test_pivot_population():
)

# Get the geometries
stat_muni = be.census_tables.get_population_details_per_municipality(
stat_muni = bel.census_tables.get_population_details_per_municipality(
muni_context
)

Expand All @@ -99,7 +99,7 @@ def test_pivot_population():

with build_asset_context() as pivot_context:
# Pivot the population
pivoted = be.pivot_population(pivot_context, stat_muni)
pivoted = bel.pivot_population(pivot_context, stat_muni)

expected_number_of_municipalities = 581

Expand All @@ -115,7 +115,7 @@ def test_demo_catalog(demo_catalog):
actual_length = len(
list(
demo_catalog.objects(
subject=be.census_tables.opendata_catalog_root,
subject=bel.census_tables.opendata_catalog_root,
predicate=DCAT.dataset,
unique=False,
)
Expand All @@ -128,7 +128,7 @@ def test_demo_catalog(demo_catalog):
def test_catalog_metadata_details(demo_catalog_df):
# Get the metadata for a specific dataset in the demo catalogue:
# https://statbel.fgov.be/node/4151 "Population by Statistical sector"
# mmd = be.census_tables.get_mmd_from_dataset_node(
# mmd = bel.census_tables.get_mmd_from_dataset_node(
# demo_catalog, dataset_node=URIRef("https://statbel.fgov.be/node/4151")
# )

Expand Down Expand Up @@ -179,7 +179,7 @@ def test_catalog_as_dataframe(demo_catalog_df):

# # Convert the demo catalog to a DataFrame
# with build_asset_context() as context:
# catalog_df = be.census_tables.catalog_as_dataframe(context, demo_catalog_df)
# catalog_df = bel.census_tables.catalog_as_dataframe(context, demo_catalog_df)

# # Check that the catalog has been converted to a DataFrame
# assert isinstance(catalog_df, pd.DataFrame)
Expand Down Expand Up @@ -228,7 +228,7 @@ def test_filter_known_failing_datasets():
"2676",
]

actual_list = be.census_tables.filter_known_failing_datasets(mock_catalog)
actual_list = bel.census_tables.filter_known_failing_datasets(mock_catalog)

assert mock_catalog != expected_list
assert actual_list != mock_catalog
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cloud_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# )
# generate_pmtiles,
# TODO, Move this to a fixture to somewhere more universal
from .test_be import demo_sectors # noqa: F401
from .test_bel import demo_sectors # noqa: F401

# Commented out test as part of #92 as functions no longer importable
# @pytest.mark.skip(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_source_data_release_hash():
)
assert (
source_data_release.id
== "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597"
== "4d61bfe401ba17becd02d6b3912152c135daa9ecaebc9bd45a589dc831a85217"
)

source_data_release2 = SourceDataRelease(
Expand Down

0 comments on commit 86ab0ed

Please sign in to comment.