From 502c3cb117e2d15197cef379b3b3f5ee30ec1b70 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Mon, 13 May 2024 16:59:41 +0200 Subject: [PATCH] Fixes python script to match the table schema --- .gitignore | 4 +- .../gadm36_levels0-2_simp.sha256 | 3 +- .../data_checksums/geo_region.zip.sha256 | 2 +- data/preprocessing/gadm/Makefile | 43 +++++++------------ data/preprocessing/gadm/gadm_h3.py | 30 ++++++++----- 5 files changed, 40 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index ecc592beb..0f397983f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,9 +5,9 @@ node_modules /api/coverage .env -## Data +## Data folders (except for root "data" module which contains all the data engineering bits /data/datasets/** -/data/h3_data_importer/data +/data/**/data infrastructure/base/vars/* infrastructure/kubernetes/vars/* diff --git a/data/gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256 b/data/gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256 index 4563ca64a..6a03f24bd 100644 --- a/data/gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256 +++ b/data/gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256 @@ -1,5 +1,4 @@ -abeadbd726250dc419ba2e58c3b3cc8ddc23b5fd673e669882b55cf1d8a937d6 gadm36_levels0-2_simp.dbf +cd8a311f163688d4252712d0bb0b20f1f22278e84266b18565f97e65523540fe gadm36_levels0-2_simp.dbf 98aaf3d1c0ecadf1a424a4536de261c3daf4e373697cb86c40c43b989daf52eb gadm36_levels0-2_simp.prj -95554d6002422bca7118ced401c05a5869808343612155a4d128cf775c739fdb gadm36_levels0-2_simp.sha256 5578ded00390bba9e8c332e8219e17dcd9e9a94a1c2d516ae78708b76c652b1a gadm36_levels0-2_simp.shp 7c7e4b7cf827eefcd534bdcdf946cb6409255f06e8a12211f1f0311269d626ad gadm36_levels0-2_simp.shx diff --git a/data/gadm_importer/data_checksums/geo_region.zip.sha256 b/data/gadm_importer/data_checksums/geo_region.zip.sha256 index ab542af3c..f3e19072a 100644 --- a/data/gadm_importer/data_checksums/geo_region.zip.sha256 +++ b/data/gadm_importer/data_checksums/geo_region.zip.sha256 @@ -1 +1 @@ -14df4a3134734b9a520610c538595fe750f04554babd9faf2836d532f8c6095e geo_region.zip +b7d5c357d0f0dd521c3896ba28b87ac77e3a99202e7c64f3f311575666f23a0e geo_region.zip diff --git a/data/preprocessing/gadm/Makefile b/data/preprocessing/gadm/Makefile index bc75e6795..a9b904090 100644 --- a/data/preprocessing/gadm/Makefile +++ b/data/preprocessing/gadm/Makefile @@ -1,43 +1,32 @@ -.PHONY: geo_region_table combine-gadm-file +.PHONY: upload_results +SHELL := /bin/bash -# results of data processing -DATADIR=data -# intermediate files -WORKDIR=data/tmp +WORKDIR=data all: upload_results make clean-workdir upload_results: checksum - aws s3 cp $(DATADIR)/geo_region.zip $(AWS_S3_BUCKET_URL)/processed/geo_region/ - rm $(DATADIR)/geo_region.zip $(DATADIR)/geo_region.csv - aws s3 sync $(DATADIR) $(AWS_S3_BUCKET_URL)/processed/gadm + aws s3 cp $(WORKDIR)/geo_region.zip $(AWS_S3_BUCKET_URL)/processed/geo_region/ + #rm $(WORKDIR)/geo_region.zip $(WORKDIR)/geo_region.csv + aws s3 sync --exclude="*" --include="gadm36_levels0-2_simp.*" $(WORKDIR) $(AWS_S3_BUCKET_URL)/processed/gadm -checksum: +checksum: compress-geo_region @echo "Generating checksums..." - cd $(DATADIR) && sha256sum geo_region.zip > ../../../gadm_importer/data_checksums/geo_region.zip.sha256 - cd $(DATADIR) && sha256sum gadm36_levels0-2_simp.* > ../../../gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256 + cd $(WORKDIR) && sha256sum geo_region.zip > ../../../gadm_importer/data_checksums/geo_region.zip.sha256 + cd $(WORKDIR) && sha256sum gadm36_levels0-2_simp.* > ../../../gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256 compress-geo_region: geo_region_table @echo "Compressing geo_region.csv..." - zip -j $(DATADIR)/geo_region.zip $(DATADIR)/geo_region.csv + zip -j $(WORKDIR)/geo_region.zip $(WORKDIR)/geo_region.csv geo_region_table: combine-gadm-file - @echo "Importing GADM data to database..." - # load shapefile into `gadm_levels0_2 table - ogr2ogr -makevalid -update -append \ - -nln gadm_levels0_2 -nlt PROMOTE_TO_MULTI -geomfield the_geom \ - -t_srs EPSG:4326 -a_srs EPSG:4326 --config OGR_TRUNCATE YES \ - -f PostgreSQL PG:"dbname=$$API_POSTGRES_DATABASE host=$$API_POSTGRES_HOST port=$$API_POSTGRES_PORT user=$$API_POSTGRES_USERNAME password=$$API_POSTGRES_PASSWORD" \ - $(DATADIR)/gadm36_levels0-2_simp.shp + python gadm_h3.py data/gadm36_levels0-2_simp.shp data/geo_region.csv - # create or update `geo region` entity table - PGPASSWORD=$$API_POSTGRES_PASSWORD && \ - psql -d $$API_POSTGRES_DATABASE -h $$API_POSTGRES_HOST -p $$API_POSTGRES_PORT -U $$API_POSTGRES_USERNAME -f make_geo_region_table.sql -$(DATADIR)/gadm36_levels0-2_simp.shp: gadm36_0_simp.shp gadm36_1_simp.shp gadm36_2_simp.shp +combine-gadm-file: gadm36_0_simp.shp gadm36_1_simp.shp gadm36_2_simp.shp @echo "Combining GADM files..." mapshaper -i $(WORKDIR)/gadm36_0_simp.shp $(WORKDIR)/gadm36_1_simp.shp $(WORKDIR)/gadm36_2_simp.shp snap combine-files \ -each 'level = this.layer_name == "gadm36_0_simp" ? 0 \ @@ -54,21 +43,21 @@ $(DATADIR)/gadm36_levels0-2_simp.shp: gadm36_0_simp.shp gadm36_1_simp.shp gadm36 : null' \ -each 'gid_0 = GID_0' \ -filter-fields mpath,name,level,gid_0 \ - -merge-layers force -o $(DATADIR)/gadm36_levels0-2_simp.shp + -merge-layers force -o $(WORKDIR)/gadm36_levels0-2_simp.shp gadm36_%_simp.shp: decompress-gadm @echo "Simplifying $@ ..." mapshaper $(WORKDIR)/gadm36_$*.shp -simplify 10% -filter-islands min-vertices=3 -filter-slivers -clean -o $(WORKDIR)/$@ force -decompress-gadm: $(WORKDIR)/gadm36_levels_shp.zip +decompress-gadm: download-gadm @echo "Decompressing GADM file..." unzip -u $(WORKDIR)/gadm36_levels_shp.zip gadm36_0* gadm36_1* gadm36_2* -d $(WORKDIR) rm $(WORKDIR)/gadm36_levels_shp.zip -$(WORKDIR)/gadm36_levels_shp.zip: +download-gadm: @echo "Downloading GADM file..." mkdir -p $(WORKDIR) cd $(WORKDIR) && curl -O https://data.biogeo.ucdavis.edu/data/gadm3.6/gadm36_levels_shp.zip clean-workdir: - rm -rf $(WORKDIR) + rm -rf $(WORKDIR)/* diff --git a/data/preprocessing/gadm/gadm_h3.py b/data/preprocessing/gadm/gadm_h3.py index 211895ec6..723aadd99 100644 --- a/data/preprocessing/gadm/gadm_h3.py +++ b/data/preprocessing/gadm/gadm_h3.py @@ -1,3 +1,8 @@ +"""Converts the GADM Shapefile to a csv with h3 cells for ingesting +into geo_region table. +""" + +import uuid from pathlib import Path import click @@ -13,21 +18,26 @@ @click.argument("output", type=click.Path(path_type=Path)) def main(filename: Path, output) -> None: """Convert gadm shapefile to csv with h3 columns""" - print("Reading file...") gdf = gpd.read_file(filename) print("Making h3 cells...") - gdf["h3flat"] = geoseries_to_cells(gdf["geometry"], resolution=H3_RESOLUTION, compact=False) - print('Compacting h3 cells...') - gdf["h3Compact"] = [list(compact(x)) for x in gdf["h3flat"]] - print('Converting to cell indexes to hexadecimal...') + gdf["h3Flat"] = geoseries_to_cells(gdf["geometry"], resolution=H3_RESOLUTION, compact=False) + gdf["h3Compact"] = [list(compact(x)) for x in gdf["h3Flat"]] gdf["h3Compact"] = gdf["h3Compact"].apply(lambda arr: [hex(x)[2:] for x in arr]) - gdf["h3flat"] = gdf["h3flat"].apply(lambda arr: [hex(x)[2:] for x in arr]) + gdf["h3Flat"] = gdf["h3Flat"].apply(lambda arr: [hex(x)[2:] for x in arr]) + + # convert h3 lists to sql literal arrays + gdf["h3Compact"] = gdf["h3Compact"].apply(lambda x: f"{{{','.join(e for e in x)}}}") + gdf["h3Flat"] = gdf["h3Flat"].apply(lambda x: f"{{{','.join(e for e in x)}}}") + gdf = gdf.to_wkb(hex=True) - gdf = gdf.rename({"geometry": "theGeom"}) + gdf = gdf.drop(["name"], axis=1) + gdf = gdf.rename(columns={"geometry": "theGeom", "mpath": "name"}) - gdf["h3FlatLength"] = gdf["h3flat"].apply(lambda x: len(x)) - print(f"writing to {output}...") - gdf.to_csv(output, index=False) + gdf["h3FlatLength"] = gdf["h3Flat"].apply(lambda x: len(x)) + gdf["id"] = [str(uuid.uuid4()) for _ in range(len(gdf))] + print(f"Writing to {output}...") + gdf.to_csv(output, index=False, columns=["id", "h3Compact", "h3Flat", "h3FlatLength", "name", "theGeom"]) + # gdf.to_parquet(output, index=False, columns=["id", "h3Compact", "h3Flat", "h3FlatLength", "name", "theGeom"]) if __name__ == "__main__":