Skip to content

Commit

Permalink
Fixes python script to match the table schema
Browse files Browse the repository at this point in the history
  • Loading branch information
BielStela committed May 13, 2024
1 parent a9e0d56 commit 502c3cb
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 42 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ node_modules
/api/coverage
.env

## Data
## Data folders (except for root "data" module which contains all the data engineering bits
/data/datasets/**
/data/h3_data_importer/data
/data/**/data

infrastructure/base/vars/*
infrastructure/kubernetes/vars/*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
abeadbd726250dc419ba2e58c3b3cc8ddc23b5fd673e669882b55cf1d8a937d6 gadm36_levels0-2_simp.dbf
cd8a311f163688d4252712d0bb0b20f1f22278e84266b18565f97e65523540fe gadm36_levels0-2_simp.dbf
98aaf3d1c0ecadf1a424a4536de261c3daf4e373697cb86c40c43b989daf52eb gadm36_levels0-2_simp.prj
95554d6002422bca7118ced401c05a5869808343612155a4d128cf775c739fdb gadm36_levels0-2_simp.sha256
5578ded00390bba9e8c332e8219e17dcd9e9a94a1c2d516ae78708b76c652b1a gadm36_levels0-2_simp.shp
7c7e4b7cf827eefcd534bdcdf946cb6409255f06e8a12211f1f0311269d626ad gadm36_levels0-2_simp.shx
2 changes: 1 addition & 1 deletion data/gadm_importer/data_checksums/geo_region.zip.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
14df4a3134734b9a520610c538595fe750f04554babd9faf2836d532f8c6095e geo_region.zip
b7d5c357d0f0dd521c3896ba28b87ac77e3a99202e7c64f3f311575666f23a0e geo_region.zip
43 changes: 16 additions & 27 deletions data/preprocessing/gadm/Makefile
Original file line number Diff line number Diff line change
@@ -1,43 +1,32 @@
.PHONY: geo_region_table combine-gadm-file
.PHONY: upload_results
SHELL := /bin/bash

# results of data processing
DATADIR=data
# intermediate files
WORKDIR=data/tmp
WORKDIR=data

all: upload_results
make clean-workdir

upload_results: checksum
aws s3 cp $(DATADIR)/geo_region.zip $(AWS_S3_BUCKET_URL)/processed/geo_region/
rm $(DATADIR)/geo_region.zip $(DATADIR)/geo_region.csv
aws s3 sync $(DATADIR) $(AWS_S3_BUCKET_URL)/processed/gadm
aws s3 cp $(WORKDIR)/geo_region.zip $(AWS_S3_BUCKET_URL)/processed/geo_region/
#rm $(WORKDIR)/geo_region.zip $(WORKDIR)/geo_region.csv
aws s3 sync --exclude="*" --include="gadm36_levels0-2_simp.*" $(WORKDIR) $(AWS_S3_BUCKET_URL)/processed/gadm


checksum:
checksum: compress-geo_region
@echo "Generating checksums..."
cd $(DATADIR) && sha256sum geo_region.zip > ../../../gadm_importer/data_checksums/geo_region.zip.sha256
cd $(DATADIR) && sha256sum gadm36_levels0-2_simp.* > ../../../gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256
cd $(WORKDIR) && sha256sum geo_region.zip > ../../../gadm_importer/data_checksums/geo_region.zip.sha256
cd $(WORKDIR) && sha256sum gadm36_levels0-2_simp.* > ../../../gadm_importer/data_checksums/gadm36_levels0-2_simp.sha256


compress-geo_region: geo_region_table
@echo "Compressing geo_region.csv..."
zip -j $(DATADIR)/geo_region.zip $(DATADIR)/geo_region.csv
zip -j $(WORKDIR)/geo_region.zip $(WORKDIR)/geo_region.csv

geo_region_table: combine-gadm-file
@echo "Importing GADM data to database..."
# load shapefile into `gadm_levels0_2 table
ogr2ogr -makevalid -update -append \
-nln gadm_levels0_2 -nlt PROMOTE_TO_MULTI -geomfield the_geom \
-t_srs EPSG:4326 -a_srs EPSG:4326 --config OGR_TRUNCATE YES \
-f PostgreSQL PG:"dbname=$$API_POSTGRES_DATABASE host=$$API_POSTGRES_HOST port=$$API_POSTGRES_PORT user=$$API_POSTGRES_USERNAME password=$$API_POSTGRES_PASSWORD" \
$(DATADIR)/gadm36_levels0-2_simp.shp
python gadm_h3.py data/gadm36_levels0-2_simp.shp data/geo_region.csv

# create or update `geo region` entity table
PGPASSWORD=$$API_POSTGRES_PASSWORD && \
psql -d $$API_POSTGRES_DATABASE -h $$API_POSTGRES_HOST -p $$API_POSTGRES_PORT -U $$API_POSTGRES_USERNAME -f make_geo_region_table.sql

$(DATADIR)/gadm36_levels0-2_simp.shp: gadm36_0_simp.shp gadm36_1_simp.shp gadm36_2_simp.shp
combine-gadm-file: gadm36_0_simp.shp gadm36_1_simp.shp gadm36_2_simp.shp
@echo "Combining GADM files..."
mapshaper -i $(WORKDIR)/gadm36_0_simp.shp $(WORKDIR)/gadm36_1_simp.shp $(WORKDIR)/gadm36_2_simp.shp snap combine-files \
-each 'level = this.layer_name == "gadm36_0_simp" ? 0 \
Expand All @@ -54,21 +43,21 @@ $(DATADIR)/gadm36_levels0-2_simp.shp: gadm36_0_simp.shp gadm36_1_simp.shp gadm36
: null' \
-each 'gid_0 = GID_0' \
-filter-fields mpath,name,level,gid_0 \
-merge-layers force -o $(DATADIR)/gadm36_levels0-2_simp.shp
-merge-layers force -o $(WORKDIR)/gadm36_levels0-2_simp.shp

gadm36_%_simp.shp: decompress-gadm
@echo "Simplifying $@ ..."
mapshaper $(WORKDIR)/gadm36_$*.shp -simplify 10% -filter-islands min-vertices=3 -filter-slivers -clean -o $(WORKDIR)/$@ force

decompress-gadm: $(WORKDIR)/gadm36_levels_shp.zip
decompress-gadm: download-gadm
@echo "Decompressing GADM file..."
unzip -u $(WORKDIR)/gadm36_levels_shp.zip gadm36_0* gadm36_1* gadm36_2* -d $(WORKDIR)
rm $(WORKDIR)/gadm36_levels_shp.zip

$(WORKDIR)/gadm36_levels_shp.zip:
download-gadm:
@echo "Downloading GADM file..."
mkdir -p $(WORKDIR)
cd $(WORKDIR) && curl -O https://data.biogeo.ucdavis.edu/data/gadm3.6/gadm36_levels_shp.zip

clean-workdir:
rm -rf $(WORKDIR)
rm -rf $(WORKDIR)/*
30 changes: 20 additions & 10 deletions data/preprocessing/gadm/gadm_h3.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""Converts the GADM Shapefile to a csv with h3 cells for ingesting
into geo_region table.
"""

import uuid
from pathlib import Path

import click
Expand All @@ -13,21 +18,26 @@
@click.argument("output", type=click.Path(path_type=Path))
def main(filename: Path, output) -> None:
"""Convert gadm shapefile to csv with h3 columns"""
print("Reading file...")
gdf = gpd.read_file(filename)
print("Making h3 cells...")
gdf["h3flat"] = geoseries_to_cells(gdf["geometry"], resolution=H3_RESOLUTION, compact=False)
print('Compacting h3 cells...')
gdf["h3Compact"] = [list(compact(x)) for x in gdf["h3flat"]]
print('Converting to cell indexes to hexadecimal...')
gdf["h3Flat"] = geoseries_to_cells(gdf["geometry"], resolution=H3_RESOLUTION, compact=False)
gdf["h3Compact"] = [list(compact(x)) for x in gdf["h3Flat"]]
gdf["h3Compact"] = gdf["h3Compact"].apply(lambda arr: [hex(x)[2:] for x in arr])
gdf["h3flat"] = gdf["h3flat"].apply(lambda arr: [hex(x)[2:] for x in arr])
gdf["h3Flat"] = gdf["h3Flat"].apply(lambda arr: [hex(x)[2:] for x in arr])

# convert h3 lists to sql literal arrays
gdf["h3Compact"] = gdf["h3Compact"].apply(lambda x: f"{{{','.join(e for e in x)}}}")
gdf["h3Flat"] = gdf["h3Flat"].apply(lambda x: f"{{{','.join(e for e in x)}}}")

gdf = gdf.to_wkb(hex=True)
gdf = gdf.rename({"geometry": "theGeom"})
gdf = gdf.drop(["name"], axis=1)
gdf = gdf.rename(columns={"geometry": "theGeom", "mpath": "name"})

gdf["h3FlatLength"] = gdf["h3flat"].apply(lambda x: len(x))
print(f"writing to {output}...")
gdf.to_csv(output, index=False)
gdf["h3FlatLength"] = gdf["h3Flat"].apply(lambda x: len(x))
gdf["id"] = [str(uuid.uuid4()) for _ in range(len(gdf))]
print(f"Writing to {output}...")
gdf.to_csv(output, index=False, columns=["id", "h3Compact", "h3Flat", "h3FlatLength", "name", "theGeom"])
# gdf.to_parquet(output, index=False, columns=["id", "h3Compact", "h3Flat", "h3FlatLength", "name", "theGeom"])


if __name__ == "__main__":
Expand Down

0 comments on commit 502c3cb

Please sign in to comment.