From 28e5865727a95c202097279d289e59e20895d271 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Thu, 31 Oct 2024 16:52:30 -0300 Subject: [PATCH 01/14] + GeoJSON stats --- API/api_worker.py | 29 ++++++++++++++++++------ src/app.py | 57 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 11 deletions(-) diff --git a/API/api_worker.py b/API/api_worker.py index 096f2f0f..1f4e559f 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -15,7 +15,7 @@ from celery import Celery # Reader imports -from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer +from src.app import CustomExport, PolygonStats, GeoJSONStats, RawData, S3FileTransfer from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -75,7 +75,7 @@ def create_readme_content(default_readme, polygon_stats): def zip_binding( - working_dir, exportname_parts, geom_dump, polygon_stats, default_readme + working_dir, exportname_parts, geom_dump, polygon_stats, geojson_stats, default_readme ): logging.debug("Zip Binding Started!") upload_file_path = os.path.join( @@ -88,6 +88,9 @@ def zip_binding( ), } + if geojson_stats: + additional_files["stats.json"] = geojson_stats + for name, content in additional_files.items(): temp_path = os.path.join(working_dir, name) with open(temp_path, "w") as f: @@ -165,7 +168,6 @@ def on_failure(self, exc, task_id, args, kwargs, einfo): if os.path.exists(clean_dir): shutil.rmtree(clean_dir) - @celery.task( bind=True, name="process_raw_data", @@ -209,11 +211,22 @@ def process_raw_data(self, params, user=None): file_parts, ) - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts) - inside_file_size = 0 polygon_stats = None + geojson_stats = None + + if "include_stats" in params.dict(): + if params.include_stats: + geoJSONStats = GeoJSONStats(params.filters) + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts, geoJSONStats.raw_data_line_stats) + geojson_stats = geoJSONStats.json() + else: + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts) + + inside_file_size = 0 if "include_stats" in params.dict(): if params.include_stats: feature = { @@ -222,12 +235,14 @@ def process_raw_data(self, params, user=None): "properties": {}, } polygon_stats = PolygonStats(feature).get_summary_stats() + if bind_zip: upload_file_path, inside_file_size = zip_binding( working_dir=working_dir, exportname_parts=exportname_parts, geom_dump=geom_dump, polygon_stats=polygon_stats, + geojson_stats=geojson_stats, default_readme=DEFAULT_README_TEXT, ) diff --git a/src/app.py b/src/app.py index fb5e82d5..c477fb2e 100644 --- a/src/app.py +++ b/src/app.py @@ -47,6 +47,7 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm +from geojson_stats.stats import Stats # Reader imports from src.config import ( @@ -640,7 +641,7 @@ def ogr_export(query, outputtype, working_dir, dump_temp_path, params): os.remove(query_path) @staticmethod - def query2geojson(con, extraction_query, dump_temp_file_path): + def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn = None): """Function written from scratch without being dependent on any library, Provides better performance for geojson binding""" # creating geojson file pre_geojson = """{"type": "FeatureCollection","features": [""" @@ -660,10 +661,12 @@ def query2geojson(con, extraction_query, dump_temp_file_path): for row in cursor: if first: first = False - f.write(row[0]) else: f.write(",") - f.write(row[0]) + if plugin_fn: + f.write(plugin_fn(row[0])) + else: + f.write((row[0])) cursor.close() # closing connection to avoid memory issues # close the writing geojson with last part f.write(post_geojson) @@ -711,7 +714,7 @@ def get_grid_id(geom, cur): country_export, ) - def extract_current_data(self, exportname): + def extract_current_data(self, exportname, plugin_fn = None): """Responsible for Extracting rawdata current snapshot, Initially it creates a geojson file , Generates query , run it with 1000 chunk size and writes it directly to the geojson file and closes the file after dump Args: exportname: takes filename as argument to create geojson file passed from routers @@ -777,6 +780,7 @@ def extract_current_data(self, exportname): country_export=country_export, ), dump_temp_file_path, + plugin_fn ) # uses own conversion class if output_type == RawDataOutputType.SHAPEFILE.value: ( @@ -2255,3 +2259,48 @@ def get_summary_stats(self, start_date, end_date, group_by): result = self.cur.fetchall() self.d_b.close_conn() return [dict(item) for item in result] + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + config_area = ["building"] + config_length = ["highway", "waterway"] + + for tag in config_area: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + for tag in config_length: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, line: str): + self.process_file_line(line) + return line From b18936291dd456d68acf0c42a921c335638d75cb Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 4 Nov 2024 09:16:26 -0300 Subject: [PATCH 02/14] + Code docs, requirement --- requirements.txt | 4 ++++ src/app.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/requirements.txt b/requirements.txt index c295c164..73fcb7e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,3 +54,7 @@ psutil==5.9.8 ## logging tqdm==4.66.2 + +# stats for geojson data +geojson-stats @ git+https://github.com/emi420/geojson-stats@v0.0.1-alpha + diff --git a/src/app.py b/src/app.py index c477fb2e..102754a4 100644 --- a/src/app.py +++ b/src/app.py @@ -2285,6 +2285,10 @@ def __init__(self, filters, *args, **kwargs): self.config.length = True def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + if tags.all_geometry: if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: return True From 5ed27a600f5e8b27296ea3ec4b8acd3e970d6cc0 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 4 Nov 2024 09:52:12 -0300 Subject: [PATCH 03/14] Change requirement (geojson-stats --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 73fcb7e6..6e87b47b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,5 +56,5 @@ psutil==5.9.8 tqdm==4.66.2 # stats for geojson data -geojson-stats @ git+https://github.com/emi420/geojson-stats@v0.0.1-alpha +geojson-stats==0.1.0 From 5eaee149629aad2eae3714b82a6c6f86022834d4 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 4 Nov 2024 09:59:48 -0300 Subject: [PATCH 04/14] + Black code linting --- API/api_worker.py | 8 ++++- src/app.py | 11 +++++-- src/validation/models.py | 64 ++++++++++++++++++++-------------------- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/API/api_worker.py b/API/api_worker.py index 1f4e559f..0a90ce21 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -75,7 +75,12 @@ def create_readme_content(default_readme, polygon_stats): def zip_binding( - working_dir, exportname_parts, geom_dump, polygon_stats, geojson_stats, default_readme + working_dir, + exportname_parts, + geom_dump, + polygon_stats, + geojson_stats, + default_readme, ): logging.debug("Zip Binding Started!") upload_file_path = os.path.join( @@ -168,6 +173,7 @@ def on_failure(self, exc, task_id, args, kwargs, einfo): if os.path.exists(clean_dir): shutil.rmtree(clean_dir) + @celery.task( bind=True, name="process_raw_data", diff --git a/src/app.py b/src/app.py index 102754a4..b675001a 100644 --- a/src/app.py +++ b/src/app.py @@ -641,7 +641,7 @@ def ogr_export(query, outputtype, working_dir, dump_temp_path, params): os.remove(query_path) @staticmethod - def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn = None): + def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn=None): """Function written from scratch without being dependent on any library, Provides better performance for geojson binding""" # creating geojson file pre_geojson = """{"type": "FeatureCollection","features": [""" @@ -714,7 +714,7 @@ def get_grid_id(geom, cur): country_export, ) - def extract_current_data(self, exportname, plugin_fn = None): + def extract_current_data(self, exportname, plugin_fn=None): """Responsible for Extracting rawdata current snapshot, Initially it creates a geojson file , Generates query , run it with 1000 chunk size and writes it directly to the geojson file and closes the file after dump Args: exportname: takes filename as argument to create geojson file passed from routers @@ -780,7 +780,7 @@ def extract_current_data(self, exportname, plugin_fn = None): country_export=country_export, ), dump_temp_file_path, - plugin_fn + plugin_fn, ) # uses own conversion class if output_type == RawDataOutputType.SHAPEFILE.value: ( @@ -2260,6 +2260,7 @@ def get_summary_stats(self, start_date, end_date, group_by): self.d_b.close_conn() return [dict(item) for item in result] + class GeoJSONStats(Stats): """Used for collecting stats while processing GeoJSON files line by line""" @@ -2305,6 +2306,10 @@ def check_filter(self, tags, tag): if tags.line.join_and and tag in tags.line.join_and: return True + """ + Process a GeoJSON line (for getting stats) and return that line + """ + def raw_data_line_stats(self, line: str): self.process_file_line(line) return line diff --git a/src/validation/models.py b/src/validation/models.py index 1b1b0e92..4b757447 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -303,22 +303,22 @@ class StatsRequestParams(BaseModel, GeometryValidatorMixin): max_length=3, example="NPL", ) - geometry: Optional[ - Union[Polygon, MultiPolygon, Feature, FeatureCollection] - ] = Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, + geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( + Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, + ) ) @validator("geometry", pre=True, always=True) @@ -624,22 +624,22 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) - geometry: Optional[ - Union[Polygon, MultiPolygon, Feature, FeatureCollection] - ] = Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, + geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( + Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, + ) ) @validator("geometry", pre=True, always=True) From d2aca194a207a4f287f6aa9cb13326cb9cea526b Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 4 Nov 2024 10:08:43 -0300 Subject: [PATCH 05/14] Format doc --- src/app.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/app.py b/src/app.py index b675001a..60d8fcdf 100644 --- a/src/app.py +++ b/src/app.py @@ -2306,10 +2306,9 @@ def check_filter(self, tags, tag): if tags.line.join_and and tag in tags.line.join_and: return True - """ - Process a GeoJSON line (for getting stats) and return that line - """ - def raw_data_line_stats(self, line: str): + """ + Process a GeoJSON line (for getting stats) and return that line + """ self.process_file_line(line) return line From 0e2e37863f26945cb1500818de13d7791af6ec64 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 4 Nov 2024 10:22:51 -0300 Subject: [PATCH 06/14] Code reformatting using Black v23 --- src/validation/models.py | 64 ++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/validation/models.py b/src/validation/models.py index 4b757447..1b1b0e92 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -303,22 +303,22 @@ class StatsRequestParams(BaseModel, GeometryValidatorMixin): max_length=3, example="NPL", ) - geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( - Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - ) + geometry: Optional[ + Union[Polygon, MultiPolygon, Feature, FeatureCollection] + ] = Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, ) @validator("geometry", pre=True, always=True) @@ -624,22 +624,22 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) - geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( - Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - ) + geometry: Optional[ + Union[Polygon, MultiPolygon, Feature, FeatureCollection] + ] = Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, ) @validator("geometry", pre=True, always=True) From 655f3181e30d1ccdf9305583e58f8ec11c3d775b Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Thu, 14 Nov 2024 22:06:02 -0300 Subject: [PATCH 07/14] + Post processing, + Transliterations, HTML stats, upgrade geojson-stats --- API/api_worker.py | 57 +++++-- requirements.txt | 5 +- src/app.py | 52 ------ src/post_processing/__init__.py | 0 src/post_processing/geojson_stats.py | 58 +++++++ src/post_processing/processor.py | 39 +++++ src/post_processing/stats_building_tpl.html | 165 ++++++++++++++++++++ src/post_processing/stats_highway_tpl.html | 164 +++++++++++++++++++ src/post_processing/stats_tpl.html | 161 +++++++++++++++++++ src/post_processing/stats_waterway_tpl.html | 165 ++++++++++++++++++++ src/post_processing/transliterator.py | 20 +++ src/validation/models.py | 8 + 12 files changed, 827 insertions(+), 67 deletions(-) create mode 100644 src/post_processing/__init__.py create mode 100644 src/post_processing/geojson_stats.py create mode 100644 src/post_processing/processor.py create mode 100644 src/post_processing/stats_building_tpl.html create mode 100644 src/post_processing/stats_highway_tpl.html create mode 100644 src/post_processing/stats_tpl.html create mode 100644 src/post_processing/stats_waterway_tpl.html create mode 100644 src/post_processing/transliterator.py diff --git a/API/api_worker.py b/API/api_worker.py index 0a90ce21..a7085178 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -15,7 +15,8 @@ from celery import Celery # Reader imports -from src.app import CustomExport, PolygonStats, GeoJSONStats, RawData, S3FileTransfer +from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer +from src.post_processing.processor import PostProcessor from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -39,6 +40,7 @@ RawDataCurrentParams, RawDataOutputType, ) +from src.post_processing.processor import PostProcessor if ENABLE_SOZIP: # Third party imports @@ -218,19 +220,46 @@ def process_raw_data(self, params, user=None): ) polygon_stats = None - geojson_stats = None + geojson_stats_html = None + + if "include_stats" or "include_translit" in params.dict(): + post_processor = PostProcessor({ + "include_stats": params.include_stats, + "include_translit": params.include_translit + }) - if "include_stats" in params.dict(): if params.include_stats: - geoJSONStats = GeoJSONStats(params.filters) - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts, geoJSONStats.raw_data_line_stats) - geojson_stats = geoJSONStats.json() - else: - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts) + post_processor.filters = params.filters + + post_processor.init() + + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts, post_processor.post_process_line) + + if params.include_stats: + geojson_stats_json = json.dumps(post_processor.geoJSONStats.dict()) + + # Create a HTML summary of stats + if params.include_stats_html: + tpl = "stats" + if 'waterway' in post_processor.geoJSONStats.config.keys: + tpl = "stats_waterway" + if 'highway' in post_processor.geoJSONStats.config.keys: + tpl = "stats_highway" + if 'building' in post_processor.geoJSONStats.config.keys: + tpl = "stats_building" + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join(project_root, "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl)) + geojson_stats_html = post_processor.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join(working_dir, os.pardir, f"{exportname_parts[-1]}.html") + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + else: + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts) inside_file_size = 0 if "include_stats" in params.dict(): @@ -248,7 +277,7 @@ def process_raw_data(self, params, user=None): exportname_parts=exportname_parts, geom_dump=geom_dump, polygon_stats=polygon_stats, - geojson_stats=geojson_stats, + geojson_stats=geojson_stats_json, default_readme=DEFAULT_README_TEXT, ) @@ -261,6 +290,7 @@ def process_raw_data(self, params, user=None): upload_file_path = file_path inside_file_size += os.path.getsize(file_path) break # only take one file inside dir , if contains many it should be inside zip + # check if download url will be generated from s3 or not from config if use_s3_to_upload: file_transfer_obj = S3FileTransfer() @@ -274,7 +304,6 @@ def process_raw_data(self, params, user=None): pattern = r"(hotosm_project_)(\d+)" match = re.match(pattern, exportname) if match: - prefix = match.group(1) project_number = match.group(2) if project_number: upload_name = f"TM/{project_number}/{exportname}" diff --git a/requirements.txt b/requirements.txt index 6e87b47b..9e7b5a68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,5 +56,8 @@ psutil==5.9.8 tqdm==4.66.2 # stats for geojson data -geojson-stats==0.1.0 +geojson-stats==0.2.2 + +# transliterations +transliterate==1.10.2 diff --git a/src/app.py b/src/app.py index 60d8fcdf..580530a5 100644 --- a/src/app.py +++ b/src/app.py @@ -47,7 +47,6 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm -from geojson_stats.stats import Stats # Reader imports from src.config import ( @@ -2261,54 +2260,3 @@ def get_summary_stats(self, start_date, end_date, group_by): return [dict(item) for item in result] -class GeoJSONStats(Stats): - """Used for collecting stats while processing GeoJSON files line by line""" - - def __init__(self, filters, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.config.clean = True - self.config.properties_prop = "properties.tags" - - if filters and filters.tags: - config_area = ["building"] - config_length = ["highway", "waterway"] - - for tag in config_area: - if self.check_filter(filters.tags, tag): - self.config.keys.append(tag) - self.config.value_keys.append(tag) - self.config.area = True - for tag in config_length: - if self.check_filter(filters.tags, tag): - self.config.keys.append(tag) - self.config.value_keys.append(tag) - self.config.length = True - - def check_filter(self, tags, tag): - """ - Check if a tag is present in tag filters - """ - - if tags.all_geometry: - if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: - return True - if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: - return True - if tags.polygon: - if tags.polygon.join_or and tag in tags.polygon.join_or: - return True - if tags.polygon.join_and and tag in tags.polygon.join_and: - return True - if tags.line: - if tags.line.join_or and tag in tags.line.join_or: - return True - if tags.line.join_and and tag in tags.line.join_and: - return True - - def raw_data_line_stats(self, line: str): - """ - Process a GeoJSON line (for getting stats) and return that line - """ - self.process_file_line(line) - return line diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py new file mode 100644 index 00000000..edf6bdf4 --- /dev/null +++ b/src/post_processing/geojson_stats.py @@ -0,0 +1,58 @@ +from geojson_stats.stats import Stats +from geojson_stats.html import Html + +CONFIG_AREA = ["building"] +CONFIG_LENGTH = ["highway", "waterway"] + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + + for tag in CONFIG_AREA: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + + for tag in CONFIG_LENGTH: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, json_object: dict): + """ + Process a GeoJSON line (for getting stats) and return that line + """ + self.get_object_stats(json_object) + + def html(self, tpl): + return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py new file mode 100644 index 00000000..25ed726f --- /dev/null +++ b/src/post_processing/processor.py @@ -0,0 +1,39 @@ + +import json +from .transliterator import Transliterator +from .geojson_stats import GeoJSONStats + +class PostProcessor(): + """Used for posst-process data while processing GeoJSON files line by line""" + + options = {} + filters = {} + functions = [] + + def __init__(self, options, *args, **kwargs): + self.options = options + + def post_process_line(self, line: str): + """ + Parses line, run functions over it and returns it + """ + + line_object = json.loads(line) + + for fn in self.functions: + fn(line_object) + + return json.dumps(line_object) + + def init(self): + """ + Initialize post-processor + """ + + if self.options["include_stats"]: + self.geoJSONStats = GeoJSONStats(self.filters) + self.functions.append(self.geoJSONStats.raw_data_line_stats) + + if self.options["include_translit"]: + self.transliterator = Transliterator() + self.functions.append(self.transliterator.translit) \ No newline at end of file diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html new file mode 100644 index 00000000..e6cdfdd6 --- /dev/null +++ b/src/post_processing/stats_building_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Hot Key CountsCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}%
${key_1}${key_1_count}${key_1_percent}%
${key_2}${key_2_count}${key_2_percent}%
${key_3}${key_3_count}${key_3_percent}%
${key_4}${key_4_count}${key_4_percent}%
${key_5}${key_5_count}${key_5_percent}%
+
+
+ + diff --git a/src/post_processing/stats_highway_tpl.html b/src/post_processing/stats_highway_tpl.html new file mode 100644 index 00000000..d80f3188 --- /dev/null +++ b/src/post_processing/stats_highway_tpl.html @@ -0,0 +1,164 @@ + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Hot Key CountsCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}%
${key_1}${key_1_count}${key_1_percent}%
${key_2}${key_2_count}${key_2_percent}%
${key_3}${key_3_count}${key_3_percent}%
${key_4}${key_4_count}${key_4_percent}%
${key_5}${key_5_count}${key_5_percent}%
+
+
+ + diff --git a/src/post_processing/stats_tpl.html b/src/post_processing/stats_tpl.html new file mode 100644 index 00000000..e6777dc2 --- /dev/null +++ b/src/post_processing/stats_tpl.html @@ -0,0 +1,161 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Hot Key CountsCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}%
${key_1}${key_1_count}${key_1_percent}%
${key_2}${key_2_count}${key_2_percent}%
${key_3}${key_3_count}${key_3_percent}%
${key_4}${key_4_count}${key_4_percent}%
${key_5}${key_5_count}${key_5_percent}%
+
+
+ + diff --git a/src/post_processing/stats_waterway_tpl.html b/src/post_processing/stats_waterway_tpl.html new file mode 100644 index 00000000..c59050d0 --- /dev/null +++ b/src/post_processing/stats_waterway_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Hot Key CountsCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}%
${key_1}${key_1_count}${key_1_percent}%
${key_2}${key_2_count}${key_2_percent}%
${key_3}${key_3_count}${key_3_percent}%
${key_4}${key_4_count}${key_4_percent}%
${key_5}${key_5_count}${key_5_percent}%
+
+
+ + diff --git a/src/post_processing/transliterator.py b/src/post_processing/transliterator.py new file mode 100644 index 00000000..1644c241 --- /dev/null +++ b/src/post_processing/transliterator.py @@ -0,0 +1,20 @@ +from transliterate import translit, get_available_language_codes + +class Transliterator(): + """Used for transliterate names while processing GeoJSON files line by line""" + + def __init__(self): + self.available_language_codes = get_available_language_codes() + self.name_tags = [f"name:{x}" for x in self.available_language_codes] + + def translit(self, line): + """ + Transliterate names and add a new tag suffixed with -translit + """ + for code in self.available_language_codes: + tag = "name:{code}".format(code=code) + if tag in line["properties"]["tags"]: + translit_tag = "{tag}-translit".format(tag=tag) + if not translit_tag in line["properties"]["tags"]: + line["properties"]["tags"][translit_tag] = \ + translit(line["properties"]["tags"][tag], code, reversed=True) diff --git a/src/validation/models.py b/src/validation/models.py index 1b1b0e92..14cf5b6f 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -167,6 +167,14 @@ class RawDataCurrentParamsBase(BaseModel, GeometryValidatorMixin): default=False, description="Includes detailed stats about the polygon passed such as buildings count , road count along with summary about data completeness in the area", ) + include_stats_html: Optional[bool] = Field( + default=False, + description="Includes detailed stats about the polygon passed such as buildings count , road count along with summary about data completeness in the area", + ) + include_translit: Optional[bool] = Field( + default=False, + description="Includes transliterations", + ) filters: Optional[Filters] = Field( default=None, example={ From 7acb43134096d313b6cb87adda96a6cd6e0fcde9 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 18 Nov 2024 11:39:20 -0300 Subject: [PATCH 08/14] Upload HTML file to S3. Fix code. Add a few comments. Ignore system MacOS files --- .gitignore | 2 ++ API/api_worker.py | 19 ++++++++++++++++--- requirements.txt | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c74b2f26..143eb173 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ Pipfile.lock #backend backend/data backend/.env + +.DS_Store diff --git a/API/api_worker.py b/API/api_worker.py index a7085178..9a1262cc 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -219,9 +219,10 @@ def process_raw_data(self, params, user=None): file_parts, ) + # Post-proccessing: Generate GeoJSON/HTML stats and transliterations polygon_stats = None geojson_stats_html = None - + download_html_url = None if "include_stats" or "include_translit" in params.dict(): post_processor = PostProcessor({ "include_stats": params.include_stats, @@ -245,9 +246,9 @@ def process_raw_data(self, params, user=None): tpl = "stats" if 'waterway' in post_processor.geoJSONStats.config.keys: tpl = "stats_waterway" - if 'highway' in post_processor.geoJSONStats.config.keys: + elif 'highway' in post_processor.geoJSONStats.config.keys: tpl = "stats_highway" - if 'building' in post_processor.geoJSONStats.config.keys: + elif 'building' in post_processor.geoJSONStats.config.keys: tpl = "stats_building" project_root = pathlib.Path(__file__).resolve().parent tpl_path = os.path.join(project_root, "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl)) @@ -322,6 +323,15 @@ def process_raw_data(self, params, user=None): upload_name, file_suffix="zip" if bind_zip else params.output_type.lower(), ) + + # If there's an HTML file, upload it too + if geojson_stats_html: + download_html_url = file_transfer_obj.upload( + upload_html_path, + upload_name, + file_suffix="html", + ) + else: # give the static file download url back to user served from fastapi static export path download_url = str(upload_file_path) @@ -347,6 +357,9 @@ def process_raw_data(self, params, user=None): } if polygon_stats: final_response["stats"] = polygon_stats + if download_html_url: + final_response["download_html_url"] = download_html_url + return final_response except Exception as ex: diff --git a/requirements.txt b/requirements.txt index 9e7b5a68..1364e704 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,7 +56,7 @@ psutil==5.9.8 tqdm==4.66.2 # stats for geojson data -geojson-stats==0.2.2 +geojson-stats==0.2.3 # transliterations transliterate==1.10.2 From 32a1866cbc60c82cb4940d7c2d06825661c82413 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 18 Nov 2024 14:13:33 -0300 Subject: [PATCH 09/14] Black tests --- API/api_worker.py | 33 +++++++++----- src/app.py | 2 - src/post_processing/geojson_stats.py | 1 + src/post_processing/processor.py | 10 ++--- src/post_processing/transliterator.py | 10 +++-- src/validation/models.py | 64 +++++++++++++-------------- 6 files changed, 65 insertions(+), 55 deletions(-) diff --git a/API/api_worker.py b/API/api_worker.py index 9a1262cc..92756c7b 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -224,16 +224,18 @@ def process_raw_data(self, params, user=None): geojson_stats_html = None download_html_url = None if "include_stats" or "include_translit" in params.dict(): - post_processor = PostProcessor({ - "include_stats": params.include_stats, - "include_translit": params.include_translit - }) + post_processor = PostProcessor( + { + "include_stats": params.include_stats, + "include_translit": params.include_translit, + } + ) if params.include_stats: post_processor.filters = params.filters post_processor.init() - + geom_area, geom_dump, working_dir = RawData( params, str(self.request.id) ).extract_current_data(file_parts, post_processor.post_process_line) @@ -244,16 +246,23 @@ def process_raw_data(self, params, user=None): # Create a HTML summary of stats if params.include_stats_html: tpl = "stats" - if 'waterway' in post_processor.geoJSONStats.config.keys: + if "waterway" in post_processor.geoJSONStats.config.keys: tpl = "stats_waterway" - elif 'highway' in post_processor.geoJSONStats.config.keys: + elif "highway" in post_processor.geoJSONStats.config.keys: tpl = "stats_highway" - elif 'building' in post_processor.geoJSONStats.config.keys: + elif "building" in post_processor.geoJSONStats.config.keys: tpl = "stats_building" project_root = pathlib.Path(__file__).resolve().parent - tpl_path = os.path.join(project_root, "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl)) - geojson_stats_html = post_processor.geoJSONStats.html(tpl_path).build() - upload_html_path = os.path.join(working_dir, os.pardir, f"{exportname_parts[-1]}.html") + tpl_path = os.path.join( + project_root, + "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = post_processor.geoJSONStats.html( + tpl_path + ).build() + upload_html_path = os.path.join( + working_dir, os.pardir, f"{exportname_parts[-1]}.html" + ) with open(upload_html_path, "w") as f: f.write(geojson_stats_html) @@ -291,7 +300,7 @@ def process_raw_data(self, params, user=None): upload_file_path = file_path inside_file_size += os.path.getsize(file_path) break # only take one file inside dir , if contains many it should be inside zip - + # check if download url will be generated from s3 or not from config if use_s3_to_upload: file_transfer_obj = S3FileTransfer() diff --git a/src/app.py b/src/app.py index 580530a5..293f5a81 100644 --- a/src/app.py +++ b/src/app.py @@ -2258,5 +2258,3 @@ def get_summary_stats(self, start_date, end_date, group_by): result = self.cur.fetchall() self.d_b.close_conn() return [dict(item) for item in result] - - diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py index edf6bdf4..3935634a 100644 --- a/src/post_processing/geojson_stats.py +++ b/src/post_processing/geojson_stats.py @@ -4,6 +4,7 @@ CONFIG_AREA = ["building"] CONFIG_LENGTH = ["highway", "waterway"] + class GeoJSONStats(Stats): """Used for collecting stats while processing GeoJSON files line by line""" diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py index 25ed726f..c7806d70 100644 --- a/src/post_processing/processor.py +++ b/src/post_processing/processor.py @@ -1,9 +1,9 @@ - import json from .transliterator import Transliterator from .geojson_stats import GeoJSONStats -class PostProcessor(): + +class PostProcessor: """Used for posst-process data while processing GeoJSON files line by line""" options = {} @@ -21,10 +21,10 @@ def post_process_line(self, line: str): line_object = json.loads(line) for fn in self.functions: - fn(line_object) + fn(line_object) return json.dumps(line_object) - + def init(self): """ Initialize post-processor @@ -36,4 +36,4 @@ def init(self): if self.options["include_translit"]: self.transliterator = Transliterator() - self.functions.append(self.transliterator.translit) \ No newline at end of file + self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/transliterator.py b/src/post_processing/transliterator.py index 1644c241..e2ed695c 100644 --- a/src/post_processing/transliterator.py +++ b/src/post_processing/transliterator.py @@ -1,8 +1,9 @@ from transliterate import translit, get_available_language_codes -class Transliterator(): + +class Transliterator: """Used for transliterate names while processing GeoJSON files line by line""" - + def __init__(self): self.available_language_codes = get_available_language_codes() self.name_tags = [f"name:{x}" for x in self.available_language_codes] @@ -16,5 +17,6 @@ def translit(self, line): if tag in line["properties"]["tags"]: translit_tag = "{tag}-translit".format(tag=tag) if not translit_tag in line["properties"]["tags"]: - line["properties"]["tags"][translit_tag] = \ - translit(line["properties"]["tags"][tag], code, reversed=True) + line["properties"]["tags"][translit_tag] = translit( + line["properties"]["tags"][tag], code, reversed=True + ) diff --git a/src/validation/models.py b/src/validation/models.py index 14cf5b6f..f97f9100 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -311,22 +311,22 @@ class StatsRequestParams(BaseModel, GeometryValidatorMixin): max_length=3, example="NPL", ) - geometry: Optional[ - Union[Polygon, MultiPolygon, Feature, FeatureCollection] - ] = Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, + geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( + Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, + ) ) @validator("geometry", pre=True, always=True) @@ -632,22 +632,22 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) - geometry: Optional[ - Union[Polygon, MultiPolygon, Feature, FeatureCollection] - ] = Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, + geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( + Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, + ) ) @validator("geometry", pre=True, always=True) From 95904d0aa32e33931ae1d43df77962508d7fac31 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 18 Nov 2024 14:16:20 -0300 Subject: [PATCH 10/14] Linting code: add docstring --- src/post_processing/geojson_stats.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py index 3935634a..8f9d3072 100644 --- a/src/post_processing/geojson_stats.py +++ b/src/post_processing/geojson_stats.py @@ -56,4 +56,7 @@ def raw_data_line_stats(self, json_object: dict): self.get_object_stats(json_object) def html(self, tpl): + """ + Returns stats Html object, generated from stats data using a template + """ return Html(tpl, self) From 0b634cc0161be77d4f192e628e0f5bd25cec1a0c Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 18 Nov 2024 14:24:41 -0300 Subject: [PATCH 11/14] Black code --- src/post_processing/geojson_stats.py | 1 - src/validation/models.py | 64 ++++++++++++++-------------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py index 8f9d3072..9ca74d68 100644 --- a/src/post_processing/geojson_stats.py +++ b/src/post_processing/geojson_stats.py @@ -15,7 +15,6 @@ def __init__(self, filters, *args, **kwargs): self.config.properties_prop = "properties.tags" if filters and filters.tags: - for tag in CONFIG_AREA: if self.check_filter(filters.tags, tag): self.config.keys.append(tag) diff --git a/src/validation/models.py b/src/validation/models.py index f97f9100..14cf5b6f 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -311,22 +311,22 @@ class StatsRequestParams(BaseModel, GeometryValidatorMixin): max_length=3, example="NPL", ) - geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( - Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - ) + geometry: Optional[ + Union[Polygon, MultiPolygon, Feature, FeatureCollection] + ] = Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, ) @validator("geometry", pre=True, always=True) @@ -632,22 +632,22 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) - geometry: Optional[Union[Polygon, MultiPolygon, Feature, FeatureCollection]] = ( - Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - ) + geometry: Optional[ + Union[Polygon, MultiPolygon, Feature, FeatureCollection] + ] = Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, ) @validator("geometry", pre=True, always=True) From 3f44f5b0d6ff75e0fab4305d3e8ff0b92c1ac196 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Mon, 18 Nov 2024 14:45:55 -0300 Subject: [PATCH 12/14] Fix undeclared variable --- API/api_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/API/api_worker.py b/API/api_worker.py index 92756c7b..c538b42c 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -222,6 +222,7 @@ def process_raw_data(self, params, user=None): # Post-proccessing: Generate GeoJSON/HTML stats and transliterations polygon_stats = None geojson_stats_html = None + geojson_stats_json = None download_html_url = None if "include_stats" or "include_translit" in params.dict(): post_processor = PostProcessor( From 56b67f704107f7e01d1ff96cfb66df2e5b02c40f Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Wed, 20 Nov 2024 14:14:00 -0300 Subject: [PATCH 13/14] + Post-processing for custom exports --- backend/field_update | 1 - backend/raw_backend | 2 +- requirements.txt | 2 +- src/app.py | 21 +++++ src/config.py | 1 + src/post_processing/processor.py | 87 ++++++++++++++++++++- src/post_processing/stats_building_tpl.html | 14 ++-- src/post_processing/stats_highway_tpl.html | 14 ++-- src/post_processing/stats_tpl.html | 14 ++-- src/post_processing/stats_waterway_tpl.html | 14 ++-- src/post_processing/transliterator.py | 22 ++++-- src/validation/models.py | 15 ++++ tests/test_API.py | 60 +++++++------- 13 files changed, 202 insertions(+), 65 deletions(-) diff --git a/backend/field_update b/backend/field_update index b663498b..0c55c83b 100644 --- a/backend/field_update +++ b/backend/field_update @@ -114,7 +114,6 @@ class Database: try: self.cursor.execute(query) self.conn.commit() - # print(query) try: result = self.cursor.fetchall() diff --git a/backend/raw_backend b/backend/raw_backend index 5ddd55c8..4a852eb3 100644 --- a/backend/raw_backend +++ b/backend/raw_backend @@ -256,7 +256,7 @@ if __name__ == "__main__": if not args.replication: osm2pgsql.append("--drop") - print(osm2pgsql) + run_subprocess_cmd(osm2pgsql) basic_index_cmd = [ diff --git a/requirements.txt b/requirements.txt index 1364e704..46eb24da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,7 +56,7 @@ psutil==5.9.8 tqdm==4.66.2 # stats for geojson data -geojson-stats==0.2.3 +geojson-stats==0.2.4 # transliterations transliterate==1.10.2 diff --git a/src/app.py b/src/app.py index 293f5a81..73ac60e7 100644 --- a/src/app.py +++ b/src/app.py @@ -47,6 +47,7 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm +from .post_processing.processor import PostProcessor # Reader imports from src.config import ( @@ -1491,8 +1492,28 @@ def process_export_format(export_format): layer_creation_options=layer_creation_options_str, query_dump_path=export_format_path, ) + run_ogr2ogr_cmd(ogr2ogr_cmd) + # Post-processing GeoJSON files + # Adds: stats, HTML stats summary and transliterations + if export_format.driver_name == "GeoJSON" and ( + self.params.include_stats or self.params.include_translit + ): + post_processor = PostProcessor( + { + "include_stats": self.params.include_stats, + "include_translit": self.params.include_translit, + "include_stats_html": self.params.include_stats_html, + } + ) + post_processor.init() + post_processor.custom( + categories=self.params.categories, + export_format_path=export_format_path, + export_filename=export_filename, + ) + zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) diff --git a/src/config.py b/src/config.py index 3f6ab649..cc889318 100644 --- a/src/config.py +++ b/src/config.py @@ -334,6 +334,7 @@ def not_raises(func, *args, **kwargs): logging.error( "Error creating HDX configuration: %s, Disabling the hdx exports feature", e ) + ENABLE_HDX_EXPORTS = False if ENABLE_HDX_EXPORTS: diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py index c7806d70..53a18dbe 100644 --- a/src/post_processing/processor.py +++ b/src/post_processing/processor.py @@ -1,6 +1,8 @@ import json from .transliterator import Transliterator from .geojson_stats import GeoJSONStats +import os +import pathlib class PostProcessor: @@ -25,15 +27,96 @@ def post_process_line(self, line: str): return json.dumps(line_object) + def custom(self, categories, export_format_path, export_filename): + """ + Post-process custom exports + """ + self.geoJSONStats.config.properties_prop = "properties" + + category_tag = "" + if any("Roads" in element for element in categories): + category_tag = "highway" + self.geoJSONStats.config.length = True + elif any("Buildings" in element for element in categories): + category_tag = "building" + self.geoJSONStats.config.area = True + elif any("Waterways" in element for element in categories): + category_tag = "Waterway" + self.geoJSONStats.config.length = True + + if self.options["include_stats"]: + if category_tag: + self.geoJSONStats.config.keys.append(category_tag) + self.geoJSONStats.config.value_keys.append(category_tag) + + path_input = os.path.join( + export_format_path, f"{export_filename}.geojson" + ) + path_output = os.path.join( + export_format_path, f"{export_filename}-post.geojson" + ) + + with open(path_input, "r") as input_file, open( + path_output, "w" + ) as output_file: + for line in input_file: + comma = False + if line.startswith('{ "type": "Feature"'): + json_string = "" + if line[-2:-1] == ",": + json_string = line[:-2] + comma = True + else: + json_string = line + line = self.post_process_line(json_string) + if self.options["include_translit"]: + if comma: + output_file.write(line + ",") + else: + output_file.write(line) + + if self.options["include_translit"]: + os.remove(path_input) + os.rename(path_output, path_input) + else: + os.remove(path_output) + + geojson_stats_json = json.dumps(self.geoJSONStats.dict()) + with open( + os.path.join( + export_format_path, f"{export_filename}-stats.json" + ), + "w", + ) as f: + f.write(geojson_stats_json) + + if self.options["include_stats_html"]: + tpl = ( + "stats_{category_tag}".format(category_tag=category_tag) + if category_tag + else "stats" + ) + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = self.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join( + export_format_path, f"{export_filename}-stats.html" + ) + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + def init(self): """ Initialize post-processor """ - if self.options["include_stats"]: + if "include_stats" in self.options and self.options["include_stats"]: self.geoJSONStats = GeoJSONStats(self.filters) self.functions.append(self.geoJSONStats.raw_data_line_stats) - if self.options["include_translit"]: + if "include_translit" in self.options and self.options["include_translit"]: self.transliterator = Transliterator() self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html index e6cdfdd6..97c3f659 100644 --- a/src/post_processing/stats_building_tpl.html +++ b/src/post_processing/stats_building_tpl.html @@ -116,7 +116,7 @@

Languages Available

- + @@ -130,32 +130,32 @@

Languages Available

- + - + - + - + - + - +
Hot Key CountsKey Count %
${key_0} ${key_0_count}${key_0_percent}%${key_0_percent}
${key_1} ${key_1_count}${key_1_percent}%${key_1_percent}
${key_2} ${key_2_count}${key_2_percent}%${key_2_percent}
${key_3} ${key_3_count}${key_3_percent}%${key_3_percent}
${key_4} ${key_4_count}${key_4_percent}%${key_4_percent}
${key_5} ${key_5_count}${key_5_percent}%${key_5_percent}
diff --git a/src/post_processing/stats_highway_tpl.html b/src/post_processing/stats_highway_tpl.html index d80f3188..be09c355 100644 --- a/src/post_processing/stats_highway_tpl.html +++ b/src/post_processing/stats_highway_tpl.html @@ -115,7 +115,7 @@

Languages Available

- + @@ -129,32 +129,32 @@

Languages Available

- + - + - + - + - + - +
Hot Key CountsKey Count %
${key_0} ${key_0_count}${key_0_percent}%${key_0_percent}
${key_1} ${key_1_count}${key_1_percent}%${key_1_percent}
${key_2} ${key_2_count}${key_2_percent}%${key_2_percent}
${key_3} ${key_3_count}${key_3_percent}%${key_3_percent}
${key_4} ${key_4_count}${key_4_percent}%${key_4_percent}
${key_5} ${key_5_count}${key_5_percent}%${key_5_percent}
diff --git a/src/post_processing/stats_tpl.html b/src/post_processing/stats_tpl.html index e6777dc2..7359ad13 100644 --- a/src/post_processing/stats_tpl.html +++ b/src/post_processing/stats_tpl.html @@ -112,7 +112,7 @@

Languages Available

- + @@ -126,32 +126,32 @@

Languages Available

- + - + - + - + - + - +
Hot Key CountsKey Count %
${key_0} ${key_0_count}${key_0_percent}%${key_0_percent}
${key_1} ${key_1_count}${key_1_percent}%${key_1_percent}
${key_2} ${key_2_count}${key_2_percent}%${key_2_percent}
${key_3} ${key_3_count}${key_3_percent}%${key_3_percent}
${key_4} ${key_4_count}${key_4_percent}%${key_4_percent}
${key_5} ${key_5_count}${key_5_percent}%${key_5_percent}
diff --git a/src/post_processing/stats_waterway_tpl.html b/src/post_processing/stats_waterway_tpl.html index c59050d0..af11d163 100644 --- a/src/post_processing/stats_waterway_tpl.html +++ b/src/post_processing/stats_waterway_tpl.html @@ -116,7 +116,7 @@

Languages Available

- + @@ -130,32 +130,32 @@

Languages Available

- + - + - + - + - + - +
Hot Key CountsKey Count %
${key_0} ${key_0_count}${key_0_percent}%${key_0_percent}
${key_1} ${key_1_count}${key_1_percent}%${key_1_percent}
${key_2} ${key_2_count}${key_2_percent}%${key_2_percent}
${key_3} ${key_3_count}${key_3_percent}%${key_3_percent}
${key_4} ${key_4_count}${key_4_percent}%${key_4_percent}
${key_5} ${key_5_count}${key_5_percent}%${key_5_percent}
diff --git a/src/post_processing/transliterator.py b/src/post_processing/transliterator.py index e2ed695c..ddc16bb7 100644 --- a/src/post_processing/transliterator.py +++ b/src/post_processing/transliterator.py @@ -4,6 +4,8 @@ class Transliterator: """Used for transliterate names while processing GeoJSON files line by line""" + props = "properties" + def __init__(self): self.available_language_codes = get_available_language_codes() self.name_tags = [f"name:{x}" for x in self.available_language_codes] @@ -14,9 +16,19 @@ def translit(self, line): """ for code in self.available_language_codes: tag = "name:{code}".format(code=code) - if tag in line["properties"]["tags"]: + prop = ( + line["properties"]["tags"] + if self.props == "properties.tags" + else line["properties"] + ) + if tag in prop: translit_tag = "{tag}-translit".format(tag=tag) - if not translit_tag in line["properties"]["tags"]: - line["properties"]["tags"][translit_tag] = translit( - line["properties"]["tags"][tag], code, reversed=True - ) + if not translit_tag in prop: + if self.props == "properties.tags": + line["properties"]["tags"][translit_tag] = translit( + prop[tag], code, reversed=True + ) + else: + line["properties"][translit_tag] = translit( + prop[tag], code, reversed=True + ) diff --git a/src/validation/models.py b/src/validation/models.py index 14cf5b6f..52677470 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -618,6 +618,9 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): Fields: - iso3 (Optional[str]): ISO3 Country Code. + - include_stats (bool): Include a JSON file with stats. Available for GeoJSON exports only. + - include_stats_html (bool): Include a HTML file with a stats summary. Available for GeoJSON exports only. + - include_translit (bool): Add transliterations. Available for GeoJSON exports only. - dataset (Optional[DatasetConfig]): Dataset Configurations for HDX Upload. - meta (bool): Dumps Meta db in parquet format & HDX config JSON to S3. - hdx_upload (bool): Enable/Disable uploading the dataset to HDX. @@ -632,6 +635,18 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) + include_stats: Optional[bool] = Field( + default=False, + description="Include a JSON file with stats. Available for GeoJSON exports only.", + ) + include_stats_html: Optional[bool] = Field( + default=False, + description="Include a HTML file with a stats summary. Available for GeoJSON exports only.", + ) + include_translit: Optional[bool] = Field( + default=False, + description="Add transliterations. Available for GeoJSON exports only.", + ) geometry: Optional[ Union[Polygon, MultiPolygon, Feature, FeatureCollection] ] = Field( diff --git a/tests/test_API.py b/tests/test_API.py index 5651dffa..433b133c 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -9,6 +9,7 @@ from API.main import app client = TestClient(app) +client.base_url = "http://127.0.0.1:8000" access_token = os.environ.get("ACCESS_TOKEN") @@ -740,33 +741,6 @@ def test_snapshot_authentication_uuid(): }, "uuid": False, } - - response = client.post("/v1/snapshot/", json=payload, headers=headers) - - assert response.status_code == 200 - res = response.json() - track_link = res["track_link"] - wait_for_task_completion(track_link) - - -def test_snapshot_bind_zip(): - headers = {"access-token": access_token} - payload = { - "geometry": { - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - "bindZip": False, - } - response = client.post("/v1/snapshot/", json=payload, headers=headers) assert response.status_code == 200 @@ -955,6 +929,38 @@ def test_hdx_submit_normal_iso3(): wait_for_task_completion(track_link) +def test_hdx_submit_normal_iso3_with_stats(): + headers = {"access-token": access_token} + payload = { + "iso3": "NPL", + "hdx_upload": False, + "include_stats": True, + "include_translit": True, + "include_stats_html": True, + "categories": [ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": ["name", "highway"], + "where": "tags['highway'] IS NOT NULL", + "formats": ["geojson"], + } + } + ], + } + + response = client.post("/v1/custom/snapshot/", json=payload, headers=headers) + + assert response.status_code == 200 + res = response.json() + track_link = res["track_link"] + wait_for_task_completion(track_link) + + def test_hdx_submit_normal_iso3_multiple_format(): headers = {"access-token": access_token} payload = { From a63553728460baad04f839f6200fbe09ed3959d8 Mon Sep 17 00:00:00 2001 From: Emilio Mariscal Date: Wed, 20 Nov 2024 15:56:52 -0300 Subject: [PATCH 14/14] + Post-processing for custom exports --- src/app.py | 1 + src/post_processing/processor.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/app.py b/src/app.py index 73ac60e7..3ca2ad6c 100644 --- a/src/app.py +++ b/src/app.py @@ -1512,6 +1512,7 @@ def process_export_format(export_format): categories=self.params.categories, export_format_path=export_format_path, export_filename=export_filename, + file_export_path=file_export_path, ) zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py index 53a18dbe..44feccff 100644 --- a/src/post_processing/processor.py +++ b/src/post_processing/processor.py @@ -27,7 +27,7 @@ def post_process_line(self, line: str): return json.dumps(line_object) - def custom(self, categories, export_format_path, export_filename): + def custom(self, categories, export_format_path, export_filename, file_export_path): """ Post-process custom exports """ @@ -81,14 +81,12 @@ def custom(self, categories, export_format_path, export_filename): else: os.remove(path_output) - geojson_stats_json = json.dumps(self.geoJSONStats.dict()) - with open( - os.path.join( - export_format_path, f"{export_filename}-stats.json" - ), - "w", - ) as f: - f.write(geojson_stats_json) + geojson_stats_json = json.dumps(self.geoJSONStats.dict()) + with open( + os.path.join(file_export_path, "stats.json"), + "w", + ) as f: + f.write(geojson_stats_json) if self.options["include_stats_html"]: tpl = ( @@ -103,7 +101,7 @@ def custom(self, categories, export_format_path, export_filename): ) geojson_stats_html = self.geoJSONStats.html(tpl_path).build() upload_html_path = os.path.join( - export_format_path, f"{export_filename}-stats.html" + file_export_path, "stats-summary.html" ) with open(upload_html_path, "w") as f: f.write(geojson_stats_html)