From c6e9baae10baf4604d65b75c09ad5541be917b9a Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Mon, 9 Sep 2024 10:46:46 +0200 Subject: [PATCH 01/12] update dataset to use hexadecimal representation of cell id --- science/notebooks/merge_entrega_roberto.ipynb | 163 ++++++++++++++---- 1 file changed, 131 insertions(+), 32 deletions(-) diff --git a/science/notebooks/merge_entrega_roberto.ipynb b/science/notebooks/merge_entrega_roberto.ipynb index e7913753..cc3cf618 100644 --- a/science/notebooks/merge_entrega_roberto.ipynb +++ b/science/notebooks/merge_entrega_roberto.ipynb @@ -2,68 +2,162 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:30:08.332078Z", + "start_time": "2024-09-09T08:30:08.112470Z" + } + }, "source": [ "import polars as pl\n", "from pathlib import Path\n", "import h3ronpy.polars" - ] + ], + "outputs": [], + "execution_count": 1 }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:37:46.700129Z", + "start_time": "2024-09-09T08:37:46.697392Z" + } + }, + "source": "csvs = list(Path(\"../data/raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024\").glob(\"*.CSV\"))", "outputs": [], - "source": [ - "csvs = list(Path(\"../raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024/\").glob(\"*.CSV\"))" - ] + "execution_count": 39 }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:37:50.987739Z", + "start_time": "2024-09-09T08:37:50.852450Z" + } + }, "source": [ "dfs = [pl.read_csv(f, separator=\";\", decimal_comma=True) for f in csvs]\n", "df = pl.concat(dfs, how=\"align\", rechunk=True)\n", "df.head()" - ] + ], + "outputs": [ + { + "data": { + "text/plain": [ + "shape: (5, 9)\n", + "┌──────────────┬───────┬────────────┬────────────┬───┬───────────┬──────────┬───────────┬──────────┐\n", + "│ GRID_ID ┆ FRECF ┆ AMIN ┆ AMAX ┆ … ┆ TREEPERCT ┆ PMIN ┆ PMAX ┆ PMEAN │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ i64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════╪═══════╪════════════╪════════════╪═══╪═══════════╪══════════╪═══════════╪══════════╡\n", + "│ 865f00007fff ┆ null ┆ 114.678246 ┆ 209.731842 ┆ … ┆ 100.0 ┆ 0.058348 ┆ 7.531753 ┆ 1.69093 │\n", + "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 865f0000ffff ┆ null ┆ 127.660339 ┆ 705.040772 ┆ … ┆ 99.985832 ┆ 0.148311 ┆ 31.043549 ┆ 6.346733 │\n", + "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 865f00017fff ┆ null ┆ 117.937508 ┆ 175.799759 ┆ … ┆ 100.0 ┆ 0.028819 ┆ 2.731335 ┆ 1.063382 │\n", + "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 865f0001ffff ┆ null ┆ 123.765045 ┆ 193.208282 ┆ … ┆ 100.0 ┆ 0.047981 ┆ 4.67722 ┆ 1.557258 │\n", + "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 865f00027fff ┆ null ┆ 111.118088 ┆ 277.398895 ┆ … ┆ 100.0 ┆ 0.144035 ┆ 12.342467 ┆ 2.193349 │\n", + "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└──────────────┴───────┴────────────┴────────────┴───┴───────────┴──────────┴───────────┴──────────┘" + ], + "text/html": [ + "
\n", + "shape: (5, 9)
GRID_IDFRECFAMINAMAXAMEANTREEPERCTPMINPMAXPMEAN
stri64f64f64f64f64f64f64f64
"865f00007ffffff"null114.678246209.731842149.513126100.00.0583487.5317531.69093
"865f0000fffffff"null127.660339705.040772245.46101399.9858320.14831131.0435496.346733
"865f00017ffffff"null117.937508175.799759145.636984100.00.0288192.7313351.063382
"865f0001fffffff"null123.765045193.208282156.474098100.00.0479814.677221.557258
"865f00027ffffff"null111.118088277.398895146.417323100.00.14403512.3424672.193349
" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 42 }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:37:51.695475Z", + "start_time": "2024-09-09T08:37:51.673626Z" + } + }, "source": [ "df = df.with_columns(pl.col(\"GRID_ID\").h3.cells_parse())\n", "df = df.drop(\"GRID_ID\")" - ] + ], + "outputs": [], + "execution_count": 43 }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:37:52.296769Z", + "start_time": "2024-09-09T08:37:52.286054Z" + } + }, "source": [ "df.select(pl.col(\"cell\").h3.cells_resolution()).unique()" - ] + ], + "outputs": [ + { + "data": { + "text/plain": [ + "shape: (1, 1)\n", + "┌────────────┐\n", + "│ resolution │\n", + "│ --- │\n", + "│ u8 │\n", + "╞════════════╡\n", + "│ 6 │\n", + "└────────────┘" + ], + "text/html": [ + "
\n", + "shape: (1, 1)
resolution
u8
6
" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 44 }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:37:53.457159Z", + "start_time": "2024-09-09T08:37:53.382329Z" + } + }, "source": [ "CELLS_RES = 6\n", "OVERVIEW_LEVEL = CELLS_RES - 5\n", "\n", "df = df.with_columns(\n", - " pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).alias(\"tile_id\") # type: ignore[attr-defined]\n", + " pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).h3.cells_to_string().alias(\"tile_id\"), # type: ignore[attr-defined]\n", + " pl.col(\"cell\").h3.cells_to_string()\n", ")\n", "partition_dfs = df.partition_by([\"tile_id\"], as_dict=True, include_key=False)" - ] + ], + "outputs": [], + "execution_count": 45 }, { "cell_type": "markdown", @@ -74,9 +168,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-09T08:40:16.950037Z", + "start_time": "2024-09-09T08:40:16.897734Z" + } + }, "source": [ "seen_tiles = set()\n", "n_cells = 0\n", @@ -84,8 +181,8 @@ "for tile_group, tile_df in partition_dfs.items():\n", " if tile_df.shape[0] == 0: # todo: skip empty tiles ?\n", " continue\n", - " tile_id = hex(tile_group[0])[2:]\n", - " filename = Path(\"grid/1\") / (tile_id + \".arrow\")\n", + " tile_id = tile_group[0]\n", + " filename = Path(\"../data/processed/grid/1\") / (tile_id + \".arrow\")\n", " if tile_id in seen_tiles:\n", " tile_df = pl.concat(\n", " [pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\"\n", @@ -96,7 +193,9 @@ " seen_tiles.add(tile_id)\n", " tile_df.write_ipc(filename)\n", " n_cells += len(tile_df)" - ] + ], + "outputs": [], + "execution_count": 48 }, { "cell_type": "markdown", From d43fff9a2d59d7c94072fa263939cd1016d100ed Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Tue, 10 Sep 2024 14:00:19 +0200 Subject: [PATCH 02/12] Add enpoint to get tiles filtered by geojson --- api/app/config/config.py | 1 + api/app/routers/grid.py | 76 +++++++++++++++++++++++++++++++++------- api/requirements.in | 1 + api/requirements.txt | 22 +++++++++++- 4 files changed, 86 insertions(+), 14 deletions(-) diff --git a/api/app/config/config.py b/api/app/config/config.py index a8882021..c57f6501 100644 --- a/api/app/config/config.py +++ b/api/app/config/config.py @@ -11,6 +11,7 @@ class Settings(BaseSettings): auth_token: str tiff_path: str grid_tiles_path: str + tile_to_cell_resolution_diff: int = 5 @lru_cache diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index 5394220a..45a41f14 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -1,13 +1,19 @@ import logging import os import pathlib +from functools import lru_cache from typing import Annotated import h3 +import h3ronpy.polars # noqa: F401 import polars as pl +import shapely from fastapi import APIRouter, Depends, HTTPException, Path, Query +from fastapi.params import Body from fastapi.responses import ORJSONResponse +from geojson_pydantic import Feature from h3 import H3CellError +from h3ronpy.polars.vector import geometry_to_cells from pydantic import ValidationError from starlette.responses import Response @@ -19,17 +25,8 @@ grid_router = APIRouter() -@grid_router.get( - "/tile/{tile_index}", - summary="Get a grid tile", -) -async def grid_tile( - tile_index: Annotated[str, Path(description="The `h3` index of the tile")], - columns: list[str] = Query( - [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." - ), -) -> Response: - """Get a tile of h3 cells with specified data columns""" +def tile_from_fs(columns, tile_index) -> tuple[pl.DataFrame, int]: + """Get the tile from filesystem filtered by column and the resolution of the tile index""" try: z = h3.api.basic_str.h3_get_resolution(tile_index) except H3CellError: @@ -38,10 +35,63 @@ async def grid_tile( if not os.path.exists(tile_path): raise HTTPException(status_code=404, detail=f"Tile {tile_path} not found") try: - tile_file = pl.read_ipc(tile_path, columns=["cell", *columns]).write_ipc(None) + tile = pl.read_ipc(tile_path, columns=["cell", *columns]) except pl.exceptions.ColumnNotFoundError: raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None - return Response(tile_file.getvalue(), media_type="application/octet-stream") + return tile, z + + +@grid_router.get( + "/tile/{tile_index}", + summary="Get a grid tile", +) +def get_grid_tile( + tile_index: Annotated[str, Path(description="The `h3` index of the tile")], + columns: list[str] = Query( + [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." + ), +) -> Response: + """Get a tile of h3 cells with specified data columns""" + tile, _ = tile_from_fs(columns, tile_index) + tile_buffer = tile.write_ipc(None) + return Response(tile_buffer.getvalue(), media_type="application/octet-stream") + + +# @lru_cache +# def cells_in_geojson(geometry, cell_resolution: int) -> pl.Series: +# """Return the cells that fill the polygon area in the geojson""" +# cells = polyfill_geojson(geojson, cell_resolution) +# return pl.Series("shape_cells", cells, dtype=pl.UInt64) + + +@lru_cache +def cells_in_geojson(geometry, cell_resolution: int) -> pl.Series: + """Return the cells that fill the polygon area in the geojson""" + cells = geometry_to_cells(geometry, cell_resolution) + return pl.Series("shape_cells", cells, dtype=pl.UInt64) + + +@grid_router.post("/tile/{tile_index}", summary="Get a grid tile with cells contained inside the GeoJSON") +def post_grid_tile( + tile_index: Annotated[str, Path(description="The `h3` index of the tile")], + geojson: Annotated[Feature, Body(description="GeoJSON Feature.")], + columns: list[str] = Query( + [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." + ), +) -> Response: + tile, tile_index_res = tile_from_fs(columns, tile_index) + cell_res = tile_index_res + get_settings().tile_to_cell_resolution_diff + geom = shapely.from_geojson(geojson.model_dump_json()) + cells = cells_in_geojson(geom, cell_res) + tile = ( + tile.with_columns(pl.col("cell").h3.cells_parse()) + .filter(pl.col("cell").is_in(cells)) + .with_columns(pl.col("cell").h3.cells_to_string()) + ) + if tile.is_empty(): + raise HTTPException(status_code=404, detail="No data in region") + tile_buffer = tile.write_ipc(None) + return Response(tile_buffer.getvalue(), media_type="application/octet-stream") @grid_router.get( diff --git a/api/requirements.in b/api/requirements.in index a2bf54c0..2d6a81c2 100644 --- a/api/requirements.in +++ b/api/requirements.in @@ -8,3 +8,4 @@ h3 pydantic-extra-types polars sqlalchemy +h3ronpy \ No newline at end of file diff --git a/api/requirements.txt b/api/requirements.txt index 50c7a96b..135b3189 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -34,8 +34,11 @@ cligj==0.7.2 color-operations==0.1.3 # via rio-tiler exactextract==0.2.0.dev0 + # via -r requirements.in fastapi==0.110.1 - # via titiler-core + # via + # -r requirements.in + # titiler-core geojson-pydantic==1.0.2 # via titiler-core greenlet==3.0.3 @@ -45,6 +48,9 @@ h11==0.14.0 # httpcore # uvicorn h3==3.7.7 + # via -r requirements.in +h3ronpy==0.21.0 + # via -r requirements.in httpcore==1.0.5 # via httpx httpx==0.27.0 @@ -66,13 +72,20 @@ numexpr==2.10.0 numpy==1.26.4 # via # color-operations + # h3ronpy # numexpr + # pyarrow # rasterio # rio-tiler + # shapely # snuggs # titiler-core orjson==3.10.0 + # via -r requirements.in polars==1.1.0 + # via -r requirements.in +pyarrow==17.0.0 + # via h3ronpy pydantic==2.6.4 # via # fastapi @@ -85,7 +98,9 @@ pydantic==2.6.4 pydantic-core==2.16.3 # via pydantic pydantic-extra-types==2.9.0 + # via -r requirements.in pydantic-settings==2.2.1 + # via -r requirements.in pyparsing==3.1.2 # via snuggs pyproj==3.6.1 @@ -104,6 +119,8 @@ rio-tiler==6.4.5 # via titiler-core setuptools==69.2.0 # via rasterio +shapely==2.0.6 + # via h3ronpy simplejson==3.19.2 # via titiler-core six==1.16.0 @@ -115,9 +132,11 @@ sniffio==1.3.1 snuggs==1.4.7 # via rasterio sqlalchemy==2.0.31 + # via -r requirements.in starlette==0.37.2 # via fastapi titiler-core==0.18.0 + # via -r requirements.in typing-extensions==4.11.0 # via # fastapi @@ -126,3 +145,4 @@ typing-extensions==4.11.0 # sqlalchemy # titiler-core uvicorn==0.29.0 + # via -r requirements.in From d57ff556d9eb4a7b92f3da011d35477e874f73e2 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Wed, 11 Sep 2024 15:26:11 +0200 Subject: [PATCH 03/12] Speedup request by caching geojson filling result and improve queries --- api/app/routers/grid.py | 48 +++++++------- api/tests/benchmark_grid_post.lua | 40 ++++++++++++ ...mark_post.lua => benchmark_table_post.lua} | 0 api/tests/conftest.py | 35 ++++++++-- api/tests/test_grid.py | 64 +++++++++++++++---- 5 files changed, 145 insertions(+), 42 deletions(-) create mode 100644 api/tests/benchmark_grid_post.lua rename api/tests/{benchmark_post.lua => benchmark_table_post.lua} (100%) diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index 45a41f14..2e229e33 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -13,6 +13,7 @@ from fastapi.responses import ORJSONResponse from geojson_pydantic import Feature from h3 import H3CellError +from h3ronpy.polars import cells_to_string from h3ronpy.polars.vector import geometry_to_cells from pydantic import ValidationError from starlette.responses import Response @@ -25,7 +26,7 @@ grid_router = APIRouter() -def tile_from_fs(columns, tile_index) -> tuple[pl.DataFrame, int]: +def tile_from_fs(columns, tile_index) -> tuple[pl.LazyFrame, int]: """Get the tile from filesystem filtered by column and the resolution of the tile index""" try: z = h3.api.basic_str.h3_get_resolution(tile_index) @@ -34,13 +35,21 @@ def tile_from_fs(columns, tile_index) -> tuple[pl.DataFrame, int]: tile_path = os.path.join(get_settings().grid_tiles_path, f"{z}/{tile_index}.arrow") if not os.path.exists(tile_path): raise HTTPException(status_code=404, detail=f"Tile {tile_path} not found") - try: - tile = pl.read_ipc(tile_path, columns=["cell", *columns]) - except pl.exceptions.ColumnNotFoundError: - raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None + tile = pl.scan_ipc(tile_path).select(["cell", *columns]) return tile, z +@lru_cache +def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame: + """Return the cells that fill the polygon area in the geojson + + Geometry must be a shapely geometry, a wkt or wkb so the lru cache + can hash the parameter. + """ + cells = cells_to_string(geometry_to_cells(geometry, cell_resolution)) + return pl.LazyFrame({"cell": cells}) + + @grid_router.get( "/tile/{tile_index}", summary="Get a grid tile", @@ -53,24 +62,13 @@ def get_grid_tile( ) -> Response: """Get a tile of h3 cells with specified data columns""" tile, _ = tile_from_fs(columns, tile_index) - tile_buffer = tile.write_ipc(None) + try: + tile_buffer = tile.collect().write_ipc(None) + except pl.exceptions.ColumnNotFoundError: + raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None return Response(tile_buffer.getvalue(), media_type="application/octet-stream") -# @lru_cache -# def cells_in_geojson(geometry, cell_resolution: int) -> pl.Series: -# """Return the cells that fill the polygon area in the geojson""" -# cells = polyfill_geojson(geojson, cell_resolution) -# return pl.Series("shape_cells", cells, dtype=pl.UInt64) - - -@lru_cache -def cells_in_geojson(geometry, cell_resolution: int) -> pl.Series: - """Return the cells that fill the polygon area in the geojson""" - cells = geometry_to_cells(geometry, cell_resolution) - return pl.Series("shape_cells", cells, dtype=pl.UInt64) - - @grid_router.post("/tile/{tile_index}", summary="Get a grid tile with cells contained inside the GeoJSON") def post_grid_tile( tile_index: Annotated[str, Path(description="The `h3` index of the tile")], @@ -79,15 +77,15 @@ def post_grid_tile( [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." ), ) -> Response: + """Get a tile of h3 cells that are inside the polygon""" tile, tile_index_res = tile_from_fs(columns, tile_index) cell_res = tile_index_res + get_settings().tile_to_cell_resolution_diff geom = shapely.from_geojson(geojson.model_dump_json()) cells = cells_in_geojson(geom, cell_res) - tile = ( - tile.with_columns(pl.col("cell").h3.cells_parse()) - .filter(pl.col("cell").is_in(cells)) - .with_columns(pl.col("cell").h3.cells_to_string()) - ) + try: + tile = tile.join(cells, on="cell").collect() + except pl.exceptions.ColumnNotFoundError: + raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None if tile.is_empty(): raise HTTPException(status_code=404, detail="No data in region") tile_buffer = tile.write_ipc(None) diff --git a/api/tests/benchmark_grid_post.lua b/api/tests/benchmark_grid_post.lua new file mode 100644 index 00000000..7eab4065 --- /dev/null +++ b/api/tests/benchmark_grid_post.lua @@ -0,0 +1,40 @@ +-- command: +-- wrk -c 100 -t 10 -d 10s -s benchmark_grid_post.lua 'http://localhost:8000/grid/tile/815f7ffffffffff?columns=AMIN' + +wrk.method = "POST" +wrk.body = [[ +{ + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [ + -61.113268179996055, + 8.666717320892204 + ], + [ + -61.113268179996055, + 8.505177617822142 + ], + [ + -60.86538798013957, + 8.505177617822142 + ], + [ + -60.86538798013957, + 8.666717320892204 + ], + [ + -61.113268179996055, + 8.666717320892204 + ] + ] + ], + "type": "Polygon" + } + } +]] +wrk.headers["Content-Type"] = "application/json" +wrk.headers["accept"] = "application/json" +wrk.headers["Authorization"] = "Bearer 1234" diff --git a/api/tests/benchmark_post.lua b/api/tests/benchmark_table_post.lua similarity index 100% rename from api/tests/benchmark_post.lua rename to api/tests/benchmark_table_post.lua diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 2fce5818..8910dd20 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -1,3 +1,4 @@ +import json import os from pathlib import Path @@ -19,6 +20,30 @@ HEADERS = {"Authorization": f"Bearer {get_settings().auth_token}"} +@pytest.fixture() +def geojson() -> str: + """This geojson contains the cell 895f4261e03ffff in `grid_dataset`""" + s = json.dumps( + { + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [-61.11, 8.66], + [-61.11, 8.50], + [-60.86, 8.50], + [-60.86, 8.66], + [-61.11, 8.66], + ] + ], + "type": "Polygon", + }, + } + ) + return s + + @pytest.fixture() def grid_dataset(setup_data_folder) -> str: """Create an empty binary file to be used as grid dataset stub @@ -40,11 +65,11 @@ def grid_dataset(setup_data_folder) -> str: df = pl.DataFrame( { "cell": [ - 618668968382824400, - 619428375900454900, - 619428407452893200, - 619428407943888900, - 619428407676764200, + "895f4261e03ffff", + "865f00007ffffff", + "865f0000fffffff", + "865f00017ffffff", + "865f0001fffffff", ], "landcover": [1, 4, 3, 3, 4], "population": [100, 200, 1, 900, 900], diff --git a/api/tests/test_grid.py b/api/tests/test_grid.py index b00f7337..6a6fe5e5 100644 --- a/api/tests/test_grid.py +++ b/api/tests/test_grid.py @@ -15,11 +15,11 @@ def test_grid_tile(grid_dataset): assert response.status_code == 200 assert pl.read_ipc(response.read()).to_dict(as_series=False) == { "cell": [ - 618668968382824400, - 619428375900454900, - 619428407452893200, - 619428407943888900, - 619428407676764200, + "895f4261e03ffff", + "865f00007ffffff", + "865f0000fffffff", + "865f00017ffffff", + "865f0001fffffff", ], "landcover": [1, 4, 3, 3, 4], "population": [100, 200, 1, 900, 900], @@ -32,11 +32,11 @@ def test_grid_tile_empty_column_param(grid_dataset): assert response.status_code == 200 assert pl.read_ipc(response.read()).to_dict(as_series=False) == { "cell": [ - 618668968382824400, - 619428375900454900, - 619428407452893200, - 619428407943888900, - 619428407676764200, + "895f4261e03ffff", + "865f00007ffffff", + "865f0000fffffff", + "865f00017ffffff", + "865f0001fffffff", ], } @@ -222,9 +222,49 @@ def test_grid_table(grid_dataset): assert response.status_code == 200 assert json.loads(response.read()) == { "cell": [ - 619428375900454900, - 618668968382824400, + "865f00007ffffff", + "895f4261e03ffff", ], "landcover": [4, 1], "population": [200, 100], } + + +def test_grid_tile_post_geojson(grid_dataset, geojson): + response = test_client.post( + f"/grid/tile/{grid_dataset}", + params={"columns": ["landcover", "population"]}, + headers=HEADERS, + content=geojson, + ) + assert response.status_code == 200 + assert pl.read_ipc(response.read()).to_dict(as_series=False) == { + "cell": [ + "895f4261e03ffff", + ], + "landcover": [1], + "population": [100], + } + + +def test_grid_tile_post_geojson_404(grid_dataset, geojson): + response = test_client.post( + "/grid/tile/8439181ffffffff", + params={"columns": ["landcover", "population"]}, + headers=HEADERS, + content=geojson, + ) + + assert response.status_code == 404 + + +def test_grid_tile_post_wrong_column(grid_dataset, geojson): + response = test_client.post( + f"/grid/tile/{grid_dataset}", + params={"columns": ["I DO NOT EXIST"]}, + headers=HEADERS, + content=geojson, + ) + + assert response.status_code == 400 + assert response.json() == {"detail": "One or more of the specified columns is not valid"} From 906e3fac71488c6457448058d562c76d1e73a5ae Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Wed, 11 Sep 2024 15:30:31 +0200 Subject: [PATCH 04/12] removes all outputs from notebooks --- science/notebooks/merge_entrega_roberto.ipynb | 168 ++++-------------- 1 file changed, 35 insertions(+), 133 deletions(-) diff --git a/science/notebooks/merge_entrega_roberto.ipynb b/science/notebooks/merge_entrega_roberto.ipynb index cc3cf618..f1ea486d 100644 --- a/science/notebooks/merge_entrega_roberto.ipynb +++ b/science/notebooks/merge_entrega_roberto.ipynb @@ -2,162 +2,69 @@ "cells": [ { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:30:08.332078Z", - "start_time": "2024-09-09T08:30:08.112470Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "import polars as pl\n", "from pathlib import Path\n", - "import h3ronpy.polars" - ], - "outputs": [], - "execution_count": 1 + "\n", + "import polars as pl" + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:46.700129Z", - "start_time": "2024-09-09T08:37:46.697392Z" - } - }, - "source": "csvs = list(Path(\"../data/raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024\").glob(\"*.CSV\"))", + "execution_count": null, + "metadata": {}, "outputs": [], - "execution_count": 39 + "source": [ + "csvs = list(Path(\"../data/raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024\").glob(\"*.CSV\"))" + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:50.987739Z", - "start_time": "2024-09-09T08:37:50.852450Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs = [pl.read_csv(f, separator=\";\", decimal_comma=True) for f in csvs]\n", "df = pl.concat(dfs, how=\"align\", rechunk=True)\n", "df.head()" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "shape: (5, 9)\n", - "┌──────────────┬───────┬────────────┬────────────┬───┬───────────┬──────────┬───────────┬──────────┐\n", - "│ GRID_ID ┆ FRECF ┆ AMIN ┆ AMAX ┆ … ┆ TREEPERCT ┆ PMIN ┆ PMAX ┆ PMEAN │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ i64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞══════════════╪═══════╪════════════╪════════════╪═══╪═══════════╪══════════╪═══════════╪══════════╡\n", - "│ 865f00007fff ┆ null ┆ 114.678246 ┆ 209.731842 ┆ … ┆ 100.0 ┆ 0.058348 ┆ 7.531753 ┆ 1.69093 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f0000ffff ┆ null ┆ 127.660339 ┆ 705.040772 ┆ … ┆ 99.985832 ┆ 0.148311 ┆ 31.043549 ┆ 6.346733 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f00017fff ┆ null ┆ 117.937508 ┆ 175.799759 ┆ … ┆ 100.0 ┆ 0.028819 ┆ 2.731335 ┆ 1.063382 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f0001ffff ┆ null ┆ 123.765045 ┆ 193.208282 ┆ … ┆ 100.0 ┆ 0.047981 ┆ 4.67722 ┆ 1.557258 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f00027fff ┆ null ┆ 111.118088 ┆ 277.398895 ┆ … ┆ 100.0 ┆ 0.144035 ┆ 12.342467 ┆ 2.193349 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────────┴───────┴────────────┴────────────┴───┴───────────┴──────────┴───────────┴──────────┘" - ], - "text/html": [ - "
\n", - "shape: (5, 9)
GRID_IDFRECFAMINAMAXAMEANTREEPERCTPMINPMAXPMEAN
stri64f64f64f64f64f64f64f64
"865f00007ffffff"null114.678246209.731842149.513126100.00.0583487.5317531.69093
"865f0000fffffff"null127.660339705.040772245.46101399.9858320.14831131.0435496.346733
"865f00017ffffff"null117.937508175.799759145.636984100.00.0288192.7313351.063382
"865f0001fffffff"null123.765045193.208282156.474098100.00.0479814.677221.557258
"865f00027ffffff"null111.118088277.398895146.417323100.00.14403512.3424672.193349
" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 42 + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:51.695475Z", - "start_time": "2024-09-09T08:37:51.673626Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df = df.with_columns(pl.col(\"GRID_ID\").h3.cells_parse())\n", "df = df.drop(\"GRID_ID\")" - ], - "outputs": [], - "execution_count": 43 + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:52.296769Z", - "start_time": "2024-09-09T08:37:52.286054Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df.select(pl.col(\"cell\").h3.cells_resolution()).unique()" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "shape: (1, 1)\n", - "┌────────────┐\n", - "│ resolution │\n", - "│ --- │\n", - "│ u8 │\n", - "╞════════════╡\n", - "│ 6 │\n", - "└────────────┘" - ], - "text/html": [ - "
\n", - "shape: (1, 1)
resolution
u8
6
" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 44 + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:53.457159Z", - "start_time": "2024-09-09T08:37:53.382329Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "CELLS_RES = 6\n", "OVERVIEW_LEVEL = CELLS_RES - 5\n", "\n", "df = df.with_columns(\n", " pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).h3.cells_to_string().alias(\"tile_id\"), # type: ignore[attr-defined]\n", - " pl.col(\"cell\").h3.cells_to_string()\n", + " pl.col(\"cell\").h3.cells_to_string(),\n", ")\n", "partition_dfs = df.partition_by([\"tile_id\"], as_dict=True, include_key=False)" - ], - "outputs": [], - "execution_count": 45 + ] }, { "cell_type": "markdown", @@ -168,12 +75,9 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:40:16.950037Z", - "start_time": "2024-09-09T08:40:16.897734Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "seen_tiles = set()\n", "n_cells = 0\n", @@ -184,18 +88,16 @@ " tile_id = tile_group[0]\n", " filename = Path(\"../data/processed/grid/1\") / (tile_id + \".arrow\")\n", " if tile_id in seen_tiles:\n", - " tile_df = pl.concat(\n", - " [pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\"\n", - " ).unique(subset=[\"cell\"])\n", + " tile_df = pl.concat([pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\").unique(\n", + " subset=[\"cell\"]\n", + " )\n", " tile_df.write_parquet(filename)\n", " n_cells += len(tile_df)\n", " else:\n", " seen_tiles.add(tile_id)\n", " tile_df.write_ipc(filename)\n", " n_cells += len(tile_df)" - ], - "outputs": [], - "execution_count": 48 + ] }, { "cell_type": "markdown", From 6b9bd425a7e750c46854d8f4ffbaf409edba98d1 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Wed, 11 Sep 2024 15:39:56 +0200 Subject: [PATCH 05/12] Adds nbstrip to pre-commit --- .pre-commit-config.yaml | 7 ++-- science/notebooks/check_combine_results.ipynb | 36 +++++++++---------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d983358..06290094 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,8 +16,7 @@ repos: - id: ruff-format types_or: [ python, pyi, jupyter ] - # check for private keys and passwords! - - repo: https://github.com/gitleaks/gitleaks - rev: v8.17.0 + - repo: https://github.com/kynan/nbstripout + rev: 0.7.1 hooks: - - id: gitleaks-docker + - id: nbstripout diff --git a/science/notebooks/check_combine_results.ipynb b/science/notebooks/check_combine_results.ipynb index 3fe61ffa..415aadc9 100644 --- a/science/notebooks/check_combine_results.ipynb +++ b/science/notebooks/check_combine_results.ipynb @@ -3,18 +3,17 @@ { "cell_type": "code", "execution_count": null, - "id": "e84373c7-a5e7-47c8-95a3-d2db7ade2e29", + "id": "0", "metadata": {}, "outputs": [], "source": [ - "import polars as pl\n", - "import polars.selectors as cs" + "import polars as pl" ] }, { "cell_type": "code", "execution_count": null, - "id": "750642ad-f9fc-434e-86f8-783cc41d533c", + "id": "1", "metadata": {}, "outputs": [], "source": [ @@ -25,16 +24,16 @@ { "cell_type": "code", "execution_count": null, - "id": "d5012a5d-ea51-4b01-8ccf-055db2feb3ec", + "id": "2", "metadata": {}, "outputs": [], "source": [ - "schema = {\"cell\":pl.UInt64, \"b\":pl.Float32, \"c\":pl.String}\n", + "schema = {\"cell\": pl.UInt64, \"b\": pl.Float32, \"c\": pl.String}\n", "df = pl.DataFrame(schema=schema)\n", "\n", - "join1 = pl.DataFrame({\"cell\": [1,2,3], \"b\": [9., 9., 9.]}, schema_overrides=schema)\n", - "join2 = pl.DataFrame({\"cell\": [1,2,3], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n", - "join3 = pl.DataFrame({\"cell\": [4,5,6], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n", + "join1 = pl.DataFrame({\"cell\": [1, 2, 3], \"b\": [9.0, 9.0, 9.0]}, schema_overrides=schema)\n", + "join2 = pl.DataFrame({\"cell\": [1, 2, 3], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n", + "join3 = pl.DataFrame({\"cell\": [4, 5, 6], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n", "\n", "tojoin = [join1, join2, join3]\n", "\n", @@ -46,24 +45,23 @@ { "cell_type": "code", "execution_count": null, - "id": "af677cc8-34c5-4259-84c2-1d90a5bf3040", + "id": "3", "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "\n", "# Define the initial DataFrame\n", - "df = pl.DataFrame({\n", - " \"cell\": [1, 2, 3, 1, 2, 3, 4, 5, 6],\n", - " \"b\": [9.0, 9.0, 9.0, None, None, None, None, None, None],\n", - " \"c\": [None, None, None, \"a\", \"b\", \"c\", \"a\", \"b\", \"c\"]\n", - "})\n", + "df = pl.DataFrame(\n", + " {\n", + " \"cell\": [1, 2, 3, 1, 2, 3, 4, 5, 6],\n", + " \"b\": [9.0, 9.0, 9.0, None, None, None, None, None, None],\n", + " \"c\": [None, None, None, \"a\", \"b\", \"c\", \"a\", \"b\", \"c\"],\n", + " }\n", + ")\n", "\n", "# Perform the group by and aggregation\n", - "agg_df = df.groupby(\"cell\").agg([\n", - " pl.col(\"b\").max().alias(\"b\"),\n", - " pl.col(\"c\").max().alias(\"c\")\n", - "])\n", + "agg_df = df.groupby(\"cell\").agg([pl.col(\"b\").max().alias(\"b\"), pl.col(\"c\").max().alias(\"c\")])\n", "\n", "# Sort the resulting DataFrame by the 'cell' column\n", "result_df = agg_df.sort(\"cell\")\n", From 2744fb8e686e8c6b7831d03bce410b164b5ebbe3 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Thu, 12 Sep 2024 10:41:05 +0200 Subject: [PATCH 06/12] update deps and filter table by geojson --- api/app/routers/grid.py | 34 ++++++++++++------- api/requirements.txt | 62 +++++++++++++++++------------------ api/tests/test_zonal_stats.py | 1 - 3 files changed, 53 insertions(+), 44 deletions(-) diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index 2e229e33..3012283e 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -26,7 +26,7 @@ grid_router = APIRouter() -def tile_from_fs(columns, tile_index) -> tuple[pl.LazyFrame, int]: +def get_tile(tile_index: str, columns: list[str]) -> tuple[pl.LazyFrame, int]: """Get the tile from filesystem filtered by column and the resolution of the tile index""" try: z = h3.api.basic_str.h3_get_resolution(tile_index) @@ -54,36 +54,38 @@ def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame: "/tile/{tile_index}", summary="Get a grid tile", ) -def get_grid_tile( +def grid_tile( tile_index: Annotated[str, Path(description="The `h3` index of the tile")], columns: list[str] = Query( [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." ), ) -> Response: """Get a tile of h3 cells with specified data columns""" - tile, _ = tile_from_fs(columns, tile_index) + tile, _ = get_tile(tile_index, columns) try: tile_buffer = tile.collect().write_ipc(None) + # we don't know if the column requested are correct until we call .collect() except pl.exceptions.ColumnNotFoundError: raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None return Response(tile_buffer.getvalue(), media_type="application/octet-stream") @grid_router.post("/tile/{tile_index}", summary="Get a grid tile with cells contained inside the GeoJSON") -def post_grid_tile( +def grid_tile_in_area( tile_index: Annotated[str, Path(description="The `h3` index of the tile")], - geojson: Annotated[Feature, Body(description="GeoJSON Feature.")], + geojson: Annotated[Feature, Body(description="GeoJSON feature used to filter the cells.")], columns: list[str] = Query( [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." ), ) -> Response: """Get a tile of h3 cells that are inside the polygon""" - tile, tile_index_res = tile_from_fs(columns, tile_index) + tile, tile_index_res = get_tile(tile_index, columns) cell_res = tile_index_res + get_settings().tile_to_cell_resolution_diff geom = shapely.from_geojson(geojson.model_dump_json()) cells = cells_in_geojson(geom, cell_res) try: tile = tile.join(cells, on="cell").collect() + # we don't know if the column requested are correct until we call .collect() except pl.exceptions.ColumnNotFoundError: raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None if tile.is_empty(): @@ -115,23 +117,31 @@ async def grid_dataset_metadata() -> MultiDatasetMeta: def read_table( level: Annotated[int, Query(..., description="Tile level at which the query will be computed")], filters: TableFilters = Depends(), + geojson: Annotated[Feature | None, Body(description="GeoJSON feature used to filter the cells.")] = None, ) -> ORJSONResponse: """Query tile dataset and return table data""" files_path = pathlib.Path(get_settings().grid_tiles_path) / str(level) if not files_path.exists(): raise HTTPException(404, detail=f"Level {level} does not exist") from None - lf = pl.scan_ipc(files_path.glob("*.arrow")) + + lf = pl.scan_ipc(list(files_path.glob("*.arrow"))) + + if geojson is not None: + cell_res = level + get_settings().tile_to_cell_resolution_diff + geom = shapely.from_geojson(geojson.model_dump_json()) + cells = cells_in_geojson(geom, cell_res) + lf = lf.join(cells, on="cell") + query = filters.to_sql_query("frame") log.debug(query) + try: res = pl.SQLContext(frame=lf).execute(query).collect() - except pl.exceptions.ColumnNotFoundError as e: - # bad column in order by clause + except pl.exceptions.ColumnNotFoundError as e: # bad column in order by clause log.exception(e) raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None - - except pl.exceptions.ComputeError as e: - # possibly raise if wrong type in compare. I'm not aware of other sources of ComputeError + except pl.exceptions.ComputeError as e: # raised if wrong type in compare. log.exception(e) raise HTTPException(status_code=422, detail=str(e)) from None + return ORJSONResponse(res.to_dict(as_series=False)) diff --git a/api/requirements.txt b/api/requirements.txt index 135b3189..471937ae 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -2,20 +2,20 @@ # uv pip compile requirements.in -o requirements.txt affine==2.4.0 # via rasterio -annotated-types==0.6.0 +annotated-types==0.7.0 # via pydantic -anyio==4.3.0 +anyio==4.4.0 # via # httpx # starlette -attrs==23.2.0 +attrs==24.2.0 # via # morecantile # rasterio # rio-tiler -cachetools==5.3.3 +cachetools==5.5.0 # via rio-tiler -certifi==2024.2.2 +certifi==2024.8.30 # via # httpcore # httpx @@ -31,17 +31,17 @@ click-plugins==1.1.1 # via rasterio cligj==0.7.2 # via rasterio -color-operations==0.1.3 +color-operations==0.1.5 # via rio-tiler -exactextract==0.2.0.dev0 +exactextract==0.2.0 # via -r requirements.in -fastapi==0.110.1 +fastapi==0.114.1 # via # -r requirements.in # titiler-core -geojson-pydantic==1.0.2 +geojson-pydantic==1.1.1 # via titiler-core -greenlet==3.0.3 +greenlet==3.1.0 # via sqlalchemy h11==0.14.0 # via @@ -53,21 +53,21 @@ h3ronpy==0.21.0 # via -r requirements.in httpcore==1.0.5 # via httpx -httpx==0.27.0 +httpx==0.27.2 # via rio-tiler -idna==3.6 +idna==3.8 # via # anyio # httpx -jinja2==3.1.3 +jinja2==3.1.4 # via titiler-core markupsafe==2.1.5 # via jinja2 -morecantile==5.3.0 +morecantile==5.4.2 # via # rio-tiler # titiler-core -numexpr==2.10.0 +numexpr==2.10.1 # via rio-tiler numpy==1.26.4 # via @@ -80,13 +80,13 @@ numpy==1.26.4 # shapely # snuggs # titiler-core -orjson==3.10.0 +orjson==3.10.7 # via -r requirements.in -polars==1.1.0 +polars==1.7.0 # via -r requirements.in pyarrow==17.0.0 # via h3ronpy -pydantic==2.6.4 +pydantic==2.9.1 # via # fastapi # geojson-pydantic @@ -95,33 +95,33 @@ pydantic==2.6.4 # pydantic-settings # rio-tiler # titiler-core -pydantic-core==2.16.3 +pydantic-core==2.23.3 # via pydantic pydantic-extra-types==2.9.0 # via -r requirements.in -pydantic-settings==2.2.1 +pydantic-settings==2.5.2 # via -r requirements.in -pyparsing==3.1.2 +pyparsing==3.1.4 # via snuggs pyproj==3.6.1 # via morecantile -pystac==1.10.0 +pystac==1.10.1 # via rio-tiler python-dateutil==2.9.0.post0 # via pystac python-dotenv==1.0.1 # via pydantic-settings -rasterio==1.3.9 +rasterio==1.3.11 # via # rio-tiler # titiler-core -rio-tiler==6.4.5 +rio-tiler==6.7.0 # via titiler-core -setuptools==69.2.0 +setuptools==74.1.2 # via rasterio shapely==2.0.6 # via h3ronpy -simplejson==3.19.2 +simplejson==3.19.3 # via titiler-core six==1.16.0 # via python-dateutil @@ -131,18 +131,18 @@ sniffio==1.3.1 # httpx snuggs==1.4.7 # via rasterio -sqlalchemy==2.0.31 +sqlalchemy==2.0.34 # via -r requirements.in -starlette==0.37.2 +starlette==0.38.5 # via fastapi -titiler-core==0.18.0 +titiler-core==0.18.6 # via -r requirements.in -typing-extensions==4.11.0 +typing-extensions==4.12.2 # via # fastapi # pydantic # pydantic-core # sqlalchemy # titiler-core -uvicorn==0.29.0 +uvicorn==0.30.6 # via -r requirements.in diff --git a/api/tests/test_zonal_stats.py b/api/tests/test_zonal_stats.py index 13236dca..ef3078b1 100644 --- a/api/tests/test_zonal_stats.py +++ b/api/tests/test_zonal_stats.py @@ -42,7 +42,6 @@ def test_no_geojson_raises_422(tif_file): "loc": ["body"], "msg": "Field required", "type": "missing", - "url": "https://errors.pydantic.dev/2.6/v/missing", } ] } From 8fb0e896f033a1f4bbc1103b9c0243f16423448f Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Thu, 12 Sep 2024 10:47:55 +0200 Subject: [PATCH 07/12] Adds test for table results with geojson filter --- api/tests/test_grid.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/api/tests/test_grid.py b/api/tests/test_grid.py index 6a6fe5e5..fb7726d4 100644 --- a/api/tests/test_grid.py +++ b/api/tests/test_grid.py @@ -211,14 +211,14 @@ def test_table_filters_multiple_filters(): def test_grid_table(grid_dataset): - filters = [ - {"filter_type": "numerical", "column_name": "population", "operation": "lte", "value": 200}, - {"filter_type": "numerical", "column_name": "population", "operation": "gt", "value": 1}, - ] + body = { + "filters": [ + {"filter_type": "numerical", "column_name": "population", "operation": "lte", "value": 200}, + {"filter_type": "numerical", "column_name": "population", "operation": "gt", "value": 1}, + ] + } - response = test_client.post( - "/grid/table?level=4&order_by=-population", headers=HEADERS, content=json.dumps(filters) - ) + response = test_client.post("/grid/table?level=4&order_by=-population", headers=HEADERS, content=json.dumps(body)) assert response.status_code == 200 assert json.loads(response.read()) == { "cell": [ @@ -230,6 +230,25 @@ def test_grid_table(grid_dataset): } +def test_grid_table_geojson(grid_dataset, geojson): + body = { + "filters": [ + {"filter_type": "numerical", "column_name": "population", "operation": "lte", "value": 200}, + {"filter_type": "numerical", "column_name": "population", "operation": "gt", "value": 1}, + ], + "geojson": json.loads(geojson), + } + response = test_client.post("/grid/table?level=4&order_by=-population", headers=HEADERS, content=json.dumps(body)) + assert response.status_code == 200 + assert json.loads(response.read()) == { + "cell": [ + "895f4261e03ffff", + ], + "landcover": [1], + "population": [100], + } + + def test_grid_tile_post_geojson(grid_dataset, geojson): response = test_client.post( f"/grid/tile/{grid_dataset}", From 5b543e10b3a142564c2f8772d6d1d73cdd296b99 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Thu, 12 Sep 2024 12:20:08 +0200 Subject: [PATCH 08/12] Tidy up the table response model --- api/app/models/grid.py | 9 +++++++++ api/app/routers/grid.py | 7 +++---- api/tests/test_grid.py | 25 +++++++++++-------------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/api/app/models/grid.py b/api/app/models/grid.py index f1ea4b3f..17ce102d 100644 --- a/api/app/models/grid.py +++ b/api/app/models/grid.py @@ -136,3 +136,12 @@ def to_sql_query(self, table_name: str) -> str: ) ) return str(query.compile(compile_kwargs={"literal_binds": True})) + + +class TableResultColumn(BaseModel): + column: Annotated[str, Field(title="column", description="Column name")] + values: Annotated[list, Field(description="Check dataset metadata for type info")] + + +class TableResults(BaseModel): + table: list[TableResultColumn] diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index 3012283e..c4e84e40 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -10,7 +10,6 @@ import shapely from fastapi import APIRouter, Depends, HTTPException, Path, Query from fastapi.params import Body -from fastapi.responses import ORJSONResponse from geojson_pydantic import Feature from h3 import H3CellError from h3ronpy.polars import cells_to_string @@ -19,7 +18,7 @@ from starlette.responses import Response from app.config.config import get_settings -from app.models.grid import MultiDatasetMeta, TableFilters +from app.models.grid import MultiDatasetMeta, TableFilters, TableResults log = logging.getLogger("uvicorn.error") @@ -118,7 +117,7 @@ def read_table( level: Annotated[int, Query(..., description="Tile level at which the query will be computed")], filters: TableFilters = Depends(), geojson: Annotated[Feature | None, Body(description="GeoJSON feature used to filter the cells.")] = None, -) -> ORJSONResponse: +) -> TableResults: """Query tile dataset and return table data""" files_path = pathlib.Path(get_settings().grid_tiles_path) / str(level) if not files_path.exists(): @@ -144,4 +143,4 @@ def read_table( log.exception(e) raise HTTPException(status_code=422, detail=str(e)) from None - return ORJSONResponse(res.to_dict(as_series=False)) + return TableResults(table=[{"column": k, "values": v} for k, v in res.to_dict(as_series=False).items()]) diff --git a/api/tests/test_grid.py b/api/tests/test_grid.py index fb7726d4..b75c254c 100644 --- a/api/tests/test_grid.py +++ b/api/tests/test_grid.py @@ -221,12 +221,11 @@ def test_grid_table(grid_dataset): response = test_client.post("/grid/table?level=4&order_by=-population", headers=HEADERS, content=json.dumps(body)) assert response.status_code == 200 assert json.loads(response.read()) == { - "cell": [ - "865f00007ffffff", - "895f4261e03ffff", - ], - "landcover": [4, 1], - "population": [200, 100], + "table": [ + {"column": "cell", "values": ["865f00007ffffff", "895f4261e03ffff"]}, + {"column": "landcover", "values": [4, 1]}, + {"column": "population", "values": [200, 100]}, + ] } @@ -241,11 +240,11 @@ def test_grid_table_geojson(grid_dataset, geojson): response = test_client.post("/grid/table?level=4&order_by=-population", headers=HEADERS, content=json.dumps(body)) assert response.status_code == 200 assert json.loads(response.read()) == { - "cell": [ - "895f4261e03ffff", - ], - "landcover": [1], - "population": [100], + "table": [ + {"column": "cell", "values": ["895f4261e03ffff"]}, + {"column": "landcover", "values": [1]}, + {"column": "population", "values": [100]}, + ] } @@ -258,9 +257,7 @@ def test_grid_tile_post_geojson(grid_dataset, geojson): ) assert response.status_code == 200 assert pl.read_ipc(response.read()).to_dict(as_series=False) == { - "cell": [ - "895f4261e03ffff", - ], + "cell": ["895f4261e03ffff"], "landcover": [1], "population": [100], } From c3b2d0e379f53225c3ab55278fd6c9a4f433e039 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Thu, 12 Sep 2024 15:52:54 +0200 Subject: [PATCH 09/12] Improve response documentation in OpenAPI --- api/app/routers/grid.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index c4e84e40..26fd4459 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -10,12 +10,12 @@ import shapely from fastapi import APIRouter, Depends, HTTPException, Path, Query from fastapi.params import Body +from fastapi.responses import Response from geojson_pydantic import Feature from h3 import H3CellError from h3ronpy.polars import cells_to_string from h3ronpy.polars.vector import geometry_to_cells from pydantic import ValidationError -from starlette.responses import Response from app.config.config import get_settings from app.models.grid import MultiDatasetMeta, TableFilters, TableResults @@ -24,12 +24,21 @@ grid_router = APIRouter() +tile_exception_responses = { + 400: {"description": "Column does not exist or tile_index is not valid h3 index."}, + 404: {"description": "Tile does not exist or is empty"}, +} + + +class ArrowIPCResponse(Response): # noqa: D101 + media_type = "application/octet-stream" + def get_tile(tile_index: str, columns: list[str]) -> tuple[pl.LazyFrame, int]: """Get the tile from filesystem filtered by column and the resolution of the tile index""" try: z = h3.api.basic_str.h3_get_resolution(tile_index) - except H3CellError: + except (H3CellError, ValueError): raise HTTPException(status_code=400, detail="Tile index is not a valid H3 cell") from None tile_path = os.path.join(get_settings().grid_tiles_path, f"{z}/{tile_index}.arrow") if not os.path.exists(tile_path): @@ -52,13 +61,16 @@ def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame: @grid_router.get( "/tile/{tile_index}", summary="Get a grid tile", + response_class=ArrowIPCResponse, + response_description="Arrow IPC table", + responses=tile_exception_responses, ) def grid_tile( tile_index: Annotated[str, Path(description="The `h3` index of the tile")], columns: list[str] = Query( [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." ), -) -> Response: +) -> ArrowIPCResponse: """Get a tile of h3 cells with specified data columns""" tile, _ = get_tile(tile_index, columns) try: @@ -66,17 +78,23 @@ def grid_tile( # we don't know if the column requested are correct until we call .collect() except pl.exceptions.ColumnNotFoundError: raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None - return Response(tile_buffer.getvalue(), media_type="application/octet-stream") + return ArrowIPCResponse(tile_buffer.getvalue()) -@grid_router.post("/tile/{tile_index}", summary="Get a grid tile with cells contained inside the GeoJSON") +@grid_router.post( + "/tile/{tile_index}", + summary="Get a grid tile with cells contained inside the GeoJSON", + response_class=ArrowIPCResponse, + response_description="Arrow IPC table", + responses=tile_exception_responses, +) def grid_tile_in_area( tile_index: Annotated[str, Path(description="The `h3` index of the tile")], geojson: Annotated[Feature, Body(description="GeoJSON feature used to filter the cells.")], columns: list[str] = Query( [], description="Colum/s to include in the tile. If empty, it returns only cell indexes." ), -) -> Response: +) -> ArrowIPCResponse: """Get a tile of h3 cells that are inside the polygon""" tile, tile_index_res = get_tile(tile_index, columns) cell_res = tile_index_res + get_settings().tile_to_cell_resolution_diff @@ -90,7 +108,7 @@ def grid_tile_in_area( if tile.is_empty(): raise HTTPException(status_code=404, detail="No data in region") tile_buffer = tile.write_ipc(None) - return Response(tile_buffer.getvalue(), media_type="application/octet-stream") + return ArrowIPCResponse(tile_buffer.getvalue()) @grid_router.get( @@ -116,7 +134,7 @@ async def grid_dataset_metadata() -> MultiDatasetMeta: def read_table( level: Annotated[int, Query(..., description="Tile level at which the query will be computed")], filters: TableFilters = Depends(), - geojson: Annotated[Feature | None, Body(description="GeoJSON feature used to filter the cells.")] = None, + geojson: Feature | None = None, ) -> TableResults: """Query tile dataset and return table data""" files_path = pathlib.Path(get_settings().grid_tiles_path) / str(level) From 50c8d32da9c6454f591cbe24b1c9fb7786aac212 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Fri, 13 Sep 2024 09:04:44 +0200 Subject: [PATCH 10/12] Update compose call to use the compose subcommand in the ci runner --- .github/workflows/cicd.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index d48f841a..fd3bfd10 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -20,10 +20,10 @@ jobs: uses: docker/setup-buildx-action@v1 - name: Build and run tests - run: docker-compose up --build --exit-code-from test test + run: docker compose up --build --exit-code-from test test - name: Clean up - run: docker-compose down + run: docker compose down deploy: name: Deploy @@ -40,8 +40,8 @@ jobs: script: | cd amazonia-360 git pull --rebase - sudo docker-compose down - sudo docker-compose up -d api --build + sudo docker compose down + sudo docker compose up -d api --build health-check: name: Health Check From c6b316a37a70b2753444d27ac9eabceee1a94ae1 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Fri, 13 Sep 2024 16:05:47 +0200 Subject: [PATCH 11/12] small comment --- api/app/routers/grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index 26fd4459..0d1f1d13 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -20,7 +20,7 @@ from app.config.config import get_settings from app.models.grid import MultiDatasetMeta, TableFilters, TableResults -log = logging.getLogger("uvicorn.error") +log = logging.getLogger("uvicorn.error") # Show the logs in the uvicorn runner logs grid_router = APIRouter() From d5ae18bbe01f3a4813db9cd43c0201fdb17ca599 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Wed, 25 Sep 2024 10:50:15 +0200 Subject: [PATCH 12/12] improve benchmark by using multiple geojson so we can check caching --- api/tests/benchmark_grid_post.lua | 121 ++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 7 deletions(-) diff --git a/api/tests/benchmark_grid_post.lua b/api/tests/benchmark_grid_post.lua index 7eab4065..ea2083cf 100644 --- a/api/tests/benchmark_grid_post.lua +++ b/api/tests/benchmark_grid_post.lua @@ -1,9 +1,9 @@ -- command: -- wrk -c 100 -t 10 -d 10s -s benchmark_grid_post.lua 'http://localhost:8000/grid/tile/815f7ffffffffff?columns=AMIN' -wrk.method = "POST" -wrk.body = [[ -{ +local geojsons = { + [[ + { "type": "Feature", "properties": {}, "geometry": { @@ -34,7 +34,114 @@ wrk.body = [[ "type": "Polygon" } } -]] -wrk.headers["Content-Type"] = "application/json" -wrk.headers["accept"] = "application/json" -wrk.headers["Authorization"] = "Bearer 1234" + ]], + [[ + { + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [ + -66.98965634041855, + -2.552105344245007 + ], + [ + -66.98965634041855, + -6.931424712822178 + ], + [ + -60.673596725229004, + -6.931424712822178 + ], + [ + -60.673596725229004, + -2.552105344245007 + ], + [ + -66.98965634041855, + -2.552105344245007 + ] + ] + ], + "type": "Polygon" + } + } + ]], + [[ + { + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [ + -59.40141593993765, + -0.8180702598489091 + ], + [ + -59.40141593993765, + -3.8038880006152453 + ], + [ + -56.08276971246181, + -3.8038880006152453 + ], + [ + -56.08276971246181, + -0.8180702598489091 + ], + [ + -59.40141593993765, + -0.8180702598489091 + ] + ] + ], + "type": "Polygon" + } + } + ]], + [[ + { + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [ + -68.36016539573357, + -3.4797077655746023 + ], + [ + -68.36016539573357, + -10.328634044400019 + ], + [ + -60.34168576692953, + -10.328634044400019 + ], + [ + -60.34168576692953, + -3.4797077655746023 + ], + [ + -68.36016539573357, + -3.4797077655746023 + ] + ] + ], + "type": "Polygon" + } + } + ]] +} + + +request = function() + wrk.method = "POST" + wrk.body = geojsons[math.random(1, #geojsons)] + wrk.headers["Content-Type"] = "application/json" + wrk.headers["accept"] = "application/json" + wrk.headers["Authorization"] = "Bearer 1234" + return wrk.format() +end