Skip to content

Commit

Permalink
Speedup request by caching geojson filling result and improve queries
Browse files Browse the repository at this point in the history
  • Loading branch information
BielStela committed Sep 11, 2024
1 parent d0e7a69 commit e1f7c41
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 42 deletions.
48 changes: 23 additions & 25 deletions api/app/routers/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from fastapi.responses import ORJSONResponse
from geojson_pydantic import Feature
from h3 import H3CellError
from h3ronpy.polars import cells_to_string
from h3ronpy.polars.vector import geometry_to_cells
from pydantic import ValidationError
from starlette.responses import Response
Expand All @@ -25,7 +26,7 @@
grid_router = APIRouter()


def tile_from_fs(columns, tile_index) -> tuple[pl.DataFrame, int]:
def tile_from_fs(columns, tile_index) -> tuple[pl.LazyFrame, int]:
"""Get the tile from filesystem filtered by column and the resolution of the tile index"""
try:
z = h3.api.basic_str.h3_get_resolution(tile_index)
Expand All @@ -34,13 +35,21 @@ def tile_from_fs(columns, tile_index) -> tuple[pl.DataFrame, int]:
tile_path = os.path.join(get_settings().grid_tiles_path, f"{z}/{tile_index}.arrow")
if not os.path.exists(tile_path):
raise HTTPException(status_code=404, detail=f"Tile {tile_path} not found")
try:
tile = pl.read_ipc(tile_path, columns=["cell", *columns])
except pl.exceptions.ColumnNotFoundError:
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
tile = pl.scan_ipc(tile_path).select(["cell", *columns])
return tile, z


@lru_cache
def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame:
"""Return the cells that fill the polygon area in the geojson
Geometry must be a shapely geometry, a wkt or wkb so the lru cache
can hash the parameter.
"""
cells = cells_to_string(geometry_to_cells(geometry, cell_resolution))
return pl.LazyFrame({"cell": cells})


@grid_router.get(
"/tile/{tile_index}",
summary="Get a grid tile",
Expand All @@ -53,24 +62,13 @@ def get_grid_tile(
) -> Response:
"""Get a tile of h3 cells with specified data columns"""
tile, _ = tile_from_fs(columns, tile_index)
tile_buffer = tile.write_ipc(None)
try:
tile_buffer = tile.collect().write_ipc(None)
except pl.exceptions.ColumnNotFoundError:
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
return Response(tile_buffer.getvalue(), media_type="application/octet-stream")


# @lru_cache
# def cells_in_geojson(geometry, cell_resolution: int) -> pl.Series:
# """Return the cells that fill the polygon area in the geojson"""
# cells = polyfill_geojson(geojson, cell_resolution)
# return pl.Series("shape_cells", cells, dtype=pl.UInt64)


@lru_cache
def cells_in_geojson(geometry, cell_resolution: int) -> pl.Series:
"""Return the cells that fill the polygon area in the geojson"""
cells = geometry_to_cells(geometry, cell_resolution)
return pl.Series("shape_cells", cells, dtype=pl.UInt64)


@grid_router.post("/tile/{tile_index}", summary="Get a grid tile with cells contained inside the GeoJSON")
def post_grid_tile(
tile_index: Annotated[str, Path(description="The `h3` index of the tile")],
Expand All @@ -79,15 +77,15 @@ def post_grid_tile(
[], description="Colum/s to include in the tile. If empty, it returns only cell indexes."
),
) -> Response:
"""Get a tile of h3 cells that are inside the polygon"""
tile, tile_index_res = tile_from_fs(columns, tile_index)
cell_res = tile_index_res + get_settings().tile_to_cell_resolution_diff
geom = shapely.from_geojson(geojson.model_dump_json())
cells = cells_in_geojson(geom, cell_res)
tile = (
tile.with_columns(pl.col("cell").h3.cells_parse())
.filter(pl.col("cell").is_in(cells))
.with_columns(pl.col("cell").h3.cells_to_string())
)
try:
tile = tile.join(cells, on="cell").collect()
except pl.exceptions.ColumnNotFoundError:
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
if tile.is_empty():
raise HTTPException(status_code=404, detail="No data in region")
tile_buffer = tile.write_ipc(None)
Expand Down
40 changes: 40 additions & 0 deletions api/tests/benchmark_grid_post.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
-- command:
-- wrk -c 100 -t 10 -d 10s -s benchmark_grid_post.lua 'http://localhost:8000/grid/tile/815f7ffffffffff?columns=AMIN'

wrk.method = "POST"
wrk.body = [[
{
"type": "Feature",
"properties": {},
"geometry": {
"coordinates": [
[
[
-61.113268179996055,
8.666717320892204
],
[
-61.113268179996055,
8.505177617822142
],
[
-60.86538798013957,
8.505177617822142
],
[
-60.86538798013957,
8.666717320892204
],
[
-61.113268179996055,
8.666717320892204
]
]
],
"type": "Polygon"
}
}
]]
wrk.headers["Content-Type"] = "application/json"
wrk.headers["accept"] = "application/json"
wrk.headers["Authorization"] = "Bearer 1234"
File renamed without changes.
35 changes: 30 additions & 5 deletions api/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
from pathlib import Path

Expand All @@ -19,6 +20,30 @@
HEADERS = {"Authorization": f"Bearer {get_settings().auth_token}"}


@pytest.fixture()
def geojson() -> str:
"""This geojson contains the cell 895f4261e03ffff in `grid_dataset`"""
s = json.dumps(
{
"type": "Feature",
"properties": {},
"geometry": {
"coordinates": [
[
[-61.11, 8.66],
[-61.11, 8.50],
[-60.86, 8.50],
[-60.86, 8.66],
[-61.11, 8.66],
]
],
"type": "Polygon",
},
}
)
return s


@pytest.fixture()
def grid_dataset(setup_data_folder) -> str:
"""Create an empty binary file to be used as grid dataset stub
Expand All @@ -40,11 +65,11 @@ def grid_dataset(setup_data_folder) -> str:
df = pl.DataFrame(
{
"cell": [
618668968382824400,
619428375900454900,
619428407452893200,
619428407943888900,
619428407676764200,
"895f4261e03ffff",
"865f00007ffffff",
"865f0000fffffff",
"865f00017ffffff",
"865f0001fffffff",
],
"landcover": [1, 4, 3, 3, 4],
"population": [100, 200, 1, 900, 900],
Expand Down
64 changes: 52 additions & 12 deletions api/tests/test_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ def test_grid_tile(grid_dataset):
assert response.status_code == 200
assert pl.read_ipc(response.read()).to_dict(as_series=False) == {
"cell": [
618668968382824400,
619428375900454900,
619428407452893200,
619428407943888900,
619428407676764200,
"895f4261e03ffff",
"865f00007ffffff",
"865f0000fffffff",
"865f00017ffffff",
"865f0001fffffff",
],
"landcover": [1, 4, 3, 3, 4],
"population": [100, 200, 1, 900, 900],
Expand All @@ -32,11 +32,11 @@ def test_grid_tile_empty_column_param(grid_dataset):
assert response.status_code == 200
assert pl.read_ipc(response.read()).to_dict(as_series=False) == {
"cell": [
618668968382824400,
619428375900454900,
619428407452893200,
619428407943888900,
619428407676764200,
"895f4261e03ffff",
"865f00007ffffff",
"865f0000fffffff",
"865f00017ffffff",
"865f0001fffffff",
],
}

Expand Down Expand Up @@ -222,9 +222,49 @@ def test_grid_table(grid_dataset):
assert response.status_code == 200
assert json.loads(response.read()) == {
"cell": [
619428375900454900,
618668968382824400,
"865f00007ffffff",
"895f4261e03ffff",
],
"landcover": [4, 1],
"population": [200, 100],
}


def test_grid_tile_post_geojson(grid_dataset, geojson):
response = test_client.post(
f"/grid/tile/{grid_dataset}",
params={"columns": ["landcover", "population"]},
headers=HEADERS,
content=geojson,
)
assert response.status_code == 200
assert pl.read_ipc(response.read()).to_dict(as_series=False) == {
"cell": [
"895f4261e03ffff",
],
"landcover": [1],
"population": [100],
}


def test_grid_tile_post_geojson_404(grid_dataset, geojson):
response = test_client.post(
"/grid/tile/8439181ffffffff",
params={"columns": ["landcover", "population"]},
headers=HEADERS,
content=geojson,
)

assert response.status_code == 404


def test_grid_tile_post_wrong_column(grid_dataset, geojson):
response = test_client.post(
f"/grid/tile/{grid_dataset}",
params={"columns": ["I DO NOT EXIST"]},
headers=HEADERS,
content=geojson,
)

assert response.status_code == 400
assert response.json() == {"detail": "One or more of the specified columns is not valid"}

0 comments on commit e1f7c41

Please sign in to comment.