Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter cells by region #66

Merged
merged 12 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ jobs:
uses: docker/setup-buildx-action@v1

- name: Build and run tests
run: docker-compose up --build --exit-code-from test test
run: docker compose up --build --exit-code-from test test

- name: Clean up
run: docker-compose down
run: docker compose down

deploy:
name: Deploy
Expand All @@ -40,8 +40,8 @@ jobs:
script: |
cd amazonia-360
git pull --rebase
sudo docker-compose down
sudo docker-compose up -d api --build
sudo docker compose down
sudo docker compose up -d api --build

health-check:
name: Health Check
Expand Down
7 changes: 3 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ repos:
- id: ruff-format
types_or: [ python, pyi, jupyter ]

# check for private keys and passwords!
- repo: https://github.com/gitleaks/gitleaks
rev: v8.17.0
- repo: https://github.com/kynan/nbstripout
rev: 0.7.1
hooks:
- id: gitleaks-docker
- id: nbstripout
1 change: 1 addition & 0 deletions api/app/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Settings(BaseSettings):
auth_token: str
tiff_path: str
grid_tiles_path: str
tile_to_cell_resolution_diff: int = 5


@lru_cache
Expand Down
9 changes: 9 additions & 0 deletions api/app/models/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,12 @@ def to_sql_query(self, table_name: str) -> str:
)
)
return str(query.compile(compile_kwargs={"literal_binds": True}))


class TableResultColumn(BaseModel):
column: Annotated[str, Field(title="column", description="Column name")]
values: Annotated[list, Field(description="Check dataset metadata for type info")]


class TableResults(BaseModel):
table: list[TableResultColumn]
119 changes: 97 additions & 22 deletions api/app/routers/grid.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,114 @@
import logging
import os
import pathlib
from functools import lru_cache
from typing import Annotated

import h3
import h3ronpy.polars # noqa: F401
import polars as pl
import shapely
from fastapi import APIRouter, Depends, HTTPException, Path, Query
from fastapi.responses import ORJSONResponse
from fastapi.params import Body
from fastapi.responses import Response
from geojson_pydantic import Feature
from h3 import H3CellError
from h3ronpy.polars import cells_to_string
from h3ronpy.polars.vector import geometry_to_cells
from pydantic import ValidationError
from starlette.responses import Response

from app.config.config import get_settings
from app.models.grid import MultiDatasetMeta, TableFilters
from app.models.grid import MultiDatasetMeta, TableFilters, TableResults

log = logging.getLogger("uvicorn.error")
log = logging.getLogger("uvicorn.error") # Show the logs in the uvicorn runner logs

grid_router = APIRouter()

tile_exception_responses = {
400: {"description": "Column does not exist or tile_index is not valid h3 index."},
404: {"description": "Tile does not exist or is empty"},
}


class ArrowIPCResponse(Response): # noqa: D101
media_type = "application/octet-stream"


def get_tile(tile_index: str, columns: list[str]) -> tuple[pl.LazyFrame, int]:
"""Get the tile from filesystem filtered by column and the resolution of the tile index"""
try:
z = h3.api.basic_str.h3_get_resolution(tile_index)
except (H3CellError, ValueError):
raise HTTPException(status_code=400, detail="Tile index is not a valid H3 cell") from None
tile_path = os.path.join(get_settings().grid_tiles_path, f"{z}/{tile_index}.arrow")
if not os.path.exists(tile_path):
raise HTTPException(status_code=404, detail=f"Tile {tile_path} not found")
tile = pl.scan_ipc(tile_path).select(["cell", *columns])
return tile, z


@lru_cache
def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame:
"""Return the cells that fill the polygon area in the geojson

Geometry must be a shapely geometry, a wkt or wkb so the lru cache
can hash the parameter.
"""
cells = cells_to_string(geometry_to_cells(geometry, cell_resolution))
return pl.LazyFrame({"cell": cells})


@grid_router.get(
"/tile/{tile_index}",
summary="Get a grid tile",
response_class=ArrowIPCResponse,
response_description="Arrow IPC table",
responses=tile_exception_responses,
)
async def grid_tile(
def grid_tile(
tile_index: Annotated[str, Path(description="The `h3` index of the tile")],
columns: list[str] = Query(
[], description="Colum/s to include in the tile. If empty, it returns only cell indexes."
),
) -> Response:
) -> ArrowIPCResponse:
"""Get a tile of h3 cells with specified data columns"""
tile, _ = get_tile(tile_index, columns)
try:
z = h3.api.basic_str.h3_get_resolution(tile_index)
except H3CellError:
raise HTTPException(status_code=400, detail="Tile index is not a valid H3 cell") from None
tile_path = os.path.join(get_settings().grid_tiles_path, f"{z}/{tile_index}.arrow")
if not os.path.exists(tile_path):
raise HTTPException(status_code=404, detail=f"Tile {tile_path} not found")
tile_buffer = tile.collect().write_ipc(None)
# we don't know if the column requested are correct until we call .collect()
except pl.exceptions.ColumnNotFoundError:
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
return ArrowIPCResponse(tile_buffer.getvalue())


@grid_router.post(
"/tile/{tile_index}",
summary="Get a grid tile with cells contained inside the GeoJSON",
response_class=ArrowIPCResponse,
response_description="Arrow IPC table",
responses=tile_exception_responses,
)
def grid_tile_in_area(
tile_index: Annotated[str, Path(description="The `h3` index of the tile")],
geojson: Annotated[Feature, Body(description="GeoJSON feature used to filter the cells.")],
columns: list[str] = Query(
[], description="Colum/s to include in the tile. If empty, it returns only cell indexes."
),
) -> ArrowIPCResponse:
"""Get a tile of h3 cells that are inside the polygon"""
tile, tile_index_res = get_tile(tile_index, columns)
cell_res = tile_index_res + get_settings().tile_to_cell_resolution_diff
geom = shapely.from_geojson(geojson.model_dump_json())
cells = cells_in_geojson(geom, cell_res)
try:
tile_file = pl.read_ipc(tile_path, columns=["cell", *columns]).write_ipc(None)
tile = tile.join(cells, on="cell").collect()
# we don't know if the column requested are correct until we call .collect()
except pl.exceptions.ColumnNotFoundError:
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
return Response(tile_file.getvalue(), media_type="application/octet-stream")
if tile.is_empty():
raise HTTPException(status_code=404, detail="No data in region")
tile_buffer = tile.write_ipc(None)
return ArrowIPCResponse(tile_buffer.getvalue())


@grid_router.get(
Expand All @@ -67,23 +134,31 @@ async def grid_dataset_metadata() -> MultiDatasetMeta:
def read_table(
level: Annotated[int, Query(..., description="Tile level at which the query will be computed")],
filters: TableFilters = Depends(),
) -> ORJSONResponse:
geojson: Feature | None = None,
) -> TableResults:
"""Query tile dataset and return table data"""
files_path = pathlib.Path(get_settings().grid_tiles_path) / str(level)
if not files_path.exists():
raise HTTPException(404, detail=f"Level {level} does not exist") from None
lf = pl.scan_ipc(files_path.glob("*.arrow"))

lf = pl.scan_ipc(list(files_path.glob("*.arrow")))

if geojson is not None:
cell_res = level + get_settings().tile_to_cell_resolution_diff
geom = shapely.from_geojson(geojson.model_dump_json())
cells = cells_in_geojson(geom, cell_res)
lf = lf.join(cells, on="cell")

query = filters.to_sql_query("frame")
log.debug(query)

try:
res = pl.SQLContext(frame=lf).execute(query).collect()
except pl.exceptions.ColumnNotFoundError as e:
# bad column in order by clause
except pl.exceptions.ColumnNotFoundError as e: # bad column in order by clause
log.exception(e)
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None

except pl.exceptions.ComputeError as e:
# possibly raise if wrong type in compare. I'm not aware of other sources of ComputeError
except pl.exceptions.ComputeError as e: # raised if wrong type in compare.
log.exception(e)
raise HTTPException(status_code=422, detail=str(e)) from None
return ORJSONResponse(res.to_dict(as_series=False))

return TableResults(table=[{"column": k, "values": v} for k, v in res.to_dict(as_series=False).items()])
1 change: 1 addition & 0 deletions api/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ h3
pydantic-extra-types
polars
sqlalchemy
h3ronpy
Loading
Loading