Skip to content

Commit

Permalink
Set h3 index type to string in IPC table (#69)
Browse files Browse the repository at this point in the history
* Notebook to convert grid csv to arrow files

* Cast IPC table to string

* Fix python docker image no longer ships with libexpat a requirement for fiona and gdal

* remove test router
  • Loading branch information
BielStela authored Oct 29, 2024
1 parent 798e29a commit db01738
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 8 deletions.
7 changes: 7 additions & 0 deletions api/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
FROM python:3.11-slim as base
LABEL maintainer="[email protected]"

# Requirement of fiona and gdal.
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends libexpat1; \
rm -rf /var/lib/apt/lists/*


FROM base as production
ENV NAME api
ENV APP_HOME /opt/$NAME
Expand Down
26 changes: 20 additions & 6 deletions api/app/routers/grid.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import logging
import os
import pathlib
Expand All @@ -7,6 +8,7 @@
import h3
import h3ronpy.polars # noqa: F401
import polars as pl
import pyarrow as pa
import shapely
from fastapi import APIRouter, Depends, HTTPException, Path, Query
from fastapi.params import Body
Expand Down Expand Up @@ -58,6 +60,21 @@ def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame:
return pl.LazyFrame({"cell": cells})


def polars_to_string_ipc(df: pl.DataFrame) -> bytes:
"""Cast cell column of polars dataframe to arrow type `string` and return the ipc bytes."""
# For performance reasons all the strings in polars are treated as `large_string`,
# a custom string type. As of today, the frontend library @loadrs.gl/arrow only supports
# `string` type so we need to downcast with pyarrow
table: pa.Table = df.to_arrow()

schema = table.schema
schema = schema.set(schema.get_field_index("cell"), pa.field("cell", pa.string()))
table = table.cast(schema)
sink = io.BytesIO()
with pa.ipc.new_file(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue()

@grid_router.get(
"/tile/{tile_index}",
summary="Get a grid tile",
Expand All @@ -74,13 +91,11 @@ def grid_tile(
"""Get a tile of h3 cells with specified data columns"""
tile, _ = get_tile(tile_index, columns)
try:
# Need to use the _undocumented_ old compatibility method so the string columns are readable by
# the apache arrow implementation in JS ( as of today 26/9/2024 it is not without this flag ).
tile_buffer = tile.collect().write_ipc(None, compat_level=pl.interchange.CompatLevel.oldest())
tile = tile.collect()
# we don't know if the column requested are correct until we call .collect()
except pl.exceptions.ColumnNotFoundError:
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
return ArrowIPCResponse(tile_buffer.getvalue())
return ArrowIPCResponse(polars_to_string_ipc(tile))


@grid_router.post(
Expand Down Expand Up @@ -109,8 +124,7 @@ def grid_tile_in_area(
raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None
if tile.is_empty():
raise HTTPException(status_code=404, detail="No data in region")
tile_buffer = tile.write_ipc(None)
return ArrowIPCResponse(tile_buffer.getvalue())
return ArrowIPCResponse(polars_to_string_ipc(tile))


@grid_router.get(
Expand Down
2 changes: 0 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ services:
- GRID_TILES_PATH=${GRID_TILES_PATH}
networks:
- amazonia360-network
restart:
always

test:
build:
Expand Down
70 changes: 70 additions & 0 deletions science/notebooks/grid_table_geometry.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"from h3ronpy.polars.vector import cells_to_wkb_points\n",
"from shapely import wkb"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"df = pl.read_csv(\"../data/processed/grid_table.csv\")\n",
"\n",
"df = df.with_columns(\n",
" pl.col(\"cell\").cast(pl.UInt64).h3.cells_to_string(),\n",
" pl.col(\"tile_id\").cast(pl.UInt64).h3.cells_to_string(),\n",
" point=cells_to_wkb_points(df.select(pl.col(\"cell\").cast(pl.UInt64)).to_series()),\n",
")\n",
"df = df.with_columns(\n",
" lat=pl.col(\"point\").map_elements(lambda p: wkb.loads(p).x, return_dtype=pl.Float64),\n",
" lon=pl.col(\"point\").map_elements(lambda p: wkb.loads(p).y, return_dtype=pl.Float64),\n",
")\n",
"\n",
"df = df.drop(\"point\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2",
"metadata": {},
"outputs": [],
"source": [
"df.write_csv(\"../data/processed/grid_table_geom.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit db01738

Please sign in to comment.