From db01738f7e4cbe5a35a9875a4eff7d6d22681c29 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Tue, 29 Oct 2024 11:31:07 +0100 Subject: [PATCH] Set h3 index type to `string` in IPC table (#69) * Notebook to convert grid csv to arrow files * Cast IPC table to string * Fix python docker image no longer ships with libexpat a requirement for fiona and gdal * remove test router --- api/Dockerfile | 7 +++ api/app/routers/grid.py | 26 ++++++-- docker-compose.yml | 2 - science/notebooks/grid_table_geometry.ipynb | 70 +++++++++++++++++++++ 4 files changed, 97 insertions(+), 8 deletions(-) create mode 100644 science/notebooks/grid_table_geometry.ipynb diff --git a/api/Dockerfile b/api/Dockerfile index 31bd2524..b95b4b07 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -1,6 +1,13 @@ FROM python:3.11-slim as base LABEL maintainer="hello@vizzuality.com" +# Requirement of fiona and gdal. +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends libexpat1; \ + rm -rf /var/lib/apt/lists/* + + FROM base as production ENV NAME api ENV APP_HOME /opt/$NAME diff --git a/api/app/routers/grid.py b/api/app/routers/grid.py index cd6767bf..4ced36de 100644 --- a/api/app/routers/grid.py +++ b/api/app/routers/grid.py @@ -1,3 +1,4 @@ +import io import logging import os import pathlib @@ -7,6 +8,7 @@ import h3 import h3ronpy.polars # noqa: F401 import polars as pl +import pyarrow as pa import shapely from fastapi import APIRouter, Depends, HTTPException, Path, Query from fastapi.params import Body @@ -58,6 +60,21 @@ def cells_in_geojson(geometry: str, cell_resolution: int) -> pl.LazyFrame: return pl.LazyFrame({"cell": cells}) +def polars_to_string_ipc(df: pl.DataFrame) -> bytes: + """Cast cell column of polars dataframe to arrow type `string` and return the ipc bytes.""" + # For performance reasons all the strings in polars are treated as `large_string`, + # a custom string type. As of today, the frontend library @loadrs.gl/arrow only supports + # `string` type so we need to downcast with pyarrow + table: pa.Table = df.to_arrow() + + schema = table.schema + schema = schema.set(schema.get_field_index("cell"), pa.field("cell", pa.string())) + table = table.cast(schema) + sink = io.BytesIO() + with pa.ipc.new_file(sink, table.schema) as writer: + writer.write_table(table) + return sink.getvalue() + @grid_router.get( "/tile/{tile_index}", summary="Get a grid tile", @@ -74,13 +91,11 @@ def grid_tile( """Get a tile of h3 cells with specified data columns""" tile, _ = get_tile(tile_index, columns) try: - # Need to use the _undocumented_ old compatibility method so the string columns are readable by - # the apache arrow implementation in JS ( as of today 26/9/2024 it is not without this flag ). - tile_buffer = tile.collect().write_ipc(None, compat_level=pl.interchange.CompatLevel.oldest()) + tile = tile.collect() # we don't know if the column requested are correct until we call .collect() except pl.exceptions.ColumnNotFoundError: raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None - return ArrowIPCResponse(tile_buffer.getvalue()) + return ArrowIPCResponse(polars_to_string_ipc(tile)) @grid_router.post( @@ -109,8 +124,7 @@ def grid_tile_in_area( raise HTTPException(status_code=400, detail="One or more of the specified columns is not valid") from None if tile.is_empty(): raise HTTPException(status_code=404, detail="No data in region") - tile_buffer = tile.write_ipc(None) - return ArrowIPCResponse(tile_buffer.getvalue()) + return ArrowIPCResponse(polars_to_string_ipc(tile)) @grid_router.get( diff --git a/docker-compose.yml b/docker-compose.yml index a2a36408..d0905bee 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,8 +13,6 @@ services: - GRID_TILES_PATH=${GRID_TILES_PATH} networks: - amazonia360-network - restart: - always test: build: diff --git a/science/notebooks/grid_table_geometry.ipynb b/science/notebooks/grid_table_geometry.ipynb new file mode 100644 index 00000000..85c60c6d --- /dev/null +++ b/science/notebooks/grid_table_geometry.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "from h3ronpy.polars.vector import cells_to_wkb_points\n", + "from shapely import wkb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pl.read_csv(\"../data/processed/grid_table.csv\")\n", + "\n", + "df = df.with_columns(\n", + " pl.col(\"cell\").cast(pl.UInt64).h3.cells_to_string(),\n", + " pl.col(\"tile_id\").cast(pl.UInt64).h3.cells_to_string(),\n", + " point=cells_to_wkb_points(df.select(pl.col(\"cell\").cast(pl.UInt64)).to_series()),\n", + ")\n", + "df = df.with_columns(\n", + " lat=pl.col(\"point\").map_elements(lambda p: wkb.loads(p).x, return_dtype=pl.Float64),\n", + " lon=pl.col(\"point\").map_elements(lambda p: wkb.loads(p).y, return_dtype=pl.Float64),\n", + ")\n", + "\n", + "df = df.drop(\"point\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "df.write_csv(\"../data/processed/grid_table_geom.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}