From 604e5da3aa057694ec948e74f35588768ebd29c1 Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Fri, 26 Jul 2024 19:27:08 +0200 Subject: [PATCH] updates metamodel --- api/app/models/grid.py | 7 +- science/notebooks/check_combine_results.ipynb | 97 ++++++++++ science/notebooks/merge_entrega_roberto.ipynb | 166 ++++++++++++++++++ 3 files changed, 267 insertions(+), 3 deletions(-) create mode 100644 science/notebooks/check_combine_results.ipynb create mode 100644 science/notebooks/merge_entrega_roberto.ipynb diff --git a/api/app/models/grid.py b/api/app/models/grid.py index 1dee3260..f1ea4b3f 100644 --- a/api/app/models/grid.py +++ b/api/app/models/grid.py @@ -46,9 +46,10 @@ class CategoricalLegend(BaseModel): class DatasetMeta(BaseModel): var_name: str = Field(description="Column name.") var_dtype: str = Field(description="Column dtype.") - nodata: str - description: str - aggregation_method: str = Field(description="Aggregation method used to compute the overview levels.") + label: str = Field(description="Human readable name.") + nodata: str | None = Field(default=None, description="Nodata value used in grid") + description: str = Field(description="Human readable indicator description.") + unit: str | None = Field(description="Unit of the measurement") lineage: list[str] | None = Field(default=None, description="Source data used to compute this dataset.") legend: CategoricalLegend | NumericalLegend = Field(discriminator="legend_type") diff --git a/science/notebooks/check_combine_results.ipynb b/science/notebooks/check_combine_results.ipynb new file mode 100644 index 00000000..3fe61ffa --- /dev/null +++ b/science/notebooks/check_combine_results.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e84373c7-a5e7-47c8-95a3-d2db7ade2e29", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import polars.selectors as cs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "750642ad-f9fc-434e-86f8-783cc41d533c", + "metadata": {}, + "outputs": [], + "source": [ + "df = pl.read_ipc(\"../data/processed/grid/0/8057fffffffffff.arrow\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5012a5d-ea51-4b01-8ccf-055db2feb3ec", + "metadata": {}, + "outputs": [], + "source": [ + "schema = {\"cell\":pl.UInt64, \"b\":pl.Float32, \"c\":pl.String}\n", + "df = pl.DataFrame(schema=schema)\n", + "\n", + "join1 = pl.DataFrame({\"cell\": [1,2,3], \"b\": [9., 9., 9.]}, schema_overrides=schema)\n", + "join2 = pl.DataFrame({\"cell\": [1,2,3], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n", + "join3 = pl.DataFrame({\"cell\": [4,5,6], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n", + "\n", + "tojoin = [join1, join2, join3]\n", + "\n", + "res = pl.concat(tojoin, how=\"diagonal\", rechunk=True)\n", + "res = res.group_by(\"cell\", maintain_order=True).agg(pl.all().max())\n", + "print(res)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af677cc8-34c5-4259-84c2-1d90a5bf3040", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "# Define the initial DataFrame\n", + "df = pl.DataFrame({\n", + " \"cell\": [1, 2, 3, 1, 2, 3, 4, 5, 6],\n", + " \"b\": [9.0, 9.0, 9.0, None, None, None, None, None, None],\n", + " \"c\": [None, None, None, \"a\", \"b\", \"c\", \"a\", \"b\", \"c\"]\n", + "})\n", + "\n", + "# Perform the group by and aggregation\n", + "agg_df = df.groupby(\"cell\").agg([\n", + " pl.col(\"b\").max().alias(\"b\"),\n", + " pl.col(\"c\").max().alias(\"c\")\n", + "])\n", + "\n", + "# Sort the resulting DataFrame by the 'cell' column\n", + "result_df = agg_df.sort(\"cell\")\n", + "\n", + "# Print the resulting DataFrame\n", + "print(result_df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/science/notebooks/merge_entrega_roberto.ipynb b/science/notebooks/merge_entrega_roberto.ipynb new file mode 100644 index 00000000..e7913753 --- /dev/null +++ b/science/notebooks/merge_entrega_roberto.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "from pathlib import Path\n", + "import h3ronpy.polars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "csvs = list(Path(\"../raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024/\").glob(\"*.CSV\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dfs = [pl.read_csv(f, separator=\";\", decimal_comma=True) for f in csvs]\n", + "df = pl.concat(dfs, how=\"align\", rechunk=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.with_columns(pl.col(\"GRID_ID\").h3.cells_parse())\n", + "df = df.drop(\"GRID_ID\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.select(pl.col(\"cell\").h3.cells_resolution()).unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CELLS_RES = 6\n", + "OVERVIEW_LEVEL = CELLS_RES - 5\n", + "\n", + "df = df.with_columns(\n", + " pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).alias(\"tile_id\") # type: ignore[attr-defined]\n", + ")\n", + "partition_dfs = df.partition_by([\"tile_id\"], as_dict=True, include_key=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write tiles to IPC files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seen_tiles = set()\n", + "n_cells = 0\n", + "\n", + "for tile_group, tile_df in partition_dfs.items():\n", + " if tile_df.shape[0] == 0: # todo: skip empty tiles ?\n", + " continue\n", + " tile_id = hex(tile_group[0])[2:]\n", + " filename = Path(\"grid/1\") / (tile_id + \".arrow\")\n", + " if tile_id in seen_tiles:\n", + " tile_df = pl.concat(\n", + " [pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\"\n", + " ).unique(subset=[\"cell\"])\n", + " tile_df.write_parquet(filename)\n", + " n_cells += len(tile_df)\n", + " else:\n", + " seen_tiles.add(tile_id)\n", + " tile_df.write_ipc(filename)\n", + " n_cells += len(tile_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make the metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.select(pl.all().min())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.select(pl.all().max())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.dtypes" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}