updates metamodel

Vizzuality · Jul 26, 2024 · 604e5da · 604e5da
1 parent 40a0b80
commit 604e5da
Show file tree

Hide file tree

Showing 3 changed files with 267 additions and 3 deletions.
diff --git a/api/app/models/grid.py b/api/app/models/grid.py
@@ -46,9 +46,10 @@ class CategoricalLegend(BaseModel):
 class DatasetMeta(BaseModel):
     var_name: str = Field(description="Column name.")
     var_dtype: str = Field(description="Column dtype.")
-    nodata: str
-    description: str
-    aggregation_method: str = Field(description="Aggregation method used to compute the overview levels.")
+    label: str = Field(description="Human readable name.")
+    nodata: str | None = Field(default=None, description="Nodata value used in grid")
+    description: str = Field(description="Human readable indicator description.")
+    unit: str | None = Field(description="Unit of the measurement")
     lineage: list[str] | None = Field(default=None, description="Source data used to compute this dataset.")
     legend: CategoricalLegend | NumericalLegend = Field(discriminator="legend_type")
 

diff --git a/science/notebooks/check_combine_results.ipynb b/science/notebooks/check_combine_results.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e84373c7-a5e7-47c8-95a3-d2db7ade2e29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "import polars.selectors as cs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "750642ad-f9fc-434e-86f8-783cc41d533c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.read_ipc(\"../data/processed/grid/0/8057fffffffffff.arrow\")\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5012a5d-ea51-4b01-8ccf-055db2feb3ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = {\"cell\":pl.UInt64, \"b\":pl.Float32, \"c\":pl.String}\n",
+    "df = pl.DataFrame(schema=schema)\n",
+    "\n",
+    "join1 = pl.DataFrame({\"cell\": [1,2,3], \"b\": [9., 9., 9.]}, schema_overrides=schema)\n",
+    "join2 = pl.DataFrame({\"cell\": [1,2,3], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n",
+    "join3 = pl.DataFrame({\"cell\": [4,5,6], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n",
+    "\n",
+    "tojoin = [join1, join2, join3]\n",
+    "\n",
+    "res = pl.concat(tojoin, how=\"diagonal\", rechunk=True)\n",
+    "res = res.group_by(\"cell\", maintain_order=True).agg(pl.all().max())\n",
+    "print(res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af677cc8-34c5-4259-84c2-1d90a5bf3040",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "\n",
+    "# Define the initial DataFrame\n",
+    "df = pl.DataFrame({\n",
+    "    \"cell\": [1, 2, 3, 1, 2, 3, 4, 5, 6],\n",
+    "    \"b\": [9.0, 9.0, 9.0, None, None, None, None, None, None],\n",
+    "    \"c\": [None, None, None, \"a\", \"b\", \"c\", \"a\", \"b\", \"c\"]\n",
+    "})\n",
+    "\n",
+    "# Perform the group by and aggregation\n",
+    "agg_df = df.groupby(\"cell\").agg([\n",
+    "    pl.col(\"b\").max().alias(\"b\"),\n",
+    "    pl.col(\"c\").max().alias(\"c\")\n",
+    "])\n",
+    "\n",
+    "# Sort the resulting DataFrame by the 'cell' column\n",
+    "result_df = agg_df.sort(\"cell\")\n",
+    "\n",
+    "# Print the resulting DataFrame\n",
+    "print(result_df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/science/notebooks/merge_entrega_roberto.ipynb b/science/notebooks/merge_entrega_roberto.ipynb
@@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "from pathlib import Path\n",
+    "import h3ronpy.polars"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "csvs = list(Path(\"../raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024/\").glob(\"*.CSV\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs = [pl.read_csv(f, separator=\";\", decimal_comma=True) for f in csvs]\n",
+    "df = pl.concat(dfs, how=\"align\", rechunk=True)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.with_columns(pl.col(\"GRID_ID\").h3.cells_parse())\n",
+    "df = df.drop(\"GRID_ID\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.select(pl.col(\"cell\").h3.cells_resolution()).unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CELLS_RES = 6\n",
+    "OVERVIEW_LEVEL = CELLS_RES - 5\n",
+    "\n",
+    "df = df.with_columns(\n",
+    "    pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).alias(\"tile_id\")  # type: ignore[attr-defined]\n",
+    ")\n",
+    "partition_dfs = df.partition_by([\"tile_id\"], as_dict=True, include_key=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Write tiles to IPC files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seen_tiles = set()\n",
+    "n_cells = 0\n",
+    "\n",
+    "for tile_group, tile_df in partition_dfs.items():\n",
+    "    if tile_df.shape[0] == 0:  # todo: skip empty tiles ?\n",
+    "        continue\n",
+    "    tile_id = hex(tile_group[0])[2:]\n",
+    "    filename = Path(\"grid/1\") / (tile_id + \".arrow\")\n",
+    "    if tile_id in seen_tiles:\n",
+    "        tile_df = pl.concat(\n",
+    "            [pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\"\n",
+    "        ).unique(subset=[\"cell\"])\n",
+    "        tile_df.write_parquet(filename)\n",
+    "        n_cells += len(tile_df)\n",
+    "    else:\n",
+    "        seen_tiles.add(tile_id)\n",
+    "        tile_df.write_ipc(filename)\n",
+    "        n_cells += len(tile_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Make the metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.select(pl.all().min())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.select(pl.all().max())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dtypes"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}