Skip to content

Commit

Permalink
updates metamodel
Browse files Browse the repository at this point in the history
  • Loading branch information
BielStela committed Jul 26, 2024
1 parent 40a0b80 commit 604e5da
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 3 deletions.
7 changes: 4 additions & 3 deletions api/app/models/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ class CategoricalLegend(BaseModel):
class DatasetMeta(BaseModel):
var_name: str = Field(description="Column name.")
var_dtype: str = Field(description="Column dtype.")
nodata: str
description: str
aggregation_method: str = Field(description="Aggregation method used to compute the overview levels.")
label: str = Field(description="Human readable name.")
nodata: str | None = Field(default=None, description="Nodata value used in grid")
description: str = Field(description="Human readable indicator description.")
unit: str | None = Field(description="Unit of the measurement")
lineage: list[str] | None = Field(default=None, description="Source data used to compute this dataset.")
legend: CategoricalLegend | NumericalLegend = Field(discriminator="legend_type")

Expand Down
97 changes: 97 additions & 0 deletions science/notebooks/check_combine_results.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e84373c7-a5e7-47c8-95a3-d2db7ade2e29",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import polars.selectors as cs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "750642ad-f9fc-434e-86f8-783cc41d533c",
"metadata": {},
"outputs": [],
"source": [
"df = pl.read_ipc(\"../data/processed/grid/0/8057fffffffffff.arrow\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5012a5d-ea51-4b01-8ccf-055db2feb3ec",
"metadata": {},
"outputs": [],
"source": [
"schema = {\"cell\":pl.UInt64, \"b\":pl.Float32, \"c\":pl.String}\n",
"df = pl.DataFrame(schema=schema)\n",
"\n",
"join1 = pl.DataFrame({\"cell\": [1,2,3], \"b\": [9., 9., 9.]}, schema_overrides=schema)\n",
"join2 = pl.DataFrame({\"cell\": [1,2,3], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n",
"join3 = pl.DataFrame({\"cell\": [4,5,6], \"c\": [\"a\", \"b\", \"c\"]}, schema_overrides=schema)\n",
"\n",
"tojoin = [join1, join2, join3]\n",
"\n",
"res = pl.concat(tojoin, how=\"diagonal\", rechunk=True)\n",
"res = res.group_by(\"cell\", maintain_order=True).agg(pl.all().max())\n",
"print(res)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af677cc8-34c5-4259-84c2-1d90a5bf3040",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"\n",
"# Define the initial DataFrame\n",
"df = pl.DataFrame({\n",
" \"cell\": [1, 2, 3, 1, 2, 3, 4, 5, 6],\n",
" \"b\": [9.0, 9.0, 9.0, None, None, None, None, None, None],\n",
" \"c\": [None, None, None, \"a\", \"b\", \"c\", \"a\", \"b\", \"c\"]\n",
"})\n",
"\n",
"# Perform the group by and aggregation\n",
"agg_df = df.groupby(\"cell\").agg([\n",
" pl.col(\"b\").max().alias(\"b\"),\n",
" pl.col(\"c\").max().alias(\"c\")\n",
"])\n",
"\n",
"# Sort the resulting DataFrame by the 'cell' column\n",
"result_df = agg_df.sort(\"cell\")\n",
"\n",
"# Print the resulting DataFrame\n",
"print(result_df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
166 changes: 166 additions & 0 deletions science/notebooks/merge_entrega_roberto.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"from pathlib import Path\n",
"import h3ronpy.polars"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"csvs = list(Path(\"../raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024/\").glob(\"*.CSV\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dfs = [pl.read_csv(f, separator=\";\", decimal_comma=True) for f in csvs]\n",
"df = pl.concat(dfs, how=\"align\", rechunk=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = df.with_columns(pl.col(\"GRID_ID\").h3.cells_parse())\n",
"df = df.drop(\"GRID_ID\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.select(pl.col(\"cell\").h3.cells_resolution()).unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CELLS_RES = 6\n",
"OVERVIEW_LEVEL = CELLS_RES - 5\n",
"\n",
"df = df.with_columns(\n",
" pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).alias(\"tile_id\") # type: ignore[attr-defined]\n",
")\n",
"partition_dfs = df.partition_by([\"tile_id\"], as_dict=True, include_key=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Write tiles to IPC files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"seen_tiles = set()\n",
"n_cells = 0\n",
"\n",
"for tile_group, tile_df in partition_dfs.items():\n",
" if tile_df.shape[0] == 0: # todo: skip empty tiles ?\n",
" continue\n",
" tile_id = hex(tile_group[0])[2:]\n",
" filename = Path(\"grid/1\") / (tile_id + \".arrow\")\n",
" if tile_id in seen_tiles:\n",
" tile_df = pl.concat(\n",
" [pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\"\n",
" ).unique(subset=[\"cell\"])\n",
" tile_df.write_parquet(filename)\n",
" n_cells += len(tile_df)\n",
" else:\n",
" seen_tiles.add(tile_id)\n",
" tile_df.write_ipc(filename)\n",
" n_cells += len(tile_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make the metadata"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.select(pl.all().min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.select(pl.all().max())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.dtypes"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0 comments on commit 604e5da

Please sign in to comment.