diff --git a/science/notebooks/merge_entrega_roberto.ipynb b/science/notebooks/merge_entrega_roberto.ipynb index cc3cf618..f1ea486d 100644 --- a/science/notebooks/merge_entrega_roberto.ipynb +++ b/science/notebooks/merge_entrega_roberto.ipynb @@ -2,162 +2,69 @@ "cells": [ { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:30:08.332078Z", - "start_time": "2024-09-09T08:30:08.112470Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "import polars as pl\n", "from pathlib import Path\n", - "import h3ronpy.polars" - ], - "outputs": [], - "execution_count": 1 + "\n", + "import polars as pl" + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:46.700129Z", - "start_time": "2024-09-09T08:37:46.697392Z" - } - }, - "source": "csvs = list(Path(\"../data/raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024\").glob(\"*.CSV\"))", + "execution_count": null, + "metadata": {}, "outputs": [], - "execution_count": 39 + "source": [ + "csvs = list(Path(\"../data/raw/ENTREGA UNO MUESTRAS HEXA CSV 18072024\").glob(\"*.CSV\"))" + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:50.987739Z", - "start_time": "2024-09-09T08:37:50.852450Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs = [pl.read_csv(f, separator=\";\", decimal_comma=True) for f in csvs]\n", "df = pl.concat(dfs, how=\"align\", rechunk=True)\n", "df.head()" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "shape: (5, 9)\n", - "┌──────────────┬───────┬────────────┬────────────┬───┬───────────┬──────────┬───────────┬──────────┐\n", - "│ GRID_ID ┆ FRECF ┆ AMIN ┆ AMAX ┆ … ┆ TREEPERCT ┆ PMIN ┆ PMAX ┆ PMEAN │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ i64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞══════════════╪═══════╪════════════╪════════════╪═══╪═══════════╪══════════╪═══════════╪══════════╡\n", - "│ 865f00007fff ┆ null ┆ 114.678246 ┆ 209.731842 ┆ … ┆ 100.0 ┆ 0.058348 ┆ 7.531753 ┆ 1.69093 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f0000ffff ┆ null ┆ 127.660339 ┆ 705.040772 ┆ … ┆ 99.985832 ┆ 0.148311 ┆ 31.043549 ┆ 6.346733 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f00017fff ┆ null ┆ 117.937508 ┆ 175.799759 ┆ … ┆ 100.0 ┆ 0.028819 ┆ 2.731335 ┆ 1.063382 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f0001ffff ┆ null ┆ 123.765045 ┆ 193.208282 ┆ … ┆ 100.0 ┆ 0.047981 ┆ 4.67722 ┆ 1.557258 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 865f00027fff ┆ null ┆ 111.118088 ┆ 277.398895 ┆ … ┆ 100.0 ┆ 0.144035 ┆ 12.342467 ┆ 2.193349 │\n", - "│ fff ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────────┴───────┴────────────┴────────────┴───┴───────────┴──────────┴───────────┴──────────┘" - ], - "text/html": [ - "
\n", - "shape: (5, 9)
GRID_IDFRECFAMINAMAXAMEANTREEPERCTPMINPMAXPMEAN
stri64f64f64f64f64f64f64f64
"865f00007ffffff"null114.678246209.731842149.513126100.00.0583487.5317531.69093
"865f0000fffffff"null127.660339705.040772245.46101399.9858320.14831131.0435496.346733
"865f00017ffffff"null117.937508175.799759145.636984100.00.0288192.7313351.063382
"865f0001fffffff"null123.765045193.208282156.474098100.00.0479814.677221.557258
"865f00027ffffff"null111.118088277.398895146.417323100.00.14403512.3424672.193349
" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 42 + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:51.695475Z", - "start_time": "2024-09-09T08:37:51.673626Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df = df.with_columns(pl.col(\"GRID_ID\").h3.cells_parse())\n", "df = df.drop(\"GRID_ID\")" - ], - "outputs": [], - "execution_count": 43 + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:52.296769Z", - "start_time": "2024-09-09T08:37:52.286054Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df.select(pl.col(\"cell\").h3.cells_resolution()).unique()" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "shape: (1, 1)\n", - "┌────────────┐\n", - "│ resolution │\n", - "│ --- │\n", - "│ u8 │\n", - "╞════════════╡\n", - "│ 6 │\n", - "└────────────┘" - ], - "text/html": [ - "
\n", - "shape: (1, 1)
resolution
u8
6
" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 44 + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:37:53.457159Z", - "start_time": "2024-09-09T08:37:53.382329Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "CELLS_RES = 6\n", "OVERVIEW_LEVEL = CELLS_RES - 5\n", "\n", "df = df.with_columns(\n", " pl.col(\"cell\").h3.change_resolution(OVERVIEW_LEVEL).h3.cells_to_string().alias(\"tile_id\"), # type: ignore[attr-defined]\n", - " pl.col(\"cell\").h3.cells_to_string()\n", + " pl.col(\"cell\").h3.cells_to_string(),\n", ")\n", "partition_dfs = df.partition_by([\"tile_id\"], as_dict=True, include_key=False)" - ], - "outputs": [], - "execution_count": 45 + ] }, { "cell_type": "markdown", @@ -168,12 +75,9 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-09T08:40:16.950037Z", - "start_time": "2024-09-09T08:40:16.897734Z" - } - }, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "seen_tiles = set()\n", "n_cells = 0\n", @@ -184,18 +88,16 @@ " tile_id = tile_group[0]\n", " filename = Path(\"../data/processed/grid/1\") / (tile_id + \".arrow\")\n", " if tile_id in seen_tiles:\n", - " tile_df = pl.concat(\n", - " [pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\"\n", - " ).unique(subset=[\"cell\"])\n", + " tile_df = pl.concat([pl.read_ipc(filename), tile_df], how=\"vertical_relaxed\").unique(\n", + " subset=[\"cell\"]\n", + " )\n", " tile_df.write_parquet(filename)\n", " n_cells += len(tile_df)\n", " else:\n", " seen_tiles.add(tile_id)\n", " tile_df.write_ipc(filename)\n", " n_cells += len(tile_df)" - ], - "outputs": [], - "execution_count": 48 + ] }, { "cell_type": "markdown",