diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8449e86 --- /dev/null +++ b/.gitignore @@ -0,0 +1,92 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# DotEnv configuration +.env + +# Database +*.db +*.rdb + +# Pycharm +.idea + +# VS Code +.vscode/ + +# Spyder +.spyproject/ + +# Jupyter NB Checkpoints +.ipynb_checkpoints/ + +# exclude data from source control by default +/data/ + +# Mac OS-specific storage files +.DS_Store + +# vim +*.swp +*.swo + +# Mypy cache +.mypy_cache/ + +# dask +dask-worker-space \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..0e2b9a4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "LocalizeSL"] + path = LocalizeSL + url = git@github.com:bobkopp/LocalizeSL.git diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..c7a92b1 --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,11 @@ +History +======= + +.. v1.0.0: + +v1.0.0 +------ + +Released on April 8, 2022 + +Initial commit. Version associated with initial manuscript submission. \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5a03fdd --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Climate Impact Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ad3c0c2 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6010452.svg)](https://doi.org/10.5281/zenodo.6010452) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6012027.svg)](https://doi.org/10.5281/zenodo.6012027) + +# Sea Level Impacts Input Dataset by Elevation, Region, and Scenario (SLIIDERS) + +This repository hosts the code used to create the [SLIIDERS-ECON](https://doi.org/10.5281/zenodo.6010452) and [SLIIDERS-SLR](https://doi.org/10.5281/zenodo.6012027) datasets. The SLIIDERS datasets contain current and forecasted physical and socioeconomic metrics from 2000-2100 - organized by coastal segment, elevation slice, and scenario - for use as inputs to global coastal climate impacts research. + +**SLIIDERS-ECON** contains socioeconomic variables, varying horizontally and vertically over space. **SLIIDERS-SLR** contains Monte Carlo projections of Local Sea Level Rise under different emissions and ice sheet dynamics assumptions, based on the outputs of [LocalizeSL](https://github.com/bobkopp/LocalizeSL). Coastal segments in SLIIDERS-ECON can be matched to gridded LSLR projections in SLIIDERS-SLR via the `SLR_site` key. + +All work utilizing this code or the resulting SLIIDERS datasets should cite Depsky, Bolliger et al. 2022 (in prep.). See [License](#license) for details. + +## Installation +Most users will want to just use the datasets directly, accessible at the DOIs linked above. If you wish to recreate and/or modify the datasets, which we encourage, you will need to run the Jupyter notebooks in this repository. A collection of helper functions, organized into a Python package, is necessary to run the notebooks and can be found within the `sliiders` directory. A simple pip install will install this package + +```bash +pip install -e sliiders +``` + +In addition, you will need to have [Dask Gateway](https://gateway.dask.org) installed and configured to execute the parallel, Dask-backed workflows contained in this repo. Advanced users can use other Dask Cluster backends (including simply running [Dask Distributed](https://distributed.dask.org) locally), but doing so will require modifying the cluster setup portion of notebooks that employ dask. + +A Conda environment file better specifying a full environment needed to execute all of the workflows in this repo is in development and will be posted when complete. + +## Filepaths and other settings +All filepaths and settings for the notebooks can be found within `settings.py`. Before moving onto executing different parts of this repository, please adjust these settings to match your directory structure and desired values. Most values will not need to be updated unless you change a particular dataset. However, at minimum you should: + +1. Update the `DIR_DATA` filepath within this file to point to the root directory within which all of the data consumed and generated by this workflow will live. +2. Update `DASK_IMAGE` to point to a Docker Image that you will use for Dask workers (advanced users not using Dask Gateway may not need this parameter). + +## Package Structure +* `sliiders`: Contains `.py` files with essential settings and functions for the SLIIDERS workflow + - `settings.py`: Contains essential settings, including various parameters and data storage directories + - `gcs.py`: Contains functions related to the use of Google Cloud Storage (GCS). Users running workflows locally or on a different cloud provider are encouraged to contribute similar modules for other contexts. + - `io.py`: Contains various I/O-related functions + - `spatial.py`: Contains functions for executing spatial and geographic operations including those related to shapefiles, grid-cell level operations, and more. + - `dask.py`: Contains utility functions for working with dask clusters + - `country_level_ypk.py`: Contains functions for cleaning and working with country-level socioeconomic data, especially for the workflow in `notebooks/country_level_ypk` + +* `notebooks`: contains the workflows to create SLIIDERS-ECON and SLIIDERS-SLR. + +## Instructions (Generating SLIIDERS-ECON and SLIIDERS-SLR) + +To generate **SLIIDERS-ECON** and **SLIIDERS-SLR**, please follow the directions in `notebooks/README.md` and other readme files in the subdirectories within `notebooks` to learn about how to execute the workflows. +\ +The list and order of notebooks to run is reproduced in full here, along with any necessary manual steps. Click the `docs` link for each workflow to navigate to the relevant directory's page. + +1. `create-SLIIDERS-SLR` ([docs](notebooks/create-SLIIDERS-SLR)): Workflow to generate **SLIIDERS-SLR** + 1. `download-ifile-to-gcs.ipynb` + 2. `convert-mat-version.ipynb` + 3. `generate-projected-lsl.ipynb` + 4. `retrieve-num-gcms.ipynb` + 5. `process-localizesl-output.ipynb` +2. `create-SLIIDERS-ECON` ([docs](notebooks/create-SLIIDERS-ECON)): Workflow to generate **SLIIDERS-ECON** + 1. `download-sliiders-econ-input-data.ipynb` + 2. `country_level_ypk` ([docs](notebooks/create-SLIIDERS-ECON/country_level_ypk)): Workflow for organizing and projecting GDP (Y), population (P), capital stock (K), and related variables for historical (1950-2020) and future (2010-2100) timelines. + 1. `ypk1_prep_clean.ipynb` + 2. `ypk2_reorg_and_impute_ypk.ipynb` + 3. `ypk3_demo_ratios_historical_reg.ipynb` + 4. `ypk4_impute_hist_capital.ipynb` + 5. `ypk5_projected_yp.ipynb` + 6. `ypk6_projected_capital.ipynb` + 3. `exposure` ([docs](notebooks/create-SLIIDERS-ECON/exposure)): Workflow to generate present-day exposure grid. + 1. `1-create-coastline-segments.ipynb` + 2. `2-create-segment-regions.ipynb` + 3. `3-fill_missing_litpop_with_geg.ipynb` + 4. `4-vectorize-wetlands.ipynb` + 5. `5-get_positive_elev_tiles.ipynb` + 6. `6-generate_datum_conversion_grid.ipynb` + 7. `7-create_dem_mss.ipynb` + 8. `8-generate_protected_areas.ipynb` + 9. `9-generate_exposure_tiles.ipynb` + 10. `10-combine_exposure_tiles.ipynb` + 4. `create-SLIIDERS-ECON.ipynb` + +The resulting datasets can be found at these paths, defined in `settings.py`: +**SLIIDERS-ECON**: `PATH_SLIIDERS_ECON` +**SLIIDERS-SLR**: `PATH_SLIIDERS_SLR` + +## Support +Please file an issue for any problems you encounter. + +## Contributing +We encourage community contributions. At the moment, we have no contribution template. Please fork the project and file a Merge Request to propose your addition. Clearly define the contribution that the Merge Request is making and, when any issues have been resolved, we will merge the new code. + +## Authors +The original authors of this code include: +- Daniel Allen +- Ian Bolliger +- Junho Choi +- Nicholas Depsky + +## License +This code is licensed under the [MIT License](./LICENSE). However, we request that wherever this code or the SLIIDERS datasets are used, that the underlying manuscript (Depsky et al. 2022) is cited. A citation guide will be posted once the manuscript preprint is available. \ No newline at end of file diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..9daafb6 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,6 @@ +# Instructions + +This directory contains sub-directories to produce final SLIIDERS outputs. The order of execution is as follows. For further instructions, we refer to the `README.md` files in the respective sub-directories. + +1. `create-SLIIDERS-SLR`: Workflow to generate **SLIIDERS-SLR**, a dataset of gridded local sea-level Monte Carlo samples for each RCP scenario, year (decadal), and site ID (defined by LocalizeSL). +2. `create-SLIIDERS-ECON`: Workflow to generate **SLIIDERS-ECON**, a dataset containing socioeconomic variables by coastal segment, elevation, Shared Socioeconomic Pathway scenario. Note that this workflow uses the SLIIDERS-SLR dataset to find nearest grid cells to match to coastal segments. diff --git a/notebooks/create-SLIIDERS-ECON/README.md b/notebooks/create-SLIIDERS-ECON/README.md new file mode 100644 index 0000000..a782e3d --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/README.md @@ -0,0 +1,10 @@ +# Workflow for generating the SLIIDERS-ECON dataset + +This directory contains notebooks to generate the **SLIIDERS-ECON** dataset. The final output for future projections is a Zarr store containing socioeconomic variables binned by coastal segment, elevation slice, and Shared Socioeconomic Pathway. + +The steps to produce the final output are as follows. + +1. Use `download-sliiders-econ-input-data.ipynb` to download necessary datasets, including various country-level datasets and datasets such as including World Bank Intercomparison Project 2017 and construction cost index by Lincke and Hinkel (2021, *Earth's Future*). +2. Go to the directory `country_level_ypk` and follow the instructions in the `README.md` in that directory. The workflow in `country_level_ypk` cleans (and when necessary, imputes) various country-level socioeconomic variables. +3. Go to the directory `exposure` and follow the instructions in the `README.md` in that directory. The workflow in `exposure` generates current-day global exposure data by coastal segment, elevation, and other variables. +4. Use `create-SLIIDERS-ECON.ipynb` to combine disparate data sources to generate the final output. diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/README.md b/notebooks/create-SLIIDERS-ECON/country_level_ypk/README.md new file mode 100644 index 0000000..d966523 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/README.md @@ -0,0 +1,67 @@ +# Workflow for organizing and projecting GDP (`Y`), population (`P`), capital stock (`K`), and related variables for historical (1950-2020) and future (2010-2100) timelines + +**This version: last updated on March 30, 2022** + +## 1. Overview + +This directory contains the data acquistion, clean-up, and projection notebook files to organize and project variables including GDP, GDP per capita (GDPpc), population, and capital stock for both historical (1950-2020) and future or projected (2010-2100) timelines. Many of the data sources used to generate historical and future panels have missing data, and therefore efforts were made to impute these missing data through either some form of extrapolation or other established methods. Also, efforts were made to keep the PPP and USD units consistent (e.g., constant 2019 PPP USD) across different sources having different vintages of PPP and USD units. + +Below is a quick summary of what each file seeks to accomplish (where the header `ypk` stands for "GDP, population, and capital stock"). +1. `ypk1_prep_clean.ipynb`: cleans up selected raw datasets requiring more attention than others to be consistent and workable with other datasets. +2. `ypk2_reorg_and_impute.ipynb`: reorganizes the raw and previously-cleaned historical datasets so that each variable considered has a single, consistent stream of values for each country. After this process, imputes missing GDPpc, GDP, and population values that might still be missing from the cleaned historical dataset. +3. `ypk3_demo_ratios_historical_reg.ipynb`: contains code to clean and extrapolate demographic (age-group) ratios and create the "demographic variables" necessary to conduct the "historical regression" (According to Higgins, 1998) of finding the relationship between investment-to-GDP ratio (I/Y ratio) and demographic variables, (relative) GDPpc, and GDPpc growth rate. Furthermore, the said historical regression is conducted to acquire estimates of investment-to-GDP ratios for missing country-years. +4. `ypk4_impute_hist_capital.ipynb`: contains code to use the historical and estimated investment-to-GDP ratios to create current-PPP investment values. These are used to replicate the initial-year capital stock estimation (country-by-country) as described in Inklaar, Woltjer, and Albarrán (2019). Also, the investment values are used in conjunction with GEG-15 and LitPop data sources to fill in missing values for the latter parts of the historical capital stock data. The end product is a filled (1950-2020) capital stock data for all relevant countries. +5. `ypk5_projected_yp.ipynb`: contains code to clean up GDP, GDPpc, and population for the future timeline, with some basic extrapolation conducted for countries with missing projections. +6. `ypk6_projected_capital.ipynb`: generates projections of capital stocks based on the Dellink et al. (2017) methodology. + +For running these files, note that they have to be **run consecutively** (i.e., from `ypk1~` to `ypk7~`). Each notebook file contains basic descriptions on what each step does; in all cases, the cells must be run consecutively from top to bottom. + +## 2. Basic description of key variables + +We describe below some key variables produced by the above process. Note that our naming conventions largely follow Penn World Table 10.0. +- `cgdpo_19`: Current PPP (purchasing power parity) GDP in millions of 2017 and 2019 USD +- `cgdpo_pc_19`: Current PPP GDP per capita in ones of 2017 and 2019 USD +- `rgdpna_19`: (National account-based) GDP in millions of constant 2019 PPP USD +- `rgdpna_pc_19`: (National account-based) GDP per capita in ones of constant 2019 PPP USD +- `cn_19`: Current PPP capital stock in millions of 2019 USD +- `rnna_19`: Capital stock in millions of constant PPP 2019 USD +- `pop`: Population in millions of people +- `k_movable_ratio`: ratio movable capital out of total physical capital (values in ) +- `iy_ratio`: Investment-to-GDP ratio +- `delta`: Physical capital depreciation rate + +Note that for GDP, GDP per capita, and capital stock variables, there are also versions with `_17` at the end instead of `_19`. For current PPP variables, this means using 2017 USD; for constant PPP variables, this means using constant 2017 PPP USD (i.e., constant PPP of 2017 and 2017 USD). + +## 3. Output storage + +We import the SLIIDERS `settings.py` as `sset`, which can be done as follows: +``` +from sliiders import as settings as sset +``` +For the aggregate long-panel format historical and future timeline variables, you may refer to the following: +1. Historical: `sset.DIR_YPK_FINAL / "gdp_gdppc_pop_capital_1950_2020.parquet"` +2. Future: `sset.DIR_YPK_FINAL / "gdp_gdppc_pop_capital_proj_2010_2100.parquet"` + +where the metadata (e.g., units and sources) are also attached to the respective files. + +## 4. Regression results for imputing missing historical investment-to-GDP ratios + +We elaborate on the regression involving investment-to-GDP ratios mentioned in Section A3.2 in the notebook `ypk4_demo_ratios_historical_reg.ipynb`. The said notebook also contains information on how to derive each variable involved. We present the results below, where the dependent variable is investment-to-GDP ratio (denoted as in the notebook). + +| Variables | (1) | (2) | (3) | (4) | +| ------ | :------: | :------: | :------: | :------: | +| | 0.405
(0.161) | 0.346
(0.076) | 0.502
(0.201) | 0.480
(0.129) | +| | 0.864
(0.742) | 0.515
(0.611) | 0.506
(0.879) | 0.493
(0.915) | +| | -0.021
(0.052) | -0.027
(0.052) | 0.076
(0.022) | 0.108
(0.016) | +| | 0.004
(0.007) | 0.003
(0.006) | -0.011
(0.005) | -0.015
(0.005) | +| | 0.184
(0.186) | | 0.348
(0.190) | | +| | -0.008
(0.035) | | -0.038
(0.030) | | +| | -0.000
(0.002) | | 0.001
(0.001) | | +| | 3.988
(3.149) | | 1.784
(3.945) | | +| | -0.797
(0.570) | | -0.465
(0.597) | | +| | 0.040
(0.028) | | 0.026
(0.026) | | +| | 11145 | 11145 | 11145 | 11145 | +| Country fixed effects | Yes | Yes | No | No | +| Adjusted | 0.325 | 0.315 | 0.068 | 0.054 | +| AIC | -12712 | -12557 | -9317 | -9157 | +| BIC | -11153 | -11042 | -9236 | -9120 | diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb new file mode 100644 index 0000000..0894701 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb @@ -0,0 +1,1011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e08e3612", + "metadata": {}, + "source": [ + "## Preparing and cleaning files necessary for (country-level) capital stock projection workflow\n", + "\n", + "## Importing necessary modules and functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf9622a5", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c7606bf", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import shutil\n", + "from operator import itemgetter\n", + "\n", + "import dask.dataframe as ddf\n", + "import dask.delayed as delayed\n", + "import fiona\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pycountry as pyctry\n", + "from dask_gateway import Gateway\n", + "from py7zr import unpack_7zarchive\n", + "from tqdm.auto import tqdm\n", + "\n", + "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders import settings as sset\n", + "from sliiders import spatial\n", + "\n", + "# dask gateway setup\n", + "gateway = Gateway()\n", + "image_name = sset.DASK_IMAGE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8695ee9-d515-4bd2-a8ac-1bb25656cbcc", + "metadata": {}, + "outputs": [], + "source": [ + "# creating necessary directory\n", + "sset.DIR_YPK_INT.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "4d76358a", + "metadata": {}, + "source": [ + "## Maddison Project: scale change" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0acd5006", + "metadata": {}, + "outputs": [], + "source": [ + "# original file format was excel spreadsheet, so we will read this as is\n", + "madd = pd.read_excel(sset.PATH_MPD_RAW)\n", + "\n", + "# population is in 1000s of people; we will save it to be in millions of people\n", + "madd[\"pop\"] = madd[\"pop\"] / 1000 ## divide by a thousand to get things in millions\n", + "\n", + "# GDPpc is currently in ones of USD; we want gdp to be in millions of USD\n", + "# one USD per million people = 1 million USD per person\n", + "madd[\"gdp\"] = madd[\"gdppc\"] * madd[\"pop\"]\n", + "\n", + "# indexing and exporting\n", + "madd.rename(columns={\"countrycode\": \"ccode\"}, inplace=True)\n", + "madd[\"gdppc_unit\"] = \"ones of USD (constant 2011 PPP USD)\"\n", + "madd[\"gdp_unit\"] = \"millions of USD (constant 2011 PPP USD)\"\n", + "madd[\"pop_unit\"] = \"millions of people\"\n", + "madd.set_index([\"ccode\", \"year\"], inplace=True)\n", + "madd.to_parquet(sset.DIR_YPK_INT / \"maddison_project.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "647b195b", + "metadata": {}, + "source": [ + "## UN WPP: overall populations data\n", + "\n", + "### Assign country (ISO) codes: initial try with obvious cases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9ebc1d0", + "metadata": {}, + "outputs": [], + "source": [ + "# importing data\n", + "un_df = pd.read_csv(sset.DIR_UN_WPP_RAW / \"UN_WPP2019_TotalPopulation.csv\")\n", + "\n", + "# let's check whether there are any with \"dependencies\" considered together with sov.s\n", + "for i in set(un_df.Location):\n", + " if \"ependenc\" in i:\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dab957b", + "metadata": {}, + "outputs": [], + "source": [ + "# very minor clean-up for iso country codes; initial\n", + "countryname_to_iso = dict([])\n", + "\n", + "for i in list(set(un_df.Location)):\n", + " name = pyctry.countries.get(name=i)\n", + " oname = pyctry.countries.get(official_name=i)\n", + "\n", + " if name is not None or oname is not None:\n", + " to_use = name\n", + " if name is None:\n", + " to_use = oname\n", + " countryname_to_iso[i] = to_use.alpha_3\n", + " else:\n", + " countryname_to_iso[i] = None\n", + "\n", + "# some mandotory clean-ups required\n", + "# Will not print them as there are too many, but can be checked via print command\n", + "# print(no_isos)\n", + "no_isos = [k for k, v in countryname_to_iso.items() if v is None]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3a6bde1", + "metadata": {}, + "outputs": [], + "source": [ + "# after examining the no_isos list, I conduct the following (manual) clean-up\n", + "to_update = {\n", + " \"Micronesia (Fed. States of)\": \"FSM\",\n", + " \"State of Palestine\": \"PSE\",\n", + " \"China (and dependencies)\": \"CHN+D\",\n", + " \"China, Macao SAR\": \"MAC\",\n", + " \"China, Hong Kong SAR\": \"HKG\",\n", + " \"Bolivia (Plurinational State of)\": \"BOL\",\n", + " \"Saint Helena\": \"SHN\",\n", + " \"Holy See\": \"VAT\",\n", + " \"Venezuela (Bolivarian Republic of)\": \"VEN\",\n", + " \"Iran (Islamic Republic of)\": \"IRN\",\n", + " \"United Kingdom (and dependencies)\": \"GBR+D\",\n", + " \"New Zealand (and dependencies)\": \"NZL+D\",\n", + " \"Dem. People's Republic of Korea\": \"PRK\",\n", + " \"China, Taiwan Province of China\": \"TWN\",\n", + " \"Democratic Republic of the Congo\": \"COD\",\n", + " \"Republic of Korea\": \"KOR\",\n", + " \"United States Virgin Islands\": \"VIR\",\n", + " \"Denmark (and dependencies)\": \"DNK+D\",\n", + " \"France (and dependencies)\": \"FRA+D\",\n", + " \"United States of America (and dependencies)\": \"USA+D\",\n", + " \"Wallis and Futuna Islands\": \"WLF\",\n", + " \"Channel Islands\": \"GGY+JEY\",\n", + " \"Netherlands (and dependencies)\": \"NLD+D\",\n", + "}\n", + "\n", + "# updating the ISO codes\n", + "countryname_to_iso.update(to_update)" + ] + }, + { + "cell_type": "markdown", + "id": "8e8189d5", + "metadata": {}, + "source": [ + "### Detecting cases spanning multiple regions\n", + "\n", + "We do not want to account for cases like \"Europe\" where there are multiple countries / territories / sovereignties associated with it. Therefore, we will assign, to these multiple-region cases, the code `WIDE`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2944f67", + "metadata": {}, + "outputs": [], + "source": [ + "# re-checking for clean-ups; again, this is too long a list to print\n", + "no_isos_2 = [k for k, v in countryname_to_iso.items() if v is None]\n", + "\n", + "# the whole of no_isos_2 is \"WIDE\"\n", + "for i, ctry in enumerate(no_isos_2):\n", + " countryname_to_iso[ctry] = \"WIDE\"\n", + "\n", + "# applying the dictionary to get country codes (ISO)\n", + "un_df[\"ccode\"] = un_df.Location.map(countryname_to_iso)" + ] + }, + { + "cell_type": "markdown", + "id": "9644b04b", + "metadata": {}, + "source": [ + "### Exporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b78d2c7", + "metadata": {}, + "outputs": [], + "source": [ + "un_df.rename(columns={\"Time\": \"year\"}, inplace=True)\n", + "un_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + "un_df.to_parquet(sset.DIR_YPK_INT / \"un_population.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "5b4a6c04", + "metadata": {}, + "source": [ + "## UN WPP: population-by-age-group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22db8719", + "metadata": {}, + "outputs": [], + "source": [ + "# attaching country codes; first import un_pop information\n", + "by_age = pd.read_csv(sset.DIR_UN_WPP_RAW / \"UN_WPP2019_Population_by_Age.csv\")\n", + "\n", + "# attaching the country codes\n", + "un_df_dic = dict(zip(un_df.Location, un_df.index.get_level_values(\"ccode\")))\n", + "by_age[\"ccode\"] = by_age.Location.map(un_df_dic)\n", + "\n", + "# double checking if any are missing country codes\n", + "print(\"The missing-ccode rows are:\", by_age[pd.isnull(by_age.ccode)].shape[0])\n", + "\n", + "# saving the ccodes as indices\n", + "by_age.set_index([\"ccode\"], inplace=True)\n", + "\n", + "# exporting\n", + "by_age.to_parquet(sset.DIR_YPK_INT / \"un_population_by_age.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "2f3eb43d", + "metadata": {}, + "source": [ + "## GEG-15" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e18b799", + "metadata": {}, + "outputs": [], + "source": [ + "# cluster setup\n", + "N_CLUSTER = 20\n", + "cluster = gateway.new_cluster(worker_image=image_name, profile=\"micro\")\n", + "client = cluster.get_client()\n", + "cluster.scale(N_CLUSTER)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2b94412", + "metadata": {}, + "outputs": [], + "source": [ + "@delayed\n", + "def clean_chunk(start, num, shp_path):\n", + " with fiona.open(shp_path, \"r\") as shp:\n", + " chunk = shp[start : (start + num)]\n", + " properties = pd.DataFrame((map(itemgetter(\"properties\"), chunk)))\n", + " geometries = list(map(itemgetter(\"geometry\"), chunk))\n", + " coordinates = pd.DataFrame(\n", + " map(itemgetter(\"coordinates\"), geometries), columns=[\"lon\", \"lat\"]\n", + " )\n", + " df = coordinates.merge(properties, left_index=True, right_index=True)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b81ef7a2", + "metadata": {}, + "outputs": [], + "source": [ + "DIR_GAR = sset.DIR_GEG15_RAW / \"gar-exp\"\n", + "with fiona.open(DIR_GAR / \"gar_exp.shp\") as shp:\n", + " num_geoms = len(shp)\n", + "\n", + "data_chunked = []\n", + "for ii in range(0, num_geoms, 1000):\n", + " data_chunked.append(clean_chunk(ii, 1000, str(DIR_GAR / \"gar_exp.shp\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa4c6a58", + "metadata": {}, + "outputs": [], + "source": [ + "df = ddf.from_delayed(data_chunked)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fe52819", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.repartition(npartitions=16).persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04cbccaf", + "metadata": {}, + "outputs": [], + "source": [ + "sset.DIR_GEG15_INT.mkdir(parents=True, exist_ok=True)\n", + "df.to_parquet(sset.DIR_GEG15_INT / \"gar_exp.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b56024a", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(0)\n", + "client.close()\n", + "cluster.close()\n", + "cluster.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "d8d10ce3-91a6-45d6-89b0-7b13985f200d", + "metadata": {}, + "source": [ + "## Unzip and process Landscan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e1a3d46-ca5a-4ddc-9545-66420177cf53", + "metadata": {}, + "outputs": [], + "source": [ + "spatial.process_landscan(\n", + " landscan_zip=sset.DIR_LANDSCAN_RAW / f\"{sset.LANDSCAN_VERS}.zip\",\n", + " dir_landscan_raw=sset.DIR_LANDSCAN_RAW / sset.LANDSCAN_VERS,\n", + " dir_landscan_int=sset.DIR_LANDSCAN_INT,\n", + " landscan_year=sset.LANDSCAN_YEAR,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "32e2ac64", + "metadata": {}, + "source": [ + "## CIA World Factbook: gathering GDP PPP terms\n", + "\n", + "The information gathered through sources such as PWT, World Bank WDI, and OECD Regional data often lack GDP information about many of the smaller or disputed countries and territories. In order to account for these countries, we incorporate data from CIA World Factbook dataset which has not much year-to-year information but has more countries covered.\n", + "\n", + "### Unzipping and organizing the files\n", + "\n", + "Note that the cell directly below needs to be run **only once** since it is basically unzipping the `.7z` zip file and may take a long time to repeat over again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c92fa2c", + "metadata": {}, + "outputs": [], + "source": [ + "# unzipping: this may take a long time\n", + "CIA_DIR, zip_file_name = sset.DIR_YPK_RAW, \"weekly_json.7z\"\n", + "shutil.register_unpack_format(\"7zip\", [\".7z\"], unpack_7zarchive)\n", + "shutil.unpack_archive(CIA_DIR / zip_file_name, CIA_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20797d4d", + "metadata": {}, + "outputs": [], + "source": [ + "# ordering them by time (Earlier entries first)\n", + "CIA_DIR_week = sset.DIR_YPK_RAW / \"weekly_json\"\n", + "file_lst = np.sort(list(CIA_DIR_week.glob(\"*\")))" + ] + }, + { + "cell_type": "markdown", + "id": "2df988e2", + "metadata": {}, + "source": [ + "### Fetch necessary information from the individual `.json` files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4132539f", + "metadata": {}, + "outputs": [], + "source": [ + "def file_gdp_fetcher(filename):\n", + " \"\"\"From weekly-scraped CIA World Factbook data (in json format), gather relevant GDP\n", + " information and save as a dictionary.\n", + "\n", + " Parameters\n", + " ----------\n", + " filename : Path-like or str\n", + " individual weekly-scraped CIA World Factbook data file path\n", + "\n", + " overall_dict : dict\n", + " information (in dictionary format) containing the countries' GDP information\n", + " (in purchasing power parity) and for which year(s) those information is provided\n", + "\n", + " \"\"\"\n", + "\n", + " with open(filename) as fp:\n", + " data = json.load(fp)\n", + " ctries = list(data[\"countries\"].keys())\n", + " ctries.sort()\n", + "\n", + " note_phrase_1 = \"data are in \"\n", + " note_phrase_2 = \" dollars\"\n", + " note_phrase_3 = \" us dollars\"\n", + "\n", + " overall_dict = dict([])\n", + " for c in ctries:\n", + "\n", + " try:\n", + " info = data[\"countries\"][c][\"data\"][\"economy\"][\"gdp\"]\n", + " info = info[\"purchasing_power_parity\"]\n", + " note = info.get(\"note\")\n", + "\n", + " base_yr = None\n", + " if note is not None:\n", + " note = note.lower()\n", + " if (note_phrase_1 in note) and (note_phrase_3 in note):\n", + " note_ = note.split(note_phrase_1)[1]\n", + " note_ = note_.split(note_phrase_3)[0]\n", + " base_yr = int(note_[0:4])\n", + " elif (note_phrase_1 in note) and (note_phrase_2 in note):\n", + " note_ = note.split(note_phrase_1)[1]\n", + " note_ = note_.split(note_phrase_2)[0]\n", + " base_yr = int(note_[0:4])\n", + " info_values = info.get(\"annual_values\")\n", + " if (info_values is not None) and (type(info_values) in [tuple, list]):\n", + " keys = []\n", + " values = []\n", + " for i in info_values:\n", + " keys.append(int(i[\"date\"]))\n", + " values.append((i[\"value\"], int(i[\"date\"])))\n", + " if base_yr is not None:\n", + " values = [(x[0], base_yr) for x in values]\n", + " yr_dict = dict(zip(keys, values))\n", + " overall_dict[c] = yr_dict\n", + "\n", + " except KeyError:\n", + " continue\n", + "\n", + " return overall_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e8e681f", + "metadata": {}, + "outputs": [], + "source": [ + "# individual results of the file_gdp_fetcher function stored in a list\n", + "lst_results = []\n", + "for f in tqdm(file_lst):\n", + " lst_results.append(file_gdp_fetcher(f))" + ] + }, + { + "cell_type": "markdown", + "id": "c5d2d1e9", + "metadata": {}, + "source": [ + "### Updating the individual dictionaries with the most recent information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9248d602", + "metadata": {}, + "outputs": [], + "source": [ + "def update_one_with_two(dict1, dict2):\n", + " \"\"\"For simple updating of dictionaries, from `dict2` onto `dict1` in order to make\n", + " sure that all relevant CIA World Factbook data are gathered\n", + "\n", + " Parameters\n", + " ----------\n", + " dict1 : dict\n", + " dictionary to implement the updates onto\n", + " dict2 : dict\n", + " dictionary to gather new information from\n", + "\n", + " Returns\n", + " -------\n", + " dict_ : dict\n", + " updated dictionary containing the information of both dictionaries\n", + "\n", + " \"\"\"\n", + "\n", + " dict_ = dict(dict1)\n", + " lst1 = list(dict1.keys())\n", + "\n", + " for key in dict2.keys():\n", + " if key not in lst1:\n", + " dict_[key] = dict2[key]\n", + " continue\n", + "\n", + " subdict = dict2[key]\n", + " subkeys = list(subdict.keys())\n", + " for subkey in subkeys:\n", + " dict_[key][subkey] = subdict[subkey]\n", + "\n", + " return dict_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c6aa475", + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "for res in tqdm(lst_results[1:]):\n", + " if i == 0:\n", + " midres = update_one_with_two(lst_results[0], res)\n", + " else:\n", + " midres = update_one_with_two(midres, res)\n", + " i += 1" + ] + }, + { + "cell_type": "markdown", + "id": "fe86819a", + "metadata": {}, + "source": [ + "### Saving into a long-panel format dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23ddf2f1", + "metadata": {}, + "outputs": [], + "source": [ + "ctry_dfs = []\n", + "for i in midres.keys():\n", + " info = midres[i]\n", + " i_k = list(info.keys())\n", + " i_v = [info[i_k_] for i_k_ in i_k]\n", + " ctry_info = [[i, i_k[l]] + list(i_v[l]) for l in range(len(i_k))]\n", + " ctry_df = pd.DataFrame(ctry_info, columns=[\"country\", \"year\", \"gdp\", \"ppp_year\"])\n", + " ctry_dfs.append(ctry_df)\n", + "ctry_agg_df = pd.concat(ctry_dfs, axis=0)\n", + "ctry_agg_df[\"country\"] = [x.replace(\"_\", \" \") for x in ctry_agg_df[\"country\"]]\n", + "ctry_agg_df.set_index([\"country\", \"year\"], inplace=True)\n", + "ctry_agg_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "dabdcdca", + "metadata": {}, + "source": [ + "### Assigning countrycodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56b9a0fb", + "metadata": {}, + "outputs": [], + "source": [ + "# let's use the UN populations data, since it should have the most countries\n", + "# to match names with values\n", + "un_loc = sset.DIR_YPK_INT\n", + "unpop = pd.read_parquet(un_loc / \"un_population.parquet\").reset_index()\n", + "\n", + "unpop[\"Location_lower\"] = [x.lower() for x in unpop.Location]\n", + "initial_cleanup = dict(zip(unpop.Location_lower, unpop.ccode))\n", + "\n", + "## attaching the cleaned countrycodes\n", + "initial_df = [list(initial_cleanup.keys()), list(initial_cleanup.values())]\n", + "initial_df = pd.DataFrame(\n", + " np.array(initial_df).T, columns=[\"country\", \"ccode\"]\n", + ").set_index([\"country\"])\n", + "ctry_agg_df = ctry_agg_df.merge(\n", + " initial_df, left_index=True, right_index=True, how=\"left\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c2eeaea", + "metadata": {}, + "outputs": [], + "source": [ + "## checking which didn't get country codes\n", + "cia_ccodes_only = ctry_agg_df.reset_index()[[\"country\", \"ccode\"]].drop_duplicates()\n", + "unknown_case = []\n", + "for i, case in enumerate(cia_ccodes_only[\"ccode\"]):\n", + " if pd.isnull(case):\n", + " unknown_case.append(cia_ccodes_only[\"country\"].values[i])\n", + "unknown_case = np.sort(np.unique(unknown_case))\n", + "print(unknown_case)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7ab6757", + "metadata": {}, + "outputs": [], + "source": [ + "# manual cleanup\n", + "unknown_case_ccodes = [\"BHS\", \"BOL\", \"BRN\", \"MMR\", \"CPV\", \"COD\", \"COG\", \"CIV\", \"CUW\"]\n", + "unknown_case_ccodes += [\"CZE\", \"TLS\", \"-\", \"FLK\", \"GMB\", \"-\", \"GGY\", \"GNB\", \"HKG\"]\n", + "unknown_case_ccodes += [\"IRN\", \"JEY\", \"PRK\", \"KOR\", \"KO-\", \"LAO\", \"MAC\", \"MKD\", \"FSM\"]\n", + "unknown_case_ccodes += [\"MDA\", \"-\", \"RUS\", \"SHN\", \"MAF\", \"SXM\", \"SWZ\", \"SYR\", \"TWN\"]\n", + "unknown_case_ccodes += [\"TZA\", \"TLS\", \"USA\", \"VEN\", \"VNM\", \"VIR\", \"WLF\", \"-\"]\n", + "\n", + "# double-checking the names' lengths\n", + "print(len(unknown_case) == len(unknown_case_ccodes))\n", + "\n", + "# getting a dataframe\n", + "update_df = pd.DataFrame(data={\"country\": unknown_case, \"ccode2\": unknown_case_ccodes})\n", + "update_df.set_index([\"country\"], inplace=True)\n", + "ctry_agg_df = ctry_agg_df.merge(\n", + " update_df, left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "ctry_agg_df.loc[pd.isnull(ctry_agg_df.ccode), \"ccode\"] = ctry_agg_df.loc[\n", + " pd.isnull(ctry_agg_df.ccode), \"ccode2\"\n", + "].values" + ] + }, + { + "cell_type": "markdown", + "id": "6631b61c", + "metadata": {}, + "source": [ + "### Fetching the PPP conversion rates (to constant 2017 PPP USD), and applying the conversion rates\n", + "\n", + "Also, turn it into millions of USD (currently in ones of USD)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a08c1ebf", + "metadata": {}, + "outputs": [], + "source": [ + "ppp_to_17 = ypk_fn.ppp_conversion_specific_year(2017, to=True, extrap_sim=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf2edb1f", + "metadata": {}, + "outputs": [], + "source": [ + "# neutral assumption when conversion rates are missing\n", + "ctry_agg_df = (\n", + " ctry_agg_df.reset_index()\n", + " .set_index([\"ccode\", \"year\"])\n", + " .drop([\"ccode2\"], axis=1)\n", + " .merge(ppp_to_17, left_index=True, right_index=True, how=\"left\")\n", + ")\n", + "ctry_agg_df.loc[pd.isnull(ctry_agg_df.conv), \"conv\"] = 1\n", + "\n", + "# first, divide by 1000000\n", + "ctry_agg_df[\"gdp\"] = ctry_agg_df[\"gdp\"] / 1000000\n", + "\n", + "# applying the conversion by multiplying\n", + "ctry_agg_df[\"gdp_ppp2017_currUSD\"] = ctry_agg_df[\"gdp\"] * ctry_agg_df[\"conv\"]" + ] + }, + { + "cell_type": "markdown", + "id": "6ba4de36", + "metadata": {}, + "source": [ + "### Attaching the US deflators and generating constant 2017 PPP USD values\n", + "\n", + "Note that while they are now in PPP of 2017, they are yet to be turned into constant 2017 PPP (since they are in current USD, for many). Therefore, we will need to fetch the US deflators (using `pl_gdpo` from PWT)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79e9dc22", + "metadata": {}, + "outputs": [], + "source": [ + "pwt = pd.read_excel(sset.PATH_PWT_RAW).rename(columns={\"countrycode\": \"ccode\"})\n", + "pwt.set_index([\"ccode\", \"year\"], inplace=True)\n", + "\n", + "us_defla = (\n", + " pwt.loc[\"USA\", [\"pl_gdpo\"]]\n", + " .reset_index()\n", + " .rename(columns={\"pl_gdpo\": \"pl_usa\", \"year\": \"ppp_year\"})\n", + ")\n", + "ctry_agg_df = (\n", + " ctry_agg_df.reset_index()\n", + " .merge(us_defla, on=[\"ppp_year\"], how=\"left\")\n", + " .set_index([\"ccode\", \"year\"])\n", + ")\n", + "\n", + "# generating constant 2017 ppp\n", + "ctry_agg_df[\"gdp_constant2017ppp\"] = (\n", + " ctry_agg_df[\"gdp_ppp2017_currUSD\"] / ctry_agg_df[\"pl_usa\"]\n", + ")\n", + "\n", + "ctry_agg_df_reorg = ctry_agg_df[[\"gdp_constant2017ppp\", \"country\"]].sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "81fed947", + "metadata": {}, + "source": [ + "### Checking for redundancies in country (ISO) codes\n", + "\n", + "Except when there aren't any country-codes attached, these redundancies are occurring because there have been changes to the countries' names over the years or there are multiple names for one country. We will use the following rule to remove some of the overlaps:\n", + "- SHN: Take only `saint helena ascension and tristan da cunha`\n", + "- CZE: For 2006-2012, use `czech republic` information; for 2013 and onwards, use `czechia` information.\n", + "- MKD: For 2006-2014, use `macedonia` information; for 2015 and onwards, use `north macedonia` information.\n", + "- SWZ: For 2006-2014, use `swaziland` information; for 2015 and onwards, use `eswatini` information.\n", + "- CPV: For 2006-2011, use `cape verde` information; for 2012 and onwards, use `cabo verde` information.\n", + "- TLS: Take only `timor leste`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37ee6478", + "metadata": {}, + "outputs": [], + "source": [ + "reorg_ccodes = ctry_agg_df_reorg.reset_index()[[\"ccode\", \"country\"]].drop_duplicates()\n", + "reorg_ccodes.set_index([\"ccode\"], inplace=True)\n", + "for i, ccode in enumerate(np.unique(reorg_ccodes.index.values)):\n", + " countrycases = reorg_ccodes.loc[ccode, \"country\"]\n", + " if (ccode != \"-\") and (type(countrycases) != str):\n", + " print(ccode, countrycases.values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab54b382", + "metadata": {}, + "outputs": [], + "source": [ + "redundant_ones = [\"SHN\", \"CZE\", \"MKD\", \"SWZ\", \"CPV\", \"TLS\"]\n", + "ctry_agg_df_redun = ctry_agg_df_reorg.reset_index()\n", + "\n", + "ctry_shn = ctry_agg_df_redun.loc[\n", + " ctry_agg_df_redun.country == \"saint helena ascension and tristan da cunha\"\n", + "].set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_cze = ctry_agg_df_redun.loc[\n", + " ((ctry_agg_df_redun.country == \"czechia\") & (ctry_agg_df_redun.year >= 2013))\n", + " | (\n", + " (ctry_agg_df_redun.country == \"czech republic\")\n", + " & (ctry_agg_df_redun.year <= 2012)\n", + " )\n", + "].set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_mkd = ctry_agg_df_redun[\n", + " ((ctry_agg_df_redun.country == \"macedonia\") & (ctry_agg_df_redun.year <= 2014))\n", + " | (\n", + " (ctry_agg_df_redun.country == \"north macedonia\")\n", + " & (ctry_agg_df_redun.year >= 2015)\n", + " )\n", + "].set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_swz = ctry_agg_df_redun[\n", + " ((ctry_agg_df_redun.country == \"swaziland\") & (ctry_agg_df_redun.year <= 2014))\n", + " | ((ctry_agg_df_redun.country == \"eswatini\") & (ctry_agg_df_redun.year >= 2015))\n", + "].set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_cpv = ctry_agg_df_redun[\n", + " ((ctry_agg_df_redun.country == \"cape verde\") & (ctry_agg_df_redun.year <= 2011))\n", + " | ((ctry_agg_df_redun.country == \"cabo verde\") & (ctry_agg_df_redun.year >= 2012))\n", + "].set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_tls = ctry_agg_df_redun.loc[\n", + " ctry_agg_df_redun.country == \"timor leste\", :\n", + "].set_index([\"ccode\", \"year\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7ca3e2f", + "metadata": {}, + "outputs": [], + "source": [ + "ctry_agg_df_final = ctry_agg_df_reorg[\n", + " ~ctry_agg_df_reorg.index.get_level_values(\"ccode\").isin(\n", + " [\"-\", \"WIDE\"] + redundant_ones\n", + " )\n", + "].copy()\n", + "\n", + "ctry_agg_df_final = pd.concat(\n", + " [ctry_agg_df_final, ctry_shn, ctry_cze, ctry_mkd, ctry_swz, ctry_cpv, ctry_tls],\n", + " axis=0,\n", + ").sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "633df76b", + "metadata": {}, + "source": [ + "### Adding those that are not in the files\n", + "\n", + "**Tokelau `TKL`**\n", + "\n", + "According to Tokelau government (link [here](https://www.tokelau.org.nz/Bulletin/April+2017/GDP+first.html)), its PPP USD was 10 million (in 2017). So we will fill this in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1c760d2", + "metadata": {}, + "outputs": [], + "source": [ + "tkl = pd.DataFrame(\n", + " [\n", + " [\"TKL\", 2017, 10, \"tokelau\"],\n", + " ],\n", + " columns=[\"ccode\", \"year\", \"gdp_constant2017ppp\", \"country\"],\n", + ").set_index([\"ccode\", \"year\"])\n", + "ctry_agg_df_final = pd.concat([ctry_agg_df_final, tkl], axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "e7b2ec2f", + "metadata": {}, + "source": [ + "**Saint Helena (`SHN`)**\n", + "\n", + "I update the latest values using the CIA World Factbook's January 7, 2021 vintage (link [here](https://www.cia.gov/the-world-factbook/)). For `SHN`, it is said that the 2009 value of GDP (in constant 2009 PPP USD) is 31.1 million, but we do not have the explicit PPP conversion for `SHN`. Since `SHN` is a British territory, `GBR` PPP rates are used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab2c1cbd", + "metadata": {}, + "outputs": [], + "source": [ + "shn_rate = ppp_to_17.loc[(\"SHN\", 2009), \"conv\"]\n", + "us_def09 = pwt.loc[(\"USA\", 2009), \"pl_gdpo\"]\n", + "shn = pd.DataFrame(\n", + " [\n", + " [\"SHN\", 2009, shn_rate / us_def09 * 31.1, \"saint helena\"],\n", + " ],\n", + " columns=[\"ccode\", \"year\", \"gdp_constant2017ppp\", \"country\"],\n", + ").set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_agg_df_final = pd.concat([ctry_agg_df_final, shn], axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "207fa442", + "metadata": {}, + "source": [ + "**Vatican (`VAT`)**\n", + "\n", + "While not in the latest CIA World Factbook, the 2000 version has some information about Vatican city (archived [here](https://www.encyclopedia.com/places/spain-portugal-italy-greece-and-balkans/italian-political-geography/vatican-city)) which we will be able to use. It says that the 1999 estimate of the Vatican GDP (assuming it's constant 1999 PPP) was 21 million USD. Let us use the PPP conversion rates of Italy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554deb3e", + "metadata": {}, + "outputs": [], + "source": [ + "vat_rate = ppp_to_17.loc[(\"VAT\", 1999), \"conv\"]\n", + "us_def99 = pwt.loc[(\"USA\", 1999), \"pl_gdpo\"]\n", + "vat = pd.DataFrame(\n", + " [\n", + " [\"VAT\", 1999, vat_rate / us_def99 * 21, \"vatican\"],\n", + " ],\n", + " columns=[\"ccode\", \"year\", \"gdp_constant2017ppp\", \"country\"],\n", + ").set_index([\"ccode\", \"year\"])\n", + "\n", + "ctry_agg_df_final = pd.concat([ctry_agg_df_final, vat], axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "483b6c8c", + "metadata": {}, + "source": [ + "### Exporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aad9b65", + "metadata": {}, + "outputs": [], + "source": [ + "ctry_agg_df_final.sort_index(inplace=True)\n", + "ctry_agg_df_final.rename(columns={\"gdp_constant2017ppp\": \"cia_rgdpna\"}, inplace=True)\n", + "ctry_agg_df_final.to_parquet(\n", + " sset.DIR_YPK_INT / \"cia_wf_gdp_constant_2017_ppp_usd_ver.parquet\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb new file mode 100644 index 0000000..c281a92 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb @@ -0,0 +1,3790 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2bbfaf73-96c1-46d7-bafe-92c30ced3b76", + "metadata": {}, + "source": [ + "## Reorganizing raw data (GDP, GDPpc, and population) in long-panel format, converting to current and constant PPP terms, taking care of missing data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1c69bfd-3b4c-4bb4-8d00-6f8e1bdf7122", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85f3f8c1-947b-4170-acc0-366f75565a86", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "from tqdm.auto import tqdm\n", + "\n", + "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders import settings as sset" + ] + }, + { + "cell_type": "markdown", + "id": "7444df03-7e87-43d1-bf13-bf7f7cd9885e", + "metadata": {}, + "source": [ + "## Importing all raw data, and creating a merged, long-panel version\n", + "\n", + "### PWT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "996df854-1843-45fb-ad57-74173b99fcf5", + "metadata": {}, + "outputs": [], + "source": [ + "pwt100 = pd.read_excel(sset.PATH_PWT_RAW)\n", + "pwt100.rename(columns={\"countrycode\": \"ccode\"}, inplace=True)\n", + "pwt_gdp_pop = [\"ccode\", \"year\", \"pop\", \"rgdpo\", \"rgdpna\", \"cgdpo\"]\n", + "gdp_pop_df = pwt100[pwt_gdp_pop].copy().set_index([\"ccode\", \"year\"])" + ] + }, + { + "cell_type": "markdown", + "id": "f7c1b3b0-fe61-4825-85db-596c639eff72", + "metadata": {}, + "source": [ + "### WB WDI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e45a649d-dc43-41fc-a96d-9dc1bd1adda7", + "metadata": {}, + "outputs": [], + "source": [ + "# WB WDI\n", + "wdi_rename_dict = {\n", + " \"SP.POP.TOTL\": \"wb_pop\",\n", + " \"NY.GDP.MKTP.PP.KD\": \"wb_rgdpna\",\n", + " \"NY.GDP.PCAP.PP.KD\": \"wb_rgdpna_pc\",\n", + " \"NY.GDP.MKTP.KD\": \"wb_gdp_nom\",\n", + " \"NY.GDP.PCAP.KD\": \"wb_gdp_nom_pc\",\n", + "}\n", + "wb_wdi = pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_pop_iy_gdp.parquet\").rename(\n", + " columns=wdi_rename_dict\n", + ")\n", + "wb_wdi = wb_wdi.loc[\n", + " wb_wdi.index.get_level_values(\"ccode\").isin(sset.ALL_ISOS_EXTENDED), :\n", + "].reset_index()\n", + "\n", + "# Unifying the country code conventions for Kosovo and Channel Islands\n", + "wb_wdi.loc[wb_wdi.ccode == \"XKX\", \"ccode\"] = \"KO-\"\n", + "wb_wdi.loc[wb_wdi.ccode == \"CHI\", \"ccode\"] = \"GGY+JEY\"\n", + "wb_wdi.set_index([\"ccode\", \"year\"], inplace=True)\n", + "\n", + "# re-scaling; currently in ones, but to have them in PWT scales\n", + "wb_wdi[[\"wb_rgdpna\", \"wb_gdp_nom\", \"wb_pop\"]] /= 1000000\n", + "\n", + "# merging\n", + "gdp_pop_df = gdp_pop_df.join(wb_wdi, how=\"outer\")" + ] + }, + { + "cell_type": "markdown", + "id": "b8696d23-122e-4a1a-bfa5-24485a623f20", + "metadata": {}, + "source": [ + "### IMF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cf1d2eb-3fb8-4057-b1ac-7e7e56cd751c", + "metadata": {}, + "outputs": [], + "source": [ + "# the code below prevents the \"ArrowInvalid\" error\n", + "imf = pd.read_excel(sset.PATH_IMF_WEO_RAW, na_values=[\"n/a\", \"--\"]).rename(\n", + " columns={\"ISO\": \"ccode\", \"Subject Descriptor\": \"subject\"}\n", + ")\n", + "imf = imf.loc[imf.ccode.isin(sset.ALL_ISOS_EXTENDED), :]\n", + "\n", + "# renaming the subjects\n", + "imf_rename = {\n", + " \"Gross domestic product per capita, constant prices\": \"imf_rgdpna_pc\",\n", + " \"Gross domestic product per capita, current prices\": \"imf_gdppc_nom\",\n", + " \"Gross domestic product, current prices\": \"imf_gdp_nom\",\n", + " \"Population\": \"imf_pop\",\n", + "}\n", + "for key, nam in imf_rename.items():\n", + " imf.loc[imf.subject == key, \"subject\"] = nam\n", + "imf = imf.loc[imf.subject.isin(list(imf_rename.values())), :].copy()\n", + "v_names = dict(zip(list(range(1980, 2021)), [\"v_\" + str(x) for x in range(1980, 2021)]))\n", + "imf.rename(columns=v_names, inplace=True)\n", + "\n", + "# organizing this in vertical format\n", + "first = 0\n", + "for nam in imf_rename.values():\n", + " imf_sub = imf.loc[imf.subject == nam, [\"ccode\"] + list(v_names.values())].set_index(\n", + " [\"ccode\"]\n", + " )\n", + " imf_sub = ypk_fn.organize_hor_to_ver(\n", + " imf_sub, \"ccode\", None, nam, \"v_\", range(1980, 2021)\n", + " )\n", + " imf_sub[nam] = imf_sub[nam].astype(\"float64\")\n", + " if first == 0:\n", + " first += 1\n", + " imf_reorg = imf_sub.copy()\n", + " else:\n", + " imf_reorg = imf_reorg.merge(\n", + " imf_sub, how=\"outer\", left_index=True, right_index=True\n", + " )\n", + "\n", + "gdp_pop_df = gdp_pop_df.merge(imf_reorg, how=\"outer\", left_index=True, right_index=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a5b448af-0fbc-40ef-a3e6-191c278ca955", + "metadata": {}, + "source": [ + "### Maddison Project Database (MPD)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "507434c7-071f-40a3-b205-33668913aabd", + "metadata": {}, + "outputs": [], + "source": [ + "# MPD\n", + "mpd = pd.read_parquet(sset.DIR_YPK_INT / \"maddison_project.parquet\")\n", + "mpd = mpd.loc[\n", + " mpd.index.get_level_values(\"ccode\").isin(sset.ALL_ISOS_EXTENDED), [\"gdppc\", \"pop\"]\n", + "].rename(columns=dict(zip([\"gdppc\", \"pop\"], [\"mpd_rgdpna_pc\", \"mpd_pop\"])))\n", + "\n", + "# separating North Korea (PRK) and non-PRK\n", + "mpd_no_prk = mpd.loc[mpd.index.get_level_values(\"ccode\") != \"PRK\", :].sort_index()\n", + "mpd_prk = mpd.loc[[\"PRK\"], :].sort_index()\n", + "\n", + "# some minor interpolation for the case of North Korea\n", + "mpd_prk_interped = pd.DataFrame(data={\"ccode\": [\"PRK\"] * 71, \"year\": range(1950, 2021)})\n", + "mpd_prk_interped.set_index([\"ccode\", \"year\"], inplace=True)\n", + "for i in [\"mpd_rgdpna_pc\", \"mpd_pop\"]:\n", + " i_yrs = (\n", + " mpd_prk.loc[~pd.isnull(mpd_prk[i]), :].index.get_level_values(\"year\").unique()\n", + " )\n", + " vals = mpd_prk.loc[(\"PRK\", i_yrs), i].values\n", + " interp_yrs = list(range(1950, i_yrs.max() + 1))\n", + " vals_interp = np.exp(np.interp(interp_yrs, i_yrs, np.log(vals)))\n", + " mpd_prk_interped[i] = np.nan\n", + " mpd_prk_interped.loc[(\"PRK\", interp_yrs), i] = vals_interp\n", + "\n", + "# merge\n", + "mpd = pd.concat([mpd_no_prk, mpd_prk_interped], axis=0)\n", + "gdp_pop_df = gdp_pop_df.join(mpd, how=\"outer\")" + ] + }, + { + "cell_type": "markdown", + "id": "8a8f4546-cc1a-467a-9cb4-1b50d631785c", + "metadata": {}, + "source": [ + "### UN population data (UN WPP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a7da224-0438-419c-b6c3-7a5e60a8db5e", + "metadata": {}, + "outputs": [], + "source": [ + "unpop = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"un_population.parquet\",\n", + " filters=[\n", + " (\"Variant\", \"==\", \"Medium\"),\n", + " ],\n", + ")\n", + "unpop = unpop.loc[\n", + " unpop.index.get_level_values(\"ccode\").isin(sset.ALL_ISOS_EXTENDED)\n", + " & (unpop.index.get_level_values(\"year\") <= 2020),\n", + " [\"PopTotal\"],\n", + "].rename(columns={\"PopTotal\": \"un_pop\"})\n", + "\n", + "# re-organizing from thousands to millions\n", + "unpop[\"un_pop\"] /= 1000\n", + "\n", + "# merging\n", + "gdp_pop_df = gdp_pop_df.join(unpop.un_pop, how=\"outer\").sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "d5520b05-0d39-42fc-8f53-6194f7bd317b", + "metadata": {}, + "source": [ + "### OECD regional data\n", + "\n", + "Among the relevant countries and regions we want to observe, only the five French overseas departments (Martinique, Mayotte, Guadeloupe, French Guiana, and La Réunion) are available in OECD regional data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5f03eb9-c89e-4299-8507-90d15851b5b4", + "metadata": {}, + "outputs": [], + "source": [ + "# mapping region names and ISO codes\n", + "fra_regions = [\"Martinique\", \"Mayotte\", \"Guadeloupe\", \"French Guiana\", \"La Réunion\"]\n", + "fra_isos = [\"MTQ\", \"MYT\", \"GLP\", \"GUF\", \"REU\"]\n", + "fra_map = pd.DataFrame(data={\"Region\": fra_regions, \"ccode\": fra_isos})\n", + "\n", + "# reading in the OECD data for population and gdp\n", + "regpop = pd.read_csv(sset.DIR_OECD_REGIONS_RAW / \"REGION_DEMOGR.csv\").rename(\n", + " columns={\"Territory Level and Typology\": \"terrtype\", \"TIME\": \"year\"}\n", + ")\n", + "regpop = (\n", + " regpop.loc[(regpop.terrtype != \"Country\") & ~pd.isnull(regpop.Value), :]\n", + " .merge(fra_map, on=[\"Region\"], how=\"left\")\n", + " .sort_values([\"ccode\", \"year\"])\n", + " .rename(columns={\"Value\": \"oecd_pop\"})\n", + ")\n", + "\n", + "regecon = pd.read_csv(sset.DIR_OECD_REGIONS_RAW / \"REGION_ECONOM.csv\").rename(\n", + " columns={\"Territory Level and Typology\": \"terrtype\", \"TIME\": \"year\"}\n", + ")\n", + "regecon = (\n", + " regecon.loc[(regecon.terrtype != \"Country\") & ~pd.isnull(regecon.Value), :]\n", + " .merge(fra_map, on=[\"Region\"], how=\"left\")\n", + " .sort_values([\"ccode\", \"year\"])\n", + " .rename(columns={\"Value\": \"oecd_rgdpna\"})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f222495-f877-4c98-b5d8-13da2b9aab93", + "metadata": {}, + "outputs": [], + "source": [ + "# subsetting for the total population information\n", + "fra_pop_detect = regpop.loc[\n", + " ~pd.isnull(regpop.ccode) & (regpop.VAR == \"T\") & (regpop.Gender == \"Total\"), :\n", + "].set_index([\"ccode\", \"year\"])\n", + "fra_pop_detect[\"oecd_pop\"] /= 1000000\n", + "\n", + "# subsetting for the total GDP information\n", + "fra_Y_detect = regecon.loc[\n", + " ~pd.isnull(regecon.ccode) & (regecon.MEAS == \"USD_PPP\") & (regecon.year <= 2020), :\n", + "].set_index([\"ccode\", \"year\"])\n", + "\n", + "# merging with the original dataset\n", + "gdp_pop_df = gdp_pop_df.join(\n", + " [fra_pop_detect.oecd_pop, fra_Y_detect.oecd_rgdpna], how=\"outer\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "41c7aa03-2b63-4e5a-ab00-98ebafaf68b3", + "metadata": {}, + "source": [ + "### CIA World Factbook\n", + "\n", + "CIA information has been pre-cleaned to be in 2017 PPP USD, part of which has used extrapolation for PPP conversion rates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2442891d-5540-40b4-bf69-2d5ddd8a3c79", + "metadata": {}, + "outputs": [], + "source": [ + "cia = pd.read_parquet(sset.DIR_YPK_INT / \"cia_wf_gdp_constant_2017_ppp_usd_ver.parquet\")\n", + "gdp_pop_df = gdp_pop_df.merge(\n", + " cia[[\"cia_rgdpna\"]], left_index=True, right_index=True, how=\"outer\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cfe7cdaf-068b-462c-9941-4dd265630f25", + "metadata": {}, + "source": [ + "### UN SNA AMA information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b83a9e04-fe4c-42d5-a8e0-f1d5d348e107", + "metadata": {}, + "outputs": [], + "source": [ + "# matching country/region names and country/region codes\n", + "wb_country_matching = (\n", + " wb_wdi[[\"country\"]].reset_index()[[\"country\", \"ccode\"]].drop_duplicates()\n", + ")\n", + "\n", + "un_country_dict_additional = [\n", + " [\"Anguilla\", \"AIA\"],\n", + " [\"Bolivia (Plurinational State of)\", \"BOL\"],\n", + " [\"China, Hong Kong SAR\", \"HKG\"],\n", + " [\"China, Macao Special Administrative Region\", \"MAC\"],\n", + " [\"China, People's Republic of\", \"CHN\"],\n", + " [\"Congo\", \"COG\"],\n", + " [\"Cook Islands\", \"COK\"],\n", + " [\"Curaçao\", \"CUW\"],\n", + " [\"Czechia\", \"CZE\"],\n", + " [\"Côte d'Ivoire\", \"CIV\"],\n", + " [\"Democratic People's Republic of Korea\", \"PRK\"],\n", + " [\"Democratic Republic of the Congo\", \"COD\"],\n", + " [\"Egypt\", \"EGY\"],\n", + " [\"Gambia\", \"GMB\"],\n", + " [\"Iran, Islamic Republic of\", \"IRN\"],\n", + " [\"Kingdom of Eswatini\", \"SWZ\"],\n", + " [\"Kyrgyzstan\", \"KGZ\"],\n", + " [\"Lao People's Democratic Republic\", \"LAO\"],\n", + " [\"Micronesia (Federated States of)\", \"FSM\"],\n", + " [\"Montserrat\", \"MSR\"],\n", + " [\"Republic of Korea\", \"KOR\"],\n", + " [\"Republic of Moldova\", \"MDA\"],\n", + " [\"Republic of North Macedonia\", \"MKD\"],\n", + " [\"Saint Kitts and Nevis\", \"KNA\"],\n", + " [\"Saint Lucia\", \"LCA\"],\n", + " [\"Saint Vincent and the Grenadines\", \"VCT\"],\n", + " [\"Slovakia\", \"SVK\"],\n", + " [\"State of Palestine\", \"PSE\"],\n", + " [\"Venezuela (Bolivarian Republic of)\", \"VEN\"],\n", + " [\"Viet Nam\", \"VNM\"],\n", + " [\"Yemen\", \"YEM\"],\n", + " [\"United Kingdom of Great Britain and Northern Ireland\", \"GBR\"],\n", + "]\n", + "\n", + "country_matching_additional = pd.concat(\n", + " [\n", + " wb_country_matching,\n", + " pd.DataFrame(un_country_dict_additional, columns=[\"country\", \"ccode\"]),\n", + " ],\n", + " axis=0,\n", + ").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f695bc5-e894-4241-9dc0-361c0916be78", + "metadata": {}, + "outputs": [], + "source": [ + "UN_NOM_y = pd.read_csv(sset.DIR_UN_AMA_RAW / \"un_snaama_nom_gdppc.csv\").rename(\n", + " columns={\n", + " \"Country/Area\": \"country\",\n", + " \"Year\": \"year\",\n", + " \"GDP, Per Capita GDP - US Dollars\": \"un_nom_gdppc\",\n", + " }\n", + ")\n", + "UN_NOM_y.drop([\"Unit\"], axis=1, inplace=True)\n", + "UN_NOM_y = UN_NOM_y.merge(wb_country_matching, on=[\"country\"], how=\"left\")\n", + "UN_NOM_y = UN_NOM_y.loc[~pd.isnull(UN_NOM_y.ccode), :]\n", + "UN_NOM_y = UN_NOM_y.set_index([\"ccode\", \"year\"]).drop([\"country\"], axis=1).un_nom_gdppc\n", + "gdp_pop_df = gdp_pop_df.join(UN_NOM_y, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "d005a3ed-31cf-48a1-92a8-43cecdb03a16", + "metadata": { + "tags": [] + }, + "source": [ + "### Information from various disaggregated sources, for smaller regions, territories and countries\n", + "\n", + "This includes national account reports and approximations from organizational reports or academic papers.\n", + "\n", + "#### Åland Islands (`ALA`; GDP per capita and population)\n", + "\n", + "- GDP per capita: information available from Statistics and Research Åland (link [here](https://www.asub.ax/en/statistics/national-accounts/gross-domestic-product), see the link \"GDP per capita 1995-2018 in current prices, PPS euro\"). Since this is in PPP Euro (or Finnish Purchasing Power Standard exchange rate), we will use the nominal Euro-to-USD converision to clean this assuming that these are current PPP.\n", + "- Population: information available from Statistics and Research Åland (link [here](https://www.asub.ax/en/statistics/population/size-and-structure-population), see the link \"Åland, the Faroe Islands and Greenland\". This is in ones of people, so we divide by 1 million to keep the population in millions of people." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7359dad2-96f4-4d8a-93f1-38b798b3f7df", + "metadata": {}, + "outputs": [], + "source": [ + "# aland islands information, GDP\n", + "ala_gdp = (\n", + " pd.read_excel(sset.DIR_ALAND_STATISTICS_RAW / \"aland_gdp.xlsx\")\n", + " .rename(columns={\"Unnamed: 0\": \"country\"})\n", + " .set_index([\"country\"])\n", + ")\n", + "ala_years = ala_gdp.columns.values\n", + "ala_95_18_cgdpo_pc = ala_gdp.loc[\"Åland\", :]\n", + "\n", + "# exchange rate; EMU only has down to 1999, so for convenience's sake\n", + "# for 1995-1998, we will use 1999 rates\n", + "wdi_xrate = (\n", + " pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\")\n", + " .loc[(\"EMU\", list(range(1999, ala_years.max() + 1))), \"xrate\"]\n", + " .values\n", + ")\n", + "ala_xrate = np.hstack([[wdi_xrate[0]] * (1999 - ala_years.min()), wdi_xrate])\n", + "ala_95_18_cgdpo_pc = ala_95_18_cgdpo_pc * ala_xrate\n", + "\n", + "# creating the Aland islands column for cgdpo_pc\n", + "ala = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"FIN\", 1950:], \"cgdpo\"].copy() * np.nan)\n", + " .rename(\"ala_cgdpo_pc\")\n", + " .reset_index()\n", + ")\n", + "ala[\"ccode\"] = \"ALA\"\n", + "ala = ala.set_index([\"ccode\", \"year\"]).ala_cgdpo_pc\n", + "ala.loc[\"ALA\", list(ala_years)] = ala_95_18_cgdpo_pc.values\n", + "\n", + "# aland islands information, population\n", + "ala_pop_link = (\n", + " \"https://www.asub.ax/sites/www.asub.ax/files/attachments/page/alv01_aland_faroe\"\n", + " \"_islands_and_greenland_-_an_overview_with_comparable_data.xlsx\"\n", + ")\n", + "ala_pop = pd.read_excel(sset.DIR_ALAND_STATISTICS_RAW / \"aland_pop.xlsx\").rename(\n", + " columns={\"Unnamed: 0\": \"category\"}\n", + ")\n", + "ala_pop_00_20 = ala_pop.iloc[1].values[1:-1]\n", + "ala_pop = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"FIN\", 1950:], \"pop\"].copy() * np.nan)\n", + " .rename(\"ala_pop\")\n", + " .reset_index()\n", + ")\n", + "ala_pop[\"ccode\"] = \"ALA\"\n", + "ala_pop = ala_pop.set_index([\"ccode\", \"year\"]).ala_pop\n", + "ala_pop.loc[\"ALA\", list(range(2000, 2021))] = ala_pop_00_20 / 1000000\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(ala, how=\"outer\")\n", + "gdp_pop_df = gdp_pop_df.join(ala_pop, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "f7470549-9075-4b17-a110-789c60f88336", + "metadata": { + "tags": [] + }, + "source": [ + "#### Norfolk Island (`NFK`)\n", + "\n", + "- GDP: GDPpc as a percentage of the Australian level for the years 1951-52 are shown in [Treadgold (Asia Pacific Viewpoint, 1999)](https://doi.org/10.1111/1467-8373.00095) and similar percentage for 1995-96 are shown in [Treadgold (Pacific Economic Bulletin, 1998)](https://openresearch-repository.anu.edu.au/handle/1885/157535).\n", + "- Population: The Australian Census has information for Norfolk in the years 2001, 2011, and 2016: www.infrastructure.gov.au/territories-regions-cities/territories/norfolk-island" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a142f948-0112-411a-8a0f-fb155117734e", + "metadata": {}, + "outputs": [], + "source": [ + "# population\n", + "nfk = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"AUS\", 1950:], \"un_pop\"].copy() * np.nan)\n", + " .rename(\"aus_census_pop\")\n", + " .reset_index()\n", + ")\n", + "nfk[\"ccode\"] = \"NFK\"\n", + "nfk = nfk.set_index([\"ccode\", \"year\"]).aus_census_pop\n", + "nfk.loc[\"NFK\", [2001, 2011, 2016]] = np.array([2601, 1796, 1748]) / 1000000\n", + "gdp_pop_df = gdp_pop_df.join(nfk, how=\"outer\")\n", + "\n", + "# GDP\n", + "nfk_gdp = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"AUS\", 1950:], \"rgdpna\"].copy() * np.nan)\n", + " .rename(\"treadgold_rgdpna_pc\")\n", + " .reset_index()\n", + ")\n", + "nfk_gdp[\"ccode\"] = \"NFK\"\n", + "nfk_cgdpo = nfk_gdp.copy().rename(columns={\"treadgold_rgdpna_pc\": \"treadgold_cgdpo_pc\"})\n", + "\n", + "# getting the Australian GDPpc\n", + "nfk_yrs = [1951, 1952, 1995, 1996]\n", + "nfk_ratios = np.array([0.39, 0.39, 1.12, 1.12])\n", + "nfk_rgdpna_pc = (\n", + " gdp_pop_df.loc[pd.IndexSlice[\"AUS\", nfk_yrs], \"rgdpna\"].values\n", + " / gdp_pop_df.loc[pd.IndexSlice[\"AUS\", nfk_yrs], \"pop\"].values\n", + " * nfk_ratios\n", + ")\n", + "nfk_gdp = nfk_gdp.set_index([\"ccode\", \"year\"]).treadgold_rgdpna_pc\n", + "nfk_gdp.loc[\"NFK\", nfk_yrs] = nfk_rgdpna_pc\n", + "gdp_pop_df = gdp_pop_df.join(nfk_gdp, how=\"outer\")\n", + "\n", + "nfk_cgdpo_pc = (\n", + " gdp_pop_df.loc[pd.IndexSlice[\"AUS\", nfk_yrs], \"cgdpo\"].values\n", + " / gdp_pop_df.loc[pd.IndexSlice[\"AUS\", nfk_yrs], \"pop\"].values\n", + " * nfk_ratios\n", + ")\n", + "nfk_cgdpo = nfk_cgdpo.set_index([\"ccode\", \"year\"]).treadgold_cgdpo_pc\n", + "nfk_cgdpo.loc[\"NFK\", nfk_yrs] = nfk_cgdpo_pc\n", + "gdp_pop_df = gdp_pop_df.join(nfk_cgdpo, how=\"outer\")" + ] + }, + { + "cell_type": "markdown", + "id": "d988bec2-a804-4cf8-a84c-8c53df4c39cd", + "metadata": { + "tags": [] + }, + "source": [ + "#### Cocos (Keeling) Islands (`CCK`)\n", + "\n", + "- GDP: Information available from the House of Representative Committees of Parliament of Australia (link [here](https://www.aph.gov.au/parliamentary_business/committees/House_of_Representatives_Committees?url=ncet/economicenvironment/report/index.htm)). Please see **Chapter 3 The economic environment of the Indian Ocean Territories** page 23; the units are in Australian dollars (nominal), and the value corresponds to 2010's GDP.\n", + "- Population: from the Australian Bureau of Statistics (ABS); information for [2016](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/90102), [2011](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/90102?opendocument), [2006](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2006/quickstat/910053009?opendocument), and [2001](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2001/quickstat/910053009?opendocument) available from the ABS Quickstat pages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3eb5aa9-cf97-4dbe-9fec-eb4760f74f32", + "metadata": {}, + "outputs": [], + "source": [ + "# GDP\n", + "cck_gdp = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"AUS\", 1950:], \"rgdpna\"].copy() * np.nan)\n", + " .rename(\"aus_census_nom_gdp\")\n", + " .reset_index()\n", + ")\n", + "cck_gdp[\"ccode\"] = \"CCK\"\n", + "cck_gdp = cck_gdp.set_index([\"ccode\", \"year\"]).aus_census_nom_gdp\n", + "cck_gdp.loc[\"CCK\", [2010]] = (15000000 / 1000000) * pd.read_parquet(\n", + " sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\"\n", + ").loc[(\"AUS\", [2010]), \"xrate\"].values\n", + "\n", + "gdp_pop_df = gdp_pop_df.join(cck_gdp, how=\"outer\")\n", + "\n", + "# population\n", + "gdp_pop_df.loc[(\"CCK\", [2001, 2006, 2011, 2016]), \"aus_census_pop\"] = (\n", + " np.array([621, 572, 550, 544]) / 1000000\n", + ")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "55f78a73-023c-48ce-81df-7510ecf88d98", + "metadata": {}, + "source": [ + "#### Christmas Island (`CXR`)\n", + "- GDP: Information available from the House of Representative Committees of Parliament of Australia (link [here](https://www.aph.gov.au/parliamentary_business/committees/House_of_Representatives_Committees?url=ncet/economicenvironment/report/index.htm)). Please see **Chapter 3 The economic environment of the Indian Ocean Territories** page 23; the units are in Australian dollars (nominal), and the value corresponds to 2010's GDP.\n", + "- Population: from the Australian Bureau of Statistics (ABS); information for [2016](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/90101?opendocument), [2011](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/910052009?opendocument), [2006](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2006/quickstat/910052009?opendocument&navpos=220), and [2001](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2001/quickstat/910052009?opendocument&navpos=220) available from the ABS Quickstat pages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c93edf40-a139-4ccb-9a6d-35d65f77af45", + "metadata": {}, + "outputs": [], + "source": [ + "# population\n", + "cxr_gdp_pop = gdp_pop_df.loc[[\"CCK\"]].reset_index()\n", + "cxr_gdp_pop[\"ccode\"] = \"CXR\"\n", + "cxr_gdp_pop.set_index([\"ccode\", \"year\"], inplace=True)\n", + "cxr_gdp_pop.loc[(\"CXR\", [2001, 2006, 2011, 2016]), \"aus_census_pop\"] = (\n", + " np.array([1446, 1349, 2072, 1843]) / 1000000\n", + ")\n", + "\n", + "# GDP\n", + "xrate_val = (\n", + " pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\")\n", + " .loc[(\"AUS\", 2010), \"xrate\"]\n", + " .values[0]\n", + ")\n", + "cxr_gdp_pop.loc[(\"CXR\", 2010), \"aus_census_nom_gdp\"] = xrate_val * 71000000 / 1000000\n", + "\n", + "# merging\n", + "gdp_pop_df = pd.concat([gdp_pop_df, cxr_gdp_pop], axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "20aa0c69-7d20-4bd8-aa68-2e31d7ee8234", + "metadata": {}, + "source": [ + "#### Pitcairn Island (`PCN`)\n", + "\n", + "- GDP: estimate of approximately 217,000 New Zealand dollars (from [this link](https://web.archive.org/web/20150705134639/http://www.government.pn/policies/Pitcairn%20Island%20SDP%202012-2016.pdf#page=4) for a WayBackMachine Archive of the Government of Pitcairn's \"Pitcairn Islands Strategic Development Plan\")\n", + "- Population: from the Pitcairn Island [government website](http://www.immigration.gov.pn/community/the_people/index.html), 233 people in 1937 and 49 people in 2017. Also, according to CIA World Factbook ([link here](https://www.cia.gov/the-world-factbook/countries/pitcairn-islands/#people-and-society)), its population was 50 in 2021 (will interpolate and round up to whole numbers)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4948806-11d6-4237-a9f7-e5698707d467", + "metadata": {}, + "outputs": [], + "source": [ + "# Pitcairn islands population\n", + "pcn_50_20 = np.round(\n", + " np.exp(\n", + " np.interp(list(range(1950, 2021)), [1937, 2017, 2021], np.log([233, 49, 50]))\n", + " ),\n", + " 0,\n", + ")\n", + "pcn_gdp = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"NZL\", 1950:], \"pop\"].copy() * np.nan)\n", + " .rename(\"pcn_pop\")\n", + " .reset_index()\n", + ")\n", + "pcn_gdp[\"ccode\"] = \"PCN\"\n", + "pcn_pop = pcn_gdp.copy()\n", + "pcn_pop = pcn_pop.set_index([\"ccode\", \"year\"]).pcn_pop\n", + "pcn_pop.loc[\"PCN\", list(range(1950, 2021))] = pcn_50_20 / 1000000\n", + "\n", + "# Pitcairn GDP\n", + "pcn_gdp.rename(columns={\"pcn_pop\": \"pcn_nom_gdp\"}, inplace=True)\n", + "pcn_gdp = pcn_gdp.set_index([\"ccode\", \"year\"]).pcn_nom_gdp\n", + "wdi_xrate = (\n", + " pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\")\n", + " .loc[(\"NZL\", 2006), \"xrate\"]\n", + " .values[0]\n", + ")\n", + "pcn_gdp.loc[\"PCN\", 2006] = (217000 / 1000000) * wdi_xrate\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(pcn_pop, how=\"outer\")\n", + "gdp_pop_df = gdp_pop_df.join(pcn_gdp, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "03150de2-40b1-4c60-9e97-3129d93a00e7", + "metadata": {}, + "source": [ + "#### Svalbard and Jan Mayen (`SJM`)\n", + "\n", + "Jan Mayen is uninhabited, but Svalbard has population and economic activity.\n", + "- GDP: Unfortunately, we will have to use the GDP per capita of Norway itself as we do not have a reliable GDP estimate of Svalbard and `SJM` is a part of Norway.\n", + "- Population: From the Statistikkbanken (Statistics Norway, link [here](https://www.ssb.no/en/statbank/table/07429)), select all half years (2009-2021)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d8a7ba3-19ab-48d2-88bd-e3c3d408f837", + "metadata": {}, + "outputs": [], + "source": [ + "## svalbard and jan maarten (no population for jan mayen)\n", + "h1 = [2085, 2052, 2017, 2115, 2158, 2100, 2185, 2152, 2145, 2214, 2258, 2428]\n", + "h2 = [2140, 2071, 2140, 2195, 2195, 2118, 2189, 2162, 2210, 2310, 2379, 2417]\n", + "sjm_pop_09_20 = np.round((np.array(h1) + np.array(h2)) / 2, 0) / 1000000\n", + "sjm_pop = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"NOR\", 1950:], \"pop\"].copy() * np.nan)\n", + " .rename(\"nor_census_pop\")\n", + " .reset_index()\n", + ")\n", + "sjm_pop[\"ccode\"] = \"SJM\"\n", + "sjm_pop = sjm_pop.set_index([\"ccode\", \"year\"]).nor_census_pop\n", + "sjm_pop.loc[\"SJM\", list(range(2009, 2021))] = sjm_pop_09_20\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(sjm_pop, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "34144b4b-a40c-450b-b77a-5b4fcfd5817d", + "metadata": {}, + "source": [ + "#### Saint Helena, Ascension and Tristan da Cunha (`SHN`)\n", + "\n", + "We have data from the [St. Helena Government](https://www.sainthelena.gov.sh/wp-content/uploads/2020/07/SEDP-EOY-Progress-Report-Final-160720.pdf) about the estimated GDP per capita (in 2019 prices, non-PPP) of 2018 and 2019. We will combine this later (by comparing ratios with the UK GDP) to get the approximate GDP per capita values in PPP.\n", + "- Note that this is *not* the average GDPpc of Saint Helena, Ascension and Tristan da Cunha but rather just Saint Helena (so we are using Saint Helena to proxy for the three areas, which are represented by the country-code `SHN`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27600ef8-3547-4478-be9e-0fcca7c7538a", + "metadata": {}, + "outputs": [], + "source": [ + "wdi_xrate = (\n", + " pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\")\n", + " .loc[(\"GBR\", [2018, 2019]), \"xrate\"]\n", + " .values\n", + ")\n", + "shn = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"GBR\", 1950:], \"rgdpna\"].copy() * np.nan)\n", + " .rename(\"shn_gov_gdppc\")\n", + " .reset_index()\n", + ")\n", + "shn[\"ccode\"] = \"SHN\"\n", + "shn = shn.set_index([\"ccode\", \"year\"]).shn_gov_gdppc\n", + "shn.loc[\"SHN\", [2018, 2019]] = np.array([8490, 8230]) * wdi_xrate\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(shn, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "bad0c87a-bd6f-4b79-ae7e-a091125d96a4", + "metadata": {}, + "source": [ + "#### Saint Barthélemy (`BLM`)\n", + "\n", + "Saint Barthélemy's 2010 and 1999 (nominal) GDP per capita shown in the CEROM document: [link here](https://www.cerom-outremer.fr/guadeloupe/publications/etudes-cerom/estimation-du-pib-par-habitant-de-st-barthelemy.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "486ae6c9-0db2-4afb-83d0-60c6b2e3064d", + "metadata": {}, + "outputs": [], + "source": [ + "blm = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"GBR\", 1950:], \"rgdpna\"].copy() * np.nan)\n", + " .rename(\"cerom_gdppc\")\n", + " .reset_index()\n", + ")\n", + "blm[\"ccode\"] = \"BLM\"\n", + "blm = blm.set_index([\"ccode\", \"year\"]).cerom_gdppc\n", + "wdi_xrate = (\n", + " pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\")\n", + " .loc[(\"EMU\", [1999, 2010]), \"xrate\"]\n", + " .values\n", + ")\n", + "blm.loc[\"BLM\", [1999, 2010]] = np.array([26000, 35700]) * wdi_xrate\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(blm, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "b013433d-6321-4c6e-9146-587445eb1baa", + "metadata": {}, + "source": [ + "#### United States Minor Outlying Islands (`UMI`)\n", + "\n", + "- Population: from the U.S. Census ([link here](https://www.census.gov/history/pdf/2000-minoroutlyingislands.pdf)), years 1980, 1990, 2000\n", + "- GDP: will use GDPpc from `MNP` (Northern Mariana Islands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79322cb4-5716-40bc-a2bd-41a8887ede64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# US Minor Outlying Islands\n", + "umi_gdp = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"USA\", 1950:], \"pop\"].copy() * np.nan)\n", + " .rename(\"us_census_pop\")\n", + " .reset_index()\n", + ")\n", + "umi_gdp[\"ccode\"] = \"UMI\"\n", + "umi_gdp = umi_gdp.set_index([\"ccode\", \"year\"]).us_census_pop\n", + "umi_gdp.loc[\"UMI\", [1980, 1990, 2000]] = np.array([1082, 193, 316]) / 1000000\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(umi_gdp, how=\"outer\")\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "d6f8058b-f7f7-4ff8-8ae0-51166566dbce", + "metadata": {}, + "source": [ + "### Uninhabited areas (no population and no economic activity)\n", + "\n", + "These are: the French Southern and Antarctic Lands (`ATF`), Bouvet Island (`BVT`), Clipperton Island (`Cl-`), Heard and McDonald Islands (`HMD`), British Indian Ocean Territory (`IOT`), South Georgia and the South Sandwich Islands (`SGS`).\n", + "- `ATF`: According to the CIA World Factbook website (link [here](https://www.cia.gov/the-world-factbook/countries/french-southern-and-antarctic-lands/)), `ATF` does not have permanent population and therefore we will record this as having no population and no economic activity.\n", + "- `BVT`: uninhabited to protect nature reserve\n", + "- `CL-`: is an atoll with no permanent inhabitants since 1945.\n", + "- `HMD`: is an Australian external territory near Antarctica.\n", + "- `IOT`: is mostly composed of U.S.-U.K. military facilities with no permanent population.\n", + "- `SGS`: is uninhabited" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c6d7863-b1bb-4a54-bd3d-23f63f21c1c3", + "metadata": {}, + "outputs": [], + "source": [ + "# assigning 0 population and 0 GDP\n", + "atf_pop = (\n", + " (gdp_pop_df.loc[pd.IndexSlice[\"FRA\", 1950:], \"pop\"].copy() * np.nan)\n", + " .rename(\"noecon_pop\")\n", + " .reset_index()\n", + ")\n", + "atf_pop[\"ccode\"], atf_pop[\"atf_pop\"] = \"ATF\", 0\n", + "atf_pop.set_index([\"ccode\", \"year\"], inplace=True)\n", + "atf_gdp = atf_pop.copy().rename(columns={\"noecon_pop\": \"noecon_rgdpna\"})\n", + "atf_pop, atf_gdp = atf_pop.noecon_pop, atf_gdp.noecon_rgdpna\n", + "\n", + "# merging all\n", + "gdp_pop_df = gdp_pop_df.join(atf_pop, how=\"outer\")\n", + "gdp_pop_df = gdp_pop_df.join(atf_gdp, how=\"outer\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c01b354f-8ca5-4038-8cad-bd6773beb13a", + "metadata": {}, + "outputs": [], + "source": [ + "# Attaching the other uninhabited areas' information\n", + "for i in np.setdiff1d(sset.UNINHABITED_ISOS, [\"ATF\"]):\n", + " i_df = gdp_pop_df.loc[([\"ATF\"], slice(None)), :].reset_index()\n", + " i_df[\"ccode\"] = i\n", + " i_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + " gdp_pop_df = pd.concat([gdp_pop_df, i_df], axis=0)\n", + "gdp_pop_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "6e03a7c0-1784-4e5c-ac8a-2ff044a4080c", + "metadata": {}, + "source": [ + "### Exporting the intermediate result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e77e0021-692c-4153-ab55-a3bc436ec854", + "metadata": {}, + "outputs": [], + "source": [ + "# NE.GDI.FTOT.ZS: Gross fixed cap. formation rate in WB WDI; will be dealt with later\n", + "col_to_drop = np.intersect1d(gdp_pop_df.columns, [\"country\", \"NE.GDI.FTOT.ZS\"])\n", + "if len(col_to_drop) > 0:\n", + " gdp_pop_df.drop(col_to_drop, axis=1, inplace=True)\n", + "\n", + "gdp_pop_df.to_parquet(sset.DIR_YPK_RAW / \"gdp_gdppc_pop_raw_multiple_sources.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "974df6fe-3f7b-469a-8c99-67e5ce35b9d5", + "metadata": {}, + "source": [ + "## Historical population (1950-2019), creating a single sequence\n", + "\n", + "Before we go on further, note that when we refer to country-level population for those with overseas territories, it excludes the population of such territories (e.g., `USA` population noted below does not include `PRI` [Puerto Rico]). One big exception is that `FRA` (France) from PWT actually includes the following territories or \"overseas region\" into its population value calculation: `GLP`, `MYT`, `MTQ`, `GUF`, and `REU`. So we will actually use the `FRA+OV` designation for PWT variables and clean up along the way so that `FRA` only represents mainland France.\n", + "\n", + "### Re-reading in the raw-organized data\n", + "\n", + "If worried about memory issues (due to reasons outside of this notebook), restart the kernel, run the first and second cells, and run the following cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18ace68c-6493-4b15-8570-ce7b74b5cf0f", + "metadata": {}, + "outputs": [], + "source": [ + "gp_df = pd.read_parquet(sset.DIR_YPK_RAW / \"gdp_gdppc_pop_raw_multiple_sources.parquet\")\n", + "fraov = gp_df.loc[(\"FRA\", slice(None)), [\"pop\", \"cgdpo\", \"rgdpna\", \"rgdpo\"]]\n", + "fraov.reset_index(inplace=True)\n", + "fraov[\"ccode\"] = \"FRA+OV\"\n", + "gp_df = pd.concat([gp_df, fraov.set_index([\"ccode\", \"year\"])], axis=0)\n", + "\n", + "# just setting apart the population data\n", + "popraw_df = gp_df.loc[:, [x for x in gp_df.columns if \"pop\" in x]]" + ] + }, + { + "cell_type": "markdown", + "id": "43ad56e9-0fea-413e-a48d-34ef9331cf41", + "metadata": {}, + "source": [ + "### Cleaning for the uninhabited areas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1250231e-200e-485c-8050-6d3d3bcce349", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# `inh` for inhabited\n", + "pop_uninh = popraw_df.loc[sset.UNINHABITED_ISOS, :].sort_index()\n", + "pop_uninh[\"pop\"] = 0\n", + "pop_uninh[\"pop_source\"] = \"uninhabited\"\n", + "pop_uninh[\"pop_unit\"] = \"millions (of people)\"\n", + "pop_uninh = pop_uninh[[\"pop_unit\", \"pop_source\", \"pop\"]]\n", + "pop_inh = popraw_df.loc[\n", + " ~popraw_df.index.get_level_values(\"ccode\").isin(sset.UNINHABITED_ISOS), :\n", + "].sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "5f834e8e-eda8-46d4-bb7e-6d4b86ee9498", + "metadata": { + "tags": [] + }, + "source": [ + "### Cleaning for Serbia and Kosovo\n", + "\n", + "It has been noted that Kosovo's population was included in the Serbian population from UN (for all times) and that from PWT (before 1999). We do have the Kosovan population from World Bank (1960-2020) so we will subtract this from the Serbian population, and for 1950-1959, we will use the mean ratio between Kosovo and Serbia." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9db3eb7b-03f8-4473-884c-ce7f3ce094f4", + "metadata": {}, + "outputs": [], + "source": [ + "# 1960-2020\n", + "kosovo_pop = pop_inh.loc[\"KO-\", \"wb_pop\"]\n", + "kosovo_yrs = kosovo_pop.index.values\n", + "pop_inh.loc[(\"SRB\", kosovo_yrs), \"pop\"] = (\n", + " pop_inh.loc[(\"SRB\", kosovo_yrs), \"un_pop\"].values - kosovo_pop.values\n", + ")\n", + "srb_ko_pop = pop_inh.loc[[\"SRB\"], [\"pop\"]].sort_index()\n", + "ko_pop = srb_ko_pop.reset_index()\n", + "ko_pop[\"ccode\"], ko_pop[\"pop\"] = \"KO-\", np.nan\n", + "srb_ko_pop = pd.concat(\n", + " [srb_ko_pop, ko_pop.set_index([\"ccode\", \"year\"])], axis=0\n", + ").sort_index()\n", + "srb_ko_pop.loc[(\"KO-\", kosovo_yrs), \"pop\"] = kosovo_pop.values\n", + "\n", + "# taking care of the 1950-1959 (mean ratio of KO- to KO- + SRB)\n", + "ratio = (kosovo_pop.values / pop_inh.loc[(\"SRB\", kosovo_yrs), \"un_pop\"].values).mean()\n", + "pre_kosovo_yrs = np.sort(\n", + " np.setdiff1d(srb_ko_pop.index.get_level_values(\"year\").unique(), kosovo_yrs)\n", + ")\n", + "kosovo_pre_pop = pop_inh.loc[(\"SRB\", pre_kosovo_yrs), \"un_pop\"].values * ratio\n", + "srb_ko_pop.loc[(\"SRB\", pre_kosovo_yrs), \"pop\"] = (\n", + " pop_inh.loc[(\"SRB\", pre_kosovo_yrs), \"un_pop\"].values - kosovo_pre_pop\n", + ")\n", + "srb_ko_pop.loc[(\"KO-\", pre_kosovo_yrs), \"pop\"] = kosovo_pre_pop\n", + "srb_ko_pop[\"pop_unit\"] = \"millions (of people)\"\n", + "srb_ko_pop[\"pop_source\"] = \"UN_SRB_minus_WB_XKX\"\n", + "srb_ko_pop.loc[(\"KO-\", kosovo_yrs), \"pop_source\"] = \"WB\"\n", + "srb_ko_pop.loc[(\"SRB\", pre_kosovo_yrs), \"pop_source\"] = \"UN_SRB_ratio_WB_XKX\"\n", + "srb_ko_pop.loc[(\"KO-\", pre_kosovo_yrs), \"pop_source\"] = \"UN_SRB_ratio_WB_XKX\"\n", + "\n", + "## merging with the uninhabited to create the \"clean\" dataset\n", + "pop_cleaned = pd.concat([pop_uninh, srb_ko_pop], axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "26e376cf-b78a-4794-9474-81a984c00d1b", + "metadata": {}, + "source": [ + "### Cleaning for the U.S. and U.S. Territories\n", + "\n", + "It seems that there is some arbitrariness when it comes to including territories or excluding them, when calculating for the population (even within the same dataset, it is unclear whether the population for some year includes or excludes the territories). We will take the PWT10.0 population as the U.S. population without territories included, and attach population numbers of the territories appropriately (from relevant, filled data sources). For the missing years we extrapolate using the known ratio between the respective territory and the U.S. mainland." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc5a2d4-5e3e-4aa0-9fd8-72ed5184218e", + "metadata": {}, + "outputs": [], + "source": [ + "# USA; use PWT10.0 as the base\n", + "# since it's missing 2020, use the growth rate from wb_pop\n", + "us_gr = pop_inh.loc[(\"USA\", 2020), \"un_pop\"] / pop_inh.loc[(\"USA\", 2019), \"un_pop\"]\n", + "us_2020 = pop_inh.loc[(\"USA\", 2019), \"pop\"] * us_gr\n", + "us_df = pop_inh.loc[([\"USA\"], range(1950, 2021)), [\"pop\"]].sort_index()\n", + "us_df.loc[(\"USA\", 2020), \"pop\"] = us_2020\n", + "us_df[\"pop_source\"] = \"PWT\"\n", + "us_df.loc[(\"USA\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "740eb452-3447-4924-97ba-62c45707c082", + "metadata": {}, + "outputs": [], + "source": [ + "# UMI\n", + "umi_yrs = (\n", + " pop_inh.loc[~pd.isnull(pop_inh.us_census_pop), :]\n", + " .index.get_level_values(\"year\")\n", + " .values\n", + ")\n", + "umi_df = pop_inh.loc[(\"USA\", range(1950, 2021)), [\"pop\"]].reset_index()\n", + "umi_df[\"ccode\"], umi_df[\"pop\"] = \"UMI\", np.nan\n", + "umi_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + "umi_vals = pop_inh.loc[(\"UMI\", umi_yrs), \"us_census_pop\"].values\n", + "umi_df.loc[(\"UMI\", umi_yrs), \"pop\"] = umi_vals\n", + "umi_ratio = (\n", + " umi_df.loc[(\"UMI\", umi_yrs), \"pop\"].values\n", + " / pop_inh.loc[(\"USA\", umi_yrs), \"pop\"].values\n", + ").mean()\n", + "umi_interp_yrs = list(range(umi_yrs.min(), umi_yrs.max() + 1))\n", + "umi_df.loc[(\"UMI\", umi_interp_yrs), \"pop\"] = np.exp(\n", + " np.interp(umi_interp_yrs, umi_yrs, np.log(umi_vals))\n", + ")\n", + "umi_df[\"pop_source\"] = \"interp\"\n", + "umi_df.loc[(\"UMI\", umi_yrs), \"pop_source\"] = \"US_CENSUS\"\n", + "\n", + "# ratio with the US\n", + "umi_remaining = np.setdiff1d(list(range(1950, 2021)), umi_interp_yrs)\n", + "umi_df.loc[(\"UMI\", umi_remaining), \"pop\"] = (\n", + " us_df.loc[(\"USA\", umi_remaining), \"pop\"].values * umi_ratio\n", + ")\n", + "umi_df.loc[(\"UMI\", umi_remaining), \"pop_source\"] = \"US_CENSUS_UMI_ratio_PWT_USA\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41aabd4c-6749-4568-9297-ba0ef2e03407", + "metadata": {}, + "outputs": [], + "source": [ + "# PRI, VIR, GUM, ASM, MNP: use UN population\n", + "other_us_terr_df = (\n", + " pop_inh.loc[([\"PRI\", \"VIR\", \"GUM\", \"ASM\", \"MNP\"], range(1950, 2021)), [\"un_pop\"]]\n", + " .sort_index()\n", + " .rename(columns={\"un_pop\": \"pop\"})\n", + ")\n", + "other_us_terr_df[\"pop_source\"] = \"UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38b82030-f5cf-4cb0-98ea-d61f00883c25", + "metadata": {}, + "outputs": [], + "source": [ + "# merging the cleaned information\n", + "pop_cleaned = pd.concat([pop_cleaned, us_df, umi_df, other_us_terr_df]).sort_index()\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "29e710f7-6d48-4a7f-9bb7-77c4c2d2e755", + "metadata": {}, + "source": [ + "### Cleaning for the French territories\n", + "\n", + "There are (excluding the no-population ones):\n", + "- Five overseas departments: `GUF`, `GLP`, `MTQ`, `MYT`, `REU`\n", + "- Overseas collectivities: `PYF`, `BLM`, `SPM`, `MAF`, `WLF`\n", + "- Other: `NCL`\n", + "\n", + "Including that for France, we will simply use UN population (since its French population doesn't include the departments)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5171db20-abdd-44ad-8ad6-12948f79b173", + "metadata": {}, + "outputs": [], + "source": [ + "# overseas departments\n", + "french = [\"GUF\", \"GLP\", \"MTQ\", \"MYT\", \"REU\", \"PYF\", \"BLM\", \"SPM\", \"MAF\", \"WLF\"]\n", + "french += [\"NCL\", \"FRA\"]\n", + "french_pop = pop_inh.loc[(french, range(1950, 2021)), [\"un_pop\"]].rename(\n", + " columns={\"un_pop\": \"pop\"}\n", + ")\n", + "french_pop[\"pop_source\"] = \"UN\"\n", + "french_pop[\"pop_unit\"] = \"millions (of people)\"\n", + "\n", + "pop_cleaned = pd.concat([french_pop, pop_cleaned], axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "2db7eebe-ae62-47e7-bdd3-3a2b43a590fa", + "metadata": {}, + "source": [ + "### Cleaning for the British territories\n", + "\n", + "These are (excluding the no-population ones):\n", + "- Overseas Territories: `AIA`, `BMU`, `VGB`, `CYM`, `FLK`, `GIB`, `MSR`, `PCN`, `SHN`, `TCA`\n", + "- Crown dependencies: `GGY`, `JEY`, `IMN`\n", + "\n", + "1. For all Overseas Territories *except `PCN`*, we take from UN (and for `PCN`, from interpolated data using CIA WF and `PCN` Government data).\n", + "\n", + "2. For `GGY` and `JEY`, their merged information is in `GGY+JEY` (for `un_pop`). Guernsey Annual Electronic Report (link [here](https://gov.gg/CHttpHandler.ashx?id=123156&p=0#:~:text=At%20the%20end%20of%20March%202019%2C%20Guernsey's%20population%20was%2062%2C792.&text=There%20was%20a%20natural%20decrease,of%20459%20people%20(0.7%25).)) shows `GGY` (Guernsey)'s population from 2009 to 2019. Take the average ratio of `GGY` population to `GGY+JEY` between these years, and use this ratio to extrapolate for the missing years (of `GGY` and `JEY`, separately).\n", + "\n", + "3. For `GBR`, take from PWT10.0. For `IMN`, take from UN." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ece16155-3257-408d-993c-8fc2de3e3238", + "metadata": {}, + "outputs": [], + "source": [ + "# cleaning Pitcairn\n", + "pcn_pop = pop_inh.loc[[\"PCN\"], [\"pcn_pop\"]].rename(columns={\"pcn_pop\": \"pop\"})\n", + "pcn_pop[\"pop_source\"] = \"interp\"\n", + "pcn_pop.loc[(\"PCN\", 2017), \"pop_source\"] = \"PCN_GOV\"\n", + "pcn_pop.loc[(\"PCN\", 2020), \"pop_source\"] = \"PCN_GOV_interp_CIA\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee7957e2-ed5b-4a4d-83fe-39fc817e7948", + "metadata": {}, + "outputs": [], + "source": [ + "# Guernsey and Jersey\n", + "ggy_09_19 = [62274, 62431, 62915, 63085, 62732, 62341, 62234, 62208, 62106]\n", + "ggy_09_19 += [62333, 62792]\n", + "ggy_09_19 = np.array(ggy_09_19) / 1000000\n", + "yr0919 = list(range(2009, 2020))\n", + "\n", + "ggy_jey_df = pop_inh.loc[[\"GGY+JEY\"], [\"un_pop\"]].rename(columns={\"un_pop\": \"pop\"})\n", + "ggy_df = ggy_jey_df.reset_index()\n", + "ggy_df[\"ccode\"] = \"GGY\"\n", + "ggy_df[\"pop\"] = np.nan\n", + "ggy_df.loc[ggy_df.year.isin(yr0919), \"pop\"] = ggy_09_19\n", + "ggy_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + "\n", + "ggy_ratio = (ggy_09_19 / ggy_jey_df.loc[(\"GGY+JEY\", yr0919), \"pop\"].values).mean()\n", + "not_0919 = np.setdiff1d(list(range(1950, 2021)), yr0919)\n", + "ggy_df.loc[(\"GGY\", not_0919), \"pop\"] = (\n", + " ggy_jey_df.loc[(\"GGY+JEY\", not_0919), \"pop\"].values * ggy_ratio\n", + ")\n", + "jey_df = ggy_jey_df.reset_index()\n", + "jey_df[\"ccode\"], jey_df[\"pop\"] = \"JEY\", ggy_jey_df[\"pop\"].values - ggy_df[\"pop\"].values\n", + "jey_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + "\n", + "# sources\n", + "ggy_df[\"pop_source\"] = \"GGY_REPORT\"\n", + "ggy_df.loc[(\"GGY\", not_0919), \"pop_source\"] = \"UN_ratio_GGY_REPORT\"\n", + "jey_df[\"pop_source\"] = \"UN_ratio_GGY_REPORT\"\n", + "jey_df.loc[(\"JEY\", yr0919), \"pop_source\"] = \"UN_minus_GGY_REPORT\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8e28735-9b45-4dc6-b78b-388df71eeb80", + "metadata": {}, + "outputs": [], + "source": [ + "# other GBR territories and GBR\n", + "gbr_df = pop_inh.loc[([\"GBR\"], range(1950, 2021)), [\"pop\"]]\n", + "gbr_df.loc[(\"GBR\", 2020), \"pop\"] = (\n", + " gbr_df.loc[(\"GBR\", 2019), \"pop\"]\n", + " * pop_inh.loc[(\"GBR\", 2020), \"un_pop\"]\n", + " / pop_inh.loc[(\"GBR\", 2019), \"un_pop\"]\n", + ")\n", + "gbr_df[\"pop_source\"] = \"PWT\"\n", + "gbr_df.loc[(\"GBR\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"\n", + "\n", + "other_gbr_terr_df = pop_inh.loc[\n", + " [\"AIA\", \"BMU\", \"VGB\", \"CYM\", \"FLK\", \"GIB\", \"MSR\", \"SHN\", \"TCA\", \"IMN\"], [\"un_pop\"]\n", + "].rename(columns={\"un_pop\": \"pop\"})\n", + "other_gbr_terr_df[\"pop_source\"] = \"UN\"\n", + "\n", + "# gathering all GBR-related territories\n", + "gbr_rel_df = pd.concat([pcn_pop, ggy_df, jey_df, gbr_df, other_gbr_terr_df], axis=0)\n", + "gbr_rel_df[\"pop_unit\"] = \"millions (of people)\"\n", + "\n", + "# merging\n", + "pop_cleaned = pd.concat([gbr_rel_df, pop_cleaned], axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "286a5bf0-e9b2-4901-af6f-47392fb8b6bf", + "metadata": {}, + "source": [ + "### Cleaning for the Australian territories\n", + "\n", + "These are (excluding the no-population ones):\n", + "- External Territories: `CXR`, `CCK`, `NFK`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f148b3-96df-4d76-9525-947761d61339", + "metadata": {}, + "outputs": [], + "source": [ + "# getting the AUS 2020 value in\n", + "pop_inh.loc[(\"AUS\", 2020), \"pop\"] = (\n", + " pop_inh.loc[(\"AUS\", 2020), \"un_pop\"]\n", + " / pop_inh.loc[(\"AUS\", 2019), \"un_pop\"]\n", + " * pop_inh.loc[(\"AUS\", 2019), \"pop\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43cf297a-a494-40a2-b2f5-fea110b7596f", + "metadata": {}, + "outputs": [], + "source": [ + "# christmas island\n", + "cxr_df = pop_inh.loc[[\"CXR\"], [\"aus_census_pop\"]].rename(\n", + " columns={\"aus_census_pop\": \"pop\"}\n", + ")\n", + "aus_cen_yrs = [2001, 2006, 2011, 2016]\n", + "cxr_df[\"pop_source\"] = \"interp\"\n", + "cxr_df.loc[(\"CXR\", aus_cen_yrs), \"pop_source\"] = \"AUS_CENSUS\"\n", + "cxr_interp_yrs = range(np.min(aus_cen_yrs), np.max(aus_cen_yrs) + 1)\n", + "cxr_df.loc[(\"CXR\", list(cxr_interp_yrs)), \"pop\"] = np.exp(\n", + " np.interp(\n", + " cxr_interp_yrs,\n", + " aus_cen_yrs,\n", + " np.log(cxr_df.loc[(\"CXR\", aus_cen_yrs), \"pop\"].values),\n", + " )\n", + ")\n", + "\n", + "# country-territory ratios for extrapolation\n", + "cxr_ratio = (\n", + " cxr_df.loc[(\"CXR\", aus_cen_yrs), \"pop\"].values\n", + " / pop_inh.loc[(\"AUS\", aus_cen_yrs), \"pop\"].values\n", + ").mean()\n", + "non_cen_yrs = np.setdiff1d(list(range(1950, 2021)), cxr_interp_yrs)\n", + "cxr_df.loc[(\"CXR\", non_cen_yrs), \"pop\"] = (\n", + " cxr_ratio * pop_inh.loc[(\"AUS\", non_cen_yrs), \"pop\"].values\n", + ")\n", + "cxr_df.loc[(\"CXR\", non_cen_yrs), \"pop_source\"] = \"PWT_AUS_ratio_AUS_CENSUS_CXR\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ac99c46-1989-4f8a-926b-a5f27cc9e56f", + "metadata": {}, + "outputs": [], + "source": [ + "# Cocos keeling island\n", + "cck_df = pop_inh.loc[[\"CCK\"], [\"aus_census_pop\"]].rename(\n", + " columns={\"aus_census_pop\": \"pop\"}\n", + ")\n", + "cck_df[\"pop_source\"] = \"interp\"\n", + "cck_df.loc[(\"CCK\", aus_cen_yrs), \"pop_source\"] = \"AUS_CENSUS\"\n", + "cck_df.loc[(\"CCK\", list(cxr_interp_yrs)), \"pop\"] = np.exp(\n", + " np.interp(\n", + " cxr_interp_yrs,\n", + " aus_cen_yrs,\n", + " np.log(cck_df.loc[(\"CCK\", aus_cen_yrs), \"pop\"].values),\n", + " )\n", + ")\n", + "\n", + "# country-territory ratios for extrapolation\n", + "cck_ratio = (\n", + " cck_df.loc[(\"CCK\", aus_cen_yrs), \"pop\"].values\n", + " / pop_inh.loc[(\"AUS\", aus_cen_yrs), \"pop\"].values\n", + ").mean()\n", + "cck_df.loc[(\"CCK\", non_cen_yrs), \"pop\"] = (\n", + " cck_ratio * pop_inh.loc[(\"AUS\", non_cen_yrs), \"pop\"].values\n", + ")\n", + "cck_df.loc[(\"CCK\", non_cen_yrs), \"pop_source\"] = \"PWT_AUS_ratio_AUS_CENSUS_CCK\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f448410-1618-43ac-a14f-41bb208a80e3", + "metadata": {}, + "outputs": [], + "source": [ + "## norfolk island\n", + "nfk_df = pop_inh.loc[[\"NFK\"], [\"aus_census_pop\"]].rename(\n", + " columns={\"aus_census_pop\": \"pop\"}\n", + ")\n", + "nfk_df[\"pop_source\"] = \"interp\"\n", + "nfk_df.loc[(\"NFK\", [2001, 2011, 2016]), \"pop_source\"] = \"AUS_CENSUS\"\n", + "nfk_df.loc[(\"NFK\", list(cxr_interp_yrs)), \"pop\"] = np.exp(\n", + " np.interp(\n", + " cxr_interp_yrs,\n", + " [2001, 2011, 2016],\n", + " np.log(nfk_df.loc[(\"NFK\", [2001, 2011, 2016]), \"pop\"].values),\n", + " )\n", + ")\n", + "\n", + "# country-territory ratios for extrapolation\n", + "nfk_ratio = (\n", + " nfk_df.loc[(\"NFK\", [2001, 2011, 2016]), \"pop\"].values\n", + " / pop_inh.loc[(\"AUS\", [2001, 2011, 2016]), \"pop\"].values\n", + ").mean()\n", + "nfk_df.loc[(\"NFK\", non_cen_yrs), \"pop\"] = (\n", + " nfk_ratio * pop_inh.loc[(\"AUS\", non_cen_yrs), \"pop\"].values\n", + ")\n", + "nfk_df.loc[(\"NFK\", non_cen_yrs), \"pop_source\"] = \"PWT_AUS_ratio_AUS_CENSUS_NFK\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3c08cad-fa36-48c2-b2a4-423d37c43fc7", + "metadata": {}, + "outputs": [], + "source": [ + "aus_df = pop_inh.loc[([\"AUS\"], range(1950, 2021)), [\"pop\"]].copy()\n", + "aus_df[\"pop_source\"] = \"PWT\"\n", + "aus_df.loc[(\"AUS\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"\n", + "\n", + "pop_cleaned = pd.concat([aus_df, pop_cleaned, nfk_df, cck_df, cxr_df], axis=0)\n", + "pop_cleaned.sort_index(inplace=True)\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "19feeb4a-2896-4dc2-b1d0-9c565ebdc8a2", + "metadata": {}, + "source": [ + "### Cleaning for the New Zealand territories\n", + "\n", + "These are (excluding the no-population ones):\n", + "- External Territories: `TKL`, `NIU`, `COK`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34953fdb-94d7-4970-a063-a9b5b053f3bb", + "metadata": {}, + "outputs": [], + "source": [ + "# New Zealand\n", + "nzl_df = pop_inh.loc[(\"NZL\", range(1950, 2021)), [\"pop\"]].sort_index()\n", + "nzl_df.loc[(\"NZL\", 2020), \"pop\"] = (\n", + " nzl_df.loc[(\"NZL\", 2019), \"pop\"]\n", + " * pop_inh.loc[(\"NZL\", 2020), \"un_pop\"]\n", + " / pop_inh.loc[(\"NZL\", 2019), \"un_pop\"]\n", + ")\n", + "nzl_df[\"pop_source\"] = \"PWT\"\n", + "nzl_df.loc[(\"NZL\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97fa25d5-59d0-4d62-b395-5f08cb5ce22b", + "metadata": {}, + "outputs": [], + "source": [ + "nzl_terr_df = pop_inh.loc[\n", + " ([\"NIU\", \"COK\", \"TKL\"], list(range(1950, 2021))), [\"un_pop\"]\n", + "].rename(columns={\"un_pop\": \"pop\"})\n", + "nzl_terr_df[\"pop_source\"] = \"UN\"\n", + "\n", + "pop_cleaned = pd.concat([pop_cleaned, nzl_df, nzl_terr_df], axis=0).sort_index()\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "43d46cce-2446-4cce-9854-1fa6a276825a", + "metadata": {}, + "source": [ + "### Cleaning for the Danish territories\n", + "\n", + "These are (excluding the no-population ones):\n", + "- `GRL`, `FRO`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c05b4130-681b-4806-8e35-f2f32b884d63", + "metadata": {}, + "outputs": [], + "source": [ + "# Denmark\n", + "dnk_df = pop_inh.loc[(\"DNK\", range(1950, 2021)), [\"pop\"]].sort_index()\n", + "dnk_df.loc[(\"DNK\", 2020), \"pop\"] = (\n", + " dnk_df.loc[(\"DNK\", 2019), \"pop\"]\n", + " * pop_inh.loc[(\"DNK\", 2020), \"un_pop\"]\n", + " / pop_inh.loc[(\"DNK\", 2019), \"un_pop\"]\n", + ")\n", + "dnk_df[\"pop_source\"] = \"PWT\"\n", + "dnk_df.loc[(\"DNK\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d425dbf9-f612-4ec9-8715-d11f6212ea43", + "metadata": {}, + "outputs": [], + "source": [ + "dnk_terr_df = pop_inh.loc[([\"GRL\", \"FRO\"], list(range(1950, 2021))), [\"un_pop\"]].rename(\n", + " columns={\"un_pop\": \"pop\"}\n", + ")\n", + "dnk_terr_df[\"pop_source\"] = \"UN\"\n", + "\n", + "pop_cleaned = pd.concat([pop_cleaned, dnk_df, dnk_terr_df], axis=0).sort_index()\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "be427ad7-4508-4dd2-b87d-1f52ebfaca90", + "metadata": {}, + "source": [ + "### Cleaning for the Finnish territories\n", + "\n", + "These are (excluding the no-population ones):\n", + "- `ALA`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f008b05a-647c-4224-9b1a-0f2272acbfdc", + "metadata": {}, + "outputs": [], + "source": [ + "# Finland\n", + "fin_df = pop_inh.loc[(\"FIN\", range(1950, 2021)), [\"pop\"]].sort_index()\n", + "fin_df.loc[(\"FIN\", 2020), \"pop\"] = (\n", + " fin_df.loc[(\"FIN\", 2019), \"pop\"]\n", + " * pop_inh.loc[(\"FIN\", 2020), \"un_pop\"]\n", + " / pop_inh.loc[(\"FIN\", 2019), \"un_pop\"]\n", + ")\n", + "fin_df[\"pop_source\"] = \"PWT\"\n", + "fin_df.loc[(\"FIN\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de1796cc-280a-445e-a7d6-36417c4e878b", + "metadata": {}, + "outputs": [], + "source": [ + "ala_df = pop_inh.loc[([\"ALA\"], range(1950, 2021)), [\"ala_pop\"]].rename(\n", + " columns={\"ala_pop\": \"pop\"}\n", + ")\n", + "ala_df[\"pop_source\"] = \"ALA_STAT\"\n", + "ala_yrs = (\n", + " ala_df.loc[~pd.isnull(ala_df[\"pop\"]), :].index.get_level_values(\"year\").unique()\n", + ")\n", + "ala_noyrs = np.setdiff1d(list(range(1950, 2021)), ala_yrs)\n", + "ala_ratio = (\n", + " ala_df.loc[(\"ALA\", ala_yrs), \"pop\"].values\n", + " / fin_df.loc[(\"FIN\", ala_yrs), \"pop\"].values\n", + ").mean()\n", + "ala_df.loc[(\"ALA\", ala_noyrs), \"pop\"] = (\n", + " ala_ratio * fin_df.loc[(\"FIN\", ala_noyrs), \"pop\"].values\n", + ")\n", + "ala_df.loc[(\"ALA\", ala_noyrs), \"pop_source\"] = \"PWT_FIN_ratio_ALA_STAT_ALA\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41c6cc1f-569b-4d6d-a701-ec8ec993529e", + "metadata": {}, + "outputs": [], + "source": [ + "pop_cleaned = pd.concat([pop_cleaned, fin_df, ala_df], axis=0).sort_index()\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "6f4fbf24-9461-46dc-b151-ca0045e39a17", + "metadata": {}, + "source": [ + "### Cleaning for the Norwegian territories\n", + "\n", + "These are (excluding the no-population ones):\n", + "- `SJM`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79358d66-c505-4d37-bcc5-7596a1d4db1f", + "metadata": {}, + "outputs": [], + "source": [ + "# Finland\n", + "nor_df = pop_inh.loc[(\"NOR\", range(1950, 2021)), [\"pop\"]].sort_index()\n", + "nor_df.loc[(\"NOR\", 2020), \"pop\"] = (\n", + " nor_df.loc[(\"NOR\", 2019), \"pop\"]\n", + " * pop_inh.loc[(\"NOR\", 2020), \"un_pop\"]\n", + " / pop_inh.loc[(\"NOR\", 2019), \"un_pop\"]\n", + ")\n", + "nor_df[\"pop_source\"] = \"PWT\"\n", + "nor_df.loc[(\"NOR\", 2020), \"pop_source\"] = \"PWT_ratio_UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b922925-09be-4a6b-8c5b-490825e8af78", + "metadata": {}, + "outputs": [], + "source": [ + "sjm_df = pop_inh.loc[([\"SJM\"], range(1950, 2021)), [\"nor_census_pop\"]].rename(\n", + " columns={\"nor_census_pop\": \"pop\"}\n", + ")\n", + "sjm_df[\"pop_source\"] = \"NOR_CENSUS\"\n", + "sjm_yrs = (\n", + " sjm_df.loc[~pd.isnull(sjm_df[\"pop\"]), :].index.get_level_values(\"year\").unique()\n", + ")\n", + "sjm_noyrs = np.setdiff1d(list(range(1950, 2021)), sjm_yrs)\n", + "sjm_ratio = (\n", + " sjm_df.loc[(\"SJM\", sjm_yrs), \"pop\"].values\n", + " / nor_df.loc[(\"NOR\", sjm_yrs), \"pop\"].values\n", + ").mean()\n", + "sjm_df.loc[(\"SJM\", sjm_noyrs), \"pop\"] = (\n", + " sjm_ratio * nor_df.loc[(\"NOR\", sjm_noyrs), \"pop\"].values\n", + ")\n", + "sjm_df.loc[(\"SJM\", sjm_noyrs), \"pop_source\"] = \"PWT_NOR_ratio_NOR_CENSUS_SJM\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc842239-3d64-4e93-97d9-afbeefca9928", + "metadata": {}, + "outputs": [], + "source": [ + "pop_cleaned = pd.concat([pop_cleaned, nor_df, sjm_df], axis=0).sort_index()\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "152dee79-0f69-4b44-aeda-352e2892dffd", + "metadata": {}, + "source": [ + "### For the rest, filling in missing population info from UN populations data to PWT data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c71cc0d3-7827-47af-aedc-06ff811f85a0", + "metadata": {}, + "outputs": [], + "source": [ + "# creating a xr.Dataset with the rest\n", + "pop_inh_remain = (\n", + " pop_inh.loc[(slice(None), range(1950, 2021)), :]\n", + " .index.get_level_values(\"ccode\")\n", + " .unique()\n", + ")\n", + "pop_inh_remain = np.setdiff1d(\n", + " pop_inh_remain, pop_cleaned.index.get_level_values(\"ccode\").unique()\n", + ")\n", + "pop_inh_remain = np.setdiff1d(pop_inh_remain, [\"CHI\", \"FRA+OV\", \"XKX\", \"GGY+JEY\"])\n", + "pop_inh_rem = pop_inh.loc[(pop_inh_remain, range(1950, 2021)), [\"pop\", \"un_pop\"]].copy()\n", + "pop_inh_rem_ds = xr.Dataset.from_dataframe(pop_inh_rem)\n", + "\n", + "# smooth_fill\n", + "pop_inh_rem_filled = ypk_fn.smooth_fill(\n", + " pop_inh_rem_ds[\"pop\"], pop_inh_rem_ds[\"un_pop\"], time_dim=\"year\", other_dim=\"ccode\"\n", + ").to_dataframe()\n", + "\n", + "pop_inh_rem_filled = pop_inh_rem_filled.merge(\n", + " pop_inh_rem[[\"pop\"]].rename(columns={\"pop\": \"pop_source\"}),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "pop_inh_rem_filled.loc[~pd.isnull(pop_inh_rem_filled.pop_source), \"pop_source\"] = \"PWT\"\n", + "pop_inh_rem_filled.loc[pd.isnull(pop_inh_rem_filled.pop_source), \"pop_source\"] = \"UN\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7eaa3eb-a03c-4de1-b980-9f4787809d00", + "metadata": {}, + "outputs": [], + "source": [ + "# concatenating with the rest\n", + "pop_cleaned = pd.concat([pop_cleaned, pop_inh_rem_filled], axis=0).sort_index()\n", + "pop_cleaned[\"pop_unit\"] = \"millions (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "3a08fd44-c5a8-4dd5-93d2-c67dd47a6e9c", + "metadata": {}, + "source": [ + "## Historical GDPpc (1950-2019), constant 2017 PPP USD, creating a single sequence\n", + "\n", + "We will first work with the constant 2017 PPP USD (i.e., ones with the `rgdpna` in their names), then create current PPP 2017 USD versions appropriately (in accordance with `cgdpo`). We will work with **per capita** version for extrapolation. But before working on the entire set of countries, I will first set aside France (due to the territory-inclusion problem mentioned above), work with the other countries (including the 5 French overseas territories), and return to the French case to subtract the sum of GDP values from the said territories to acquire the mainland French GDP (and GDPpc).\n", + "\n", + "### Setting aside the no-population cases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a540d28c-89ce-4aae-a0ef-2d724bc04b75", + "metadata": {}, + "outputs": [], + "source": [ + "y_uninh = gp_df.loc[(sset.UNINHABITED_ISOS, range(1950, 2021)), [\"rgdpna\"]].rename(\n", + " columns={\"rgdpna\": \"rgdpna_pc\"}\n", + ")\n", + "y_uninh[\"rgdpna_pc\"] = 0\n", + "y_uninh[\"gdp_source\"] = \"uninhabited\"" + ] + }, + { + "cell_type": "markdown", + "id": "fc2393b4-f1a9-4dec-b882-5ae8dab6e837", + "metadata": {}, + "source": [ + "### Using World Bank information to fill in for PWT\n", + "\n", + "Both `rgdpna_pc` and `wb_rgdpna_pc` are in ones of constant 2017 PPP USD; we will use the `smooth_fill` function in `ypk_settings.py`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffbee04b-e8a8-47d0-8822-8dcb74f5f75b", + "metadata": {}, + "outputs": [], + "source": [ + "gp_df[\"rgdpna_pc\"] = gp_df[\"rgdpna\"] / gp_df[\"pop\"]\n", + "y_xr_pwt_wb = xr.Dataset.from_dataframe(\n", + " gp_df.loc[\n", + " ~gp_df.index.get_level_values(\"ccode\").isin(sset.UNINHABITED_ISOS),\n", + " [\"rgdpna_pc\", \"wb_rgdpna_pc\"],\n", + " ]\n", + ")\n", + "\n", + "# filling in rgdpna_pc using wb_rgdpna_pc\n", + "y_pwt_wb = ypk_fn.smooth_fill(\n", + " y_xr_pwt_wb[\"rgdpna_pc\"],\n", + " y_xr_pwt_wb[\"wb_rgdpna_pc\"],\n", + " time_dim=\"year\",\n", + " other_dim=\"ccode\",\n", + ").to_dataframe()\n", + "y_pwt_clean = y_pwt_wb.loc[\n", + " y_pwt_wb.index.get_level_values(\"year\").isin(range(1950, 2021)), :\n", + "].copy()\n", + "\n", + "# filling in the source information\n", + "y_pwt_clean = y_pwt_clean.merge(\n", + " gp_df[[\"rgdpna_pc\", \"wb_rgdpna_pc\"]].rename(columns={\"rgdpna_pc\": \"gdp_source\"}),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean.loc[~pd.isnull(y_pwt_clean.gdp_source), \"gdp_source\"] = \"PWT\"\n", + "y_pwt_clean.loc[\n", + " pd.isnull(y_pwt_clean.gdp_source) & ~pd.isnull(y_pwt_clean.wb_rgdpna_pc),\n", + " \"gdp_source\",\n", + "] = \"WB\"\n", + "y_pwt_clean.drop([\"wb_rgdpna_pc\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "bd2dbdaa-8363-4286-8ba8-616fe9d87038", + "metadata": {}, + "source": [ + "### Using IMF information to fill in for PWT\n", + "\n", + "According to this [link](https://www.imf.org/external/pubs/ft/weo/faq.htm#q4d) talking about the PPP used by IMF, it is said that IMF WEO's PPP rates are calculated based on ICP's 2017 report. Since the variable description says that this is constant PPP, we will interpret `imf_rgdpna_pc` as being in ones of constant 2017 PPP USD." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ad37852-86d3-4fb6-8423-aba3815dba03", + "metadata": {}, + "outputs": [], + "source": [ + "y_pwt_clean = y_pwt_clean.merge(\n", + " gp_df[[\"imf_rgdpna_pc\"]], how=\"left\", left_index=True, right_index=True\n", + ")\n", + "y_xr_pwt_imf = xr.Dataset.from_dataframe(y_pwt_clean[[\"imf_rgdpna_pc\", \"rgdpna_pc\"]])\n", + "\n", + "# smooth_fill\n", + "y_pwt_imf = ypk_fn.smooth_fill(\n", + " y_xr_pwt_imf[\"rgdpna_pc\"],\n", + " y_xr_pwt_imf[\"imf_rgdpna_pc\"],\n", + " time_dim=\"year\",\n", + " other_dim=\"ccode\",\n", + ").to_dataframe()\n", + "\n", + "# merging\n", + "prev_dict = {\"rgdpna_pc\": \"rgdpna_pc_prev\"}\n", + "y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(\n", + " y_pwt_imf[[\"rgdpna_pc\"]],\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean.loc[\n", + " ~pd.isnull(y_pwt_clean.rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),\n", + " \"gdp_source\",\n", + "] = \"IMF\"\n", + "y_pwt_clean.drop([\"rgdpna_pc_prev\", \"imf_rgdpna_pc\"], inplace=True, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "a6307aa9-f8b2-4c80-b0a3-80ed15468f3b", + "metadata": {}, + "source": [ + "### Using MPD (Maddison) information to fill in for PWT\n", + "\n", + "MPD data in `mpd_rgdpna_pc` are in **constant 2011 PPP USD**, so we will have to use the PPP conversion rates to change them into constant 2017 PPP USD before using the function `smooth_fill`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d5ddf07-56b9-49f9-ad95-0bee7dd464f5", + "metadata": {}, + "outputs": [], + "source": [ + "# from constant 2011 PPP USD ot constant 2017 PPP USD\n", + "ppp_to_2017 = ypk_fn.ppp_conversion_specific_year(2017, extrap_sim=True)\n", + "ppp_11_to_17 = ppp_to_2017.loc[(slice(None), 2011), :].reset_index()\n", + "ppp_11_to_17.set_index([\"ccode\"], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "be0bc766-59c2-4401-8b04-230934726056", + "metadata": {}, + "source": [ + "We will use the \"neutral assumption\" (i.e., using the conversion rate of 1) and not use the WB conversion rates (and **only use PWT conversion rates**) due to there being a big discrepancy between the two measures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfe04dd1-d613-4940-8b7d-bb0d3a247d0f", + "metadata": {}, + "outputs": [], + "source": [ + "# applying the conversion\n", + "y_pwt_clean = y_pwt_clean.merge(\n", + " gp_df[[\"mpd_rgdpna_pc\"]].merge(\n", + " ppp_11_to_17[[\"conv\"]], left_index=True, right_index=True, how=\"left\"\n", + " ),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean.loc[pd.isnull(y_pwt_clean.conv), \"conv\"] = 1\n", + "y_pwt_clean[\"mpd_rgdpna_pc\"] *= y_pwt_clean[\"conv\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70b803df-f787-4282-af14-00fa6aa4d936", + "metadata": {}, + "outputs": [], + "source": [ + "# smooth_fill\n", + "y_xr_pwt_mpd = xr.Dataset.from_dataframe(y_pwt_clean[[\"mpd_rgdpna_pc\", \"rgdpna_pc\"]])\n", + "y_pwt_mpd = ypk_fn.smooth_fill(\n", + " y_xr_pwt_mpd[\"rgdpna_pc\"],\n", + " y_xr_pwt_mpd[\"mpd_rgdpna_pc\"],\n", + " time_dim=\"year\",\n", + " other_dim=\"ccode\",\n", + ").to_dataframe()\n", + "\n", + "# merging\n", + "y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(\n", + " y_pwt_mpd[[\"rgdpna_pc\"]],\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean.loc[\n", + " ~pd.isnull(y_pwt_clean.rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),\n", + " \"gdp_source\",\n", + "] = \"MPD\"\n", + "y_pwt_clean.drop([\"rgdpna_pc_prev\", \"mpd_rgdpna_pc\", \"conv\"], inplace=True, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "6ae0dd96-d834-4bcb-be56-76718fd0276c", + "metadata": {}, + "source": [ + "### Using OECD regional data to fill in for PWT\n", + "\n", + "OECD information is in constant 2015 PPP USD, so we will change accordingly before using `smooth_fill`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa576943-47cd-4389-ac90-a87a66633331", + "metadata": {}, + "outputs": [], + "source": [ + "# from constant 2015 PPP USD to constant 2017 PPP USD\n", + "ppp_15_to_17 = ppp_to_2017.loc[(slice(None), 2015), :].reset_index()\n", + "ppp_15_to_17.set_index([\"ccode\"], inplace=True)\n", + "\n", + "# applying the conversion\n", + "y_pwt_clean = y_pwt_clean.merge(\n", + " gp_df[[\"oecd_rgdpna\"]].merge(\n", + " ppp_15_to_17[[\"conv\"]], left_index=True, right_index=True, how=\"left\"\n", + " ),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean = y_pwt_clean.merge(\n", + " pop_cleaned[[\"pop\"]], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "y_pwt_clean.loc[pd.isnull(y_pwt_clean.conv), \"conv\"] = 1\n", + "y_pwt_clean[\"oecd_rgdpna_pc\"] = (\n", + " y_pwt_clean[\"oecd_rgdpna\"] / y_pwt_clean[\"pop\"] * y_pwt_clean[\"conv\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9d4578c-5768-41b9-9c31-ba3ad022904f", + "metadata": {}, + "outputs": [], + "source": [ + "# smooth_fill\n", + "y_xr_pwt_oecd = xr.Dataset.from_dataframe(y_pwt_clean[[\"oecd_rgdpna_pc\", \"rgdpna_pc\"]])\n", + "y_pwt_oecd = ypk_fn.smooth_fill(\n", + " y_xr_pwt_oecd[\"rgdpna_pc\"],\n", + " y_xr_pwt_oecd[\"oecd_rgdpna_pc\"],\n", + " time_dim=\"year\",\n", + " other_dim=\"ccode\",\n", + ").to_dataframe()\n", + "\n", + "# merging\n", + "y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(\n", + " y_pwt_oecd[[\"rgdpna_pc\"]],\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean.loc[\n", + " ~pd.isnull(y_pwt_clean.rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),\n", + " \"gdp_source\",\n", + "] = \"OECD\"\n", + "y_pwt_clean.drop(\n", + " [\"rgdpna_pc_prev\", \"oecd_rgdpna_pc\", \"oecd_rgdpna\", \"conv\"], inplace=True, axis=1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e9939f3a-586a-48ae-b314-c6fa54d47f91", + "metadata": {}, + "source": [ + "### Using CIA information to fill in for PWT\n", + "\n", + "CIA information (`cia_rgdpna`) is in constant 2017 PPP USD, so we will use this as is." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbb4e74b-0e07-4c55-9e74-801215545b04", + "metadata": {}, + "outputs": [], + "source": [ + "# CIA rgdpna\n", + "y_pwt_clean = y_pwt_clean.merge(\n", + " gp_df[[\"cia_rgdpna\"]], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "y_pwt_clean[\"cia_rgdpna\"] /= y_pwt_clean[\"pop\"]\n", + "y_pwt_clean.rename(columns={\"cia_rgdpna\": \"cia_rgdpna_pc\"}, inplace=True)\n", + "\n", + "# interpolating\n", + "cia_y_ccode = (\n", + " y_pwt_clean.loc[~pd.isnull(y_pwt_clean.cia_rgdpna_pc), :]\n", + " .index.get_level_values(\"ccode\")\n", + " .unique()\n", + ")\n", + "cc_dfs = []\n", + "for cc in tqdm(cia_y_ccode):\n", + " cc_df = y_pwt_clean.loc[cc, [\"cia_rgdpna_pc\"]]\n", + " cc_yrs = (\n", + " cc_df.loc[~pd.isnull(cc_df.cia_rgdpna_pc), :]\n", + " .index.get_level_values(\"year\")\n", + " .unique()\n", + " )\n", + " cc_filled = range(cc_yrs.min(), cc_yrs.max() + 1)\n", + " cc_filled_vals = np.exp(\n", + " np.interp(cc_filled, cc_yrs, np.log(cc_df.loc[cc_yrs, \"cia_rgdpna_pc\"].values))\n", + " )\n", + " cc_dfs.append(\n", + " pd.DataFrame(\n", + " data={\n", + " \"ccode\": [cc] * len(cc_filled),\n", + " \"year\": cc_filled,\n", + " \"cia_rgdpna_pc_interp\": cc_filled_vals,\n", + " }\n", + " )\n", + " )\n", + "cc_dfs = pd.concat(cc_dfs, axis=0).set_index([\"ccode\", \"year\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b376750f-faaf-46eb-a28a-0b97862f1574", + "metadata": {}, + "outputs": [], + "source": [ + "# smooth_fill\n", + "y_pwt_clean = y_pwt_clean.merge(cc_dfs, left_index=True, right_index=True, how=\"outer\")\n", + "y_xr_pwt_cia = xr.Dataset.from_dataframe(\n", + " y_pwt_clean[[\"cia_rgdpna_pc_interp\", \"rgdpna_pc\"]]\n", + ")\n", + "y_pwt_cia = ypk_fn.smooth_fill(\n", + " y_xr_pwt_cia[\"rgdpna_pc\"],\n", + " y_xr_pwt_cia[\"cia_rgdpna_pc_interp\"],\n", + " time_dim=\"year\",\n", + " other_dim=\"ccode\",\n", + ").to_dataframe()\n", + "\n", + "# merging\n", + "y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(\n", + " y_pwt_cia[[\"rgdpna_pc\"]],\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_pwt_clean.loc[\n", + " ~pd.isnull(y_pwt_clean.cia_rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),\n", + " \"gdp_source\",\n", + "] = \"CIA\"\n", + "y_pwt_clean.loc[\n", + " ~pd.isnull(y_pwt_clean.cia_rgdpna_pc_interp)\n", + " & pd.isnull(y_pwt_clean.rgdpna_pc_prev)\n", + " & (y_pwt_clean.gdp_source != \"CIA\"),\n", + " \"gdp_source\",\n", + "] = \"CIA_interp\"\n", + "\n", + "y_pwt_clean.drop(\n", + " [\"rgdpna_pc_prev\", \"cia_rgdpna_pc\", \"cia_rgdpna_pc_interp\"], inplace=True, axis=1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1ba71df6-0c9c-4940-a2ae-3ad96cbe7671", + "metadata": {}, + "source": [ + "### Aland Statistics (for `ALA`)\n", + "\n", + "We will extrapolate for the missing years as well, by creating ratios with Finland." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b08daef9-ddf3-446d-bb44-37f87e34daa7", + "metadata": {}, + "outputs": [], + "source": [ + "ala_cgdpo_pc = gp_df.loc[~pd.isnull(gp_df.ala_cgdpo_pc), [\"ala_cgdpo_pc\"]]\n", + "ala_yrs = ala_cgdpo_pc.index.get_level_values(\"year\").unique()\n", + "ala_ppp_val = ppp_to_2017.loc[(\"ALA\", ala_yrs), \"conv\"].values\n", + "y_pwt_clean.loc[(\"ALA\", ala_yrs), \"rgdpna_pc\"] = (\n", + " ala_ppp_val * ala_cgdpo_pc.ala_cgdpo_pc.values\n", + ")\n", + "y_pwt_clean.loc[(\"ALA\", ala_yrs), \"gdp_source\"] = \"ALAND_STAT\"\n", + "\n", + "# ratio wrt Finland\n", + "ala_fin_ratio = (\n", + " y_pwt_clean.loc[(\"ALA\", ala_yrs), \"rgdpna_pc\"].values\n", + " / y_pwt_clean.loc[(\"FIN\", ala_yrs), \"rgdpna_pc\"].values\n", + ").mean()\n", + "ala_noyrs = np.setdiff1d(list(range(1950, 2021)), ala_yrs)\n", + "y_pwt_clean.loc[(\"ALA\", ala_noyrs), \"rgdpna_pc\"] = (\n", + " ala_fin_ratio * y_pwt_clean.loc[(\"FIN\", ala_noyrs), \"rgdpna_pc\"].values\n", + ")\n", + "y_pwt_clean.loc[(\"ALA\", ala_noyrs), \"gdp_source\"] = \"FIN_extrap\"" + ] + }, + { + "cell_type": "markdown", + "id": "8a88edfa-edb6-4cb7-a3af-1c6fd35c24e0", + "metadata": {}, + "source": [ + "### Treadgold Reports on Norfolk Island (for `NFK`)\n", + "\n", + "We will extrapolate for the missing years as well, by creating ratios with Australia (`AUS`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27f97b8b-d7c5-4fa1-bd83-a655d8ab998f", + "metadata": {}, + "outputs": [], + "source": [ + "tgold_rgdpna_pc = gp_df.loc[\n", + " ~pd.isnull(gp_df.treadgold_rgdpna_pc), [\"treadgold_rgdpna_pc\"]\n", + "]\n", + "tgold_yrs = tgold_rgdpna_pc.index.get_level_values(\"year\").unique()\n", + "tgold_interp_yrs = list(range(tgold_yrs.min(), tgold_yrs.max() + 1))\n", + "tgold_interp_vals = np.exp(\n", + " np.interp(\n", + " tgold_interp_yrs, tgold_yrs, np.log(tgold_rgdpna_pc.treadgold_rgdpna_pc.values)\n", + " )\n", + ")\n", + "y_pwt_clean.loc[(\"NFK\", tgold_interp_yrs), \"rgdpna_pc\"] = tgold_interp_vals\n", + "y_pwt_clean.loc[(\"NFK\", tgold_interp_yrs), \"gdp_source\"] = \"Treadgold_ratio_PWT\"\n", + "\n", + "# calculating the ratios separately\n", + "nfk_aus_ratio_early = (\n", + " y_pwt_clean.loc[(\"NFK\", tgold_interp_yrs[0:2]), \"rgdpna_pc\"].values\n", + " / y_pwt_clean.loc[(\"AUS\", tgold_interp_yrs[0:2]), \"rgdpna_pc\"].values\n", + ").mean()\n", + "\n", + "nfk_aus_ratio_later = (\n", + " y_pwt_clean.loc[(\"NFK\", tgold_interp_yrs[-2:]), \"rgdpna_pc\"].values\n", + " / y_pwt_clean.loc[(\"AUS\", tgold_interp_yrs[-2:]), \"rgdpna_pc\"].values\n", + ").mean()\n", + "\n", + "y_pwt_clean.loc[(\"NFK\", 1950), \"rgdpna_pc\"] = (\n", + " nfk_aus_ratio_early * y_pwt_clean.loc[(\"AUS\", 1950), \"rgdpna_pc\"]\n", + ")\n", + "nfk_noyrs = np.setdiff1d(list(range(1951, 2021)), tgold_interp_yrs)\n", + "y_pwt_clean.loc[(\"NFK\", nfk_noyrs), \"rgdpna_pc\"] = (\n", + " nfk_aus_ratio_later * y_pwt_clean.loc[(\"AUS\", nfk_noyrs), \"rgdpna_pc\"].values\n", + ")\n", + "y_pwt_clean.loc[(\"NFK\", [1950] + list(nfk_noyrs)), \"gdp_source\"] = \"AUS_extrap\"" + ] + }, + { + "cell_type": "markdown", + "id": "24f26479-9598-423a-843f-628f5df94be7", + "metadata": {}, + "source": [ + "### Statistics Netherlands information on `BES`\n", + "\n", + "Nominal GDP for Bonaire, Saba, and Eustatius are shown separately (for the years 2012 and 2017, on this [Statistics Netherlands file](https://www.cbs.nl/en-gb/publication/2020/41/trends-in-the-caribbean-netherlands-2020)). We will add them together, interpolate, and create ratios with the Netherlands information to fill in the missing pieces as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad2eba56-9286-4c3e-8f92-a59ba9a70e7d", + "metadata": {}, + "outputs": [], + "source": [ + "# numbers correspond to GDP for Bonaire, Saba, and Eustatius (in mil. of nominal USD)\n", + "bes_yrs = list(range(2012, 2018))\n", + "bes_2012, bes_2017 = 372 + 42 + 101, 428 + 47 + 108\n", + "bes_pc_12_17 = (\n", + " np.array([bes_2012, bes_2017])\n", + " / pop_cleaned.loc[(\"BES\", [2012, 2017]), \"pop\"].values\n", + ")\n", + "\n", + "bes_ratio = bes_pc_12_17 / gp_df.loc[(\"NLD\", [2012, 2017]), \"wb_gdp_nom_pc\"].values\n", + "bes_avg_ratio = bes_ratio.mean()\n", + "bes_ppp_pc_12_17 = (\n", + " bes_ratio * y_pwt_clean.loc[(\"NLD\", [2012, 2017]), \"rgdpna_pc\"].values\n", + ")\n", + "bes_ppp_pc_12_17_interp = np.exp(\n", + " np.interp(bes_yrs, [2012, 2017], np.log(bes_ppp_pc_12_17))\n", + ")\n", + "bes_noyrs = np.setdiff1d(list(range(1950, 2021)), bes_yrs)\n", + "bes_ppp_pc_not12_17 = (\n", + " y_pwt_clean.loc[(\"NLD\", bes_noyrs), \"rgdpna_pc\"].values * bes_avg_ratio\n", + ")\n", + "y_pwt_clean.loc[(\"BES\", bes_yrs), \"rgdpna_pc\"] = bes_ppp_pc_12_17_interp\n", + "y_pwt_clean.loc[(\"BES\", bes_yrs), \"gdp_source\"] = \"NLD_STAT\"\n", + "y_pwt_clean.loc[(\"BES\", bes_noyrs), \"rgdpna_pc\"] = bes_ppp_pc_not12_17\n", + "y_pwt_clean.loc[(\"BES\", bes_noyrs), \"gdp_source\"] = \"NLD_extrap\"" + ] + }, + { + "cell_type": "markdown", + "id": "609b4e3a-c764-467d-bc33-06004eb8e55f", + "metadata": {}, + "source": [ + "### Information from CEROM, Saint Barthelemy (for `BLM`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8ceaa0c-0553-4f36-add2-39338623428d", + "metadata": {}, + "outputs": [], + "source": [ + "# getting the nominal values\n", + "cerom = gp_df.loc[\"BLM\", [\"cerom_gdppc\"]]\n", + "cerom_yrs = cerom.loc[~pd.isnull(cerom.cerom_gdppc), :].index.values\n", + "cerom_vals = cerom.loc[~pd.isnull(cerom.cerom_gdppc), \"cerom_gdppc\"].values\n", + "\n", + "# ratio with the nominal, french gdppc\n", + "blm_ratio = cerom_vals / gp_df.loc[(\"FRA\", cerom_yrs), \"wb_gdp_nom_pc\"].values\n", + "blm_y_vals = blm_ratio * y_pwt_clean.loc[(\"FRA\", cerom_yrs), \"rgdpna_pc\"].values\n", + "blm_interp_yrs = list(range(cerom_yrs.min(), cerom_yrs.max() + 1))\n", + "y_pwt_clean.loc[(\"BLM\", blm_interp_yrs), \"rgdpna_pc\"] = np.exp(\n", + " np.interp(blm_interp_yrs, cerom_yrs, np.log(blm_y_vals))\n", + ")\n", + "\n", + "# rest of the years\n", + "blm_noyrs = np.setdiff1d(list(range(1950, 2021)), blm_interp_yrs)\n", + "y_pwt_clean.loc[(\"BLM\", blm_noyrs), \"rgdpna_pc\"] = (\n", + " y_pwt_clean.loc[(\"FRA\", blm_noyrs), \"rgdpna_pc\"].values * blm_ratio.mean()\n", + ")\n", + "\n", + "y_pwt_clean.loc[(\"BLM\", blm_noyrs), \"gdp_source\"] = \"FRA_extrap\"\n", + "y_pwt_clean.loc[(\"BLM\", blm_interp_yrs), \"gdp_source\"] = \"CEROM\"" + ] + }, + { + "cell_type": "markdown", + "id": "40e83fd6-f861-445d-b68a-7d782d6e46b1", + "metadata": {}, + "source": [ + "### Australian Census from information (for `CCK` and `CXR`)\n", + "\n", + "Again, we only have nominal GDP of these areas in the year 2010 as nominal terms. Therefore, we will again rely on the scale with the relevant sovereignty (being `AUS`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e5dc8b3-87d9-46ff-b0dd-96a45474319a", + "metadata": {}, + "outputs": [], + "source": [ + "# ratios in 2010\n", + "aus_2010 = gp_df.loc[(\"AUS\", 2010), \"wb_gdp_nom_pc\"]\n", + "cxr_2010 = (\n", + " gp_df.loc[(\"CXR\", 2010), \"aus_census_nom_gdp\"]\n", + " / pop_cleaned.loc[(\"CXR\", 2010), \"pop\"]\n", + ")\n", + "cck_2010 = (\n", + " gp_df.loc[(\"CCK\", 2010), \"aus_census_nom_gdp\"]\n", + " / pop_cleaned.loc[(\"CCK\", 2010), \"pop\"]\n", + ")\n", + "cxr_aus_r, cck_aus_r = cxr_2010 / aus_2010, cck_2010 / aus_2010\n", + "\n", + "# apply the ratios\n", + "cxr_vals = y_pwt_clean.loc[(\"AUS\", range(1950, 2021)), \"rgdpna_pc\"].values * cxr_aus_r\n", + "cck_vals = y_pwt_clean.loc[(\"AUS\", range(1950, 2021)), \"rgdpna_pc\"].values * cck_aus_r\n", + "y_pwt_clean.loc[(\"CXR\", list(range(1950, 2021))), \"rgdpna_pc\"] = cxr_vals\n", + "y_pwt_clean.loc[(\"CCK\", list(range(1950, 2021))), \"rgdpna_pc\"] = cck_vals\n", + "\n", + "# sources\n", + "y_pwt_clean.loc[(\"CXR\", list(range(1950, 2021))), \"gdp_source\"] = \"AUS_extrap\"\n", + "y_pwt_clean.loc[(\"CCK\", list(range(1950, 2021))), \"gdp_source\"] = \"AUS_extrap\"\n", + "y_pwt_clean.loc[(\"CCK\", 2010), \"gdp_source\"] = \"AUS_parliament\"\n", + "y_pwt_clean.loc[(\"CXR\", 2010), \"gdp_source\"] = \"AUS_parliament\"" + ] + }, + { + "cell_type": "markdown", + "id": "b4345fd2-cb31-438d-a818-74fc5bf8ff58", + "metadata": {}, + "source": [ + "### Svalbard and Jan Mayen (`SJM`)\n", + "\n", + "As mentioned above, we do not have reliable metrics for GDPpc or GDP for `SJM`; so we will copy Norway's information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5abaecc-9a4d-457e-b422-8616589b3b7f", + "metadata": {}, + "outputs": [], + "source": [ + "y_pwt_clean.loc[(\"SJM\", list(range(1950, 2021))), \"rgdpna_pc\"] = y_pwt_clean.loc[\n", + " (\"NOR\", list(range(1950, 2021))), \"rgdpna_pc\"\n", + "].values\n", + "y_pwt_clean.loc[\"SJM\", \"gdp_source\"] = \"NOR_copy\"" + ] + }, + { + "cell_type": "markdown", + "id": "006668cf-515a-447e-8d44-674e73f50042", + "metadata": {}, + "source": [ + "### United States Minor Outlying Islands (`UMI`)\n", + "\n", + "We will use the most similar U.S. territory, which are the Northern Mariana Islands (`MNP`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b225f80a-5f87-4087-a0b4-91dbcacdf812", + "metadata": {}, + "outputs": [], + "source": [ + "mnp = y_pwt_clean.loc[\"MNP\", :].copy()\n", + "mnpyrs = np.unique(mnp.loc[~pd.isnull(mnp.rgdpna_pc), :].index.get_level_values(\"year\"))\n", + "y_pwt_clean.loc[(\"UMI\", mnpyrs), \"rgdpna_pc\"] = y_pwt_clean.loc[\n", + " (\"MNP\", mnpyrs), \"rgdpna_pc\"\n", + "].values\n", + "y_pwt_clean.loc[(\"UMI\", mnpyrs), \"gdp_source\"] = \"MNP_copy\"" + ] + }, + { + "cell_type": "markdown", + "id": "611c5d6d-207a-45be-9c67-593ea7dbdbbb", + "metadata": {}, + "source": [ + "### Pitcairn Island (`PCN`): take the ratio with `GBR`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5b0e79a-2632-4c69-b1c3-85d0b14b81be", + "metadata": {}, + "outputs": [], + "source": [ + "pcn_ratio = (\n", + " gp_df.loc[(\"PCN\", 2006), \"pcn_nom_gdp\"]\n", + " / y_pwt_clean.loc[(\"PCN\", 2006), \"pop\"]\n", + " / gp_df.loc[(\"GBR\", 2006), \"wb_gdp_nom_pc\"]\n", + ")\n", + "pcn_rgdpna_pc = (\n", + " pcn_ratio * y_pwt_clean.loc[(\"GBR\", list(range(1950, 2021))), \"rgdpna_pc\"].values\n", + ")\n", + "y_pwt_clean.loc[(\"PCN\", list(range(1950, 2021))), \"rgdpna_pc\"] = pcn_rgdpna_pc\n", + "y_pwt_clean.loc[\"PCN\", \"gdp_source\"] = \"GBR_ratio\"" + ] + }, + { + "cell_type": "markdown", + "id": "d4c6190b-ca86-4b13-b66b-1cf700de4077", + "metadata": {}, + "source": [ + "### Cleaning up for other territories (current and former)\n", + "\n", + "In general, the reason for assigning country-sovereignty ratios instead of that between similar countries is based on the argument in [**Bertram (World Development, 2003)**](https://www.sciencedirect.com/science/article/abs/pii/S0305750X03002134) that territories (or island economies in the paper, to be more specific) seem to converge to trend with their metropolitan patrons more so than similar territory (island) economies. Based on this idea, we will fill in the missing years' GDPpc data with the nearest 5-year average (e.g., if years before 2000 are missing, use the country-sovereignty GDPpc ratio from 2000-2005 [averaged] to extrapolate for the missing years).\n", + "\n", + "Following territory-sovereignty relationships are considered (excluding the ones dealt with above, but including *previous* territory-sovereignty relationships mentioned in Bertram (2003)). Note that if they are already all filled (1950-2020), they will not be further extrapolated, and for the former territories, year in parentheses is that of gaining independence:\n", + "- Current `GBR`: `IMN`, `JEY`, `GGY`, `AIA`, `BMU`, `IOT`, `VGB`, `CYM`, `FLK`, `GIB`, `MSR`, `SHN`, `TCA`\n", + "- Former `GBR`: `VCT` (1979), `DMA` (1976), `GRD` (1974), `KNA` (1983), `ATG` (1981), `BHS` (1973), `MDV` (1966), `KIR` (1979), `TUV` (1978), `SLB` (1978), `TON` (1970), `FJI` (1970), `VUT` (1980; also managed by `FRA`)\n", + "- Current `FRA`: `GUF`, `GLP`, `MTQ`, `MYT`, `REU`, `SPM`, `MAF`, `BLM`, `PYF`, `WLF`, `NCL`\n", + "- Former `FRA`: `COM` (1975)\n", + "- Current `NLD`: `SXM`, `ABW`, `BES`, `CUW`\n", + "- Current `DNK`: `GRL`, `FRO`\n", + "- Current `NZL`: `NIU`, `WSM`, `COK`, `TKL`\n", + "- Current `USA`: `VIR`, `GUM`, `UMI`, `ASM`\n", + "- Former `USA` (including Free Association): `MHL`, `FSM`, `PLW`\n", + "- Former `AUS`: `NRU`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91aba98b-7426-4eec-8aed-0203af0d9d85", + "metadata": {}, + "outputs": [], + "source": [ + "def fill_using_simple_ratio(\n", + " terr_code, sov_code, df, col=\"rgdpna_pc\", source_col=\"gdp_source\"\n", + "):\n", + " \"\"\"Fill the missing values of the country denoted by `terr_code` using the average\n", + " ratio of nearest known values (at most 5 years) between the countries `terr_code`\n", + " and `sov_code`. This is based on the observation in Bertram (World Development,\n", + " 2003) that GDPpc of (island) territories tend to converge to sovereign GDPpc.\n", + "\n", + " Parameters\n", + " ----------\n", + " terr_code : str\n", + " country code for country/region that belongs to or was associated with the\n", + " country/region represented by `sov_code`\n", + " sov_code : str\n", + " country code for country/region that had or still has legal control over the\n", + " country/region represented by `terr_code`\n", + " df : pandas.DataFrame\n", + " that contains the country/region-level information; should have the columns\n", + " `col` and `source_col`, and be multi-indexed by `ccode` and `year` (denoting\n", + " country code and year)\n", + " col : str\n", + " column containing data that needs imputation/extrapolation (by using the ratio\n", + " of values from the two countries)\n", + " source_col : str\n", + " column containing source data, to be filled with basic information about which\n", + " country information (`sov_code`) was used to conduct the\n", + " imputation/extrapolation\n", + "\n", + " Returns\n", + " -------\n", + " df : pandas.DataFrame\n", + " containing the original information from the pre-modified `df` and the newly\n", + " imputed/extrapolated information\n", + "\n", + " \"\"\"\n", + " ALL_YRS = list(range(1950, 2021))\n", + " msng = (\n", + " df.loc[\n", + " pd.isnull(df[col]) & (df.index.get_level_values(\"ccode\") == terr_code), :\n", + " ]\n", + " .index.get_level_values(\"year\")\n", + " .unique()\n", + " )\n", + "\n", + " if len(msng) == 0:\n", + " return df\n", + "\n", + " filled = np.sort(np.setdiff1d(ALL_YRS, msng))\n", + "\n", + " if ALL_YRS[-1] in msng:\n", + " # years to create averages from\n", + " years = filled[-5:]\n", + "\n", + " # which years to extrapolate?\n", + " to_fill = msng[msng > filled[-1]]\n", + " avg_ratio = (\n", + " df.loc[(terr_code, years), col].values\n", + " / df.loc[(sov_code, years), col].values\n", + " ).mean()\n", + " df.loc[(terr_code, to_fill), col] = (\n", + " avg_ratio * df.loc[(sov_code, to_fill), col].values\n", + " )\n", + "\n", + " if ALL_YRS[0] in msng:\n", + " # years to create averages from\n", + " years = filled[0:5]\n", + "\n", + " # which years to extrapolate?\n", + " to_fill = msng[msng < filled[0]]\n", + " avg_ratio = (\n", + " df.loc[(terr_code, years), col].values\n", + " / df.loc[(sov_code, years), col].values\n", + " ).mean()\n", + " df.loc[(terr_code, to_fill), col] = (\n", + " avg_ratio * df.loc[(sov_code, to_fill), col].values\n", + " )\n", + "\n", + " df.loc[(terr_code, msng), source_col] = \"{}_extrap\".format(sov_code)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "6f4880f3-6a64-43d9-a243-a9204f048b2a", + "metadata": {}, + "source": [ + "#### `GBR` Territories (former and current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14ee6dc9-5484-40d6-967b-b1edb4428566", + "metadata": {}, + "outputs": [], + "source": [ + "current_terr_gbr = [\n", + " \"IMN\",\n", + " \"JEY\",\n", + " \"GGY\",\n", + " \"AIA\",\n", + " \"BMU\",\n", + " \"VGB\",\n", + " \"CYM\",\n", + " \"FLK\",\n", + " \"GIB\",\n", + " \"MSR\",\n", + " \"SHN\",\n", + " \"TCA\",\n", + "]\n", + "former_terr_gbr = [\n", + " \"VCT\",\n", + " \"DMA\",\n", + " \"GRD\",\n", + " \"KNA\",\n", + " \"ATG\",\n", + " \"BHS\",\n", + " \"MDV\",\n", + " \"KIR\",\n", + " \"TUV\",\n", + " \"SLB\",\n", + " \"TON\",\n", + " \"FJI\",\n", + " \"VUT\",\n", + "]\n", + "for i in former_terr_gbr + current_terr_gbr:\n", + " y_pwt_clean = fill_using_simple_ratio(i, \"GBR\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "333e0acd-f5fd-4b4b-85a7-4e59012d2bae", + "metadata": {}, + "source": [ + "#### `FRA` Territories (former and current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3e6cdbe-f826-43f6-97ac-5d807ece7f8e", + "metadata": {}, + "outputs": [], + "source": [ + "current_terr_fra = [\n", + " \"GUF\",\n", + " \"GLP\",\n", + " \"MTQ\",\n", + " \"MYT\",\n", + " \"REU\",\n", + " \"SPM\",\n", + " \"MAF\",\n", + " \"BLM\",\n", + " \"PYF\",\n", + " \"WLF\",\n", + " \"NCL\",\n", + "]\n", + "former_terr_fra = [\"COM\"]\n", + "for i in former_terr_fra + current_terr_fra:\n", + " y_pwt_clean = fill_using_simple_ratio(i, \"FRA\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "2980b4e3-8d34-4a6f-a60a-461076286148", + "metadata": {}, + "source": [ + "#### `NLD` Territories (current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0ef855c-84f9-4204-86d4-a0aeee03e127", + "metadata": {}, + "outputs": [], + "source": [ + "current_terr_nld = [\"SXM\", \"ABW\", \"BES\", \"CUW\"]\n", + "for i in current_terr_nld:\n", + " y_pwt_clean = fill_using_simple_ratio(i, \"NLD\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "7176add7-9255-44c8-9368-48eddbabd5b1", + "metadata": {}, + "source": [ + "#### `DNK` Territories (current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb397fca-3822-49a7-9f48-9a3cab267cf9", + "metadata": {}, + "outputs": [], + "source": [ + "former_terr_prt = [\"GRL\", \"FRO\"]\n", + "for i in former_terr_prt:\n", + " y_pwt_clean = fill_using_simple_ratio(i, \"DNK\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "ebaf28e0-9208-45d1-bb67-b4e222955a78", + "metadata": {}, + "source": [ + "#### `NZL` Territories (current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecbcd811-456a-4ede-9016-ec6f03e5f8aa", + "metadata": {}, + "outputs": [], + "source": [ + "current_terr_nzl = [\"NIU\", \"WSM\", \"COK\", \"TKL\"]\n", + "for i in current_terr_nzl:\n", + " y_pwt_clean = fill_using_simple_ratio(i, \"NZL\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "e9886239-c34c-47cd-9abb-1117cc289d0f", + "metadata": {}, + "source": [ + "#### `USA` Territories (former and current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cde65fe-66fd-4a4e-85fc-20cb104f7e42", + "metadata": {}, + "outputs": [], + "source": [ + "current_terr_usa = [\"VIR\", \"GUM\", \"UMI\", \"ASM\"]\n", + "former_terr_usa = [\"MHL\", \"FSM\", \"PLW\"]\n", + "for i in former_terr_usa + current_terr_usa:\n", + " y_pwt_clean = fill_using_simple_ratio(i, \"USA\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "c6299b5d-f060-4e1d-acb5-1a4ad27eb0a2", + "metadata": {}, + "source": [ + "#### `AUS` Territories (former)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21238a76-dbc0-4f4a-ade7-644b6d2539c8", + "metadata": {}, + "outputs": [], + "source": [ + "y_pwt_clean = fill_using_simple_ratio(\"NRU\", \"AUS\", y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "190387db-c884-458a-9871-fe91185b388b", + "metadata": {}, + "source": [ + "### City-states, city-territories, or microstates\n", + "\n", + "Will use a similar tactic as above. We match it as follows:\n", + "- `AND`: `FRA`\n", + "- `MCO`: `FRA`\n", + "- `MAC`: `HKG`\n", + "- `VAT`: `ITA`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40fb5aca-b224-434b-b3f3-0e49e39acc03", + "metadata": {}, + "outputs": [], + "source": [ + "micro_to_extrap = [\"AND\", \"MCO\", \"MAC\", \"VAT\"]\n", + "micro_from_extrap = [\"FRA\", \"FRA\", \"HKG\", \"ITA\"]\n", + "for j, i in enumerate(micro_to_extrap):\n", + " y_pwt_clean = fill_using_simple_ratio(i, micro_from_extrap[j], y_pwt_clean)" + ] + }, + { + "cell_type": "markdown", + "id": "cfbf35cf-fe66-4110-8047-50f1bd4554e5", + "metadata": {}, + "source": [ + "### Extrapolating based on similar trends\n", + "\n", + "For the remaining countries, we will detect similar-trending countries and use their average trends to fill in the missing pieces. Note that I will not be using trends from those countries whose information have already been extrapolated using others' trends (e.g., using territory-sovereignty relationship)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29a3b92b-47a3-4d2e-9998-baef9409722f", + "metadata": {}, + "outputs": [], + "source": [ + "## detecting which to extrapolate and which not to\n", + "to_extrap = []\n", + "from_extrap = []\n", + "for i in y_pwt_clean.index.get_level_values(\"ccode\").unique():\n", + " i_sum = pd.isnull(y_pwt_clean.loc[i, \"rgdpna_pc\"].values).sum()\n", + " if i_sum > 0:\n", + " if i not in [\"GGY+JEY\", \"CHI\", \"FRA+OV\"]:\n", + " to_extrap.append(i)\n", + " continue\n", + "\n", + " i_source = np.unique(y_pwt_clean.loc[i, \"gdp_source\"].values)\n", + " i_no_extraped = True\n", + " for sour in i_source:\n", + " if (sour == \"GBR_ratio\") or (\"extrap\" in sour) or (\"copy\" in sour):\n", + " i_no_extraped = False\n", + " break\n", + "\n", + " if i_no_extraped:\n", + " from_extrap.append(i)\n", + "\n", + "exclude_extrap = np.setdiff1d(\n", + " y_pwt_clean.index.get_level_values(\"ccode\").unique(), to_extrap + from_extrap\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7826af8-dd49-4188-900d-1a5a30f12714", + "metadata": {}, + "outputs": [], + "source": [ + "## extrapolation process\n", + "gdppc_extrap = ypk_fn.extrap_using_closest(\n", + " to_extrap,\n", + " ypk_fn.organize_ver_to_hor(\n", + " y_pwt_clean.copy(),\n", + " \"rgdpna_pc\",\n", + " \"year\",\n", + " \"ccode\",\n", + " range(1950, 2021),\n", + " ),\n", + " begin_end=[1950, 2020],\n", + " exclude_these=list(exclude_extrap) + [\"GGY+JEY\", \"CHI\", \"FRA+OV\"],\n", + ")\n", + "gdppc_extrap = ypk_fn.organize_hor_to_ver(\n", + " gdppc_extrap,\n", + " \"ccode\",\n", + " None,\n", + " \"rgdpna_pc_extrap\",\n", + " yrs=range(1950, 2021),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f015af1c-8e0b-4e75-9366-0a4f15a1baf5", + "metadata": {}, + "outputs": [], + "source": [ + "## merging this back to the original dataframe\n", + "y_pwt_clean = y_pwt_clean.merge(\n", + " gdppc_extrap, how=\"left\", left_index=True, right_index=True\n", + ")\n", + "y_pwt_clean.loc[pd.isnull(y_pwt_clean.gdp_source), \"gdp_source\"] = y_pwt_clean.loc[\n", + " pd.isnull(y_pwt_clean.gdp_source), \"msng_fill\"\n", + "].values\n", + "y_pwt_clean.loc[pd.isnull(y_pwt_clean.rgdpna_pc), \"rgdpna_pc\"] = y_pwt_clean.loc[\n", + " pd.isnull(y_pwt_clean.rgdpna_pc), \"rgdpna_pc_extrap\"\n", + "].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34186be3-64ec-4d26-bf0b-10e9ce2d0a26", + "metadata": {}, + "outputs": [], + "source": [ + "## finalizing the rgdpna_pc series\n", + "y_clean = (\n", + " y_pwt_clean.loc[\n", + " ~y_pwt_clean.index.get_level_values(\"ccode\").isin([\"FRA+OV\", \"GGY+JEY\", \"CHI\"]),\n", + " :,\n", + " ]\n", + " .sort_index()\n", + " .drop([\"rgdpna_pc_extrap\", \"msng_fill\"], axis=1)\n", + ")\n", + "y_clean = pd.concat([y_clean, y_uninh])[[\"rgdpna_pc\", \"gdp_source\"]].sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "8fd1def3-6e60-4875-9586-004e57e28621", + "metadata": {}, + "source": [ + "### Cleaning up for Mainland France\n", + "\n", + "Currently, the values that have been recorded in `rgdpna_pc` for `FRA` are in terms of overall French population (including the 5 overseas departments). Therefore, in order to keep things consistent, we will 1) multiply the overall French population and the overseas departments' populations to get `rgdpna` values, 2) subtract the five overseas departments' `rgdpna` values from overall French `rgdpna`, and 3) divide by the mainland French population to get the mainland-specific `rgdpna_pc` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f472b53-b065-428f-b0c0-47afc44a5b10", + "metadata": {}, + "outputs": [], + "source": [ + "fra_terr = [\"GUF\", \"GLP\", \"MTQ\", \"MYT\", \"REU\"]\n", + "fra_terr_dfs = y_clean.loc[[\"FRA\"] + fra_terr, :].copy()\n", + "fra_terr_dfs = fra_terr_dfs.merge(\n", + " gp_df.loc[[\"FRA\"], [\"pop\"]], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "fra_terr_dfs.loc[(\"FRA\", 2020), \"pop\"] = (\n", + " fra_terr_dfs.loc[(\"FRA\", 2019), \"pop\"]\n", + " * pop_cleaned.loc[(\"FRA\", 2020), \"pop\"]\n", + " / pop_cleaned.loc[(\"FRA\", 2019), \"pop\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1346a4e-4eb0-4bd6-ab68-463799a5b412", + "metadata": {}, + "outputs": [], + "source": [ + "fra_terr_dfs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63ef82ee-53ea-4264-a324-dccd6d7133ec", + "metadata": {}, + "outputs": [], + "source": [ + "fra_terr = [\"GUF\", \"GLP\", \"MTQ\", \"MYT\", \"REU\"]\n", + "fra_terr_dfs = y_clean.loc[[\"FRA\"] + fra_terr, :].copy()\n", + "fra_terr_dfs = fra_terr_dfs.merge(\n", + " gp_df.loc[[\"FRA\"], [\"pop\"]], left_index=True, right_index=True, how=\"left\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "749082fa-c1e7-4ff7-b962-d36c74d3a752", + "metadata": {}, + "outputs": [], + "source": [ + "# cleaning up the mainland france\n", + "fra_terr = [\"GUF\", \"GLP\", \"MTQ\", \"MYT\", \"REU\"]\n", + "fra_terr_dfs = y_clean.loc[[\"FRA\"] + fra_terr, :].copy()\n", + "fra_terr_dfs = fra_terr_dfs.merge(\n", + " gp_df.loc[[\"FRA\"], [\"pop\"]], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "fra_terr_dfs.loc[(\"FRA\", 2020), \"pop\"] = (\n", + " fra_terr_dfs.loc[(\"FRA\", 2019), \"pop\"]\n", + " * pop_cleaned.loc[(\"FRA\", 2020), \"pop\"]\n", + " / pop_cleaned.loc[(\"FRA\", 2019), \"pop\"]\n", + ")\n", + "yrs_tgt = list(range(1950, 2021))\n", + "for i in fra_terr:\n", + " fra_terr_dfs.loc[(i, yrs_tgt), \"pop\"] = pop_cleaned.loc[(i, yrs_tgt), \"pop\"].values\n", + "\n", + "fra_terr_dfs[\"rgdpna\"] = fra_terr_dfs[\"rgdpna_pc\"] * fra_terr_dfs[\"pop\"]\n", + "fra_terr_dfs.loc[(\"FRA\", yrs_tgt), \"rgdpna\"] = fra_terr_dfs.loc[\n", + " (\"FRA\", yrs_tgt), \"rgdpna\"\n", + "].values - (\n", + " fra_terr_dfs.loc[fra_terr, [\"rgdpna\"]]\n", + " .reset_index()\n", + " .groupby([\"year\"])\n", + " .sum()\n", + " .rgdpna.values\n", + ")\n", + "\n", + "fra_terr_dfs.loc[(\"FRA\", yrs_tgt), \"pop\"] = pop_cleaned.loc[\n", + " (\"FRA\", yrs_tgt), \"pop\"\n", + "].values\n", + "fra_terr_dfs.loc[(\"FRA\", yrs_tgt), \"rgdpna_pc\"] = (\n", + " fra_terr_dfs.loc[(\"FRA\", yrs_tgt), \"rgdpna\"].values\n", + " / fra_terr_dfs.loc[(\"FRA\", yrs_tgt), \"pop\"].values\n", + ")\n", + "\n", + "# re-attaching with the cleaned GDPpc dataset\n", + "y_clean.loc[(\"FRA\", yrs_tgt), \"rgdpna_pc\"] = fra_terr_dfs.loc[\n", + " (\"FRA\", yrs_tgt), \"rgdpna_pc\"\n", + "].values" + ] + }, + { + "cell_type": "markdown", + "id": "d35137a1-1291-49da-bafe-50b2e357fe8f", + "metadata": {}, + "source": [ + "### Creating GDP (`rgdpna`) values\n", + "\n", + "This will be simpler to execute, by appending the cleaned population dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "935e1f94-d84b-4d49-998a-406411bbbee0", + "metadata": {}, + "outputs": [], + "source": [ + "y_clean = y_clean.merge(pop_cleaned, left_index=True, right_index=True, how=\"left\")\n", + "y_clean[\"gdp_unit\"] = \"millions (PPP USD)\"\n", + "y_clean[\"gdppc_unit\"] = \"ones (PPP USD)\"\n", + "y_clean[\"rgdpna\"] = y_clean[\"rgdpna_pc\"] * y_clean[\"pop\"]" + ] + }, + { + "cell_type": "markdown", + "id": "6bf4e643-5b3d-4689-910e-347675415cca", + "metadata": {}, + "source": [ + "## Filling in the missing values for the `cgdpo` (current PPP 2017 USD) series\n", + "\n", + "### Transforming the `rgdpna_pc` series to `cgdpo_pc` equivalents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faeb6e83-bede-4409-ac23-442d90c5e3cd", + "metadata": {}, + "outputs": [], + "source": [ + "y_clean = y_clean.merge(\n", + " ppp_to_2017[[\"conv\"]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "## neutral assumption\n", + "y_clean.loc[pd.isnull(y_clean.conv), \"conv\"] = 1\n", + "\n", + "## copying the 2019 conversion to 2020 conversion\n", + "y_clean.loc[(slice(None), 2020), \"conv\"] = y_clean.loc[\n", + " (slice(None), 2019), \"conv\"\n", + "].values\n", + "y_clean[\"cgdpo_pc_equiv\"] = y_clean[\"rgdpna_pc\"] / y_clean[\"conv\"]" + ] + }, + { + "cell_type": "markdown", + "id": "ea9c6aa3-adf5-40d3-91a5-141de81210e9", + "metadata": {}, + "source": [ + "### Attaching the actual `cgdpo` values from PWT, and creating `cgdpo_pc`\n", + "\n", + "In doing so, we will again try to clean up for the issue with French mainland." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f45d4321-855d-474b-8073-dab668ebb4a3", + "metadata": {}, + "outputs": [], + "source": [ + "col_dict = {\"cgdpo\": \"cgdpo_pwt\", \"pop\": \"pop_pwt\"}\n", + "y_clean = y_clean.merge(\n", + " gp_df[[\"cgdpo\", \"pop\"]].rename(columns=col_dict),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "42d48930-c4dc-4ae2-a4e3-27307a45d6c6", + "metadata": {}, + "source": [ + "Briefly separating out the French territories, and dealing with their numbers first" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2f5312e-15c0-41e8-8e67-f5caed7c94d5", + "metadata": {}, + "outputs": [], + "source": [ + "## gathering the French overseas department values\n", + "y_clean_fra_terr = y_clean.loc[fra_terr, :].copy()\n", + "y_clean_fra_terr[\"cgdpo\"] = y_clean_fra_terr[\"cgdpo_pc_equiv\"]\n", + "fra_terr_cgdpo = y_clean_fra_terr.reset_index().groupby([\"year\"]).sum()[\"cgdpo\"].values\n", + "\n", + "## subtracting this from the `cgdpo_pwt` values in `y_claen` (for FRA)\n", + "y_clean.loc[(\"FRA\", yrs_tgt), \"cgdpo_pwt\"] = (\n", + " y_clean.loc[(\"FRA\", yrs_tgt), \"cgdpo_pwt\"].values - fra_terr_cgdpo\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "65c6cf84-4961-4a68-8be6-674f50c474d9", + "metadata": {}, + "source": [ + "Creating `cgdpo_pc` in PWT version (`cgdpo_pc_pwt`) but with the cleaned population" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ade1d37-0f09-45b3-b7ce-fca82f7942a9", + "metadata": {}, + "outputs": [], + "source": [ + "y_clean[\"cgdpo_pc_pwt\"] = y_clean[\"cgdpo_pwt\"] / y_clean[\"pop\"]" + ] + }, + { + "cell_type": "markdown", + "id": "052671a9-6278-4b40-ab52-dd8ed1b4c5fd", + "metadata": {}, + "source": [ + "### Smooth-filling the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "290003a7-164e-4375-a7d1-c0d20b96184b", + "metadata": {}, + "outputs": [], + "source": [ + "## creating a xr.Dataset with the cgdpo_pc variables\n", + "cgdpo_clean_up = xr.Dataset.from_dataframe(y_clean[[\"cgdpo_pc_pwt\", \"cgdpo_pc_equiv\"]])\n", + "\n", + "## smooth_fill\n", + "cgdpo_clean_up = ypk_fn.smooth_fill(\n", + " cgdpo_clean_up[\"cgdpo_pc_pwt\"],\n", + " cgdpo_clean_up[\"cgdpo_pc_equiv\"],\n", + " time_dim=\"year\",\n", + " other_dim=\"ccode\",\n", + ").to_dataframe()\n", + "\n", + "y_clean = y_clean.merge(\n", + " cgdpo_clean_up.rename(columns={\"cgdpo_pc_pwt\": \"cgdpo_pc\"}),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "\n", + "## creating cgdpo values\n", + "y_clean[\"cgdpo\"] = y_clean[\"cgdpo_pc\"] * y_clean[\"pop\"]" + ] + }, + { + "cell_type": "markdown", + "id": "e9c66764-75f7-4cfe-a869-0e9c954add61", + "metadata": {}, + "source": [ + "## Creating the current PPP-2019 USD `cgdpo` and constant PPP-2019 USD `rgdpna`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27a3f1a9-bccc-4c1d-8344-9ccab6c6ed8e", + "metadata": {}, + "outputs": [], + "source": [ + "## some minor clean-up of names\n", + "y_clean.drop(\n", + " [\"cgdpo_pwt\", \"cgdpo_pc_pwt\", \"pop_pwt\", \"cgdpo_pc_equiv\", \"conv\"],\n", + " axis=1,\n", + " inplace=True,\n", + ")\n", + "ren_names = [\"cgdpo_pc\", \"cgdpo\", \"rgdpna_pc\", \"rgdpna\"]\n", + "y_clean.rename(\n", + " columns=dict(zip(ren_names, [x + \"_17\" for x in ren_names])), inplace=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6320a549-32cd-48d0-aaad-a15291e4f7b8", + "metadata": {}, + "source": [ + "Creating `cgdpo_19` and `cgdpo_pc_19`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c33d1fd3-310a-427f-b826-8d4970faf3fd", + "metadata": {}, + "outputs": [], + "source": [ + "pwt100 = pd.read_excel(sset.PATH_PWT_RAW)\n", + "pwt100.rename(columns={\"countrycode\": \"ccode\"}, inplace=True)\n", + "pwt100.set_index([\"ccode\", \"year\"], inplace=True)\n", + "infla_1719 = pwt100.loc[(\"USA\", 2019), \"pl_gdpo\"] / pwt100.loc[(\"USA\", 2017), \"pl_gdpo\"]\n", + "\n", + "y_clean[\"cgdpo_19\"] = y_clean[\"cgdpo_17\"] * infla_1719\n", + "y_clean[\"cgdpo_pc_19\"] = y_clean[\"cgdpo_19\"] / y_clean[\"pop\"]" + ] + }, + { + "cell_type": "markdown", + "id": "1a4ed720-b223-4590-af99-a7a9a7c50b3b", + "metadata": {}, + "source": [ + "Creating `rgdpna_19` and `rgdpna_pc_19`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19b73abb-831d-4a7d-b17a-adec12f5dffb", + "metadata": {}, + "outputs": [], + "source": [ + "ccodes = y_clean.index.get_level_values(\"ccode\").unique()\n", + "y_clean[\"rgdpna_19\"] = 0\n", + "for cc in tqdm(ccodes):\n", + " cc_17 = y_clean.loc[(cc, 2019), \"rgdpna_17\"]\n", + " if cc_17 == 0:\n", + " continue\n", + " cc_vals = (\n", + " y_clean.loc[(cc, yrs_tgt), \"rgdpna_17\"].values\n", + " / cc_17\n", + " * y_clean.loc[(cc, 2019), \"cgdpo_19\"]\n", + " )\n", + " y_clean.loc[(cc, yrs_tgt), \"rgdpna_19\"] = cc_vals\n", + "\n", + "y_clean[\"rgdpna_pc_19\"] = y_clean[\"rgdpna_19\"] / y_clean[\"pop\"]" + ] + }, + { + "cell_type": "markdown", + "id": "30ded9f9-f1a9-430f-8fdb-181d5b8718c7", + "metadata": {}, + "source": [ + "Filling the `nan`s in with zeros (which are from zero population)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1de94649-6243-413c-b048-56a95e35cca7", + "metadata": {}, + "outputs": [], + "source": [ + "y_clean.loc[pd.isnull(y_clean.rgdpna_pc_19), \"rgdpna_pc_19\"] = 0\n", + "y_clean.loc[pd.isnull(y_clean.cgdpo_pc_19), \"cgdpo_pc_19\"] = 0" + ] + }, + { + "cell_type": "markdown", + "id": "6a574c24-0f2f-4f58-86da-3f0ddc4300ab", + "metadata": { + "tags": [] + }, + "source": [ + "## I-Y (investment to GDP) ratios and `delta` (depreciation rate) cleanup\n", + "\n", + "We do not extrapolate for all the missingness of I-Y ratio here since that will be done by notebooks to follow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50c78400-f3d3-464e-8a44-e46fbdd385cc", + "metadata": {}, + "outputs": [], + "source": [ + "## from PWT\n", + "y_clean = y_clean.merge(\n", + " pwt100[[\"csh_i\", \"delta\"]].rename(columns={\"csh_i\": \"iy_ratio\"}),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "\n", + "## from WB WDI\n", + "wb_wdi = pd.read_parquet(sset.DIR_WB_WDI_RAW / \"wdi_pop_iy_gdp.parquet\")\n", + "y_clean = y_clean.merge(\n", + " wb_wdi[[\"NE.GDI.FTOT.ZS\"]].rename(columns={\"NE.GDI.FTOT.ZS\": \"wb_iy_ratio\"}),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "\n", + "## these values are in percentages, so change accordingly\n", + "y_clean[\"wb_iy_ratio\"] = y_clean[\"wb_iy_ratio\"] / 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03abfee9-28fd-4ed9-9697-92a72148fe9e", + "metadata": {}, + "outputs": [], + "source": [ + "# adding in the IMF iy ratios, while trying to avoid the ArrowInvalid error\n", + "imf = pd.read_excel(sset.PATH_IMF_WEO_RAW, na_values=[\"n/a\", \"--\"]).rename(\n", + " columns={\"ISO\": \"ccode\", \"Subject Descriptor\": \"subject\"}\n", + ")\n", + "imf = imf.loc[imf.ccode.isin(sset.ALL_ISOS_EXTENDED), :]\n", + "\n", + "# renaming and organizing in vertical format\n", + "imf.loc[imf.subject == \"Total investment\", \"subject\"] = \"imf_iy_ratio\"\n", + "v_names = dict(zip(list(range(1980, 2021)), [\"v_\" + str(x) for x in range(1980, 2021)]))\n", + "imf.rename(columns=v_names, inplace=True)\n", + "\n", + "imf_reorg = imf.loc[\n", + " imf.subject == \"imf_iy_ratio\", [\"ccode\"] + list(v_names.values())\n", + "].set_index([\"ccode\"])\n", + "imf_reorg = ypk_fn.organize_hor_to_ver(\n", + " imf_reorg, \"ccode\", None, \"imf_iy_ratio\", \"v_\", range(1980, 2021)\n", + ")\n", + "imf_reorg[\"imf_iy_ratio\"] /= 100\n", + "\n", + "# merging\n", + "y_clean = y_clean.merge(imf_reorg, how=\"outer\", left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72c906f4-2f46-4fa6-8a65-f7c25ea1582f", + "metadata": {}, + "outputs": [], + "source": [ + "## IY ratio fill-in, not smoothly; PWT -> WB -> IMF\n", + "y_clean[\"iy_ratio_source\"] = np.nan\n", + "y_clean.loc[\n", + " pd.isnull(y_clean.iy_ratio) & ~pd.isnull(y_clean.wb_iy_ratio), \"iy_ratio_source\"\n", + "] = \"WB\"\n", + "y_clean.loc[\n", + " pd.isnull(y_clean.iy_ratio)\n", + " & pd.isnull(y_clean.wb_iy_ratio)\n", + " & ~pd.isnull(y_clean.imf_iy_ratio),\n", + " \"iy_ratio_source\",\n", + "] = \"IMF\"\n", + "y_clean.loc[~pd.isnull(y_clean.iy_ratio), \"iy_ratio_source\"] = \"PWT\"\n", + "\n", + "## filling in with WB values\n", + "y_clean.loc[pd.isnull(y_clean.iy_ratio), \"iy_ratio\"] = y_clean.loc[\n", + " pd.isnull(y_clean.iy_ratio), \"wb_iy_ratio\"\n", + "].values\n", + "y_clean.drop([\"wb_iy_ratio\"], inplace=True, axis=1)\n", + "\n", + "## filling in with IMF values\n", + "y_clean.loc[pd.isnull(y_clean.iy_ratio), \"iy_ratio\"] = y_clean.loc[\n", + " pd.isnull(y_clean.iy_ratio), \"imf_iy_ratio\"\n", + "].values\n", + "y_clean.drop([\"imf_iy_ratio\"], inplace=True, axis=1)\n", + "\n", + "## final source fill-in\n", + "y_clean.loc[pd.isnull(y_clean.iy_ratio_source), \"iy_ratio_source\"] = \"-\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "540c6776-47cf-4d58-a935-f8b1ed949923", + "metadata": {}, + "outputs": [], + "source": [ + "## filling in the delta information\n", + "y_clean[\"delta_source\"] = \"-\"\n", + "y_clean.loc[~pd.isnull(y_clean.delta), \"delta_source\"] = \"PWT\"\n", + "\n", + "### 2020 information is missing entirely, so just use 2019's values\n", + "y_clean.loc[(slice(None), 2020), \"delta\"] = y_clean.loc[\n", + " (slice(None), 2019), \"delta\"\n", + "].values\n", + "y_clean.loc[\n", + " (~pd.isnull(y_clean.delta)) & (y_clean.index.get_level_values(\"year\") == 2020),\n", + " \"delta_source\",\n", + "] = \"PWT_copy_2019\"\n", + "\n", + "### using annual global averages when values are missing;\n", + "gp_yrly_delta = y_clean.reset_index().groupby(\"year\").mean()[[\"delta\"]]\n", + "y_clean = y_clean.merge(\n", + " gp_yrly_delta.rename(columns={\"delta\": \"delta_yr_avg\"}),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "y_clean.loc[pd.isnull(y_clean.delta), \"delta\"] = y_clean.loc[\n", + " pd.isnull(y_clean.delta), \"delta_yr_avg\"\n", + "].values\n", + "y_clean.loc[y_clean.delta_source == \"-\", \"delta_source\"] = \"yearly_global_avg\"" + ] + }, + { + "cell_type": "markdown", + "id": "ad7d1d64-c44a-4c6a-accb-b16a2e0a0d9c", + "metadata": {}, + "source": [ + "## Capital ratio by category\n", + "\n", + "In the case of missing data, we will again use yearly global average and mark those as so." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed4d8b24-9fe6-4972-9f16-d8a6771746db", + "metadata": {}, + "outputs": [], + "source": [ + "## captial information\n", + "PWT_RAW_DIR = Path(os.path.dirname(sset.PATH_PWT_RAW))\n", + "pwt_capital = pd.read_excel(PWT_RAW_DIR / \"pwt_K_detail_100.xlsx\").rename(\n", + " columns={\"countrycode\": \"ccode\"}\n", + ")\n", + "capital_vals = [\"Nc_Struc\", \"Nc_Mach\", \"Nc_TraEq\", \"Nc_Other\"]\n", + "pwt_capital = pwt_capital.set_index([\"ccode\", \"year\"])[capital_vals]\n", + "for i in capital_vals:\n", + " pwt_capital[i] = pwt_capital[i].astype(\"float64\")\n", + "\n", + "## ratio of capital in each category to total capital\n", + "pwt_capital[\"total_cap\"] = pwt_capital[capital_vals].sum(axis=1)\n", + "newnames = []\n", + "for i in capital_vals:\n", + " newname = i.split(\"_\")[-1].lower() + \"_ratio_prep\"\n", + " newnames.append(newname)\n", + " pwt_capital[newname] = pwt_capital[i] / pwt_capital[\"total_cap\"]\n", + "\n", + "y_clean = y_clean.merge(\n", + " pwt_capital[newnames], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "\n", + "y_clean[\"k_ratio_source\"] = \"-\"\n", + "y_clean.loc[~pd.isnull(y_clean[newnames[0]]), \"k_ratio_source\"] = \"PWT\"\n", + "\n", + "## copying 2019 value into 2020 ones\n", + "for i in newnames:\n", + " y_clean.loc[(slice(None), 2020), i] = y_clean.loc[(slice(None), 2019), i].values\n", + "y_clean.loc[\n", + " (~pd.isnull(y_clean[i])) & (y_clean.index.get_level_values(\"year\") == 2020),\n", + " \"k_ratio_source\",\n", + "] = \"PWT_copy_2019\"\n", + "\n", + "k_ratio_names = []\n", + "for i in newnames:\n", + " yr_avg = i[0:-5] + \"_yr_avg\"\n", + " y_clean = y_clean.merge(\n", + " y_clean.reset_index().groupby(\"year\").mean()[[i]].rename(columns={i: yr_avg}),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + " )\n", + " name = \"k_\" + i[0:-5]\n", + " k_ratio_names.append(name)\n", + " y_clean[name] = y_clean[i]\n", + " y_clean.loc[pd.isnull(y_clean[i]), name] = y_clean.loc[\n", + " pd.isnull(y_clean[i]), yr_avg\n", + " ]\n", + " if i == newnames[0]:\n", + " y_clean.loc[pd.isnull(y_clean[i]), \"k_ratio_source\"] = \"yearly_global_avg\"" + ] + }, + { + "cell_type": "markdown", + "id": "f1567d69-fc76-4133-af74-1471cebc8f2c", + "metadata": {}, + "source": [ + "We also add the \"movable capital ratio\" (`k_movable_ratio`), which is the sum of `k_mach_ratio`, `k_traeq_ratio`, and `k_other_ratio`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c1c1cb7-8092-42b5-bb3d-cd1be647f922", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "y_clean[\"k_movable_ratio\"] = (\n", + " y_clean[\"k_mach_ratio\"] + y_clean[\"k_traeq_ratio\"] + y_clean[\"k_other_ratio\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e78308d8-249b-457c-9a77-70472c8b7563", + "metadata": {}, + "source": [ + "## Exporting\n", + "\n", + "Let us also clarify the current PPP, USD 2017 and constant 2017 PPP USD variables' names (by marking them with `_17`) to signal what units they are in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9754405a-04f2-4c1f-abe9-5bf1f8ab1688", + "metadata": {}, + "outputs": [], + "source": [ + "y_clean_cop = y_clean.copy() ## due to pandas error, I will make a copy\n", + "y_clean_cop[\"pop_unit\"] = \"millions (of people)\"\n", + "y_clean_cop[\"gdp_unit\"] = \"millions (of USD)\"\n", + "y_clean_cop[\"gdppc_unit\"] = \"ones (of USD)\"\n", + "\n", + "## reorganizing the columns\n", + "gp_columns = [\n", + " \"pop_unit\",\n", + " \"gdppc_unit\",\n", + " \"gdp_unit\",\n", + " \"pop_source\",\n", + " \"gdp_source\",\n", + " \"iy_ratio_source\",\n", + " \"k_ratio_source\",\n", + " \"delta_source\",\n", + " \"pop\",\n", + " \"rgdpna_pc_17\",\n", + " \"rgdpna_17\",\n", + " \"rgdpna_pc_19\",\n", + " \"rgdpna_19\",\n", + " \"cgdpo_pc_17\",\n", + " \"cgdpo_17\",\n", + " \"cgdpo_pc_19\",\n", + " \"cgdpo_19\",\n", + " \"iy_ratio\",\n", + "]\n", + "gp_columns += k_ratio_names + [\"k_movable_ratio\", \"delta\"]\n", + "y_clean_cop = y_clean_cop[gp_columns].copy()\n", + "\n", + "## Exporting\n", + "y_clean_cop.to_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_capital_1950_2020_post_ypk3.parquet\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0b9dcb8e7e194842ae99b1a9baea9759": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_ee7a58fc46dd4b4d9da4f91588307bb2", + "style": "IPY_MODEL_561a97a4b45448a9a8eae441f8e5d0da", + "value": "100%" + } + }, + "1c73fdc454c6491fb61f02cb7ed9fb60": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1c9166276ee347d39c95faf165bc9450": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_53f1e8cca07a4d38aee57777f84c46e9", + "style": "IPY_MODEL_fdee249ae0a74cd5883c86469c39aab9", + "value": " 58/128 [00:03<00:03, 18.74it/s]" + } + }, + "2219a9c4f88340f88bdac773d3ff4b81": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_0b9dcb8e7e194842ae99b1a9baea9759", + "IPY_MODEL_bfb81f9cf73640fa97dd0610729aff52", + "IPY_MODEL_7898ad926b0d4883bcba893ae7fc4767" + ], + "layout": "IPY_MODEL_fd84190ae61144e09e5218bfe17d1fc7" + } + }, + "28f49eb672ba4d298a6053625cd08683": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_93f37f94c1c743138ac2c6ee01c067a8", + "max": 41, + "style": "IPY_MODEL_7f3aaa711aac4b12852fd21f7e39945d", + "value": 41 + } + }, + "2ca5cda929ec4f598887d57499131c4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "2cf7257329bc449a8f7f2cea82f102e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "2f2ef046bcef46b8977cc0ec176e5e3d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3e4ddebc6d8146c5aa104043051a7d9c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_7080a43f19074187898a40c3a20ee40d", + "style": "IPY_MODEL_ca9b28f1e0f14fc8b47d9c8744aed724", + "value": "100%" + } + }, + "3f5b1a2098e44766aa993c6b82496274": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_404dbe7264f44e20bfcf76a2442bc307", + "IPY_MODEL_d472be28fc3f41e9a88461090353c6e7", + "IPY_MODEL_1c9166276ee347d39c95faf165bc9450" + ], + "layout": "IPY_MODEL_9c15bafa92dc45289705416df944586c" + } + }, + "404dbe7264f44e20bfcf76a2442bc307": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_d70e0b24d77f4ae8a376588cb6ed1950", + "style": "IPY_MODEL_7552bca25c9a43a2905ae9fde45c5edb", + "value": " 45%" + } + }, + "45da96f00ed548cfa6d134939acbadd5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_c21431b349d84789982ea663823146b3", + "style": "IPY_MODEL_8cb6432b2f084c3cb409d6c7beba6632", + "value": "100%" + } + }, + "4d24dc58e4b84e11a720130e59a5cd77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_45da96f00ed548cfa6d134939acbadd5", + "IPY_MODEL_28f49eb672ba4d298a6053625cd08683", + "IPY_MODEL_f0842f58af714d499f34aedec233b7c9" + ], + "layout": "IPY_MODEL_1c73fdc454c6491fb61f02cb7ed9fb60" + } + }, + "50b4c5c87b804e0f9214977cd06fef37": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_f1fd6be11513474aa87063932f6eac2b", + "max": 128, + "style": "IPY_MODEL_729bb13470b447e79b3caea1241f1e93", + "value": 128 + } + }, + "53f1e8cca07a4d38aee57777f84c46e9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "55b9a6a30f8b416cbba6c9d62e2087fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_3e4ddebc6d8146c5aa104043051a7d9c", + "IPY_MODEL_50b4c5c87b804e0f9214977cd06fef37", + "IPY_MODEL_5c2902f5fee74ae3aa47520fef604943" + ], + "layout": "IPY_MODEL_63385f1808b547e6a8d3d4ed7b96b70a" + } + }, + "561a97a4b45448a9a8eae441f8e5d0da": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5c2902f5fee74ae3aa47520fef604943": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_f81dbdbbbbf54f7ca9164896bc1bff6e", + "style": "IPY_MODEL_2ca5cda929ec4f598887d57499131c4e", + "value": " 128/128 [00:06<00:00, 18.85it/s]" + } + }, + "63385f1808b547e6a8d3d4ed7b96b70a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7080a43f19074187898a40c3a20ee40d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "729bb13470b447e79b3caea1241f1e93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "7552bca25c9a43a2905ae9fde45c5edb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7898ad926b0d4883bcba893ae7fc4767": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_a21a9092bda140fdafae94f953e21434", + "style": "IPY_MODEL_2cf7257329bc449a8f7f2cea82f102e2", + "value": " 229/229 [00:00<00:00, 517.42it/s]" + } + }, + "7f3aaa711aac4b12852fd21f7e39945d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "8b21dc66894e4f06aaf34585481aae6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8cb6432b2f084c3cb409d6c7beba6632": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "93f37f94c1c743138ac2c6ee01c067a8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "9c15bafa92dc45289705416df944586c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a21a9092bda140fdafae94f953e21434": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a2f1698ea33a4075b704c93fbfe2a56d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ab97341f3436466aa6f4964db63be7f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "bfb81f9cf73640fa97dd0610729aff52": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_8b21dc66894e4f06aaf34585481aae6e", + "max": 229, + "style": "IPY_MODEL_d578d2e01b464318a2a7aa63c44b45a2", + "value": 229 + } + }, + "c21431b349d84789982ea663823146b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ca9b28f1e0f14fc8b47d9c8744aed724": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d472be28fc3f41e9a88461090353c6e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "danger", + "layout": "IPY_MODEL_f300e0de6459495c925d508aa567cc10", + "max": 128, + "style": "IPY_MODEL_ab97341f3436466aa6f4964db63be7f6", + "value": 58 + } + }, + "d578d2e01b464318a2a7aa63c44b45a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "d70e0b24d77f4ae8a376588cb6ed1950": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ee7a58fc46dd4b4d9da4f91588307bb2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f0842f58af714d499f34aedec233b7c9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_a2f1698ea33a4075b704c93fbfe2a56d", + "style": "IPY_MODEL_2f2ef046bcef46b8977cc0ec176e5e3d", + "value": " 41/41 [00:02<00:00, 17.28it/s]" + } + }, + "f1fd6be11513474aa87063932f6eac2b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f300e0de6459495c925d508aa567cc10": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f81dbdbbbbf54f7ca9164896bc1bff6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "fd84190ae61144e09e5218bfe17d1fc7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "fdee249ae0a74cd5883c86469c39aab9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk3_demo_ratios_historical_reg.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk3_demo_ratios_historical_reg.ipynb new file mode 100644 index 0000000..c1f73d2 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk3_demo_ratios_historical_reg.ipynb @@ -0,0 +1,1135 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "68fee01c-c1cb-4120-8c8f-319624f439b1", + "metadata": {}, + "source": [ + "## Code for creating \"demographic ratios,\" \"demographic variables,\" other variables needed for the \"historical regression\" (based on Higgins, 1998, International Economic Review), and executing the said regression\n", + "\n", + "This notebook was written to execute the following tasks:\n", + "- Create \"demographic ratios,\" which are the shares of specific age-groups (0-4, 5-9, ..., 65-69, and 70+) for each country; these will be made in five-year moving averages, for the \"historical regression\" (based on Higgins, 1998, Int. Econ. Rev.)\n", + "- Create \"demographic variables,\" which are created from the demographic ratios and are used the historical regression\n", + "- Create other variables created from on GDPpc and its grwoth rate to be used in the historical regression\n", + "- Conduct the historical regression and project missing values for the I-Y ratios\n", + "\n", + "## Setting\n", + "\n", + "### Importing necessary modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27c2bff0-2778-4158-b2e1-129d1aabdd17", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "432995b6-2134-40d4-8704-8f14f0cea373", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import statsmodels.api as sm\n", + "from tqdm.auto import tqdm\n", + "\n", + "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders import settings as sset" + ] + }, + { + "cell_type": "markdown", + "id": "9f59da9e-de72-4e2e-a0d7-83b16fe6dd1b", + "metadata": { + "tags": [] + }, + "source": [ + "## Creating (five-year averages of) demographic ratios\n", + "\n", + "### Importing necessary datasets\n", + "\n", + "We note that the population values in the dataset we cleaned up is in millions of people whereas the UN data is in thousands of people. Therefore, we will divide the columns `PopMale`, `PopFemale`, and `PopTotal` by 1000 to keep all values in millions of people." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca5b745b-4771-4099-8fec-8aa559dcb0b5", + "metadata": {}, + "outputs": [], + "source": [ + "# overall population stuff\n", + "pop = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_capital_1950_2020_post_ypk3.parquet\"\n", + ")\n", + "\n", + "# let us set aside the uninhabited areas\n", + "pop = pop.loc[\n", + " ~pop.index.get_level_values(\"ccode\").isin(sset.UNINHABITED_ISOS), :\n", + "].sort_index()\n", + "ccodes = pop.index.get_level_values(\"ccode\").unique()\n", + "\n", + "# by-age population\n", + "by_age = pd.read_parquet(sset.DIR_YPK_INT / \"un_population_by_age.parquet\")\n", + "for i in [\"PopMale\", \"PopFemale\", \"PopTotal\"]:\n", + " by_age[i] = by_age[i] / 1000" + ] + }, + { + "cell_type": "markdown", + "id": "97afbfd6-7d25-4472-95b3-4540d5b883fb", + "metadata": { + "tags": [] + }, + "source": [ + "### Gathering the age groups\n", + "\n", + "#### For all countries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3852080c-ecc9-499f-80d9-46df97bd168b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## generating groups\n", + "yrs = np.arange(0, 70, 5)\n", + "groups = [\"-\".join([str(x), str(x + 4)]) for x in yrs]\n", + "\n", + "## easier mapping from group names to group numbering\n", + "dic = dict(zip(groups, range(1, len(groups) + 1)))\n", + "case_df = pd.DataFrame(\n", + " np.vstack([list(dic.keys()), list(dic.values())]).T, columns=[\"AgeGrp\", \"group_num\"]\n", + ")\n", + "\n", + "## group numbering attach\n", + "by_age_cleaning = by_age.reset_index().merge(case_df, on=\"AgeGrp\", how=\"left\")\n", + "\n", + "## if no group number (i.e., the highest age group), just create a new one\n", + "by_age_cleaning.loc[pd.isnull(by_age_cleaning.group_num), \"group_num\"] = (\n", + " max(list(dic.values())) + 1\n", + ")\n", + "\n", + "## gathering by the generated group numberings\n", + "by_age_cleaning = by_age_cleaning.groupby([\"ccode\", \"Time\", \"group_num\"]).sum()\n", + "by_age_cleaning.reset_index(inplace=True)\n", + "\n", + "## reorganizing, data type-setting\n", + "by_age_cleaning = by_age_cleaning.astype({\"Time\": \"int64\", \"group_num\": \"int64\"})\n", + "by_age_cleaning.rename(columns={\"Time\": \"year\"}, inplace=True)\n", + "by_age_cleaning.set_index([\"ccode\", \"year\", \"group_num\"], inplace=True)\n", + "by_age_cleaning.sort_index(inplace=True)\n", + "\n", + "## Only want 2020 and before\n", + "by_age_cleaning = by_age_cleaning.loc[\n", + " by_age_cleaning.index.get_level_values(\"year\") <= 2020, [\"PopTotal\"]\n", + "].copy()\n", + "\n", + "## again, cleaning and merging in the group names\n", + "case_df = case_df.astype({\"group_num\": \"int64\"}).set_index([\"group_num\"])\n", + "by_age_cleaning = by_age_cleaning.merge(\n", + " case_df, left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "\n", + "## if missing group name, this should be for over 70+\n", + "by_age_cleaning.loc[pd.isnull(by_age_cleaning.AgeGrp), \"AgeGrp\"] = \"70+\"" + ] + }, + { + "cell_type": "markdown", + "id": "dcfeaadf-08e1-4792-9528-50c8c2735552", + "metadata": {}, + "source": [ + "#### Taking care of the Channel Islands, and setting aside their information\n", + "\n", + "We find that the `grps_df` actually includes the information for the Channel Islands, which we can separate out to Guernsey and Jersey (`GGY` and `JEY`). In the said separation effort, what we will do is to use the average ratio of `GGY` population and `JEY` population between the years 2009-2019 as noted in the previous file (`ypk3_reorg_and_impute_ypk.ipynb`). The reason for using the years 2009-2019 is because we have actual data for `GGY` for those years in Guernsey Annual Electronic Report." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0eb9ac1c-b1d9-45f5-b216-0d7ac4564a51", + "metadata": {}, + "outputs": [], + "source": [ + "## GGY and JEY ratios\n", + "ggy_0919 = pop.loc[(\"GGY\", list(range(2009, 2020))), \"pop\"].values\n", + "jey_0919 = pop.loc[(\"JEY\", list(range(2009, 2020))), \"pop\"].values\n", + "ggy_ratio = (ggy_0919 / (ggy_0919 + jey_0919)).mean()\n", + "jey_ratio = (jey_0919 / (ggy_0919 + jey_0919)).mean()\n", + "\n", + "## separating out JEY and GGY\n", + "channel = by_age_cleaning.loc[(\"GGY+JEY\", slice(None), slice(None)), :].copy()\n", + "ggy, jey = channel.reset_index(), channel.reset_index()\n", + "ggy[\"PopTotal\"] = ggy[\"PopTotal\"].values * ggy_ratio\n", + "jey[\"PopTotal\"] = jey[\"PopTotal\"].values * jey_ratio\n", + "ggy[\"ccode\"], jey[\"ccode\"] = \"GGY\", \"JEY\"\n", + "ggy.set_index([\"ccode\", \"year\", \"group_num\"], inplace=True)\n", + "jey.set_index([\"ccode\", \"year\", \"group_num\"], inplace=True)\n", + "\n", + "## merging stuff together\n", + "by_age_cleaning = pd.concat(\n", + " [\n", + " ggy,\n", + " jey,\n", + " by_age_cleaning.loc[\n", + " by_age_cleaning.index.get_level_values(\"ccode\") != \"GGY+JEY\", :\n", + " ],\n", + " ],\n", + " axis=0,\n", + ").sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "f68b301c-2c4c-4d94-a0e8-68becace7c4a", + "metadata": {}, + "source": [ + "### Creating demographic ratios of age-group population 5-year-averages, and extrapolating for missing countries\n", + "\n", + "#### 5-year-averages of age-group population\n", + "\n", + "To be exact, this would be the 5 previous years' averages. If there are less than 5 previous years available (for 1950-1954), we use whatever previous years we have, with the exception of year 1950 where the 1950 values are copied due to lack of previous-years data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1864ce4b-d0c0-4e78-b34b-9b992cf537ee", + "metadata": {}, + "outputs": [], + "source": [ + "by_age_cleaning[\"avg_5_yrs\"] = np.nan\n", + "\n", + "for i in tqdm(range(1950, 2021)):\n", + " if i in [1950, 1951]:\n", + " by_age_cleaning.loc[(slice(None), i), \"avg_5_yrs\"] = by_age_cleaning.loc[\n", + " (slice(None), 1950), \"PopTotal\"\n", + " ].values\n", + " continue\n", + " elif i in [1952, 1953, 1954]:\n", + " prev_yrs = list(range(1950, i))\n", + " else:\n", + " prev_yrs = list(range(i - 5, i))\n", + "\n", + " for j, yr in enumerate(prev_yrs):\n", + " name = \"pop_{}\".format(yr)\n", + " yr_df = by_age_cleaning.loc[(slice(None), yr, slice(None)), [\"PopTotal\"]]\n", + " yr_df.reset_index(inplace=True)\n", + " yr_df.drop([\"year\"], inplace=True, axis=1)\n", + " yr_df.rename(columns={\"PopTotal\": name}, inplace=True)\n", + " yr_df.set_index([\"ccode\", \"group_num\"], inplace=True)\n", + "\n", + " if j == 0:\n", + " prev_df = yr_df.copy()\n", + " else:\n", + " prev_df = prev_df.merge(\n", + " yr_df, left_index=True, right_index=True, how=\"left\"\n", + " )\n", + "\n", + " prev_df[\"prev_mean\"] = prev_df[[\"pop_{}\".format(x) for x in prev_yrs]].mean(axis=1)\n", + " prev_df.reset_index(inplace=True)\n", + " prev_df[\"year\"] = i\n", + " prev_df.set_index([\"ccode\", \"year\", \"group_num\"], inplace=True)\n", + " by_age_cleaning = by_age_cleaning.merge(\n", + " prev_df[[\"prev_mean\"]], left_index=True, right_index=True, how=\"left\"\n", + " )\n", + " by_age_cleaning.loc[(slice(None), i), \"avg_5_yrs\"] = by_age_cleaning.loc[\n", + " (slice(None), i), \"prev_mean\"\n", + " ].values\n", + " by_age_cleaning.drop([\"prev_mean\"], inplace=True, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "a1859a37-ee90-4950-a63b-97b82c79e6eb", + "metadata": {}, + "source": [ + "#### Creating the demographic ratios" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "934b555b-99b3-461a-8be4-e1cc93ae0759", + "metadata": {}, + "outputs": [], + "source": [ + "total = by_age_cleaning.reset_index().groupby([\"ccode\", \"year\"]).sum()[[\"avg_5_yrs\"]]\n", + "total.rename(columns={\"avg_5_yrs\": \"total\"}, inplace=True)\n", + "by_age_cleaning = by_age_cleaning.merge(\n", + " total, left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "by_age_cleaning[\"demo_ratio\"] = by_age_cleaning[\"avg_5_yrs\"] / by_age_cleaning[\"total\"]\n", + "\n", + "## setting aside the countries that we will actually use\n", + "by_age_cleaning = by_age_cleaning.loc[\n", + " by_age_cleaning.index.get_level_values(\"ccode\").isin(ccodes), :\n", + "].sort_index()\n", + "\n", + "by_age_cleaning.drop([\"total\"], inplace=True, axis=1)\n", + "by_age_cleaning[\"demo_ratio_source\"] = \"UN\"" + ] + }, + { + "cell_type": "markdown", + "id": "52eb1d2d-b4c9-41f5-8af3-7253ba1c8d97", + "metadata": {}, + "source": [ + "#### Finding the \"similar\" countries (in terms of population growth trajectory) creating demographic ratios by way of weighted averaging of similar countries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad208aef-6584-4064-b6cb-2f11022697d3", + "metadata": {}, + "outputs": [], + "source": [ + "## creating growth rates of population\n", + "pop_hor = ypk_fn.organize_ver_to_hor(\n", + " pop[[\"pop\"]], \"pop\", \"year\", \"ccode\", total_yrs=range(1950, 2021)\n", + ")\n", + "for i in range(1951, 2021):\n", + " v_, v_prev = \"v_{}\".format(i), \"v_{}\".format(i - 1)\n", + " newvar = \"r_{}\".format(i)\n", + " pop_hor[newvar] = pop_hor[v_] / pop_hor[v_prev] - 1\n", + "\n", + "## finding which countries we should extrapolate for, and extrapolate from\n", + "msng_ccodes = np.setdiff1d(\n", + " ccodes, by_age_cleaning.index.get_level_values(\"ccode\").unique()\n", + ")\n", + "valid_ccodes = np.intersect1d(\n", + " ccodes, by_age_cleaning.index.get_level_values(\"ccode\").unique()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65fecae9-adfe-436d-9579-6e181a0dccdf", + "metadata": {}, + "outputs": [], + "source": [ + "def find_similars_extrap_demog_ratio(\n", + " ctry, sim_df, demoratio_df, valid_ccodes, header=\"r_\", n_det=5\n", + "):\n", + " \"\"\"Find `n_det` most similar countries (in terms of population trajectories)\n", + " for a certain country, among those that are listed in valid_ccodes. Then,\n", + " take a weighted average of those countries' to extrapolate demographic ratio.\n", + "\n", + " Parameters\n", + " ----------\n", + " ctry : str\n", + " name of the country to find \"similar\"-trend countries for\n", + " sim_df : pandas DataFrame\n", + " wide-panel-format data to calculate trend-similarities for the `ctry` and other\n", + " countries. Should have yearly data that has column names starting with the\n", + " `header`\n", + " demoratio_df : pandas DataFrame\n", + " DataFrame containing information for the demographic ratio; should have the\n", + " column `demo_ratio` contained\n", + " valid_ccodes : array-like of str\n", + " DataFrame containing country codes which are \"valid\" ones to create extrapolated\n", + " demographic ratios from.\n", + " header : str\n", + " header of the columns in `sim_df` which are yearly variables to detect\n", + " similarities (in growth rate) from\n", + " n_det : int\n", + " number of country codes in `valid_ccodes` to created extrapolated demographic\n", + " ratios from; top `n_det` countries in terms of similaries are selected\n", + "\n", + " Returns\n", + " -------\n", + " extrap_df : pandas DataFrame\n", + " contains information about the extrapolated demographic ratios, specific for the\n", + " country code defined by `ctry`; has the variables `demo_ratio` for the\n", + " extrapolated demographic ratio, and `demo_ratio_source` for the countries whose\n", + " information was utilized to create the `demo_ratio` extrapolations. Has indices\n", + " `ccode`, `year`, and `group_num` for countrycode, year, and group numbering.\n", + "\n", + " \"\"\"\n", + " goodcols = [x for x in sim_df.columns if header in x]\n", + " df_sse = (\n", + " sim_df.loc[valid_ccodes, goodcols].sub(sim_df.loc[ctry, goodcols].values).copy()\n", + " )\n", + " df_sse[\"sse\"] = (df_sse[goodcols].values ** 2).sum(axis=1)\n", + " df_sse[\"sse_rank\"] = df_sse[\"sse\"].rank()\n", + " df_sse.sort_values([\"sse_rank\"], inplace=True)\n", + " df_sse = df_sse.loc[df_sse.sse_rank <= n_det, [\"sse\", \"sse_rank\"]].copy()\n", + "\n", + " extrap_df = demoratio_df.loc[\n", + " (df_sse.index.values, slice(None), slice(None)), [\"demo_ratio\"]\n", + " ].merge(df_sse[[\"sse\"]], left_index=True, right_index=True, how=\"left\")\n", + " denom = np.sum(1 / df_sse.sse)\n", + " extrap_df[\"numer\"] = extrap_df[\"demo_ratio\"] / extrap_df[\"sse\"]\n", + " extrap_df = extrap_df.reset_index().groupby([\"year\", \"group_num\"]).sum()[[\"numer\"]]\n", + " extrap_df[\"numer\"] = extrap_df[\"numer\"] / denom\n", + "\n", + " extrap_df = extrap_df.reset_index().rename(columns={\"numer\": \"demo_ratio\"})\n", + " extrap_df[\"ccode\"] = ctry\n", + " extrap_df.set_index([\"ccode\", \"year\", \"group_num\"], inplace=True)\n", + " extrap_df[\"demo_ratio_source\"] = \",\".join(df_sse.index.values)\n", + "\n", + " return extrap_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c8575a3-a863-4ff3-ad72-0f0b593fee00", + "metadata": {}, + "outputs": [], + "source": [ + "msng_extrap_dfs = []\n", + "for i in tqdm(msng_ccodes):\n", + " msng_extrap_dfs.append(\n", + " find_similars_extrap_demog_ratio(i, pop_hor, by_age_cleaning, valid_ccodes)\n", + " )\n", + "msng_extrap_dfs = pd.concat(msng_extrap_dfs, axis=0)\n", + "\n", + "## creating a finalized version of the demographic ratio dataset\n", + "demo_ratio_df = pd.concat(\n", + " [by_age_cleaning[[\"demo_ratio\", \"demo_ratio_source\"]], msng_extrap_dfs], axis=0\n", + ").sort_index()\n", + "\n", + "## attaching the Age Group indicators\n", + "demo_ratio_df = demo_ratio_df.merge(\n", + " case_df, how=\"left\", right_index=True, left_index=True\n", + ")\n", + "demo_ratio_df.loc[pd.isnull(demo_ratio_df.AgeGrp), \"AgeGrp\"] = \"70+\"" + ] + }, + { + "cell_type": "markdown", + "id": "dcfebd10-bcae-4878-87a5-ea73d8dea1a2", + "metadata": {}, + "source": [ + "### Exporting the demographic ratios results\n", + "\n", + "Note that the uninhabited areas are, again, left out from this dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27fdf070-0b31-401b-82a7-7a9472a9aa75", + "metadata": {}, + "outputs": [], + "source": [ + "demo_ratio_df.to_parquet(sset.DIR_YPK_INT / \"demo_ratio_1950_2020.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "7fee9545-7ecd-4f4e-bc8c-9688dffefa9b", + "metadata": {}, + "source": [ + "## Creating the \"demographic variables\"\n", + "\n", + "Demographic variables are a succinct way of representing different age group (demographic) ratios into just a few variables by assuming a functional (polynomial) form for how they enter the regression. If we assume a $l$th order polynomial form, there will be $k=1, 2, \\cdots, l$-order variables (so $l$ demographic variables to represent 15 demographic groups we have).\n", + "\n", + "For the $k$th order, we have\n", + "\n", + "$D_k = \\sum_{j=1}^J j^k p_j - \\frac{1}{J}\\sum_{j=1}^J j^k$\n", + "\n", + "where $J=15$ in our case (for the number of demographic groups considered) and $p_j$ is the demographic ratio of the $j$th group. Derivation of this form (and why we choose $l=3$ in this instance) is further explained in [Higgins (1995, International Economic Review, pp. 366-367)](https://www.jstor.org/stable/2527297)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e72d132b-c139-4274-8547-d59b89804a30", + "metadata": {}, + "outputs": [], + "source": [ + "demo = pd.read_parquet(sset.DIR_YPK_INT / \"demo_ratio_1950_2020.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6ffc47e-ea37-4240-abb8-6fa85158a75c", + "metadata": {}, + "outputs": [], + "source": [ + "def D_k_generator(df, k=1, v_name=\"demo_ratio\"):\n", + " \"\"\"Function to generate the k-order demographic variable for each country.\n", + "\n", + " Parameters\n", + " ----------\n", + " df : pandas DataFrame\n", + " DataFrame containing column to generate the k-order demographic variable. Should\n", + " have indices `ccode`, `year`, and `group_num` to indicate country-code, year,\n", + " and group numbering for the country-year ratio variable (by groups)\n", + " k : int\n", + " order of the demographic variable to create\n", + " v_name : str\n", + " column name of the ratio variable, contained in the DataFrame `df`\n", + "\n", + " Returns\n", + " -------\n", + " demovar_df : pandas DataFrame\n", + " DataFrame with the same `ccode`, `year`, and `group_num` index information but\n", + " containing a single variable, which is the calculated `k`-th order demographic\n", + " variable with the name `D{k}`.\n", + "\n", + " \"\"\"\n", + "\n", + " astmsg = \"Assign the indices ccode, year, group_num correctly\"\n", + " assert [\"ccode\", \"year\", \"group_num\"] == demo.index.names, astmsg\n", + "\n", + " groups = np.sort(df.index.get_level_values(\"group_num\").unique())\n", + " N_groups = len(groups)\n", + " groups_pwr_k = groups**k\n", + " to_subtract = np.mean(groups_pwr_k)\n", + "\n", + " group_df = pd.DataFrame(data={\"grp_k\": groups_pwr_k, \"group_num\": groups})\n", + " group_df.set_index([\"group_num\"], inplace=True)\n", + "\n", + " demovar_df = df.merge(group_df, left_index=True, right_index=True, how=\"left\")\n", + " demovar_df[\"grp_k_prod_ratio\"] = demovar_df[v_name] * demovar_df[\"grp_k\"]\n", + " demovar_df = (\n", + " demovar_df.reset_index().groupby([\"ccode\", \"year\"]).sum()[[\"grp_k_prod_ratio\"]]\n", + " )\n", + " demovar_df[\"D{}\".format(k)] = demovar_df[\"grp_k_prod_ratio\"] - to_subtract\n", + "\n", + " return demovar_df[[\"D{}\".format(k)]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "168c6ac4-ca0a-49c2-a7b5-1046ae17338e", + "metadata": {}, + "outputs": [], + "source": [ + "## demographic variables (orders 1, 2, 3) and source\n", + "demovar_df = D_k_generator(demo).merge(\n", + " D_k_generator(demo, k=2), how=\"left\", right_index=True, left_index=True\n", + ")\n", + "demovar_df = demovar_df.merge(\n", + " D_k_generator(demo, k=3), how=\"left\", right_index=True, left_index=True\n", + ")\n", + "demo_source = demo.loc[(slice(None), 1950, 1), [\"demo_ratio_source\"]].reset_index()\n", + "demo_source.drop([\"year\", \"group_num\"], inplace=True, axis=1)\n", + "demo_source.set_index([\"ccode\"], inplace=True)\n", + "demovar_df = demovar_df.merge(\n", + " demo_source.rename(columns={\"demo_ratio_source\": \"demo_var_source\"}),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f953a55e-13de-4c11-a504-f7b4fcc92f02", + "metadata": {}, + "source": [ + "## Creating other variables to be used in the \"historical regression\" for I-Y ratios (based on Higgins, 1995, Int. Econ. Rev.)\n", + "\n", + "### Creating \"yhat\"\n", + "\n", + "By \"yhat\" or $\\hat{y}_t$, we mean the following:\n", + "\n", + "$$ \\hat{y}_t = \\frac{1}{5}\\sum_{i=1}^{5} y_{t-i} \\quad t \\geq 1955 $$\n", + "\n", + "where $y_{t}$ is year $t$'s value of GDP per capita (in ones of constant 2017 PPP USD). So essentially, it would be the 5-year average of GDPpc of a certain country for the 5 previous year to the year $t$. However, notice that for $t \\leq 1954$, since we only have data up to 1950, we won't be able to take the 5-year average. Therefore, instead, we will use the following $n \\in \\{1, 2, 3, 4\\}$-year averages:\n", + "\n", + "$$ \\hat{y}_t = \\begin{cases}\n", + " y_{1950} & \\text{ if }t = 1950, 1951 \\\\\n", + " \\frac{1}{n} \\sum_{j=1950}^{1949+n} y_{j} & \\text{ if }t \\in \\{1952, 1953, 1954\\} \\text{ (and $n = t - 1950 $)}\n", + "\\end{cases} $$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef7493c1-d408-44e2-918a-218da2eea5f1", + "metadata": {}, + "outputs": [], + "source": [ + "gdppc = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_capital_1950_2020_post_ypk3.parquet\"\n", + ")\n", + "yvar = \"rgdpna_pc_17\"\n", + "gdppc[\"yhat\"] = np.nan\n", + "\n", + "for i in tqdm(range(1950, 2021)):\n", + " if i in [1950, 1951]:\n", + " gdppc.loc[(slice(None), i), \"yhat\"] = gdppc.loc[\n", + " (slice(None), 1950), yvar\n", + " ].values\n", + " continue\n", + " elif i in [1952, 1953, 1954]:\n", + " prev_range = range(1950, i)\n", + " else:\n", + " prev_range = range(i - 5, i)\n", + "\n", + " for yr in prev_range:\n", + " yr_df = gdppc.loc[(slice(None), yr), [yvar]].reset_index()\n", + " yr_df = yr_df.drop([\"year\"], axis=1).rename(columns={yvar: \"y_{}\".format(yr)})\n", + " yr_df.set_index([\"ccode\"], inplace=True)\n", + " if yr == prev_range[0]:\n", + " prev_df = yr_df.copy()\n", + " else:\n", + " prev_df = prev_df.merge(\n", + " yr_df, left_index=True, right_index=True, how=\"left\"\n", + " )\n", + " prev_df[\"yhat_i\"] = prev_df[[\"y_{}\".format(x) for x in prev_range]].mean(axis=1)\n", + " gdppc = gdppc.merge(\n", + " prev_df[[\"yhat_i\"]], left_index=True, right_index=True, how=\"left\"\n", + " )\n", + " gdppc.loc[(slice(None), i), \"yhat\"] = gdppc.loc[(slice(None), i), \"yhat_i\"].values\n", + " gdppc.drop([\"yhat_i\"], inplace=True, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "ebdeb42b-f966-4685-8c21-1425256c3b42", + "metadata": {}, + "source": [ + "### Creating \"ghat\"\n", + "\n", + "By \"ghat\" or $\\hat{g}_t$, we mean the following:\n", + "\n", + "$$ \\hat{g}_t = \\begin{cases} \\frac{\\hat{y}_{t}}{\\hat{y}_{t-1}} - 1 & \\text{ if $\\hat{y}_{t-1} > 0$} \n", + "\\\\ 1 & \\text{ if $\\hat{y}_{t-1} = 0$ and $\\hat{y}_{t} > 0$}\n", + "\\\\ 0 & \\text{ otherwise}\\end{cases} $$\n", + "\n", + "In other words, it would be the previous year's growth rate (of GDPpc) with respect to the 5-year average of the previous year's 5 previous years. Since we do not have information for the year 1949, I will set $\\hat{g}_{1950} = \\hat{g}_{1951}$. Note that \"ghat\" will be actually used as a regressor in the historical regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cc81ee2-7590-4e10-b46e-e22b103c326e", + "metadata": {}, + "outputs": [], + "source": [ + "prev_yhat = gdppc[[\"yhat\"]].reset_index().rename(columns={\"yhat\": \"prev_yhat\"})\n", + "prev_yhat[\"year\"] = prev_yhat[\"year\"] + 1\n", + "prev_yhat.set_index([\"ccode\", \"year\"], inplace=True)\n", + "gdppc = gdppc.merge(prev_yhat, left_index=True, right_index=True, how=\"left\")\n", + "\n", + "gdppc[\"ghat\"] = gdppc[\"yhat\"] / gdppc[\"prev_yhat\"] - 1\n", + "gdppc.loc[(slice(None), 1950), \"ghat\"] = gdppc.loc[(slice(None), 1951), \"ghat\"].values" + ] + }, + { + "cell_type": "markdown", + "id": "d97df27d-31cf-416d-8b53-aeaae0f9ada5", + "metadata": {}, + "source": [ + "### Creating \"yhat rate\"\n", + "\n", + "By \"yhat rate\" or $\\hat{yr}_t$, I will mean the following:\n", + "\n", + "$$ \\hat{yr}_{i,t} = \\frac{\\hat{y}_{i,t}}{\\hat{y}_{US, t}}$$\n", + "\n", + "where $i$ is a country. Note that the \"yhat rate\" will be used as a regressor in the historical regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b81eb2ab-c411-4bee-b257-f32674ad1fef", + "metadata": {}, + "outputs": [], + "source": [ + "yhat_usa = gdppc.loc[(\"USA\", slice(None)), [\"yhat\"]].reset_index()\n", + "yhat_usa = (\n", + " yhat_usa.drop([\"ccode\"], axis=1)\n", + " .set_index([\"year\"])\n", + " .rename(columns={\"yhat\": \"yhat_us\"})\n", + ")\n", + "\n", + "gdppc = gdppc.merge(yhat_usa, left_index=True, right_index=True, how=\"left\")\n", + "gdppc[\"yhat_rate\"] = gdppc[\"yhat\"] / gdppc[\"yhat_us\"]" + ] + }, + { + "cell_type": "markdown", + "id": "65eb34fb-f955-4ca3-b88c-f799d33907aa", + "metadata": {}, + "source": [ + "### Organizing the dataframe for the historical regression and exporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cc72259-451b-4ce9-9db7-06e6e2436705", + "metadata": {}, + "outputs": [], + "source": [ + "hist_reg_df = gdppc[[\"iy_ratio\", \"yhat_rate\", \"ghat\", \"gdp_source\"]].copy()\n", + "hist_reg_df[\"yhr_sq\"] = hist_reg_df[\"yhat_rate\"] ** 2\n", + "hist_reg_df[\"ghat_sq\"] = hist_reg_df[\"ghat\"] ** 2\n", + "\n", + "hist_reg_df = hist_reg_df.merge(\n", + " demovar_df, left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "for i in [\"D1\", \"D2\", \"D3\"]:\n", + " hist_reg_df[\"{}_x_ghat\".format(i)] = hist_reg_df[i] * hist_reg_df[\"ghat\"]\n", + "\n", + "hist_reg_df.to_parquet(sset.DIR_YPK_INT / \"hist_reg_prep.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "9b1dbf13-f2cc-4ba5-8dcd-067de4d1033d", + "metadata": {}, + "source": [ + "## Historical regression\n", + "\n", + "### Conducting the fit\n", + "\n", + "To summarize, the regression **with** demographic variables can be written as:\n", + "\n", + "$$ \\left(\\frac{I}{Y}\\right)_{c, t} = \\alpha_c + \\beta_1 \\hat{yhr}_{c, t} + \\beta_2 (\\hat{yhr}_{c, t})^2 + \\beta_3 \\hat{g}_{c, t} + \\beta_4 (\\hat{g}_{c, t})^2 + \\sum_{k=1}^3 \\left(\\gamma_k D_{k, c, t} + \\zeta_k [D_{k, c, t} \\times \\hat{g}_{c, t}]\\right) + \\varepsilon_{c, t}$$\n", + "\n", + "where the demographic terms are the ones involving $D_{k,c, t}$ and should be removed in the version where demographic variables are *not* used.\n", + "\n", + "Brief conclusion with AIC, BIC, and adjusted $R^2$ comparison shows that it is better to include the demographic variables in terms of the model fit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "456e81d9-dc61-46cb-8c63-d30469bcc1e6", + "metadata": {}, + "outputs": [], + "source": [ + "## DF to fit for the historical regression\n", + "hist_fit_df = hist_reg_df.loc[\n", + " ~pd.isnull(hist_reg_df[hist_reg_df.columns]).any(axis=1), :\n", + "].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73fc9b82-a8f8-4cd3-ad4f-c0ab1c8bceaf", + "metadata": {}, + "outputs": [], + "source": [ + "def ols_with_selected(\n", + " df_orig, lhs, fe=True, exclude_rhs=[\"gdp_source\", \"demo_var_source\"]\n", + "):\n", + " \"\"\"Running a OLS model with clustered standard errors (clustered at the country\n", + " level) and returning the fitting result.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_orig : pandas DataFrame\n", + " containing information about the `lhs` variable as well as other variables to be\n", + " used as regressors in the OLS model. Should have `ccode` as one of the index\n", + " columns.\n", + " lhs : str\n", + " information in `df_orig` to be used as the regressand\n", + " fe : boolean\n", + " if True, creates and applies country-level fixed effects to the OLS; if False,\n", + " does not create the said fixed effects\n", + " exclude_rhs : array-like of str\n", + " contains column names in `df_orig` that should not be used as regressors\n", + "\n", + " Returns\n", + " -------\n", + " ols_results : statsmodels.regression.linear_model.RegressionResults\n", + " containing OLS results\n", + "\n", + " \"\"\"\n", + "\n", + " c = np.sort(df_orig.index.get_level_values(\"ccode\").unique())\n", + " rhs = [x for x in df_orig.columns if (x not in list(exclude_rhs) + [lhs])]\n", + " df = df_orig.reset_index()\n", + "\n", + " if fe:\n", + " df = pd.concat([df, pd.get_dummies(df[\"ccode\"])], axis=1)\n", + " ols_setup = sm.OLS(df[lhs], sm.add_constant(df[rhs + list(c[1:])]))\n", + " ols_results = ols_setup.fit(cov_kwds={\"groups\": df.ccode}, cov_type=\"cluster\")\n", + " else:\n", + " ols_setup = sm.OLS(df[lhs], sm.add_constant(df[rhs]))\n", + " ols_results = ols_setup.fit(cov_kwds={\"groups\": df.ccode}, cov_type=\"cluster\")\n", + "\n", + " return ols_results" + ] + }, + { + "cell_type": "markdown", + "id": "46ee6add-c3ff-4e93-ae12-a743cc979445", + "metadata": {}, + "source": [ + "Based on better adjusted $R^2$, AIC, and BIC values, we will stick with the regression with demographic variables included (as opposed to that without)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ef79fec-182e-4963-94f9-116d24d0853c", + "metadata": {}, + "outputs": [], + "source": [ + "histreg_demog = ols_with_selected(hist_fit_df, \"iy_ratio\")\n", + "demog_vars = [\n", + " x for x in hist_fit_df.columns if (\"D1\" in x) or (\"D2\" in x) or (\"D3\" in x)\n", + "]\n", + "demog_vars += [\"gdp_source\", \"demo_var_source\"]\n", + "histreg_nodemog = ols_with_selected(hist_fit_df, \"iy_ratio\", exclude_rhs=demog_vars)\n", + "\n", + "d_ar2 = round(histreg_demog.rsquared_adj, 4)\n", + "n_ar2 = round(histreg_nodemog.rsquared_adj, 4)\n", + "print(\"With demog. adj R2: {}, no demog. adj R2: {}\".format(d_ar2, n_ar2))\n", + "print(\"With demog. has better AIC:\", histreg_nodemog.aic > histreg_demog.aic)\n", + "print(\"With demog. has better BIC:\", histreg_nodemog.bic > histreg_demog.bic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc9d1487-2e36-4a3c-a916-cad2ff7641e9", + "metadata": {}, + "outputs": [], + "source": [ + "res_cols = [\"ghat\", \"ghat_sq\", \"yhat_rate\", \"yhr_sq\", \"D1\", \"D2\", \"D3\"]\n", + "res_cols += [\"D1_x_ghat\", \"D2_x_ghat\", \"D3_x_ghat\"]\n", + "res_df = pd.DataFrame(histreg_demog.params, columns=[\"d_beta\"]).loc[res_cols, :]\n", + "d_bse, d_p = histreg_demog.bse, histreg_demog.pvalues\n", + "d_bse.name, d_p.name = \"d_se\", \"d_p\"\n", + "\n", + "nd_param = histreg_nodemog.params\n", + "nd_param.name = \"nd_beta\"\n", + "nd_bse, nd_p = histreg_nodemog.bse, histreg_nodemog.pvalues\n", + "nd_bse.name, nd_p.name = \"nd_se\", \"nd_p\"\n", + "\n", + "for l in [d_bse, d_p, nd_param, nd_bse, nd_p]:\n", + " res_df = res_df.join(l, how=\"left\")\n", + "\n", + "print(res_df)\n", + "\n", + "ctries = [x for x in d_p.index.values if (len(x) == 3) or (x not in res_cols)]\n", + "d_fes, nd_fes = d_p.loc[ctries].values, nd_p.loc[ctries].values\n", + "print()\n", + "print(\n", + " \"FEs significant (%), D:\",\n", + " round(len(d_fes[d_fes < 0.05]) / len(ctries), 4) * 100,\n", + " \"; ND:\",\n", + " round(len(nd_fes[nd_fes < 0.05]) / len(ctries), 4) * 100,\n", + ")\n", + "print()\n", + "print(histreg_nodemog.aic, histreg_demog.aic)\n", + "print(histreg_nodemog.bic, histreg_demog.bic)" + ] + }, + { + "cell_type": "markdown", + "id": "e3db9662-f4f2-45aa-bb87-e46c86af024e", + "metadata": {}, + "source": [ + "We will also estimate the model without fixed effects due to some countries having no I-Y information whatsoever (but we still need to estimate their information). Based on better adjusted $R^2$, AIC, and BIC values, we will again stick with the regression with demographic variables included. Unfortunately, the $R^2$ is not too high without the fixed effects involved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f811708b-4614-4e11-b2d7-72f832729cf1", + "metadata": {}, + "outputs": [], + "source": [ + "histreg_demog_nofe = ols_with_selected(hist_fit_df, \"iy_ratio\", False)\n", + "histreg_nodemog_nofe = ols_with_selected(hist_fit_df, \"iy_ratio\", False, demog_vars)\n", + "\n", + "d_ar2 = round(histreg_demog_nofe.rsquared_adj, 4)\n", + "n_ar2 = round(histreg_nodemog_nofe.rsquared_adj, 4)\n", + "print(\"With demog. adj R2: {}, no demog. adj R2: {}\".format(d_ar2, n_ar2))\n", + "print(\"With demog. has better AIC:\", histreg_nodemog_nofe.aic > histreg_demog_nofe.aic)\n", + "print(\"With demog. has better BIC:\", histreg_nodemog_nofe.bic > histreg_demog_nofe.bic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aedb551-c95d-443f-bf8d-af5570cf5409", + "metadata": {}, + "outputs": [], + "source": [ + "res_df2 = pd.DataFrame(histreg_demog_nofe.params, columns=[\"d_beta\"]).loc[res_cols, :]\n", + "d_bse, d_p = histreg_demog_nofe.bse, histreg_demog_nofe.pvalues\n", + "d_bse.name, d_p.name = \"d_se\", \"d_p\"\n", + "\n", + "nd_param = histreg_nodemog_nofe.params\n", + "nd_param.name = \"nd_beta\"\n", + "nd_bse, nd_p = histreg_nodemog_nofe.bse, histreg_nodemog_nofe.pvalues\n", + "nd_bse.name, nd_p.name = \"nd_se\", \"nd_p\"\n", + "\n", + "for l in [d_bse, d_p, nd_param, nd_bse, nd_p]:\n", + " res_df2 = res_df2.join(l, how=\"left\")\n", + "\n", + "print(res_df2)\n", + "print()\n", + "print(histreg_nodemog_nofe.aic, histreg_demog_nofe.aic)\n", + "print(histreg_nodemog_nofe.bic, histreg_demog_nofe.bic)" + ] + }, + { + "cell_type": "markdown", + "id": "9adec3a9-3c54-462b-ae50-689fd17c35d8", + "metadata": {}, + "source": [ + "### Projections for missing I-Y ratios\n", + "\n", + "#### Projections for those with partial I-Y information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dadb3bd-daef-4173-8d33-5b02663f37ce", + "metadata": {}, + "outputs": [], + "source": [ + "yes_fe = np.sort(hist_fit_df.index.get_level_values(\"ccode\").unique())\n", + "yes_fe_fit = hist_reg_df.loc[(yes_fe, slice(None)), :].reset_index()\n", + "yes_fe_fit = pd.concat([yes_fe_fit, pd.get_dummies(yes_fe_fit.ccode)], axis=1)\n", + "yes_fe_fit[\"iy_ratio_pred\"] = histreg_demog.predict(\n", + " sm.add_constant(yes_fe_fit)[histreg_demog.params.index.values]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5209502c-5ea1-4675-9277-e3f743f8b7cf", + "metadata": {}, + "source": [ + "#### Projections for those with no I-Y information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f326334a-9668-43cd-a22a-8e0b444fbdd1", + "metadata": {}, + "outputs": [], + "source": [ + "no_fe_fit = hist_reg_df.loc[\n", + " ~hist_reg_df.index.get_level_values(\"ccode\").isin(yes_fe), :\n", + "].reset_index()\n", + "no_fe_fit[\"iy_ratio_pred\"] = histreg_demog_nofe.predict(\n", + " sm.add_constant(no_fe_fit)[histreg_demog_nofe.params.index.values]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f1465492-f5a8-48f5-82af-bf65bcfb0aba", + "metadata": {}, + "source": [ + "#### Merging the two cases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f17dbb06-7024-49a5-920b-295a7f4b02cb", + "metadata": {}, + "outputs": [], + "source": [ + "cols = [\"ccode\", \"year\", \"iy_ratio\", \"iy_ratio_pred\"]\n", + "fitted_iy = pd.concat([yes_fe_fit[cols], no_fe_fit[cols]], axis=0).set_index(cols[0:2])\n", + "fitted_iy[\"iy_ratio_fit\"] = fitted_iy[\"iy_ratio\"]\n", + "fitted_iy.loc[pd.isnull(fitted_iy.iy_ratio), \"iy_ratio_fit\"] = fitted_iy.loc[\n", + " pd.isnull(fitted_iy.iy_ratio), \"iy_ratio_pred\"\n", + "].values" + ] + }, + { + "cell_type": "markdown", + "id": "594ff4c5-5d0b-4775-b287-0a7df19db6be", + "metadata": {}, + "source": [ + "#### Some diagnostics (graphing, checking whether any values are above 1 or below 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cbd71a8-080e-4eef-9184-6962d0d0aafb", + "metadata": {}, + "outputs": [], + "source": [ + "def graph_trajectory(ctry, df=fitted_iy):\n", + " \"\"\"Creates a simple graph drawing the actual investment-to-GDP ratio in the data\n", + " and the fitted (or predicted) investment-to-GDP ratio using the OLS model, for a\n", + " single country specified by `ctry`\n", + "\n", + " Parameters\n", + " ----------\n", + " ctry : str\n", + " name of the country\n", + " df : pandas DataFrame\n", + " DataFrame containing the variables `iy_ratio` (for the actual investment-to-GDP\n", + " ratios) and `iy_ratio_pred` (for the predicted investment-to-GDP ratios)\n", + " with indices `ccode` (for country-codes) and `year`, in that order\n", + "\n", + " Returns\n", + " -------\n", + " None, but produces the aforementioned graphs for `ctry`\n", + "\n", + " \"\"\"\n", + " years = np.sort(df.index.get_level_values(\"year\").unique())\n", + " actual = df.loc[(ctry, slice(None)), \"iy_ratio\"].values\n", + " fitted = df.loc[(ctry, slice(None)), \"iy_ratio_pred\"].values\n", + "\n", + " plt.figure(figsize=(9, 6))\n", + " if not pd.isnull(actual).all():\n", + " plt.plot(years, actual, label=\"Actual I-Y\", color=\"black\")\n", + " plt.plot(years, fitted, label=\"Fitted I-Y\", color=\"orange\")\n", + " plt.xlabel(\"Year\")\n", + " plt.ylabel(\"Investment-to-GDP ratio\")\n", + " plt.title(\"Investment-to-GDP ratio for {}\".format(ctry))\n", + " plt.legend()\n", + " plt.show()\n", + "\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b931dbf-7758-4b10-91b1-2ee7aaa36435", + "metadata": {}, + "outputs": [], + "source": [ + "graph_trajectory(\"ARG\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bf582af-629b-465e-a63d-260a7d1e4522", + "metadata": {}, + "outputs": [], + "source": [ + "graph_trajectory(\"USA\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efb40f26-58bd-4d46-9f9f-138e7f161e6b", + "metadata": {}, + "outputs": [], + "source": [ + "## not-so-good case with Libya\n", + "graph_trajectory(\"LBY\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27af47ff-8d0f-4f9b-938b-effac035dcbf", + "metadata": {}, + "outputs": [], + "source": [ + "## not-so-good case with China\n", + "graph_trajectory(\"CHN\")" + ] + }, + { + "cell_type": "markdown", + "id": "0b2b0105-ec81-4ca7-82f1-a43a5ba81c78", + "metadata": {}, + "source": [ + "We can also see that there are some cases where the I-Y ratios are lesser than 0 or larger than 1; however, these are all from the original sources (mostly from PWT10.0's `csh_i` variable, except for `LBY` where the values are from IMF). Therefore, we will keep things as is and accept these values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a23feb54-3a99-4fb5-8c67-864a4e3c0aff", + "metadata": {}, + "outputs": [], + "source": [ + "fitted_iy.loc[(fitted_iy.iy_ratio_fit < 0) | (fitted_iy.iy_ratio_fit > 1)]" + ] + }, + { + "cell_type": "markdown", + "id": "ffbb21fc-509a-421e-b8c9-cefd8e27e89f", + "metadata": {}, + "source": [ + "## Updating the predicted I-Y ratios and re-exporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe99bc37-ea40-4dda-97fb-7432e4e373ed", + "metadata": {}, + "outputs": [], + "source": [ + "iyinfo = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_capital_1950_2020_post_ypk3.parquet\"\n", + ")\n", + "iyinfo = iyinfo.merge(\n", + " fitted_iy[[\"iy_ratio_fit\"]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "\n", + "iyinfo.loc[\n", + " ~pd.isnull(iyinfo.iy_ratio_fit) & pd.isnull(iyinfo.iy_ratio), \"iy_ratio_source\"\n", + "] = \"hist_reg_project\"\n", + "\n", + "iyinfo.loc[(\"SHN\", list(range(1950, 2014))), \"iy_ratio_source\"] = \"hist_reg_project_avg\"\n", + "iyinfo.to_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_capital_1950_2020_post_ypk4.parquet\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk4_impute_hist_capital.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk4_impute_hist_capital.ipynb new file mode 100644 index 0000000..bcfe038 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk4_impute_hist_capital.ipynb @@ -0,0 +1,1304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1319c0da-f541-486b-bb36-b17b422b3551", + "metadata": { + "tags": [] + }, + "source": [ + "## Code for output of historical (1950-2020) country-level capital stock projections (and actual values whenever possible)\n", + "\n", + "Using the I-Y (investment-to-GDP) ratio projections and GDP projections in the previous notebooks, we project the capital stock values (at the country-level).\n", + "\n", + "## Setting\n", + "\n", + "### Importing necessary modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8317fc9e-85cb-4ba2-a70c-385af99ef86f", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "420b64af-3878-402a-91ae-84d17c6ee67c", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import statsmodels.api as sm\n", + "from sklearn.cluster import KMeans\n", + "from tqdm.auto import tqdm\n", + "\n", + "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders import settings as sset\n", + "\n", + "## variables header\n", + "v_ = [\"v_\" + str(x) for x in range(1950, 2020)]" + ] + }, + { + "cell_type": "markdown", + "id": "e82a028b-9a84-40bc-8245-a008be123d41", + "metadata": {}, + "source": [ + "## Getting the investment values\n", + "\n", + "The investment values can be found by multiplying the GDP values ($Y_{c, t}$) with investment-to-GDP ratios (i.e., I-Y ratios and denoted $\\left(\\frac{I}{Y}\\right)_{c, t}$). This is needed as we would like to project missing capital stock values in the years 1950-2020.\n", + "\n", + "We will use the GDP (`cgdpo` series) in conjunction with actual + predicted I-Y ratios as current PPP values are what are used in PWT's method of finding the \"initial capital stock value\" (at 1950 or a later year that is as early as possible).\n", + "\n", + "### Preparations and creating current PPP, 2017 USD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd1ab6ef-1e0a-48db-9669-94f63fd4fc97", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## importing GDP (cgdpo, current PPP, 2017 USD) and I-Y ratio and create investment\n", + "histinfo = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_capital_1950_2020_post_ypk4.parquet\"\n", + ")\n", + "histinfo.loc[pd.isnull(histinfo.iy_ratio_fit), \"iy_ratio_fit\"] = 0\n", + "histinfo[\"curr_ppp_invest\"] = histinfo[\"cgdpo_17\"] * histinfo[\"iy_ratio_fit\"]" + ] + }, + { + "cell_type": "markdown", + "id": "baecbd82-925c-45e0-aa39-e400c910a473", + "metadata": {}, + "source": [ + "### Fetching the PPP conversion table for year-to-year conversion\n", + "\n", + "For the PWT method of finding the initial capital, what we want is to add year-$t$ current PPP investment (generated above) to the year-$t$ current PPP capital stock, take care of capital depreciation, and get the year-$t+1$ capital stock. However, the said year-$t+1$ value will be in year-$t$ PPP, so we need to get year-$t$-to-year-$t+1$ PPP conversion rates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00755f23-899f-43e6-9aa2-4066e7b2d586", + "metadata": {}, + "outputs": [], + "source": [ + "## ppp table for capital stock\n", + "ppp_to_2017_K = ypk_fn.ppp_conversion_specific_year(2017, True, True, pwtvar=\"pl_n\")\n", + "ppp_to_2017_K.loc[pd.isnull(ppp_to_2017_K.conv), \"conv\"] = 1\n", + "\n", + "ppp_K_yr_to_yr = ppp_to_2017_K[[\"conv\"]].rename(columns={\"conv\": \"conv_curr_yr\"})\n", + "ppp_K_next_yr = ppp_to_2017_K[[\"conv\"]].rename(columns={\"conv\": \"conv_next_yr\"})\n", + "ppp_K_next_yr.reset_index(inplace=True)\n", + "ppp_K_next_yr[\"year\"] = ppp_K_next_yr[\"year\"] - 1\n", + "ppp_K_yr_to_yr = ppp_K_yr_to_yr.merge(\n", + " ppp_K_next_yr.set_index([\"ccode\", \"year\"]),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "ppp_K_yr_to_yr[\"conv\"] = ppp_K_yr_to_yr[\"conv_curr_yr\"] / ppp_K_yr_to_yr[\"conv_next_yr\"]\n", + "\n", + "## we don't have 2019-to-2020 rates, so we will assume that there is no PPP rate change\n", + "ppp_K_yr_to_yr.loc[(slice(None), 2019), \"conv\"] = 1" + ] + }, + { + "cell_type": "markdown", + "id": "d2cda573-4c56-4dc6-a3c2-2c1faf55bb4f", + "metadata": {}, + "source": [ + "## Projecting missing values of capital stock\n", + "\n", + "The overall methodology for projection of missing capital stock values can be summarized as follows:\n", + "1) For countries whose information exist only in LitPop, organize 2014 data and use (estimated) investment and depreciation rate values to project 2015-2019 data.\n", + "2) For countries whose information exist only in GEG-15, organize 2005 data and use (estimated) investment and depreciation rate values to project 2006-2019 data.\n", + "3) After Steps 1 and 2, 2014-2019 capital stock values will be available for all countries. Turn those capital stock values into current PPP terms (for PWT10.0 ones, just use `cn`) and calculate the ratios of current-PPP capital stock to current-PPP GDP (`cgdpo`, current PPP, 2017 USD in particular).\n", + "4) Use `k`-nearest neighbors to make unsupervised classifications of countries based on the above-calculated capital-stock-to-GDP ratios.\n", + "\n", + "### Preparations (importing data, cleaning to current PPP, 2017 USD)\n", + "\n", + "We import the `cn` (current PPP, 2017 USD) and `rnna` (constant 2017 PPP USD) capital stock series from PWT. \n", + "\n", + "Also, we import LitPop data which is assumed to be in constant 2014 PPP USD (and are in ones of USD). LitPop's original source data for capital is from World Bank (link [here](https://datacatalog.worldbank.org/dataset/wealth-accounting)), which multiplies 1.24 to their values to also account for land values. We want only the capital values, so land values must be removed; LitPop already has taken care of this multiplier, so it can be used as is (can be confirmed by comparing numbers in the link above).\n", + "\n", + "Finally, we import GEG-15 data which is assumed to be in constant 2005 PPP USD (and are in millions of USD). This data also includes land values, so we divide the values by 1.24 to acquire only the capital stock values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "000ab181-d963-4d78-93aa-f1928e027da8", + "metadata": {}, + "outputs": [], + "source": [ + "# skip this cell if GEG has been already cleaned up at the country level\n", + "geg_coord = pd.read_parquet(sset.PATH_GEG15_INT).rename(columns={\"iso3\": \"ccode\"})\n", + "geg = geg_coord.groupby(\"ccode\")[\"tot_val\"].sum()\n", + "geg = pd.DataFrame(data={\"ccode\": geg.index, \"value\": geg.values})\n", + "\n", + "# country-level information\n", + "geg.to_parquet(sset.DIR_YPK_INT / \"geg-15_ctry_lv.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4235387-e433-4dac-aaa6-7c99b17772fc", + "metadata": {}, + "outputs": [], + "source": [ + "# PWT10.0\n", + "pwt100 = (\n", + " pd.read_excel(sset.PATH_PWT_RAW)\n", + " .rename(columns={\"countrycode\": \"ccode\"})\n", + " .set_index([\"ccode\", \"year\"])\n", + ")\n", + "capdata = histinfo[[\"cgdpo_17\", \"rgdpna_17\", \"curr_ppp_invest\", \"delta\"]].merge(\n", + " pwt100[[\"cn\", \"rnna\"]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "\n", + "# for litpop and geg-15, we retain current PPP but adjust from current USD to\n", + "# constant USD.\n", + "\n", + "# litpop\n", + "litpop_meta = pd.read_csv(sset.DIR_LITPOP_RAW / \"_metadata_countries_v1_2.csv\").rename(\n", + " columns={\"iso3\": \"ccode\", \"total_value [USD]\": \"litpop_cn\"}\n", + ")\n", + "litpop_meta = litpop_meta[~pd.isnull(litpop_meta.litpop_cn)]\n", + "litpop_meta[\"year\"] = 2014\n", + "usd_14_to_17 = pwt100.loc[(\"USA\", 2017), \"pl_n\"] / pwt100.loc[(\"USA\", 2014), \"pl_n\"]\n", + "litpop_meta[\"litpop_cn\"] = litpop_meta[\"litpop_cn\"] / 1000000 * usd_14_to_17\n", + "litpop_meta.set_index([\"ccode\", \"year\"], inplace=True)\n", + "\n", + "### geg-15\n", + "ctry_lv_geg = pd.read_parquet(sset.DIR_YPK_INT / \"geg-15_ctry_lv.parquet\").reset_index()\n", + "ctry_lv_geg[\"year\"] = 2005\n", + "ctry_lv_geg = ctry_lv_geg.astype({\"value\": \"float64\"}).set_index([\"ccode\", \"year\"])\n", + "usd_05_to_17 = pwt100.loc[(\"USA\", 2017), \"pl_n\"] / pwt100.loc[(\"USA\", 2005), \"pl_n\"]\n", + "ctry_lv_geg[\"value\"] = ctry_lv_geg[\"value\"] / 1.24 * usd_05_to_17\n", + "ctry_lv_geg.rename(columns={\"value\": \"geg_cn\"}, inplace=True)\n", + "\n", + "## merging all\n", + "capdata = capdata.merge(ctry_lv_geg, left_index=True, right_index=True, how=\"left\")\n", + "capdata = capdata.merge(\n", + " litpop_meta[[\"litpop_cn\"]], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "\n", + "## we also merge the year-to-year PPP conversion rates\n", + "capdata = capdata.merge(\n", + " ppp_K_yr_to_yr[[\"conv\"]], left_index=True, right_index=True, how=\"left\"\n", + ").drop([\"index\"], axis=1)\n", + "capdata.loc[pd.isnull(capdata.conv), \"conv\"] = 1" + ] + }, + { + "cell_type": "markdown", + "id": "8eeae094-d0d7-49b7-8ee5-433796862f75", + "metadata": {}, + "source": [ + "Let us fill in the capital values of uninhabited areas to be 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a56796f-ee84-4f67-81ac-f28bcf011162", + "metadata": {}, + "outputs": [], + "source": [ + "## uninhabited areas\n", + "for i in sset.UNINHABITED_ISOS:\n", + " capdata.loc[i, [\"cn\"]] = 0\n", + " capdata.loc[i, [\"rnna\"]] = 0" + ] + }, + { + "cell_type": "markdown", + "id": "b5d8656a-c466-4edc-887a-a9e7f14026ac", + "metadata": {}, + "source": [ + "### 2014-2020 projection for LitPop values, 2005-2020 projection for GEG-15 values, and 2020 projection for PWT10.0 (all in current PPP)\n", + "\n", + "#### Log-linear interpolation for LitPop and GEG-15\n", + "\n", + "In the case where 2014 value exists in LitPop and 2005 value exists in GEG-15, we will not try to extrapolate the 2005-2014 values via perpetual inventory method (PIM) but rather by log-linear interpolation. Note that this will only be done for countries *not* having PWT10.0 capital stock information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85e4b9de-3858-42ff-b704-09340228e7e1", + "metadata": {}, + "outputs": [], + "source": [ + "## getting the relevant ccodes\n", + "ccodes = capdata.index.get_level_values(\"ccode\").unique()\n", + "pwt_cc = capdata.loc[~pd.isnull(capdata.cn), :].index.get_level_values(\"ccode\").unique()\n", + "lp_cc = (\n", + " capdata.loc[~pd.isnull(capdata.litpop_cn), :]\n", + " .index.get_level_values(\"ccode\")\n", + " .unique()\n", + ")\n", + "geg_cc = (\n", + " capdata.loc[~pd.isnull(capdata.geg_cn), :].index.get_level_values(\"ccode\").unique()\n", + ")\n", + "lp_cc = np.setdiff1d(lp_cc, pwt_cc)\n", + "geg_cc = np.setdiff1d(geg_cc, pwt_cc)" + ] + }, + { + "cell_type": "markdown", + "id": "f4b7d077-6e17-4a2b-8d2f-06406cc1ead3", + "metadata": {}, + "source": [ + "We notice, however, that there are some additional *inhabited* countries or regions that are absolutely missing all (1950-2020) capital stock information. In this case, we follow LitPop and assume that the 2014 value of capital stock for these countries is **1.247240** times the 2014 value of GDP (`cgdpo_17`, in this case). We will include these in the column `litpop_cn`. We will include these in the set of LitPop countries for now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50e2a01c-c3d5-4a3a-9417-b9f1fa371ff5", + "metadata": {}, + "outputs": [], + "source": [ + "no_k_cc = np.setdiff1d(\n", + " ccodes,\n", + " np.union1d(np.union1d(np.union1d(lp_cc, geg_cc), pwt_cc), sset.UNINHABITED_ISOS),\n", + ")\n", + "print(no_k_cc)\n", + "\n", + "litpop_ky_ratio = 1.247240\n", + "for i in no_k_cc:\n", + " capdata.loc[(i, 2014), \"litpop_cn\"] = (\n", + " litpop_ky_ratio * capdata.loc[(i, 2014), \"cgdpo_17\"]\n", + " )\n", + "\n", + "lp_cc = np.union1d(lp_cc, no_k_cc)\n", + "lp_geg_cc = np.intersect1d(lp_cc, geg_cc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a12b978-f12e-41a9-b8fb-c62ca72a4d09", + "metadata": {}, + "outputs": [], + "source": [ + "## interpolating\n", + "capdata[\"litpop_geg_cn\"] = np.nan\n", + "for i in lp_geg_cc:\n", + " val05 = capdata.loc[(i, 2005), \"geg_cn\"]\n", + " val14 = capdata.loc[(i, 2014), \"litpop_cn\"]\n", + " val05_14 = np.exp(\n", + " np.interp(range(2005, 2015), [2005, 2014], np.log([val05, val14]))\n", + " )\n", + " capdata.loc[(i, list(range(2005, 2015))), \"litpop_geg_cn\"] = val05_14" + ] + }, + { + "cell_type": "markdown", + "id": "eb14c7a0-ee26-4b6c-9e4b-51e73817e58d", + "metadata": { + "tags": [] + }, + "source": [ + "#### PIM projection for LitPop-GEG (2014-2020), LitPop-only (2014-2020), GEG-15-only (2005-2020), and PWT10.0 (2019-2020)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e3f9c39-98ac-46c7-bc83-f9fc095c5d95", + "metadata": {}, + "outputs": [], + "source": [ + "def capital_perp_inven(\n", + " currK_var=\"litpop_cn\",\n", + " currI_var=\"curr_ppp_invest\",\n", + " depre_var=\"delta\",\n", + " ppp_conv_var=\"conv\",\n", + " begin_end=[2014, 2020],\n", + " df=capdata,\n", + "):\n", + " \"\"\"Using the investment values in `currI_var`, depreciation rate values in\n", + " `depre_var`, and capital stock values in `currK_var`, conduct the perpertual\n", + " inventory method to acquire capital stock values' estimates. In every step,\n", + " the capital values are calculated as current PPP values.\n", + "\n", + " Parameters\n", + " ----------\n", + " currK_var : str\n", + " variable name in `df` to contain known current-PPP capital stock values\n", + " currI_var : str\n", + " variable name in `df` to contain current-PPP investment values\n", + " depre_var : str\n", + " variable name in `df` to contain depreciation rate values\n", + " ppp_conv_var : str\n", + " variable name in `df` to contain the year-to-next-year conversion rates in PPP\n", + " begin_end : array-like of int\n", + " contains the year to begin the perpetual inventory method on and to end the said\n", + " method on\n", + " df : pandas DataFrame\n", + " containing the necessary variables (`currK_var`, `currI_var`, `depre_var`, and\n", + " `ppp_conv_var`) with indices `ccode` for country-code and `year`, in that order\n", + "\n", + " Returns\n", + " -------\n", + " df : pandas DataFrame\n", + " containing information with the perpetual inventory method applied to produce\n", + " estimates for (future) capital stock values\n", + "\n", + " \"\"\"\n", + "\n", + " newvar = currK_var + \"_proj\"\n", + " df[newvar] = df[currK_var].values\n", + " for i in range(begin_end[0], begin_end[-1]):\n", + " grossK = df.loc[\n", + " (slice(None), i), [newvar, currI_var, depre_var, ppp_conv_var]\n", + " ].copy()\n", + " grossK[\"next_year_K\"] = (\n", + " (grossK[newvar] + grossK[currI_var])\n", + " * (1 - grossK[depre_var])\n", + " * (grossK[ppp_conv_var])\n", + " )\n", + " grossK.reset_index(inplace=True)\n", + " grossK[\"year\"] = grossK[\"year\"] + 1\n", + " grossK.set_index([\"ccode\", \"year\"], inplace=True)\n", + " df = df.merge(\n", + " grossK[[\"next_year_K\"]], left_index=True, right_index=True, how=\"left\"\n", + " )\n", + " df.loc[(slice(None), i + 1), newvar] = df.loc[\n", + " (slice(None), i + 1), \"next_year_K\"\n", + " ].values\n", + " df.drop([\"next_year_K\"], axis=1, inplace=True)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cac9e416-d97c-4543-bbc8-59a6fda1db6a", + "metadata": {}, + "outputs": [], + "source": [ + "## updating litpop\n", + "capdata = capital_perp_inven(df=capdata)\n", + "\n", + "## updating geg-15\n", + "capdata = capital_perp_inven(\"geg_cn\", begin_end=[2005, 2020], df=capdata)\n", + "\n", + "## updating litpop-geg-15\n", + "capdata = capital_perp_inven(\"litpop_geg_cn\", df=capdata)\n", + "\n", + "## updating cn for PWT10.0\n", + "capdata = capital_perp_inven(\"cn\", begin_end=[2019, 2020], df=capdata)" + ] + }, + { + "cell_type": "markdown", + "id": "9776f721-9f65-4311-bbda-7ef6f3889455", + "metadata": {}, + "source": [ + "#### Creating a single current PPP, 2017 USD capital stock series (`cn_extrap`) for the data so far and tagging sources\n", + "\n", + "Again, we prioritize PWT10.0, then LitPop-GEG-15, then LitPop, then finally GEG-15." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9517dc36-1d6d-4d0d-8071-08896c37f58d", + "metadata": {}, + "outputs": [], + "source": [ + "## filling in the values\n", + "capdata[\"cn_extrap\"] = capdata[\"cn_proj\"].values\n", + "capdata.loc[(lp_geg_cc, slice(None)), \"cn_extrap\"] = capdata.loc[\n", + " (lp_geg_cc, slice(None)), \"litpop_geg_cn_proj\"\n", + "].values\n", + "lp_only = np.setdiff1d(lp_cc, lp_geg_cc)\n", + "capdata.loc[(lp_only, slice(None)), \"cn_extrap\"] = capdata.loc[\n", + " (lp_only, slice(None)), \"litpop_cn_proj\"\n", + "].values\n", + "geg_only = np.setdiff1d(geg_cc, lp_geg_cc)\n", + "capdata.loc[(geg_only, slice(None)), \"cn_extrap\"] = capdata.loc[\n", + " (geg_only, slice(None)), \"geg_cn_proj\"\n", + "].values\n", + "\n", + "## filling in the source information\n", + "capdata[\"cs\"] = \"-\"\n", + "capdata.loc[~pd.isnull(capdata.cn), \"cs\"] = \"PWT\"\n", + "capdata.loc[~pd.isnull(capdata.cn_proj) & (capdata.cs == \"-\"), \"cs\"] = \"PWT_perp_inven\"\n", + "capdata.loc[~pd.isnull(capdata.litpop_cn) & (capdata.cs == \"-\"), \"cs\"] = \"LitPop\"\n", + "capdata.loc[~pd.isnull(capdata.geg_cn) & (capdata.cs == \"-\"), \"cs\"] = \"GEG-15\"\n", + "\n", + "capdata.loc[\n", + " ~pd.isnull(capdata.litpop_geg_cn) & (capdata.cs == \"-\"), \"cs\"\n", + "] = \"LitPop_GEG-15_interp\"\n", + "capdata.loc[\n", + " ~pd.isnull(capdata.litpop_geg_cn_proj) & (capdata.cs == \"-\"),\n", + " \"cs\",\n", + "] = \"LitPop_perp_inven\"\n", + "capdata.loc[\n", + " ~pd.isnull(capdata.litpop_cn_proj) & (capdata.cs == \"-\"),\n", + " \"cs\",\n", + "] = \"LitPop_perp_inven\"\n", + "capdata.loc[\n", + " ~pd.isnull(capdata.geg_cn_proj) & (capdata.cs == \"-\"), \"cs\"\n", + "] = \"GEG-15_perp_inven\"\n", + "\n", + "capdata.loc[(no_k_cc, [2014]), \"cs\"] = \"mult_LitPop_ratio\"\n", + "capdata.loc[(no_k_cc, list(range(2015, 2021))), \"cs\"] = \"mult_LitPop_perp_inven\"" + ] + }, + { + "cell_type": "markdown", + "id": "48f3ba4b-9631-4b67-b3b3-2d518036b63d", + "metadata": { + "tags": [] + }, + "source": [ + "### Finding the initial capital stock (at the year 1950)\n", + "\n", + "#### Grouping the countries (via $k$-means) to find the optimal rate of change of capital intensity (capital to GDP ratio) and the range of initial capital\n", + "\n", + "The methodology implemented in PWT (as documented in [this PWT9.1 appendix](https://www.rug.nl/ggdc/docs/pwt91_capitalservices_ipmrevision.pdf)) to estimate the initial capital stock is as follows:\n", + "1. Set a lower bound and an upper bound of capital intensity at the initial available year ($t_0$), and multiple the year-$t_0$ value of current PPP GDP to get lower and upper bounds of year-$t_0$ capital stock estimates. In PWT9.1, the values of lower and upper bounds of year-$t_0$ capital intensities are 0.5 and 4.0.\n", + "2. Add on investment values and account for depreciation via the perpetual inventory method (PIM) and \"grow\" the upper and lower capital stocks.\n", + "3. Due to depreciation, there will be a year (call this $t^*$) at which the two (upper and lower) tracks of current-PPP capital become close; PWT9.1 sets the \"closeness\" as 10% (so upper-bound capital is less than 1.1 times lower-bound capital).\n", + "4. Calculate the upper and lower capital intensities at $t^*$, and calculate the simple mean of year-$t^*$ capital intensity (denote by $\\kappa_{t_0}$).\n", + "5. Decrease this year-$t^*$ capital intensity by per-annum capital intensity growth rate (set as $g_\\kappa=0.02$), until it reaches the initial year. So the new initial capital intensity at $t_0$ is $\\kappa_{t_0}= \\kappa_{t^*}- g_\\kappa(t^* - t_0) $.\n", + "6. Multiply this value with the GDP at year-$t_0$ to acquire the year-$t_0$ capital stock value.\n", + "\n", + "While we will follow this methodology, the problem is that the capital intensity growth rate seems to vary a lot country-by-country. Further, the initial lower and upper bounds of capital intensity being 0.5 and 4.0 each does not seem to fit the PWT10.0 update. Therefore, what we will do is the following:\n", + "\n", + "1. Group the countries via $k$-means using their available capital intensities.\n", + "2. For each group, find the earliest-year (preferably 1950) upper and lower bounds of capital intensity.\n", + "3. Also for each group, find the per-annum capital intensity growth rates.\n", + "4. Apply the above-mentioned methodology for each group, using the updated lower / upper bounds of capital intensity at the initial year (1950) and capital intensity growth rates.\n", + "\n", + "For grouping, we try regular $k$-means with only the years that are available currently for all countries (2014-2020) or by filling in the missing pieces using the EM algorithm. The former, in terms of balanced classification, seems to work better, so we go with the regular $k$-means methodology with $k=3$. The EM-augmented $k$-means algorithm is from this [Stack Overflow post](https://stackoverflow.com/questions/35611465/python-scikit-learn-clustering-with-missing-data)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a192c31-d056-4865-af91-ed9bbd39a522", + "metadata": {}, + "outputs": [], + "source": [ + "def kmeans_missing(X, n_clusters, max_iter=10, rand_state=60607):\n", + " \"\"\"Perform K-Means clustering on data with missing values.\n", + "\n", + " Parameters\n", + " ----------\n", + " X : array-like\n", + " wide-format array (with each row being different countries) to conduct the\n", + " EM algorithm and k-means clustering on\n", + " n_clusters : int\n", + " number of clusters to form\n", + " max_iter : int\n", + " maximum number of EM iterations to perform\n", + " rand_state : int\n", + " random state, for replicability\n", + "\n", + " Returns\n", + " -------\n", + " labels : array-like of int\n", + " containing integer labels, based on the EM-augmented k-means algorithm, for each\n", + " row in the array-like `X`\n", + " centroid : array-like\n", + " containing the centroid for each of the k-means label\n", + " X_hat : array-like\n", + " copy of `X` with the missing values filled in using the EM algorithm\n", + "\n", + " \"\"\"\n", + "\n", + " # Initialize missing values to their column means\n", + " missing = ~np.isfinite(X)\n", + " mu = np.nanmean(X, 0, keepdims=1)\n", + " X_hat = np.where(missing, mu, X)\n", + "\n", + " for i in range(max_iter):\n", + " if i > 0:\n", + " # initialize KMeans with the previous set of centroids. this is much\n", + " # faster and makes it easier to check convergence (since labels\n", + " # won't be permuted on every iteration), but might be more prone to\n", + " # getting stuck in local minima.\n", + " clus = KMeans(n_clusters, init=prev_centroids, random_state=rand_state)\n", + " else:\n", + " # do multiple random initializations in parallel\n", + " clus = KMeans(n_clusters, random_state=rand_state)\n", + "\n", + " # perform clustering on the filled-in data\n", + " labels = clus.fit_predict(X_hat)\n", + " centroids = clus.cluster_centers_\n", + "\n", + " # fill in the missing values based on their cluster centroids\n", + " X_hat[missing] = centroids[labels][missing]\n", + "\n", + " # when the labels have stopped changing then we have converged\n", + " if i > 0 and np.all(labels == prev_labels):\n", + " break\n", + "\n", + " prev_labels = labels\n", + " prev_centroids = clus.cluster_centers_\n", + "\n", + " return labels, centroids, X_hat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeffa752-d181-4853-a81a-bc931619ebcb", + "metadata": {}, + "outputs": [], + "source": [ + "## set aside the uninhabited areas\n", + "uninh_capdata = capdata.loc[sset.UNINHABITED_ISOS, :].copy()\n", + "capdata = capdata.loc[\n", + " ~capdata.index.get_level_values(\"ccode\").isin(sset.UNINHABITED_ISOS), :\n", + "].sort_index()\n", + "\n", + "## creating K-Y ratios dataset, horizontal form (for k-means)\n", + "capdata[\"cap_intensity\"] = capdata[\"cn_extrap\"] / capdata[\"cgdpo_17\"]\n", + "cap_intensity = ypk_fn.organize_ver_to_hor(\n", + " capdata,\n", + " \"cap_intensity\",\n", + " \"year\",\n", + " \"ccode\",\n", + " range(1950, 2021),\n", + ")\n", + "all_kys = [\"v_\" + str(X) for X in range(1950, 2021)]\n", + "cap_intensity[all_kys] = cap_intensity[all_kys].astype(\"float64\")\n", + "\n", + "## we can use only the filled information; initializing clustering algorithms\n", + "cluster_3 = KMeans(n_clusters=3, random_state=60607)\n", + "comp_ky_s_filled = [\"v_\" + str(X) for X in range(2014, 2021)]\n", + "cap_intensity[\"cl3\"] = cluster_3.fit(cap_intensity[comp_ky_s_filled].values).labels_\n", + "\n", + "## based on balanced classification, 3 seems to be the most optimal\n", + "## with EM algorithm as well\n", + "em_kmeans = kmeans_missing(cap_intensity[all_kys], 3)\n", + "cap_intensity[\"cl3_em\"] = em_kmeans[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2848acfb-dde1-4016-9f38-97c2d9a06700", + "metadata": {}, + "outputs": [], + "source": [ + "## trying to see the balancedness between regular k-means and EM-augmented version\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))\n", + "ax1.hist(cap_intensity[\"cl3\"].astype(\"int64\"))\n", + "ax1.set_xticks([0, 1, 2])\n", + "\n", + "ax2.hist(cap_intensity[\"cl3_em\"].astype(\"int64\"))\n", + "ax2.set_xticks([0, 1, 2])\n", + "\n", + "ax1.set_ylim([0, 160]), ax2.set_ylim([0, 160])\n", + "ax1.set_yticks([0, 40, 80, 120, 160]), ax2.set_yticks([0, 40, 80, 120, 160])\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39c97da5-5b83-41fe-b6c5-805e48b16717", + "metadata": {}, + "outputs": [], + "source": [ + "# em grouping gives only 1 country assigned to the final group\n", + "cap_intensity.reset_index().groupby([\"cl3\"]).count()[[\"ccode\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2fb5792-53ab-4b42-8f40-5dfc38fd6c89", + "metadata": {}, + "outputs": [], + "source": [ + "# replication of Table 1 in Inklaar et al. (Intl Productivity Monitor 2019)\n", + "rows = []\n", + "for i in [1950, 1960, 1970, 1980, 1990, 2000, 2011, 2017]:\n", + " v = f\"v_{i}\"\n", + " row = [i, cap_intensity.loc[~pd.isnull(cap_intensity[v]), :].shape[0]]\n", + " row += [\n", + " round(cap_intensity[v].mean(), 1),\n", + " round(cap_intensity[v].std(), 1),\n", + " round(cap_intensity[v].min(), 1),\n", + " round(cap_intensity[v].max(), 1),\n", + " ]\n", + " rows.append(row)\n", + "np.set_printoptions(suppress=True)\n", + "print(np.array(rows))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2498b437-8ca5-4667-9832-14d301dcd9c3", + "metadata": {}, + "outputs": [], + "source": [ + "## attaching the cluster types\n", + "capdata = capdata.merge(\n", + " cap_intensity[\"cl3\"], left_index=True, right_index=True, how=\"left\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e85f524-75c0-47c8-9094-2e4be0a51494", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_min_max_growthrate_by_group(\n", + " df=capdata, group=\"cl3\", ratio=\"cap_intensity\"\n", + "):\n", + " \"\"\"By specified `group` designation, calculate the lower and upper bounds of the\n", + " variable `ratio` contained in DataFrame `df`, as well as the said variable's average\n", + " annual growth rate.\n", + "\n", + " Parameters\n", + " ----------\n", + " df : pandas DataFrame\n", + " containing information about the `group` and `ratio`. Should also contain the\n", + " variable `year` as growth rate values are calculated yearly.\n", + " group : str\n", + " column name in `df` that represents the grouping (by k-means clustering or\n", + " other methods)\n", + " ratio : str\n", + " column name in `df` that represents the variable for calculating the lower,\n", + " upper bounds and annual growth rates\n", + "\n", + " Returns\n", + " -------\n", + " growth_rate_df : pandas DataFrame\n", + " containing, by group, the information about lower bound of `ratio` (`ky_lower`,\n", + " and set to be the 10th quantile from the bottom), upper bound of `ratio`\n", + " (`ky_upper`, and set to be the 90th quantile from the bottom), and growth rate\n", + " per annum of `ratio` (`ky_growth`). Also stores the grouping information in the\n", + " variable `cl`.\n", + "\n", + " \"\"\"\n", + "\n", + " growth_rate_df = []\n", + " for cl in np.sort(df[group].unique()):\n", + " cl_df = df.loc[df[group] == cl, [ratio]].copy()\n", + " nona_ratios = cl_df[ratio].values\n", + " nona_ratios = nona_ratios[~pd.isnull(nona_ratios)]\n", + " cl_lower, cl_upper = np.quantile(nona_ratios, [0.1, 0.9])\n", + "\n", + " cl_df = cl_df.loc[~pd.isnull(cl_df[ratio]), :].reset_index()\n", + " cl_growth = sm.OLS(\n", + " cl_df[ratio].astype(\"float64\"),\n", + " sm.add_constant(cl_df[[\"year\"]]).astype(\"float64\"),\n", + " )\n", + " cl_growth = cl_growth.fit().params[\"year\"]\n", + " growth_rate_df.append([cl, cl_lower, cl_upper, cl_growth])\n", + "\n", + " growth_rate_df = pd.DataFrame(\n", + " np.vstack(growth_rate_df),\n", + " columns=[\"cl\", \"ky_lower\", \"ky_upper\", \"ky_growth\"],\n", + " )\n", + "\n", + " return growth_rate_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "377ca697-9eac-47c4-b020-aa2beea3e8c1", + "metadata": {}, + "outputs": [], + "source": [ + "## growth rates, upper and lower bounds for capital intensity\n", + "cl_gr = calculate_min_max_growthrate_by_group(df=capdata).rename(columns={\"cl\": \"cl3\"})\n", + "cl_gr[\"cl3\"] = cl_gr[\"cl3\"].astype(\"int64\")\n", + "capdata = (\n", + " capdata.reset_index()\n", + " .merge(cl_gr, on=[\"cl3\"], how=\"left\")\n", + " .set_index([\"ccode\", \"year\"])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a8afb79-0b5c-412b-9e7f-c88c51682ae5", + "metadata": {}, + "outputs": [], + "source": [ + "cl_gr" + ] + }, + { + "cell_type": "markdown", + "id": "99832dcb-0c13-446b-9a15-3fc924b3be1d", + "metadata": {}, + "source": [ + "#### Applying PWT 9.1's method, cluster by cluster, and interpolating with the known values of capital\n", + "\n", + "We will also need our investment values to apply the PWT 9.1's method, so we will do so below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44211983-4acc-4563-a2db-8295f08152b0", + "metadata": {}, + "outputs": [], + "source": [ + "def find_init_k(\n", + " df=capdata,\n", + " begin_end=[1950, 2020],\n", + " lb=\"ky_lower\",\n", + " ub=\"ky_upper\",\n", + " gr=\"ky_growth\",\n", + " currK_var=\"cn\",\n", + " currY_var=\"cgdpo_17\",\n", + " currI_var=\"curr_ppp_invest\",\n", + " depre_var=\"delta\",\n", + " ytoy_ppp=\"conv\",\n", + " cluster=\"cl3\",\n", + " ub_lb_thresh=0.1,\n", + "):\n", + " \"\"\"Finding the initial value of capital (at the year specified by `begin_end`)\n", + " based on the methdology of PWT 9.1.\n", + "\n", + " Parameters\n", + " ----------\n", + " df : pandas.DataFrame\n", + " DataFrame to contain all necessary information (current PPP GDP, investment,\n", + " depreciation rates, growth rate, lower bound and upper bound for the capital\n", + " intensity)\n", + " begin_end : array-like of ints\n", + " array-like containing two elements - initial year and the final year to be\n", + " considered by the process\n", + " lb : str\n", + " column name in `df` to indicate the lower bound of capital intensity\n", + " ub : str\n", + " column name in `df` to indicate the upper bound of capital intensity\n", + " gr : str\n", + " column name in `df` to indicate the average yearly growth of capital intensity\n", + " currK_var : str\n", + " column name in `df` for current-PPP capital\n", + " currY_var : str\n", + " column name in `df` for current-PPP GDP\n", + " currI_var : str\n", + " column name in `df` for current-PPP investment\n", + " depre_var : str\n", + " column name in `df` for depreciate rate\n", + " ytoy_ppp : str\n", + " column name in `df` for year-to-next-year PPP conversion rate\n", + " cluster : str\n", + " column name in `df` for cluster (based on capital intensity values)\n", + " ub_lb_thresh : float\n", + " difference between upper- and lower-bound capital stock values to halt and\n", + " acquire year `tstar`\n", + "\n", + " Returns\n", + " -------\n", + " estimated : pandas.DataFrame\n", + " DataFrame with `ccode` (country code) as the index containing initial-year\n", + " capital stock estimations (based on the PWT 9.1 method); only contains\n", + " information if a country was actually missing the initial-year capital stock\n", + "\n", + " \"\"\"\n", + "\n", + " cl_df = df[[cluster, lb, ub, gr, currY_var, currI_var, ytoy_ppp, depre_var]].copy()\n", + " cl_df[\"low_k\"], cl_df[\"high_k\"] = np.nan, np.nan\n", + " for yr in range(begin_end[0], begin_end[-1]):\n", + " ## setting the initial year's lower and upper bound capital\n", + " if yr == begin_end[0]:\n", + " cl_df.loc[(slice(None), yr), \"low_k\"] = (\n", + " cl_df.loc[(slice(None), yr), [currY_var, lb]].product(axis=1).values\n", + " )\n", + " cl_df.loc[(slice(None), yr), \"high_k\"] = (\n", + " cl_df.loc[(slice(None), yr), [currY_var, ub]].product(axis=1).values\n", + " )\n", + " nxt = yr + 1\n", + " for i in [\"low_k\", \"high_k\"]:\n", + " cl_df.loc[(slice(None), nxt), i] = (\n", + " cl_df.loc[(slice(None), yr), [i, currI_var]].sum(axis=1).values\n", + " * (1 - cl_df.loc[(slice(None), yr), depre_var].values)\n", + " * cl_df.loc[(slice(None), yr), ytoy_ppp].values\n", + " )\n", + " cl_df[\"hi_lo_ratio\"] = cl_df[\"high_k\"] / cl_df[\"low_k\"] - 1\n", + "\n", + " ## finding t-star, the year that high- and low-trajectories are lesser than\n", + " ## the threshold set by `ub_lb_thresh`\n", + " tstar_df = (\n", + " cl_df.loc[cl_df.hi_lo_ratio < ub_lb_thresh, :]\n", + " .reset_index()\n", + " .groupby([\"ccode\"])\n", + " .min()[[\"year\"]]\n", + " .rename(columns={\"year\": \"tstar\"})\n", + " )\n", + " cl_df = cl_df.merge(tstar_df, how=\"left\", left_index=True, right_index=True)\n", + "\n", + " ## if tstar is not acquired, get the latest year to be the tstar\n", + " cl_df.loc[pd.isnull(cl_df[\"tstar\"]), \"tstar\"] = begin_end[-1]\n", + "\n", + " ## country-by-country calculation of initial capital for those missing them\n", + " init = df.loc[(slice(None), begin_end[0]), [currK_var]].copy()\n", + " msng_ccodes = (\n", + " init.loc[pd.isnull(init[currK_var]), :].index.get_level_values(\"ccode\").unique()\n", + " )\n", + " estimated = []\n", + " for cc in msng_ccodes:\n", + " ## how many years from tstar to initial year\n", + " tstar = cl_df.loc[(cc, begin_end[0]), \"tstar\"]\n", + " tstar_t0 = tstar - begin_end[0]\n", + "\n", + " ## initial-year capital-to-GDP ratio\n", + " init_ky = cl_df.loc[(cc, [tstar]), [\"high_k\", \"low_k\"]].mean(axis=1).values[\n", + " 0\n", + " ] / cl_df.loc[(cc, tstar), currY_var] - (\n", + " tstar_t0 * cl_df.loc[(cc, begin_end[0]), gr]\n", + " )\n", + " if init_ky < cl_df.loc[(cc, tstar), lb]:\n", + " init_ky = cl_df.loc[(cc, tstar), lb]\n", + " elif init_ky > cl_df.loc[(cc, tstar), ub]:\n", + " init_ky = cl_df.loc[(cc, tstar), ub]\n", + "\n", + " ## initial-year capital value\n", + " init_K = init_ky * cl_df.loc[(cc, begin_end[0]), currY_var]\n", + " estimated.append([cc, init_K])\n", + " estimated = pd.DataFrame(\n", + " np.vstack(estimated), columns=[\"ccode\", \"cn_init_estim\"]\n", + " ).set_index([\"ccode\"])\n", + "\n", + " return estimated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5912a76a-c9d0-437b-8e27-79a68494a3a3", + "metadata": {}, + "outputs": [], + "source": [ + "## estimating the \"cn_init_estim\" (missing initial-year capital)\n", + "cn_init_estim = find_init_k(capdata)\n", + "\n", + "## merging with the rest\n", + "capdata = capdata.merge(cn_init_estim, left_index=True, right_index=True, how=\"left\")\n", + "capdata.loc[\n", + " (capdata.cs == \"-\")\n", + " & (~pd.isnull(capdata.cn_init_estim))\n", + " & (capdata.index.get_level_values(\"year\") == 1950),\n", + " \"cs\",\n", + "] = \"init_K_estim\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edc9a75d-a2aa-4ab9-9dd0-3af69deb1b98", + "metadata": {}, + "outputs": [], + "source": [ + "## interpolating the rest, and filling the said values to cn_extrap\n", + "msng_ccodes = (\n", + " capdata.loc[(~pd.isnull(capdata.cn_init_estim)), :]\n", + " .index.get_level_values(\"ccode\")\n", + " .unique()\n", + ")\n", + "capdata[\"cn_init_estim\"] = capdata[\"cn_init_estim\"].astype(\"float64\")\n", + "capdata[\"cn_extrap\"] = capdata[\"cn_extrap\"].astype(\"float64\")\n", + "for i in msng_ccodes:\n", + " ## initial capital that was estimated\n", + " init_K = capdata.loc[(i, 1950), \"cn_init_estim\"]\n", + "\n", + " filled_K = capdata.loc[\n", + " (capdata.index.get_level_values(\"ccode\") == i)\n", + " & (~pd.isnull(capdata.cn_extrap)),\n", + " [\"cn_extrap\"],\n", + " ]\n", + " filled_yr_min = filled_K.index.get_level_values(\"year\").min()\n", + " filled_yr_min_K = capdata.loc[(i, filled_yr_min), \"cn_extrap\"]\n", + "\n", + " interp_K = np.interp(\n", + " range(1950, filled_yr_min + 1),\n", + " [1950, filled_yr_min],\n", + " np.log([init_K, filled_yr_min_K]),\n", + " )\n", + " interp_K = np.exp(interp_K)\n", + " i_yrs = list(range(1950, filled_yr_min + 1))\n", + " capdata.loc[(i, i_yrs), \"cn_extrap\"] = interp_K\n", + " capdata.loc[(i, i_yrs[1:-1]), \"cs\"] = \"init_K_estim_interp\"\n", + "\n", + "## filling in information for ratio-extrapolated\n", + "capdata.loc[(no_k_cc, 2014), \"cs\"] = \"LitPop_ratio_extrap\"\n", + "\n", + "capdata = pd.concat([capdata, uninh_capdata], axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "b6f3711e-a0ce-49b3-8a0c-c896fd20251e", + "metadata": {}, + "source": [ + "We will merge the acquired result for current-PPP capital stock (and their sources) with the other historical data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2e8b379-9a7d-41c6-9f3e-b212e5c036ba", + "metadata": {}, + "outputs": [], + "source": [ + "histinfo = histinfo.merge(\n", + " capdata[[\"cn_extrap\", \"cs\"]].rename(columns={\"cs\": \"capital_source\"}),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3e6d8f66-e11c-408c-9e79-deb8d9ba0972", + "metadata": {}, + "source": [ + "## Filling in the missing `rnna` values, generating current PPP, 2019 USD capital values (`cn_19`) and constant 2019 PPP USD capital values (`rnna_19`)\n", + "\n", + "### For the missing `rnna` values (current PPP, 2017 USD)\n", + "\n", + "For these ones, we need to make sure that $rnna_{c, 2017} = cn_{c, 2017}$ for any country $c$. For the countries whose `rnna` information is missing entirely, we will use the (extrapolated) conversion rates to turn the `cgdpo` to `rnna` values. But for the countries whose `rnna` information does exist partially, we first apply the conversion rates, get `rnna` equivalents, get the growth rates of `rnna`-equivalents for the missing years, and apply them to the pre-existing `rnna` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acc538be-0b1b-4c8a-ad81-a97bf5bff40e", + "metadata": {}, + "outputs": [], + "source": [ + "## conversion rates (PPP) attached (from current to 2017 PPP)\n", + "histinfo = histinfo.merge(\n", + " ppp_to_2017_K[[\"conv\"]].rename(columns={\"conv\": \"curr_to_cnst\"}),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "histinfo.loc[(slice(None), 2020), \"curr_to_cnst\"] = histinfo.loc[\n", + " (slice(None), 2019), \"curr_to_cnst\"\n", + "].values\n", + "histinfo.loc[pd.isnull(histinfo.curr_to_cnst), \"curr_to_cnst\"] = 1\n", + "\n", + "## creating `rnna equivalents`\n", + "histinfo[\"rnna_equiv\"] = histinfo[\"cn_extrap\"] * histinfo[\"curr_to_cnst\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "750e3be2-7035-4d5c-b24a-7120c80b02c5", + "metadata": {}, + "outputs": [], + "source": [ + "## merging the actual rnna values from PWT10.0, and detecting which are missing\n", + "## rnna values completely\n", + "histinfo = histinfo.merge(\n", + " pwt100[[\"rnna\"]], left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "\n", + "## detecting those that have some rnna information vs. don't\n", + "count_rnna = histinfo.reset_index().groupby(\"ccode\").count()[[\"rnna\"]]\n", + "no_rnna = count_rnna.loc[count_rnna.rnna == 0, :].index.values\n", + "some_rnna = count_rnna.loc[count_rnna.rnna > 0, :].index.values\n", + "\n", + "## filling in the information for those that absolutely do not have rnna information\n", + "histinfo[\"rnna_extrap\"] = np.nan\n", + "histinfo.loc[(no_rnna, slice(None)), \"rnna_extrap\"] = histinfo.loc[\n", + " (no_rnna, slice(None)), \"rnna_equiv\"\n", + "].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "759dfd1b-35c7-401e-9141-a1ceb9eb7b41", + "metadata": {}, + "outputs": [], + "source": [ + "## for the partially-filled countries, fill in by using growth rates\n", + "for cc in tqdm(some_rnna):\n", + " nona_yrs = histinfo.loc[\n", + " (histinfo.index.get_level_values(\"ccode\") == cc) & (~pd.isnull(histinfo.rnna)),\n", + " :,\n", + " ]\n", + " nona_yrs = nona_yrs.index.get_level_values(\"year\")\n", + " nona_maxyr, nona_minyr = nona_yrs.max(), nona_yrs.min()\n", + "\n", + " ## copying information into the rnna_extrap column\n", + " histinfo.loc[(cc, nona_yrs), \"rnna_extrap\"] = histinfo.loc[\n", + " (cc, nona_yrs), \"rnna\"\n", + " ].values\n", + "\n", + " ## using growth rates for extrapolation\n", + " rnna_1950, rnna_2020 = histinfo.loc[(cc, [1950, 2020]), \"rnna\"].values\n", + " if pd.isnull(rnna_1950):\n", + " fill_yrs = list(range(1950, nona_minyr + 1))\n", + " equiv = histinfo.loc[(cc, fill_yrs), \"rnna_equiv\"].values\n", + " actual_extrap = (equiv / equiv[-1]) * histinfo.loc[(cc, nona_minyr), \"rnna\"]\n", + " histinfo.loc[(cc, fill_yrs), \"rnna_extrap\"] = actual_extrap\n", + "\n", + " if pd.isnull(rnna_2020):\n", + " fill_yrs = list(range(nona_maxyr, 2021))\n", + " equiv = histinfo.loc[(cc, fill_yrs), \"rnna_equiv\"].values\n", + " actual_extrap = (equiv / equiv[0]) * histinfo.loc[(cc, nona_maxyr), \"rnna\"]\n", + " histinfo.loc[(cc, fill_yrs), \"rnna_extrap\"] = actual_extrap" + ] + }, + { + "cell_type": "markdown", + "id": "c06d4c89-9483-4efe-8484-4c9f7c7a597a", + "metadata": {}, + "source": [ + "### Creating `cn_19` and `rnna_19`\n", + "\n", + "Again, for these, it must be that $cn\\_19_{c, 2019} = rnna\\_19_{c, 2019}$ for all countries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6643385e-48c8-4f7d-91fc-e425429999ad", + "metadata": {}, + "outputs": [], + "source": [ + "## cn_19 is created simply by chaning from USD of 2017 to USD of 2019\n", + "usd_17_19 = pwt100.loc[(\"USA\", 2019), \"pl_n\"] / pwt100.loc[(\"USA\", 2017), \"pl_n\"]\n", + "histinfo[\"cn_19\"] = histinfo[\"cn_extrap\"] * usd_17_19\n", + "\n", + "## creating rnna_19; first creating scale factors with 2019 values being 1\n", + "rnna_17_2019_vals = (\n", + " histinfo.loc[(slice(None), 2019), [\"rnna_extrap\"]]\n", + " .reset_index()\n", + " .drop([\"year\"], axis=1)\n", + " .set_index([\"ccode\"])\n", + " .rename(columns={\"rnna_extrap\": \"rnna_2019_vals\"})\n", + ")\n", + "histinfo = histinfo.merge(\n", + " rnna_17_2019_vals, left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "histinfo[\"rnna_2019_scale\"] = histinfo[\"rnna_extrap\"] / histinfo[\"rnna_2019_vals\"]\n", + "\n", + "## multiplying the cn_19 values of 2019\n", + "cn_19_2019_vals = (\n", + " histinfo.loc[(slice(None), 2019), [\"cn_19\"]]\n", + " .reset_index()\n", + " .drop([\"year\"], axis=1)\n", + " .set_index([\"ccode\"])\n", + " .rename(columns={\"cn_19\": \"cn_19_2019\"})\n", + ")\n", + "histinfo = histinfo.merge(\n", + " cn_19_2019_vals, left_index=True, right_index=True, how=\"left\"\n", + ")\n", + "histinfo[\"rnna_19\"] = histinfo[\"rnna_2019_scale\"] * histinfo[\"cn_19_2019\"]" + ] + }, + { + "cell_type": "markdown", + "id": "552db7d5-3e75-46f2-88e5-587bfb2ded18", + "metadata": {}, + "source": [ + "## Creating capital and population scales, organizing the variable names, and exporting\n", + "\n", + "### Creating capital scale (with respect to `cn_19` of 2019) and population scale (with respect to `pop` of 2019)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7938bf65-53cf-45cb-9cc6-c82b3f9ff3aa", + "metadata": {}, + "outputs": [], + "source": [ + "## pop scale\n", + "pop2019 = (\n", + " histinfo.loc[(slice(None), 2019), [\"pop\"]]\n", + " .reset_index()\n", + " .drop([\"year\"], axis=1)\n", + " .set_index([\"ccode\"])\n", + " .rename(columns={\"pop\": \"pop_2019\"})\n", + ")\n", + "histinfo = histinfo.merge(pop2019, left_index=True, right_index=True, how=\"left\")\n", + "histinfo[\"pop_scale\"] = histinfo[\"pop\"] / histinfo[\"pop_2019\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca274a42-929b-4e16-8919-6de744263559", + "metadata": {}, + "outputs": [], + "source": [ + "## capital scale\n", + "cn2019 = (\n", + " histinfo.loc[(slice(None), 2019), [\"cn_19\"]]\n", + " .reset_index()\n", + " .drop([\"year\"], axis=1)\n", + " .set_index([\"ccode\"])\n", + " .rename(columns={\"cn_19\": \"cn_2019\"})\n", + ")\n", + "histinfo = histinfo.merge(cn2014, left_index=True, right_index=True, how=\"left\")\n", + "histinfo[\"rnna_19_scale\"] = histinfo[\"rnna_19\"] / histinfo[\"cn_2019\"]\n", + "histinfo[\"cn_19_scale\"] = histinfo[\"cn_19\"] / histinfo[\"cn_2019\"]" + ] + }, + { + "cell_type": "markdown", + "id": "b945ef5d-a024-4f29-856b-704a7f1d12f1", + "metadata": {}, + "source": [ + "### Variable name cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1074ed0c-efbe-45ae-915b-62e8942caf82", + "metadata": {}, + "outputs": [], + "source": [ + "histinfo_columns = [\n", + " \"pop_unit\",\n", + " \"gdppc_unit\",\n", + " \"gdp_capital_unit\",\n", + " \"pop_source\",\n", + " \"gdp_source\",\n", + " \"iy_ratio_source\",\n", + " \"k_ratio_source\",\n", + " \"delta_source\",\n", + " \"capital_source\",\n", + " \"pop\",\n", + " \"pop_scale\",\n", + " \"rgdpna_pc_17\",\n", + " \"rgdpna_17\",\n", + " \"rgdpna_pc_19\",\n", + " \"rgdpna_19\",\n", + " \"cgdpo_pc_17\",\n", + " \"cgdpo_17\",\n", + " \"cgdpo_pc_19\",\n", + " \"cgdpo_19\",\n", + " \"iy_ratio\",\n", + " \"iy_ratio_fit\",\n", + " \"k_movable_ratio\",\n", + " \"k_struc_ratio\",\n", + " \"k_mach_ratio\",\n", + " \"k_traeq_ratio\",\n", + " \"k_other_ratio\",\n", + " \"delta\",\n", + " \"rnna_17\",\n", + " \"rnna_19\",\n", + " \"rnna_19_scale\",\n", + " \"cn_17\",\n", + " \"cn_19\",\n", + " \"cn_19_scale\",\n", + "]\n", + "histinfo_final = histinfo.copy()\n", + "histinfo_final.rename(\n", + " columns={\n", + " \"rnna_extrap\": \"rnna_17\",\n", + " \"cn_extrap\": \"cn_17\",\n", + " \"gdp_unit\": \"gdp_capital_unit\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "## filling in the nan's with 0s\n", + "fill0 = [\n", + " \"rnna_17\",\n", + " \"rnna_19\",\n", + " \"rnna_19_scale\",\n", + " \"cn_17\",\n", + " \"cn_19\",\n", + " \"cn_19_scale\",\n", + " \"pop_scale\",\n", + "]\n", + "for i in fill0:\n", + " histinfo_final.loc[pd.isnull(histinfo_final[i]), i] = 0\n", + "\n", + "histinfo_final = histinfo_final[histinfo_columns].copy()" + ] + }, + { + "cell_type": "markdown", + "id": "e9174477-b7d0-4553-b33c-a4cabb0321ff", + "metadata": {}, + "source": [ + "### Exporting the data" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "ae83a2c3-fc71-497b-b676-3bdca885a3f8", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(sset.DIR_YPK_FINAL, exist_ok=True)\n", + "histinfo_final.to_parquet(\n", + " sset.DIR_YPK_FINAL / \"gdp_gdppc_pop_capital_1950_2020.parquet\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk5_projected_yp.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk5_projected_yp.ipynb new file mode 100644 index 0000000..d67acde --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk5_projected_yp.ipynb @@ -0,0 +1,745 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "abb30260", + "metadata": {}, + "source": [ + "## Clean up and impute missing projected (2010-2100) GDPpc, GDP, and population values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8be17532", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a56987e", + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as ddf\n", + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "from dask_gateway import Gateway\n", + "from shapely.geometry import MultiPolygon, Point, Polygon\n", + "from tqdm.auto import tqdm\n", + "\n", + "from sliiders.spatial import iso_poly_box_getter\n", + "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders import settings as sset\n", + "\n", + "# dask gateway setup\n", + "gateway = Gateway()\n", + "image_name = sset.DASK_IMAGE" + ] + }, + { + "cell_type": "markdown", + "id": "1cececc9", + "metadata": {}, + "source": [ + "## Importing and cleaning SSP-IAM projections\n", + "\n", + "### Raw data re-formatting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a94cef05", + "metadata": {}, + "outputs": [], + "source": [ + "iiasa_raw_df = pd.read_csv(\n", + " sset.DIR_IIASA_PROJECTIONS / \"SspDb_country_data_2013-06-12.csv\"\n", + ")\n", + "iiasa_pop = iiasa_raw_df.loc[iiasa_raw_df.VARIABLE == \"Population\", :].sort_values(\n", + " [\"SCENARIO\", \"MODEL\", \"REGION\"]\n", + ")\n", + "iiasa_gdp = iiasa_raw_df.loc[iiasa_raw_df.VARIABLE == \"GDP|PPP\", :].sort_values(\n", + " [\"SCENARIO\", \"MODEL\", \"REGION\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cf013e35", + "metadata": {}, + "source": [ + "### Population\n", + "\n", + "We will only take IIASA projections, with the exception of countries whose information are in OECD projections but not in IIASA.\n", + "\n", + "#### Basic cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1736cc0f", + "metadata": {}, + "outputs": [], + "source": [ + "# Cleaning the projections\n", + "ii_pop_clean = ypk_fn.ssp_and_model_simplify(\"SCENARIO\", \"MODEL\", iiasa_pop)\n", + "ii_pop_clean.sort_values([\"ccode\", \"ssp\", \"iam\"], inplace=True)\n", + "\n", + "# double-checking if IIASA and IIASA-WiC values are same\n", + "v_ = [str(y) for y in np.arange(2010, 2105, 5)]\n", + "for i in set(ii_pop_clean.ccode):\n", + " row = ii_pop_clean[ii_pop_clean.ccode == i]\n", + " iams = set(row.iam)\n", + " if (\"IIASA\" in iams) and (\"IIASA-WiC\" in iams):\n", + " w1 = row.loc[row.iam == \"IIASA\", v_].values\n", + " w2 = row.loc[row.iam == \"IIASA-WiC\", v_].values\n", + " if not (w1 == w2).all():\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f53976a4", + "metadata": {}, + "outputs": [], + "source": [ + "# cleaning up by gathering only two population IAMs per country:\n", + "# IIASA (or equivalently, IIASA-WiC), and OECD\n", + "ii_pop = pd.DataFrame(ii_pop_clean[[\"ccode\", \"ssp\", \"iam\"] + v_])\n", + "new_v_ = [\"v_\" + str(y) for y in v_]\n", + "\n", + "for i, ccode in enumerate(list(set(ii_pop_clean.ccode))):\n", + " j = 0\n", + " indiv_df = []\n", + " case = ii_pop[ii_pop.ccode == ccode]\n", + " get_these = []\n", + " ## add oecd if existing\n", + " if \"OECD\" in set(case.iam):\n", + " indiv_df.append(case[case.iam == \"OECD\"].values)\n", + " j += 1\n", + " ## add only one of IIASA OR IIASA-WiC\n", + " if \"IIASA\" in set(case.iam):\n", + " indiv_df.append(case[case.iam == \"IIASA\"].values)\n", + " j += 1\n", + " elif \"IIASA-WiC\" in set(case.iam):\n", + " indiv_df.append(case[case.iam == \"IIASA-WiC\"].values)\n", + " j += 1\n", + "\n", + " indiv_df = pd.DataFrame(\n", + " np.vstack(indiv_df), columns=[\"ccode\", \"ssp\", \"iam\"] + new_v_\n", + " )\n", + " indiv_df[\"howmany_iam\"] = j\n", + " if i == 0:\n", + " agg_df = indiv_df.copy()\n", + " else:\n", + " agg_df = pd.concat([agg_df, indiv_df], axis=0)\n", + "\n", + "agg_df[\"unit\"] = \"millions\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e07e3967", + "metadata": {}, + "outputs": [], + "source": [ + "# brief clean-ups\n", + "ii_pop = agg_df.copy()\n", + "ii_pop[\"iam_fill\"] = \"-\"\n", + "ii_pop.loc[ii_pop.iam == \"IIASA-WiC\", \"iam\"] = \"IIASA\"\n", + "\n", + "# adding the extra rows for missing iams\n", + "for i, ccode in enumerate(set(ii_pop.ccode)):\n", + " case = ii_pop[ii_pop.ccode == ccode]\n", + " if case[\"howmany_iam\"][0] == 1:\n", + " copy_case = pd.DataFrame(case)\n", + " if set([\"OECD\"]) == set(copy_case.iam):\n", + " copy_case[\"iam\"], copy_case[\"iam_fill\"] = \"IIASA\", \"OECD\"\n", + " elif set([\"IIASA\"]) == set(copy_case.iam):\n", + " copy_case[\"iam\"], copy_case[\"iam_fill\"] = \"OECD\", \"IIASA\"\n", + " ii_pop = pd.concat([ii_pop, copy_case], axis=0)\n", + "\n", + "## further re-ordering cleanups\n", + "ii_pop.sort_values([\"ccode\", \"ssp\", \"iam\"], inplace=True)\n", + "ii_pop.set_index([\"ccode\", \"ssp\", \"iam\"], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ee42d035", + "metadata": {}, + "source": [ + "#### Cleaning up for the case of France\n", + "\n", + "In the French case, IIASA's version has the 5 overseas departments (i.e., `MYT`, `MTQ`, `GUF`, `GLP`, and `REU`) **excluded** when it calculates the French populations. This is different in the OECD's version of the French population since it seems to **include** the said overseas departments. This can be confirmed below as the values for the sum of IIASA's populations for `MYT`, `MTQ`, `GUF`, `GLP`, `REU` and `FRA` is approximately the same as the values for OECD's French population.\n", + "\n", + "From here on, the French case for both IIASA and OECD will **exclude** the five overseas departments and keep them separately logged." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "744b2830", + "metadata": {}, + "outputs": [], + "source": [ + "## checking\n", + "fra_dept = [\"FRA\", \"MYT\", \"MTQ\", \"GUF\", \"GLP\", \"REU\"]\n", + "v_fut_5 = [x for x in ii_pop.columns if \"v_\" in x]\n", + "for ssp in [\"SSP{}\".format(i) for i in range(1, 6)]:\n", + " ## OECD case\n", + " oecd_val = ii_pop.loc[(\"FRA\", ssp, \"OECD\"), v_fut_5].values\n", + "\n", + " ## IIASA case\n", + " iiasa_val = ii_pop.loc[(fra_dept, ssp, \"IIASA\"), v_fut_5].values\n", + " iiasa_val = np.sum(iiasa_val, axis=0)\n", + "\n", + " jointhese = [ssp, str(round(np.sum((oecd_val - iiasa_val) ** 2), 4))]\n", + " print(\": \".join(jointhese))" + ] + }, + { + "cell_type": "markdown", + "id": "b482929d", + "metadata": {}, + "source": [ + "The above confirms that OECD cases do include all of the five overseas departments when calculating their population. So we will subtract these values to get the \"mainland France\" population values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcb3ce56", + "metadata": {}, + "outputs": [], + "source": [ + "ii_pop_fra = ii_pop.copy()\n", + "csi = [\"ccode\", \"ssp\", \"iam\"]\n", + "for ssp in [\"SSP{}\".format(i) for i in range(1, 6)]:\n", + " fra_dept_oecd = ii_pop.loc[(fra_dept[1:], ssp, \"OECD\"), v_fut_5].values\n", + " fra_dept_oecd = np.sum(fra_dept_oecd, axis=0)\n", + " fra_overall_oecd = ii_pop.loc[(\"FRA\", ssp, \"OECD\"), v_fut_5].values\n", + "\n", + " ii_pop_fra.loc[(\"FRA\", ssp, \"OECD\"), v_fut_5] = fra_overall_oecd - fra_dept_oecd" + ] + }, + { + "cell_type": "markdown", + "id": "54be959d", + "metadata": {}, + "source": [ + "#### Interpolating, turning into a long-panel format, and taking only the IIASA cases\n", + "\n", + "Projections are given every five years, so we will use interpolation to fill in the missing years' information. We will assume that the between any known adjacent two years' values (e.g., 2015 and 2020), the values grow log-linearly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26dff56c", + "metadata": {}, + "outputs": [], + "source": [ + "# interpolate log-linearly and turning into a long-panel format\n", + "ii_pop = ypk_fn.organize_hor_to_ver(\n", + " ypk_fn.log_lin_interpolate(ii_pop_fra),\n", + " \"ccode\",\n", + " [\"ssp\", \"iam\"],\n", + " \"pop\",\n", + " yrs=list(range(2010, 2101)),\n", + ")\n", + "\n", + "# selecting only the IIASA cases\n", + "ii_pop = (\n", + " ii_pop.loc[(slice(None), slice(None), slice(None), \"IIASA\"), :]\n", + " .reset_index()\n", + " .drop([\"howmany_iam\", \"iam_fill\", \"iam\"], axis=1)\n", + " .set_index([\"ccode\", \"year\", \"ssp\"])\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fcb7a6cd", + "metadata": {}, + "source": [ + "#### Detecting those ISOs that are missing, and getting the country-level population estimates for these ISOs (from LandScan 2019)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfe54e94", + "metadata": {}, + "outputs": [], + "source": [ + "# cluster setup\n", + "cluster = gateway.new_cluster(worker_image=image_name, profile=\"micro\")\n", + "client = cluster.get_client()\n", + "cluster.scale(20)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9452c7d4", + "metadata": {}, + "outputs": [], + "source": [ + "# detecting which ISOs are missing\n", + "isos_pop_wproj = ii_pop.index.get_level_values(\"ccode\").unique()\n", + "need_landscan = np.sort(np.setdiff1d(sset.ALL_ISOS, isos_pop_wproj))\n", + "\n", + "# landscan and (raw) coordinates\n", + "ls19 = ddf.read_parquet(\n", + " sset.DIR_LANDSCAN_INT / \"population_with_xy.parquet\"\n", + ").repartition(npartitions=20)\n", + "ls19 = ls19.persist()\n", + "\n", + "# shapefiles requiring information from LandScan\n", + "ctries_shp = gpd.read_parquet(sset.PATH_GADM_ADM1)\n", + "\n", + "# fixing ISO codes to be consistent with our convention\n", + "ctries_shp.loc[ctries_shp.GID_0 == \"XKO\", \"GID_0\"] = \"KO-\"\n", + "ctries_shp.loc[ctries_shp.GID_0 == \"XCL\", \"GID_0\"] = \"CL-\"\n", + "\n", + "# subsetting the shapefiles for those missing projections\n", + "ctries_shp = ctries_shp.set_index([\"GID_0\"]).sort_index().loc[need_landscan]" + ] + }, + { + "cell_type": "markdown", + "id": "f37fe493", + "metadata": {}, + "source": [ + "Note that the current shapefile information we are using often has more than one MultiPolygon per ISO code, so we will create a shapefile dataset with one MultiPolygon per ISO code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8939a774", + "metadata": {}, + "outputs": [], + "source": [ + "ctries_shp_lst = []\n", + "for iso in tqdm(need_landscan):\n", + " iso_lst = []\n", + " for i in ctries_shp.loc[[iso], \"geometry\"].values:\n", + " if type(i) == MultiPolygon:\n", + " j = [x for x in i.geoms]\n", + " elif type(i) == Polygon:\n", + " j = [i]\n", + " iso_lst += j\n", + " ctries_shp_lst.append(MultiPolygon(iso_lst))\n", + "\n", + "ctries_shp_df = gpd.GeoDataFrame(\n", + " data={\"ccode\": need_landscan, \"geometry\": ctries_shp_lst}\n", + ")\n", + "ctries_shp_df.set_index([\"ccode\"], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "31adba1c", + "metadata": {}, + "source": [ + "Based on the ISO-relevant shapefiles and grid-level population in LandScan 2019, let us find the country-level population information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa2df5cf", + "metadata": {}, + "outputs": [], + "source": [ + "def subset_grid_find_pop(iso, shp_df=ctries_shp_df, ls_df=ls19):\n", + " poly_bounds = iso_poly_box_getter(iso, shp_df)\n", + " geom = shp_df.loc[iso, \"geometry\"]\n", + "\n", + " sub_dfs = []\n", + " for bd in poly_bounds:\n", + " x_mn, x_mx, y_mn, y_mx = bd\n", + " sub_df = ls_df.loc[\n", + " (ls_df.x > x_mn) & (ls_df.y > y_mn) & (ls_df.x < x_mx) & (ls_df.y < y_mx), :\n", + " ].compute()\n", + " sub_dfs.append(sub_df)\n", + " sub_df = pd.concat(sub_dfs, axis=0).drop_duplicates([\"x_ix\", \"y_ix\"])\n", + "\n", + " if sub_df.shape[0] == 0:\n", + " return 0\n", + "\n", + " pop = 0\n", + " for l in range(sub_df.shape[0]):\n", + " pt = Point(sub_df.iloc[l, :][\"x\"], sub_df.iloc[l, :][\"y\"])\n", + " if geom.contains(pt):\n", + " pop += sub_df.iloc[l, :][\"population\"]\n", + "\n", + " return pop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ed2a60a", + "metadata": {}, + "outputs": [], + "source": [ + "# this may take a while\n", + "ls_msng_pop = []\n", + "for iso in tqdm(need_landscan):\n", + " ls_msng_pop.append(subset_grid_find_pop(iso))\n", + "\n", + "msng_from_proj_pop = pd.DataFrame(data={\"pop\": ls_msng_pop, \"ccode\": need_landscan})\n", + "msng_from_proj_pop.to_parquet(sset.DIR_YPK_INT / \"msng_from_iiasa_proj_pop.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2bebb46", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(0)\n", + "cluster.close()\n", + "client.close()\n", + "cluster.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "64903a47", + "metadata": {}, + "source": [ + "#### Attaching LandScan 2019 values to the overall population projections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2859792b", + "metadata": {}, + "outputs": [], + "source": [ + "msng_from_proj_pop = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"msng_from_iiasa_proj_pop.parquet\"\n", + ").set_index([\"ccode\"])\n", + "\n", + "pop_from_landscan = []\n", + "for i in msng_from_proj_pop.index.get_level_values(\"ccode\"):\n", + " i_shell = ii_pop.loc[[\"USA\"], :].reset_index().copy()\n", + " i_shell[\"ccode\"] = i\n", + "\n", + " ## adjusting it to millions of people\n", + " i_shell[\"pop\"] = msng_from_proj_pop.loc[i, \"pop\"] / 1000000\n", + " i_shell.set_index(ii_pop.index.names, inplace=True)\n", + " pop_from_landscan.append(i_shell)\n", + "\n", + "ii_pop = pd.concat([ii_pop] + pop_from_landscan, axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "edc0ff29", + "metadata": {}, + "source": [ + "### GDPpc and GDP\n", + "\n", + "We will use IAMs `IIASA` and `OECD`.\n", + "\n", + "#### Basic cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "883d2312", + "metadata": {}, + "outputs": [], + "source": [ + "## cleaning the imported dataset\n", + "csi = [\"ccode\", \"ssp\", \"iam\"]\n", + "ii_gdp_clean = ypk_fn.ssp_and_model_simplify(\"SCENARIO\", \"MODEL\", iiasa_gdp)\n", + "ii_gdp_clean.set_index(csi, inplace=True)\n", + "ii_gdp_clean.sort_index(axis=0, inplace=True)\n", + "\n", + "num_v = [str(x) for x in np.arange(2010, 2105, 5)]\n", + "v_name = [\"v_\" + str(v) for v in num_v]\n", + "ii_gdp_clean.rename(columns=dict(zip(num_v, v_name)), inplace=True)\n", + "ii_gdp_clean = ii_gdp_clean[v_name]\n", + "\n", + "## changing the values from billions of dollars to millions of dollars\n", + "ii_gdp_clean[v_name] = ii_gdp_clean[v_name] * 1000\n", + "\n", + "## double-checking if IIASA and IIASA-WiC values are same\n", + "## it's verifiable that IIASA and IIASA-WiC ones are the same\n", + "for i in set(ii_gdp_clean.index.get_level_values(\"ccode\")):\n", + " row = ii_gdp_clean.loc[(i, slice(None), slice(None)), :]\n", + " iams = set(row.index.get_level_values(\"iam\"))\n", + " if (\"IIASA\" in iams) and (\"IIASA-WiC\" in iams):\n", + " w1 = row.loc[(slice(None), slice(None), \"IIASA\"), v_name].values\n", + " w2 = row.loc[(slice(None), slice(None), \"IIASA-WiC\"), v_name].values\n", + " if not (w1 == w2).all():\n", + " print(i)\n", + "\n", + "## getting only IIASA and OECD cases\n", + "clean_ccodes = ii_gdp_clean.index.get_level_values(\"ccode\")\n", + "for i, ccode in enumerate(list(set(clean_ccodes))):\n", + " j = 0\n", + " indiv_df = []\n", + " case = ii_gdp_clean.loc[(ccode, slice(None), slice(None)), :]\n", + " get_these = []\n", + " ## add oecd if existing\n", + " if \"OECD\" in set(case.index.get_level_values(\"iam\")):\n", + " indiv_df.append(case.loc[(slice(None), slice(None), \"OECD\"), :])\n", + " j += 1\n", + " ## add only one of IIASA OR IIASA-WiC\n", + " if \"IIASA\" in set(case.index.get_level_values(\"iam\")):\n", + " indiv_df.append(case.loc[(slice(None), slice(None), \"IIASA\"), :])\n", + " j += 1\n", + " elif \"IIASA-WiC\" in set(case.index.get_level_values(\"iam\")):\n", + " indiv_df.append(case.loc[(slice(None), slice(None), \"IIASA-WiC\"), :])\n", + " j += 1\n", + "\n", + " indiv_df = pd.concat(indiv_df, axis=0)\n", + " indiv_df[\"howmany_iam\"] = j\n", + " if i == 0:\n", + " agg_df = indiv_df.copy()\n", + " else:\n", + " agg_df = pd.concat([agg_df, indiv_df], axis=0)\n", + "\n", + "ii_gdp = agg_df.copy().reset_index()\n", + "ii_gdp[\"iam_fill\"] = \"-\"\n", + "ii_gdp.loc[ii_gdp.iam == \"IIASA-WiC\", \"iam\"] = \"IIASA\"\n", + "ii_gdp.set_index(csi, inplace=True)\n", + "\n", + "## If either OECD or IIASA track is missing, fill in using the other track\n", + "for i, ccode in enumerate(set(ii_gdp.index.get_level_values(\"ccode\"))):\n", + " case = ii_gdp.loc[(ccode, slice(None), slice(None)), :]\n", + " if case[\"howmany_iam\"][0] == 1:\n", + " copy_case = case.copy().reset_index()\n", + " if set([\"OECD\"]) == set(copy_case.iam):\n", + " copy_case[\"iam\"], copy_case[\"iam_fill\"] = \"IIASA\", \"OECD\"\n", + " elif set([\"IIASA\"]) == set(copy_case.iam):\n", + " copy_case[\"iam\"], copy_case[\"iam_fill\"] = \"OECD\", \"IIASA\"\n", + " ii_gdp = pd.concat([ii_gdp, copy_case.set_index(csi)], axis=0)\n", + "\n", + "ii_gdp = ypk_fn.organize_hor_to_ver(\n", + " ii_gdp.sort_index(axis=0), \"ccode\", [\"ssp\", \"iam\"], \"gdp\", yrs=range(2010, 2101)\n", + ").drop([\"howmany_iam\"], axis=1)\n", + "ii_gdp[\"unit\"] = \"millions\"" + ] + }, + { + "cell_type": "markdown", + "id": "bb2bf593", + "metadata": {}, + "source": [ + "#### Attaching the population values, creating GDPpc, and log-linearly interpolating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd08ffc7", + "metadata": {}, + "outputs": [], + "source": [ + "ii_gdppc = ii_gdp.merge(ii_pop[[\"pop\"]], how=\"left\", left_index=True, right_index=True)\n", + "ii_gdppc[\"gdppc\"] = ii_gdppc[\"gdp\"] / ii_gdppc[\"pop\"]\n", + "scenarios = sset.SCENARIOS\n", + "scen_dfs = []\n", + "for scen in tqdm(scenarios):\n", + " ssp, iam = scen\n", + " scen_df = (\n", + " ii_gdppc.loc[(slice(None), slice(None), ssp, iam), [\"gdppc\"]]\n", + " .reset_index()\n", + " .drop([\"ssp\", \"iam\"], axis=1)\n", + " .set_index([\"ccode\", \"year\"])\n", + " )\n", + " scen_df = ypk_fn.log_lin_interpolate(\n", + " ypk_fn.organize_ver_to_hor(\n", + " scen_df, \"gdppc\", \"year\", \"ccode\", range(2010, 2101)\n", + " ),\n", + " ).reset_index()\n", + " scen_df[\"ssp\"], scen_df[\"iam\"] = ssp, iam\n", + " scen_dfs.append(scen_df.set_index([\"ccode\", \"ssp\", \"iam\"]))\n", + "ii_gdppc = ypk_fn.organize_hor_to_ver(\n", + " pd.concat(scen_dfs, axis=0), \"ccode\", [\"ssp\", \"iam\"], \"gdppc\", yrs=range(2010, 2101)\n", + ")\n", + "ii_gdppc[\"unit\"] = \"ones\"" + ] + }, + { + "cell_type": "markdown", + "id": "52f32d19", + "metadata": {}, + "source": [ + "#### Getting the by-scenario global GDPpc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0407ecf4", + "metadata": {}, + "outputs": [], + "source": [ + "ii_gdppc_w_pop = ii_gdppc.merge(\n", + " ii_pop[[\"pop\"]], how=\"left\", left_index=True, right_index=True\n", + ")\n", + "ii_gdppc_w_pop[\"gdp\"] = ii_gdppc_w_pop[\"pop\"] * ii_gdppc_w_pop[\"gdppc\"]\n", + "scen_agg_dfs = []\n", + "for scen in tqdm(scenarios):\n", + " ssp, iam = scen\n", + " scen_agg_df = (\n", + " ii_gdppc_w_pop.loc[(slice(None), slice(None), ssp, iam)]\n", + " .reset_index()\n", + " .groupby([\"year\"])\n", + " .sum()[[\"pop\", \"gdp\"]]\n", + " .reset_index()\n", + " )\n", + " scen_agg_df[\"ssp\"], scen_agg_df[\"iam\"] = ssp, iam\n", + " scen_agg_df.set_index([\"year\", \"ssp\", \"iam\"], inplace=True)\n", + " scen_agg_dfs.append(scen_agg_df)\n", + "global_df = pd.concat(scen_agg_dfs, axis=0).sort_index()\n", + "global_df[\"gdppc\"] = global_df[\"gdp\"] / global_df[\"pop\"]" + ] + }, + { + "cell_type": "markdown", + "id": "4b776cc4", + "metadata": {}, + "source": [ + "#### GDPpc for countries that are not in the current projections (subbing in the global GDPpc), and attaching it with the existing projections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc910fe4", + "metadata": {}, + "outputs": [], + "source": [ + "gdppc_yesproj = np.sort(ii_gdppc.index.get_level_values(\"ccode\").unique())\n", + "gdppc_noproj = np.setdiff1d(sset.ALL_ISOS, gdppc_yesproj)\n", + "missing_gdps = []\n", + "for iso in tqdm(gdppc_noproj):\n", + " iso_df = global_df.reset_index()\n", + " iso_df[\"ccode\"], iso_df[\"unit\"] = iso, \"ones\"\n", + " iso_df.set_index([\"ccode\", \"year\", \"ssp\", \"iam\"], inplace=True)\n", + " missing_gdps.append(iso_df[[\"gdppc\", \"unit\"]])\n", + "missing_gdps = pd.concat(missing_gdps, axis=0).sort_index()\n", + "\n", + "ii_gdppc = pd.concat([ii_gdppc, missing_gdps], axis=0).sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d54ed5e7", + "metadata": {}, + "outputs": [], + "source": [ + "ii_yp = ii_gdppc.merge(ii_pop, left_index=True, right_index=True, how=\"left\")\n", + "ii_yp[\"pop_unit\"] = \"millions (of people)\"\n", + "ii_yp[\"gdppc_unit\"] = \"ones (of USD)\"\n", + "ii_yp[\"gdp_unit\"] = \"millions (of USD)\"\n", + "ii_yp.drop([\"unit_x\", \"unit_y\"], inplace=True, axis=1)\n", + "ii_yp[\"gdp\"] = ii_yp[\"gdppc\"] * ii_yp[\"pop\"]\n", + "\n", + "## if population is 0, then GDPpc and GDP should also be 0 (no economic activity)\n", + "ii_yp.loc[ii_yp[\"pop\"] == 0, \"gdppc\"] = 0" + ] + }, + { + "cell_type": "markdown", + "id": "7278d8e5", + "metadata": { + "tags": [] + }, + "source": [ + "#### Turning the GDP and GDPpc values to 2019 USD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5497598", + "metadata": {}, + "outputs": [], + "source": [ + "## inflator from 2005 to 2019\n", + "pwt = (\n", + " pd.read_excel(sset.PATH_PWT_RAW)\n", + " .rename(columns={\"countrycode\": \"ccode\"})\n", + " .set_index([\"ccode\", \"year\"])\n", + ")\n", + "infla = pwt.loc[(\"USA\", 2019), \"pl_gdpo\"] / pwt.loc[(\"USA\", 2005), \"pl_gdpo\"]\n", + "ii_yp[\"gdp\"] *= infla\n", + "ii_yp[\"gdppc\"] *= infla" + ] + }, + { + "cell_type": "markdown", + "id": "d7213e95", + "metadata": {}, + "source": [ + "#### Organizing and exporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602991c8", + "metadata": {}, + "outputs": [], + "source": [ + "ii_yp = ii_yp[[\"gdp\", \"gdppc\", \"pop\", \"gdp_unit\", \"gdppc_unit\", \"pop_unit\"]].copy()\n", + "ii_yp.to_parquet(sset.DIR_YPK_INT / \"gdp_gdppc_pop_proj_2010_2100_post_ypk6.parquet\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk6_projected_capital.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk6_projected_capital.ipynb new file mode 100644 index 0000000..2205036 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk6_projected_capital.ipynb @@ -0,0 +1,591 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3d62e24d", + "metadata": {}, + "source": [ + "## Projecting capital stock values (2010-2100) according to Dellink et al. (2017)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e86388c", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bf979ef", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import os\n", + "import zipfile\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import statsmodels.api as sm\n", + "import xarray as xr\n", + "from dask_gateway import Gateway\n", + "from tqdm.auto import tqdm\n", + "\n", + "## settings and utility functions for SLIIDERS\n", + "from sliiders import __file__ as slfile\n", + "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders import settings as sset\n", + "\n", + "# dask gateway setup\n", + "gateway = Gateway()\n", + "image_name = sset.DASK_IMAGE" + ] + }, + { + "cell_type": "markdown", + "id": "7de17e72", + "metadata": { + "tags": [] + }, + "source": [ + "## Capital projection\n", + "\n", + "We incorporate historical 2010 capital stock values and projected GDP, GDPpc, and population values.\n", + "\n", + "### Importing and merging capital 2010 values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3419bbb", + "metadata": {}, + "outputs": [], + "source": [ + "## historical data and projected gdp, gdppc, and population\n", + "hist_df = pd.read_parquet(\n", + " sset.DIR_YPK_FINAL / \"gdp_gdppc_pop_capital_1950_2020.parquet\"\n", + ")\n", + "proj_yp_df = pd.read_parquet(\n", + " sset.DIR_YPK_INT / \"gdp_gdppc_pop_proj_2010_2100_post_ypk6.parquet\"\n", + ")\n", + "\n", + "## merging 2010 capital values\n", + "proj_ypk_df = proj_yp_df.merge(\n", + " (\n", + " hist_df.loc[(slice(None), 2010), [\"rnna_19\"]].rename(\n", + " columns={\"rnna_19\": \"capital\"}\n", + " )\n", + " ),\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "\n", + "# readjusting the values to ones (of dollars and people)\n", + "for i in [\"gdp\", \"pop\", \"capital\"]:\n", + " unitname = f\"{i}_unit\"\n", + " proj_ypk_df[i] *= 1000000\n", + " proj_ypk_df[unitname] = \"ones (of USD)\"\n", + " if i == \"pop\":\n", + " proj_ypk_df[unitname] = \"ones (of people)\"" + ] + }, + { + "cell_type": "markdown", + "id": "782f31fb", + "metadata": {}, + "source": [ + "### Getting the overall GDP elasticity with respect to capital\n", + "\n", + "We first need to calculate the overall GDP elasticity w.r.t. capital, and here we assume a simple Cobb-Douglas production function with population being an approximation of the labor force. Alternatively, we may use IIASA approximation (from Crespo Cuaresma, 2017) of the said elasticity being approximately 0.326." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f378d31f", + "metadata": {}, + "outputs": [], + "source": [ + "## let us subset values for 2010\n", + "k2010 = proj_ypk_df.loc[(slice(None), 2010), :]\n", + "\n", + "# since 2010 values are same across all SSP (but could be different across iams)\n", + "# we subset SSP2 here and calculate the GDP elasticity wrt capital\n", + "k2010_pos_y = k2010.loc[\n", + " (k2010.gdp > 0) & (k2010.index.get_level_values(\"ssp\") == \"SSP2\"), :\n", + "].sort_index()\n", + "overall_elas_ols = sm.OLS(\n", + " np.log(k2010_pos_y[\"gdp\"]), sm.add_constant(np.log(k2010_pos_y[[\"pop\", \"capital\"]]))\n", + ")\n", + "overall_elas_ols = overall_elas_ols.fit()\n", + "\n", + "OVERALL_E = overall_elas_ols.params[\"capital\"]\n", + "OVERALL_E_IIASA = 0.326" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eca8cb4c", + "metadata": {}, + "outputs": [], + "source": [ + "# for seeing the regression summary\n", + "overall_elas_ols.summary()" + ] + }, + { + "cell_type": "markdown", + "id": "27f22a84", + "metadata": {}, + "source": [ + "#### Calculating the initial marginal product of capital (${MPK}_{r, t_0}$, with $t_0 = 2010$) and appending other necessary information\n", + "\n", + "**Four options of calculating MPK**\n", + "\n", + "If we assume a simple, Cobb-Douglas form for the production function (i.e., $Y = AK^\\alpha L^{1-\\alpha}$), the marginal product of capital (MPK) can be written as:\n", + "$$ \\frac{\\partial Y}{\\partial K} = \\alpha \\cdot \\underbrace{A{K}^\\alpha{L}^{1-\\alpha}}_{=Y}\\cdot \\frac{1}{K} = \\alpha \\frac{Y}{K} = \\alpha \\frac{Y/L}{K/L} $$\n", + "and similarly if we are going to assume some form like $Y = AK^\\alpha$, we can write:\n", + "$$ \\frac{\\partial Y}{\\partial K} = \\alpha \\cdot \\underbrace{AK^{\\alpha}}_{=Y} \\cdot \\frac{1}{K} = \\alpha \\frac{Y}{K} $$\n", + "so essentially the MPK can be written as the ratio of GDP ($Y$) and capital ($K$) multiplied by the GDP elasticity w.r.t. capital ($\\alpha$).\n", + "\n", + "We have acquired two different estimates (one ours, one IIASA's) of $\\alpha$ from above, but we can further look at calculating $\\alpha$ for each country by fitting either a Cobb-Douglas function or a capital-only function. So there are four options for calculating a country's MPK:\n", + "1. Use $\\alpha$ from IIASA\n", + "2. Use $\\alpha$ from our estimation\n", + "3. Use $\\alpha$ from fitting a Cobb-Douglas function\n", + "4. Use $\\alpha$ from fitting a capital-only function\n", + "\n", + "and we can multiply the value of $\\frac{Y}{K}$ (in the year 2010) afterwards." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57a27e17", + "metadata": {}, + "outputs": [], + "source": [ + "MPK_init_calc_lamb = lambda x: ypk_fn.MPK_init_calc(\n", + " x, hist_df, k2010, [OVERALL_E, OVERALL_E_IIASA]\n", + ")\n", + "\n", + "# for the inhabited areas\n", + "mpk_calc = []\n", + "inhabited_isos = np.setdiff1d(sset.ALL_ISOS, sset.UNINHABITED_ISOS)\n", + "for i in tqdm(inhabited_isos):\n", + " lst_mpks = MPK_init_calc_lamb(i)\n", + " mpk_calc.append(lst_mpks)\n", + "mpk_calc = pd.concat(mpk_calc, axis=0)\n", + "\n", + "# there are some cases in which the TPK and elasticities per country are not\n", + "# found via optimization (minimization); in this case, we will clip it with\n", + "# the minimum MPK garnered either from `mpk_our`, `mpk_iiasa`, or itself (whichever\n", + "# is lesser yet above 0)\n", + "for i in [\"mpk_ctry_cd\", \"mpk_ctry_co\"]:\n", + " cd_values = mpk_calc[[\"mpk_our\", \"mpk_iiasa\", i]].values\n", + " mpk_calc.loc[mpk_calc[i] == 0, i] = cd_values[cd_values > 0].min()\n", + "\n", + "# attaching the uninhabited areas; by default, their Y/K ratios and MPK values will\n", + "# be set to 0 (doesn't matter too much, since their projected capitals will be 0)\n", + "mpk_calc_uninhabited = k2010.reset_index().set_index([\"ccode\", \"ssp\", \"iam\"])\n", + "mpk_calc_uninhabited = mpk_calc_uninhabited.loc[\n", + " (sset.UNINHABITED_ISOS, slice(None), slice(None)), [\"gdp\", \"capital\", \"pop\"]\n", + "]\n", + "for i in [\"yk\", \"mpk_our\", \"mpk_iiasa\", \"mpk_ctry_cd\", \"mpk_ctry_co\"]:\n", + " mpk_calc_uninhabited[i] = 0\n", + "mpk_calc = pd.concat([mpk_calc, mpk_calc_uninhabited], axis=0).sort_index()" + ] + }, + { + "cell_type": "markdown", + "id": "4217f3b9", + "metadata": {}, + "source": [ + "### Using the perpetual inventory method (PIM) with the dynamic parameter equations specified in Dellink et al. (2017)\n", + "\n", + "The method in Dellink et al. (2017) is basically a PIM, but its parameters are dynamic (and evolving on their own) so that they approach converge to specific long-term values. Below is (with `dask` parallelization) an application of the Dellink et al. (2017) methodology using the MPKs (in 4 different methods) we have calculated above for each country.\n", + "\n", + "First, we load the 2010 historical values (some estimated) of capital stock into our projection dataset. Also, we calculate the by-country average depreciation rates (from PWT 10.0) and overall average (average of the by-country rates) rates (also from PWT 10.0) which are used in the PIM process. If a country is missing from the PWT 10.0 dataset, we will simply use the overall average depreciation rate for the country-specific values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca9057b2", + "metadata": {}, + "outputs": [], + "source": [ + "## importing the (initial) iy ratios in 2010\n", + "iy_org = hist_df.loc[(slice(None), [2010]), [\"iy_ratio_fit\", \"delta\"]]\n", + "iy_org = (\n", + " iy_org.reset_index()\n", + " .rename(columns={\"delta\": \"delta_c\", \"iy_ratio_fit\": \"iy_ratio\"})\n", + " .drop([\"year\"], axis=1)\n", + " .set_index([\"ccode\"])\n", + ")\n", + "\n", + "## AFG has the average delta value\n", + "delta_overall = iy_org.loc[\"AFG\", \"delta_c\"]\n", + "\n", + "## merge this with the 2010 (starting point) dataset\n", + "mpk_calc = mpk_calc.merge(iy_org, left_index=True, right_index=True, how=\"left\")\n", + "mpk_calc[\"delta\"] = delta_overall" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c5f73ef", + "metadata": {}, + "outputs": [], + "source": [ + "## cluster setup\n", + "N_CLUSTER = 20\n", + "cluster = gateway.new_cluster(worker_image=image_name, profile=\"micro\")\n", + "client = cluster.get_client()\n", + "cluster.scale(N_CLUSTER)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93355c2f", + "metadata": {}, + "outputs": [], + "source": [ + "## getting the ccodes and ccode-specific DFs necessary\n", + "ccodes_pos_y = (\n", + " proj_ypk_df.loc[proj_ypk_df.gdp > 0, :].index.get_level_values(\"ccode\").unique()\n", + ")\n", + "ccodes_dfs = [proj_ypk_df.loc[[cc], :].copy() for cc in ccodes_pos_y]\n", + "\n", + "## uninhabited ones set aside\n", + "cc_dfs_uninh = proj_ypk_df.loc[\n", + " ~proj_ypk_df.index.get_level_values(\"ccode\").isin(ccodes_pos_y), :\n", + "].sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb94a190", + "metadata": {}, + "outputs": [], + "source": [ + "# making sure SLIIDERS functions are compatible with Dask workflow\n", + "# run this when all the workers are available\n", + "sliiders_dir = Path(slfile).parent\n", + "zipf = zipfile.ZipFile(\"sliiders.zip\", \"w\", zipfile.ZIP_DEFLATED)\n", + "for root, dirs, files in os.walk(sliiders_dir):\n", + " for file in files:\n", + " zipf.write(\n", + " os.path.join(root, file),\n", + " os.path.relpath(os.path.join(root, file), os.path.join(sliiders_dir, \"..\")),\n", + " )\n", + "zipf.close()\n", + "client.upload_file(\"sliiders.zip\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa952c3c", + "metadata": {}, + "outputs": [], + "source": [ + "MPK_var_cases = [\"mpk_our\", \"mpk_ctry_cd\", \"mpk_ctry_co\"] * 2\n", + "MPK_case_len = len(MPK_var_cases)\n", + "all_cases = []\n", + "for i, case in enumerate(MPK_var_cases):\n", + " if i < (MPK_case_len // 2):\n", + " pim_lamb = lambda x: ypk_fn.pim_single_ctry(x, mpk_calc, OVERALL_E, case)\n", + " else:\n", + " pim_lamb = lambda x: ypk_fn.pim_single_ctry(x, mpk_calc, OVERALL_E_IIASA, case)\n", + " pim_dfs = client.map(pim_lamb, ccodes_dfs)\n", + " pim_dfs = client.gather(pim_dfs)\n", + " pim_dfs = pd.concat(pim_dfs, axis=0)\n", + " all_cases.append(pim_dfs)\n", + " j = i + 1\n", + " print(f\"Step {j}/{MPK_case_len} done\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dab4482", + "metadata": {}, + "outputs": [], + "source": [ + "# shutting down cluster\n", + "cluster.scale(0)\n", + "client.close()\n", + "cluster.close()\n", + "cluster.shutdown()\n", + "\n", + "# removing the .zip file that's been uploaded to Dask\n", + "os.remove(\"sliiders.zip\")" + ] + }, + { + "cell_type": "markdown", + "id": "84ea369b", + "metadata": {}, + "source": [ + "### Checking against the Dellink et al. (2017)'s Figure 6 (capital intensity plots)" + ] + }, + { + "cell_type": "markdown", + "id": "7b66b707", + "metadata": {}, + "source": [ + "We examine our 6 options as below. After examination with the graph as well as the SSE values, it seems that the case utilizing by-country MPK, **capital-only** production function, and the IIASA overall MPK are the ones that perform the best, at least with the four countries whose information are available.\n", + "\n", + "However, since the SSEs for the numbers are very similar between the two cases (varying only by **capital-and-labor** production versus **capital-only** production) and because capital-only one has been used previously to produce capital stock estimates, we will use estimates from `all_cases[-1]` as our main capital stock estimates and those from `all_cases[-2]` as alternative estimates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c1d3944", + "metadata": {}, + "outputs": [], + "source": [ + "all_cases_sse = []\n", + "for i in all_cases:\n", + " all_cases_sse.append(ypk_fn.examine_against_fig6(i))" + ] + }, + { + "cell_type": "markdown", + "id": "418aedca", + "metadata": {}, + "source": [ + "For sanity check, we will also graph top ten and bottom cases of capital stock (in natural logarithm) for some specified SSP (SSP3 below) and some year (2100 below)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de9cdc57", + "metadata": {}, + "outputs": [], + "source": [ + "ypk_fn.top_bottom_10(all_cases[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46819cfa", + "metadata": {}, + "outputs": [], + "source": [ + "ypk_fn.top_bottom_10(all_cases[-2])" + ] + }, + { + "cell_type": "markdown", + "id": "8e240b14", + "metadata": {}, + "source": [ + "## Re-organizing the dataset and exporting\n", + "\n", + "### Data re-organization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "042d6543", + "metadata": {}, + "outputs": [], + "source": [ + "# capital stock estimates\n", + "pim_dfs_iiasa_co = all_cases[-1].copy()\n", + "pim_dfs_iiasa_cd = all_cases[-2].copy()\n", + "\n", + "# creating gdppc, unit changes, and changing the name to be matched\n", + "output_df = proj_ypk_df.rename(\n", + " columns={\"gdp\": \"rgdpna_19\", \"gdppc\": \"rgdpna_pc_19\"}\n", + ").drop([\"capital\"], axis=1)\n", + "output_df[\"pop\"] /= 1000000\n", + "output_df[\"rgdpna_19\"] /= 1000000\n", + "\n", + "## attaching the capital stock estimates\n", + "necess_cols = [\"capital_estim\", \"MPK\", \"IY\", \"KY\"]\n", + "output_df = output_df.merge(\n", + " pim_dfs_iiasa_co[necess_cols].rename(columns={\"capital_estim\": \"rnna_19\"}),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "output_df[\"rnna_19\"] /= 1000000\n", + "\n", + "alt_name = \"rnna_19_alternative\"\n", + "output_df = output_df.merge(\n", + " pim_dfs_iiasa_cd[[\"capital_estim\"]].rename(columns={\"capital_estim\": alt_name}),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"left\",\n", + ")\n", + "output_df[alt_name] /= 1000000\n", + "\n", + "for i in necess_cols[1:] + [alt_name, \"rnna_19\"]:\n", + " output_df.loc[pd.isnull(output_df[i]), i] = 0\n", + "\n", + "## adding the unit information and reordering\n", + "output_df[\"gdp_capital_unit\"] = \"millions (of USD)\"\n", + "output_df[\"gdppc_unit\"] = \"ones (of USD)\"\n", + "output_df[\"pop_unit\"] = \"millions (of people)\"\n", + "output_df.sort_index(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0132fe16", + "metadata": {}, + "source": [ + "### Scale creation with respect to historical 2019 values of population and current-PPP (2019 USD) capital stock" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "636201d0", + "metadata": {}, + "outputs": [], + "source": [ + "## fetching the 2019 historical values\n", + "hist_gp = (\n", + " hist_df.loc[(slice(None), 2019), [\"cn_19\", \"pop\"]]\n", + " .reset_index()\n", + " .drop([\"year\"], axis=1)\n", + " .set_index([\"ccode\"])\n", + " .rename(columns={\"pop\": \"pop_2019\", \"cn_19\": \"cn_19_2019\"})\n", + ")\n", + "\n", + "## merge and create scales\n", + "output_df = output_df.merge(hist_gp, left_index=True, right_index=True, how=\"left\")\n", + "output_df[\"pop_scale\"] = output_df[\"pop\"] / output_df[\"pop_2019\"]\n", + "output_df[\"rnna_19_scale\"] = output_df[\"rnna_19\"] / output_df[\"cn_19_2019\"]\n", + "output_df[\"rnna_19_alternative_scale\"] = (\n", + " output_df[\"rnna_19_alternative\"] / output_df[\"cn_19_2019\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8298c07a", + "metadata": {}, + "source": [ + "### Exporting: historical 2019 values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3156eb87", + "metadata": {}, + "outputs": [], + "source": [ + "hist2019 = hist_df.loc[\n", + " (slice(None), 2019),\n", + " [\n", + " \"gdp_capital_unit\",\n", + " \"gdppc_unit\",\n", + " \"pop_unit\",\n", + " \"cgdpo_19\",\n", + " \"cgdpo_pc_19\",\n", + " \"pop\",\n", + " \"cn_19\",\n", + " ],\n", + "].reset_index()\n", + "hist2019 = hist2019.drop([\"year\"], axis=1).set_index([\"ccode\"])\n", + "hist2019.to_parquet(sset.DIR_YPK_FINAL / \"gdp_gdppc_pop_capital_hist2019.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "282846eb", + "metadata": {}, + "source": [ + "### Exporting: projected values (2010-2100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8205448f", + "metadata": {}, + "outputs": [], + "source": [ + "col_ordering = [\n", + " \"gdp_capital_unit\",\n", + " \"gdppc_unit\",\n", + " \"pop_unit\",\n", + " \"rgdpna_19\",\n", + " \"rgdpna_pc_19\",\n", + " \"rnna_19\",\n", + " \"rnna_19_scale\",\n", + " \"rnna_19_alternative\",\n", + " \"rnna_19_alternative_scale\",\n", + " \"cn_19_2019\",\n", + " \"pop\",\n", + " \"pop_scale\",\n", + " \"pop_2019\",\n", + " \"MPK\",\n", + " \"IY\",\n", + " \"KY\",\n", + "]\n", + "\n", + "## filling in the nan's with 0, for uninhabited areas\n", + "output_df = output_df[col_ordering].copy().sort_index()\n", + "for i in [\"rgdpna_pc_19\", \"rnna_19_scale\", \"rnna_19_alternative_scale\", \"pop_scale\"]:\n", + " output_df.loc[pd.isnull(output_df[i]), i] = 0\n", + "\n", + "output_df.to_parquet(\n", + " sset.DIR_YPK_FINAL / \"gdp_gdppc_pop_capital_proj_2010_2100.parquet\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/create-SLIIDERS-ECON.ipynb b/notebooks/create-SLIIDERS-ECON/create-SLIIDERS-ECON.ipynb new file mode 100644 index 0000000..0aeb3b9 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/create-SLIIDERS-ECON.ipynb @@ -0,0 +1,3409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create SLIIDERS-ECON" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook creates the SLIIDERS-ECON dataset, starting from a segment X ADM1 X elevation dataset of capital stock and population + country-level scaling factors following the SSPs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "\n", + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "from dask_gateway import Gateway\n", + "from gcsfs import GCSFileSystem\n", + "from scipy.stats import gumbel_r\n", + "\n", + "from sliiders.settings import (\n", + " EXPOSURE_BIN_WIDTH_V,\n", + " PATH_CIAM_2016,\n", + " PATH_CIAM_ADM1_VORONOI_INTERSECTIONS,\n", + " PATH_CIAM_COASTLINES,\n", + " PATH_SEG_CENTROIDS,\n", + " PATH_COUNTRY_LEVEL_EXPOSURE,\n", + " PATH_COUNTRY_LEVEL_EXPOSURE_PROJ,\n", + " PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION,\n", + " PATH_EXPOSURE_BINNED_WITHELEV,\n", + " PATH_EXPOSURE_LINCKE,\n", + " PATH_EXPOSURE_WB_ICP,\n", + " PATH_GADM,\n", + " PATH_GTSM_SURGE,\n", + " PATH_PWT_RAW,\n", + " PATH_SLIIDERS_ECON,\n", + " PATH_SLIIDERS_SLR,\n", + " SVALS,\n", + ")\n", + "from sliiders.spatial import coastlen_poly, get_great_circle_nearest_index\n", + "from sliiders.utils import upload_pkg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Start Dask Cluster" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section can be modified for whatever computing resources you have. But the result must be a dask distributed `Client` object assigned to `client`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ce1123bfbc4b4d78994359ec6424e8b4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

GatewayCluster

'), HBox(children=(HTML(value='\\n
\\n
<xarray.Dataset>\n",
+       "Dimensions:         (elev: 200, bound: 2, seg_adm: 11709, country: 204,\n",
+       "                     params: 2, return_period: 4, ssp: 5, iam: 2, year: 101)\n",
+       "Coordinates:\n",
+       "    seg             (seg_adm) object dask.array<chunksize=(500,), meta=np.ndarray>\n",
+       "    adm1            (seg_adm) object dask.array<chunksize=(500,), meta=np.ndarray>\n",
+       "  * country         (country) object 'GRC' 'ITA' 'CYP' ... 'NRU' 'TKL' 'SXM'\n",
+       "    seg_country     (seg_adm) object dask.array<chunksize=(500,), meta=np.ndarray>\n",
+       "  * seg_adm         (seg_adm) object 'seg_00001_adm1_GRC.4_1' ... 'seg_99015_...\n",
+       "  * params          (params) <U5 'loc' 'scale'\n",
+       "  * return_period   (return_period) int64 10 100 1000 10000\n",
+       "  * year            (year) int64 2000 2001 2002 2003 ... 2097 2098 2099 2100\n",
+       "  * ssp             (ssp) object 'SSP1' 'SSP2' 'SSP3' 'SSP4' 'SSP5'\n",
+       "  * iam             (iam) object 'IIASA' 'OECD'\n",
+       "  * elev            (elev) float64 0.05 0.15 0.25 0.35 ... 19.75 19.85 19.95\n",
+       "  * bound           (bound) object 'lower' 'upper'\n",
+       "Data variables: (12/17)\n",
+       "    elev_bounds     (elev, bound) float32 dask.array<chunksize=(200, 2), meta=np.ndarray>\n",
+       "    SLR_site_id     (seg_adm) object dask.array<chunksize=(500,), meta=np.ndarray>\n",
+       "    length          (seg_adm) float32 dask.array<chunksize=(500,), meta=np.ndarray>\n",
+       "    pc              (country) float32 dask.array<chunksize=(204,), meta=np.ndarray>\n",
+       "    mobcapfrac      (country) float32 dask.array<chunksize=(204,), meta=np.ndarray>\n",
+       "    gumbel_params   (seg_adm, params) float32 dask.array<chunksize=(500, 2), meta=np.ndarray>\n",
+       "    ...              ...\n",
+       "    pop_scale       (ssp, iam, year, country) float32 dask.array<chunksize=(5, 2, 101, 204), meta=np.ndarray>\n",
+       "    ypcc            (ssp, iam, year, country) float32 dask.array<chunksize=(5, 2, 101, 204), meta=np.ndarray>\n",
+       "    landarea        (seg_adm, elev) float32 dask.array<chunksize=(500, 200), meta=np.ndarray>\n",
+       "    interior        (ssp, iam, year, country) float32 dask.array<chunksize=(5, 2, 101, 204), meta=np.ndarray>\n",
+       "    wetland         (seg_adm, elev) float32 dask.array<chunksize=(500, 200), meta=np.ndarray>\n",
+       "    wetlandservice  (ssp, iam, year, country) float32 dask.array<chunksize=(5, 2, 101, 204), meta=np.ndarray>
" + ], + "text/plain": [ + "\n", + "Dimensions: (elev: 200, bound: 2, seg_adm: 11709, country: 204,\n", + " params: 2, return_period: 4, ssp: 5, iam: 2, year: 101)\n", + "Coordinates:\n", + " seg (seg_adm) object dask.array\n", + " adm1 (seg_adm) object dask.array\n", + " * country (country) object 'GRC' 'ITA' 'CYP' ... 'NRU' 'TKL' 'SXM'\n", + " seg_country (seg_adm) object dask.array\n", + " * seg_adm (seg_adm) object 'seg_00001_adm1_GRC.4_1' ... 'seg_99015_...\n", + " * params (params) \n", + " SLR_site_id (seg_adm) object dask.array\n", + " length (seg_adm) float32 dask.array\n", + " pc (country) float32 dask.array\n", + " mobcapfrac (country) float32 dask.array\n", + " gumbel_params (seg_adm, params) float32 dask.array\n", + " ... ...\n", + " pop_scale (ssp, iam, year, country) float32 dask.array\n", + " ypcc (ssp, iam, year, country) float32 dask.array\n", + " landarea (seg_adm, elev) float32 dask.array\n", + " interior (ssp, iam, year, country) float32 dask.array\n", + " wetland (seg_adm, elev) float32 dask.array\n", + " wetlandservice (ssp, iam, year, country) float32 dask.array" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dollar_units = \"2019 USD PPP\"\n", + "\n", + "# coords\n", + "out.seg_adm.attrs.update(\n", + " {\n", + " \"description\": (\n", + " \"Unique combinations of coastline segment and ADM1 unit. Each is treated \"\n", + " \"as an independent unit in pyCIAM.\"\n", + " )\n", + " }\n", + ")\n", + "out.params.attrs.update(\n", + " {\"description\": \"Gumbel parameters for ESL/storm surge distribution\"}\n", + ")\n", + "out.return_period.attrs.update({\"long_name\": \"Return periods\", \"units\": \"y\"})\n", + "out.elev.attrs.update(\n", + " {\n", + " \"long_name\": \"Elevation\",\n", + " \"description\": (\n", + " \"Midpoint elevation for each coastal elevation bin employed in pyCIAM\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "out.ssp.attrs.update(\n", + " {\n", + " \"long_name\": \"Shared Socioeconomic Pathway\",\n", + " \"description\": \"Trajectories of income, capital, and population growth\",\n", + " }\n", + ")\n", + "out.iam.attrs.update(\n", + " {\n", + " \"long_name\": \"Growth Model\",\n", + " \"description\": (\n", + " \"Independent models used to simulate income and capital growth for each \"\n", + " \"SSP\",\n", + " ),\n", + " }\n", + ")\n", + "out.country.attrs.update(\n", + " {\n", + " \"description\": (\n", + " \"Dimension used for variables that exhibit only country-level variance\"\n", + " )\n", + " }\n", + ")\n", + "\n", + "# alternate coords\n", + "out.seg.attrs.update({\"description\": \"Segment associated with each seg-ADM1\"})\n", + "out.adm1.attrs.update({\"description\": \"ADM1 unit associated with each seg-ADM1\"})\n", + "out.seg_country.attrs.update(\n", + " {\"description\": (\"Country associated with each analysis unit\")}\n", + ")\n", + "\n", + "# data_vars\n", + "out.interior.attrs.update(\n", + " {\n", + " \"long_name\": \"Value of non-coastal land\",\n", + " \"units\": f\"{dollar_units} per km2\",\n", + " }\n", + ")\n", + "out.SLR_site_id.attrs.update(\n", + " {\n", + " \"long_name\": \"SLR Site ID\",\n", + " \"description\": \"SLR Site ID for closest 2-deg LSLR projection grid cell\",\n", + " }\n", + ")\n", + "out.elev_bounds.attrs.update(\n", + " {\n", + " \"long_name\": \"Elevation bounds\",\n", + " \"description\": (\n", + " \"Lower and upper bounds for each coastal elevation bin employed in pyCIAM\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "out.length.attrs.update(\n", + " {\n", + " \"description\": \"Length of coastline associated with each analysis unit\",\n", + " \"units\": \"km\",\n", + " }\n", + ")\n", + "out.gumbel_params.attrs.update(\n", + " {\"description\": \"Parameters describing ESL distribution\", \"units\": \"m\"}\n", + ")\n", + "out.surge_height.attrs.update(\n", + " {\"description\": \"Estimated ESL/storm surge heights\", \"units\": \"m\"}\n", + ")\n", + "out.wetland.attrs.update(\n", + " {\n", + " \"description\": \"Estimated area for all wetland by elevation\",\n", + " \"units\": \"km^2\",\n", + " }\n", + ")\n", + "out.wetlandservice.attrs.update(\n", + " {\n", + " \"description\": \"Value of wetlands\",\n", + " \"units\": f\"{dollar_units} per km^2\",\n", + " }\n", + ")\n", + "out.K_2019.attrs.update(\n", + " {\n", + " \"long_name\": \"2019 Capital stock\",\n", + " \"description\": (\n", + " \"Estimated value of physical capital in 2019, under actual conditions \"\n", + " \"(i.e. observed, not SSP)\"\n", + " ),\n", + " \"units\": dollar_units,\n", + " }\n", + ")\n", + "out.pop_2019.attrs.update(\n", + " {\n", + " \"long_name\": \"2019 population\",\n", + " \"description\": (\n", + " \"Estimated population in 2019, under actual conditions (i.e. observed, not \"\n", + " \"SSP)\"\n", + " ),\n", + " \"units\": \"people\",\n", + " }\n", + ")\n", + "out.K_scale.attrs.update(\n", + " {\n", + " \"long_name\": \"Capital scaling factor\",\n", + " \"description\": (\n", + " \"Country-level change factor in capital stock specific to SSP/IAM, \"\n", + " \"relative to K_2019\"\n", + " ),\n", + " }\n", + ")\n", + "out.pop_scale.attrs.update(\n", + " {\n", + " \"long_name\": \"Population scaling factor\",\n", + " \"description\": (\n", + " \"Country-level change factor in population specific to SSP/IAM, relative \"\n", + " \"to pop_2019\"\n", + " ),\n", + " }\n", + ")\n", + "out.ypcc.attrs.update(\n", + " {\n", + " \"long_name\": \"Country-level income per capita\",\n", + " \"units\": f\"{dollar_units} per person\",\n", + " }\n", + ")\n", + "out.landarea.attrs.update(\n", + " {\n", + " \"long_name\": \"Total Land Area\",\n", + " \"units\": \"km^2\",\n", + " }\n", + ")\n", + "out.pc.attrs.update(\n", + " {\n", + " \"description\": \"Protection cost (quadratic with height)\",\n", + " \"units\": f\"{dollar_units} per km per vert m^2\",\n", + " }\n", + ")\n", + "out.mobcapfrac.attrs.update(\n", + " {\n", + " \"description\": \"Fraction of capital that is mobile\",\n", + " }\n", + ")\n", + "out.rho.attrs.update(\n", + " {\n", + " \"description\": (\n", + " \"Resilience factor scaling depth-damage and depth-mortality functions\"\n", + " ),\n", + " }\n", + ")\n", + "out.dr.attrs.update({\"description\": \"Discount rate\"})\n", + "out.wmaxrate.attrs.update(\n", + " {\n", + " \"units\": \"m per year\",\n", + " }\n", + ")\n", + "\n", + "# values we need to save\n", + "to_keep = [\n", + " \"elev_bounds\",\n", + " \"seg\",\n", + " \"adm1\",\n", + " \"country\",\n", + " \"seg_country\",\n", + " \"SLR_site_id\",\n", + " \"length\",\n", + " \"pc\",\n", + " \"mobcapfrac\",\n", + " \"gumbel_params\",\n", + " \"surge_height\",\n", + " \"rho\",\n", + " \"K_2019\",\n", + " \"pop_2019\",\n", + " \"K_scale\",\n", + " \"pop_scale\",\n", + " \"ypcc\",\n", + " \"landarea\",\n", + " \"interior\",\n", + " \"wetland\",\n", + " \"wetlandservice\",\n", + "]\n", + "\n", + "# print dataset to verify it looks as expected\n", + "out[to_keep]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/srv/conda/envs/notebook/lib/python3.9/site-packages/xarray/conventions.py:205: SerializationWarning: variable None has data in the form of a dask array with dtype=object, which means it is being loaded into memory to determine a data type that can be safely stored on disk. To avoid this, coerce this variable to a fixed-size dtype with astype() before saving it.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out[to_keep].to_zarr(PATH_SLIIDERS_ECON, mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(None, None)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.close(), client.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb b/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb new file mode 100644 index 0000000..f9391ae --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb @@ -0,0 +1,760 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9c487926-08b8-49b2-928d-c95730c36d44", + "metadata": {}, + "source": [ + "## Notebook for downloading inputs to create SLIIDERS-ECON\n", + "\n", + "This notebook contains directions for downloading various input datasets to create the final product for this directory, the **SLIIDERS-ECON** dataset.\n", + "\n", + "In general, we will keep the format, file name, and data unaltered, but apply changes when\n", + "- file name is not human-readable, too long, or is not much informative about the dataset (assign appropriate file names)\n", + "- file format causes errors (save in a similar file format that is not error-prone)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "517784ca-badd-41fe-a88c-7b4370260e5c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import ssl\n", + "import subprocess\n", + "import tarfile\n", + "from io import BytesIO\n", + "from pathlib import Path\n", + "from urllib import request as urequest\n", + "from zipfile import ZipFile\n", + "\n", + "import dask.distributed as dd\n", + "import pandas as pd\n", + "import requests\n", + "from dask_gateway import Gateway\n", + "from pandas_datareader import wb as dr_wb\n", + "from tqdm.auto import tqdm\n", + "\n", + "from sliiders import settings as sset\n", + "\n", + "# dask gateway setup\n", + "gateway = Gateway()\n", + "image_name = sset.DASK_IMAGE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19fc306c-cfca-44dd-9044-0d8c6bf1d2c3", + "metadata": {}, + "outputs": [], + "source": [ + "# creating select directories\n", + "PWT_DIRECTORY = sset.PATH_PWT_RAW.parent\n", + "IMF_WEO_DIRECTORY = sset.PATH_IMF_WEO_RAW.parent\n", + "MPD_DIRECTORY = sset.PATH_MPD_RAW.parent\n", + "GWDB_DIRECTORY = sset.PATH_GWDB2021_RAW.parent\n", + "SRTM15PLUS_DIRECTORY = sset.PATH_SRTM15_PLUS.parent\n", + "MDT_DIRECTORY = sset.PATH_GEOG_MDT_RAW.parent\n", + "\n", + "directories_to_create = [\n", + " PWT_DIRECTORY,\n", + " IMF_WEO_DIRECTORY,\n", + " MPD_DIRECTORY,\n", + " GWDB_DIRECTORY,\n", + " SRTM15PLUS_DIRECTORY,\n", + " MDT_DIRECTORY,\n", + " sset.DIR_WB_WDI_RAW,\n", + " sset.DIR_LITPOP_RAW,\n", + " sset.DIR_GEG15_RAW,\n", + " sset.DIR_CCI_RAW,\n", + " sset.DIR_UN_WPP_RAW,\n", + " sset.DIR_UN_AMA_RAW,\n", + " sset.DIR_ALAND_STATISTICS_RAW,\n", + " sset.DIR_OECD_REGIONS_RAW,\n", + " sset.DIR_LANDSCAN_RAW,\n", + " sset.DIR_IIASA_PROJECTIONS,\n", + " sset.DIR_GEOG_DATUMS_XGM2019e_WGS84,\n", + " sset.DIR_GEOG_DATUMS_EGM96_WGS84,\n", + "]\n", + "for direc in directories_to_create:\n", + " direc.mkdir(exist_ok=True, parents=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9e1d4cce-5cc8-4c4d-b3cd-1aaf56e90ce9", + "metadata": { + "tags": [] + }, + "source": [ + "## Fetching raw data from various sources" + ] + }, + { + "cell_type": "markdown", + "id": "83c92a29-bd27-4b1d-838c-7989d7561757", + "metadata": {}, + "source": [ + "### Penn World Tables 10.0 (PWT 10.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "029359fb-e17a-44c8-a536-4696f7d8c00c", + "metadata": {}, + "outputs": [], + "source": [ + "# PWT10.0\n", + "pwt100_data = pd.read_excel(\"https://www.rug.nl/ggdc/docs/pwt100.xlsx\", sheet_name=2)\n", + "\n", + "# PWT10.0 capital details\n", + "pwt100_data_K = pd.read_excel(\n", + " \"https://www.rug.nl/ggdc/docs/pwt100-capital-detail.xlsx\", sheet_name=2\n", + ")\n", + "\n", + "pwt_filenames = [\"pwt_100.xlsx\", \"pwt_K_detail_100.xlsx\"]\n", + "for i, data in enumerate([pwt100_data, pwt100_data_K]):\n", + " data.to_excel(\n", + " excel_writer=(PWT_DIRECTORY / pwt_filenames[i]),\n", + " sheet_name=\"Sheet1\",\n", + " index=False,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "d08579cb-0c3e-4722-8ae3-944b46297b68", + "metadata": {}, + "source": [ + "### Maddison Project Dataset (MPD, Maddison Project Database 2020)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb664e52-bdbe-4e2e-b30e-3102fdf8ae83", + "metadata": {}, + "outputs": [], + "source": [ + "madd = pd.read_excel(\n", + " \"https://www.rug.nl/ggdc/historicaldevelopment/maddison/data/mpd2020.xlsx\",\n", + " sheet_name=2,\n", + ")\n", + "madd.to_excel(\n", + " excel_writer=(sset.PATH_MPD_RAW),\n", + " index=False,\n", + " sheet_name=\"Sheet1\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8b3c486f-f4b0-4ad1-849a-542e2a875a11", + "metadata": {}, + "source": [ + "### World Bank WDI (WB WDI)\n", + "\n", + "#### Investment-to-GDP ratio, GDP and GDPpc (nominal and PPP), and Population" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b33996e7-fa36-4496-bda9-1f8a459cde1f", + "metadata": {}, + "outputs": [], + "source": [ + "# country name and iso3 country code information\n", + "country_info = dr_wb.get_countries()[[\"name\", \"iso3c\"]].rename(\n", + " columns={\"name\": \"country\", \"iso3c\": \"ccode\"}\n", + ")\n", + "\n", + "# relevant indicator information for the `dr_wb` module to fetch the variables\n", + "wbwdi_indicators = [\n", + " \"SP.POP.TOTL\", # population\n", + " \"NE.GDI.FTOT.ZS\", # investment-to-GDP ratio\n", + " \"NY.GDP.MKTP.PP.KD\", # GDP PPP\n", + " \"NY.GDP.PCAP.PP.KD\", # GDP per capita PPP\n", + " \"NY.GDP.MKTP.KD\", # GDP nominal\n", + " \"NY.GDP.PCAP.KD\", # GDP per capita nominal\n", + "]\n", + "\n", + "j = 0\n", + "for indi in wbwdi_indicators:\n", + " indi_info = (\n", + " dr_wb.download(indicator=indi, country=\"all\", start=1950, end=2020)\n", + " .reset_index()\n", + " .astype({\"year\": \"int64\"})\n", + " .merge(country_info, on=[\"country\"], how=\"left\")\n", + " .set_index([\"ccode\", \"year\"])\n", + " )\n", + "\n", + " if j == 0:\n", + " j += 1\n", + " wbwdi_info = indi_info.copy()\n", + " else:\n", + " wbwdi_info = wbwdi_info.merge(\n", + " indi_info.drop([\"country\"], axis=1),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"outer\",\n", + " )\n", + "\n", + "# excluding those that have no information and saving the data\n", + "wb_info_vars = [x for x in wbwdi_info.columns if x != \"country\"]\n", + "wbwdi_info = wbwdi_info.loc[~pd.isnull(wbwdi_info[wb_info_vars]).all(axis=1), :]\n", + "wbwdi_info.to_parquet(sset.DIR_WB_WDI_RAW / \"wdi_pop_iy_gdp.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "bdec086b-c676-4690-9f2f-93585172e4d7", + "metadata": {}, + "source": [ + "#### WB WDI: exchange rate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e766dc90-ef52-4f10-ba4c-0fdae8c768c9", + "metadata": {}, + "outputs": [], + "source": [ + "# country name and iso3 country code information\n", + "country_info = dr_wb.get_countries()[[\"name\", \"iso3c\"]].rename(\n", + " columns={\"name\": \"country\", \"iso3c\": \"ccode\"}\n", + ")\n", + "\n", + "xr_code = \"PA.NUS.FCRF\"\n", + "xr_wb = dr_wb.download(indicator=xr_code, country=\"all\", start=1950, end=2019)\n", + "xr_wb = (\n", + " xr_wb.reset_index()\n", + " .astype({\"year\": \"int64\"})\n", + " .merge(country_info, on=[\"country\"], how=\"left\")\n", + ")\n", + "(\n", + " xr_wb.set_index([\"ccode\", \"year\"])\n", + " .rename(columns={xr_code: \"xrate\"})\n", + " .to_parquet(sset.DIR_WB_WDI_RAW / \"wdi_xr.parquet\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "df339e99-0571-4c23-8c7f-6e691b7c39c6", + "metadata": {}, + "source": [ + "### UN WPP populations (overall and by-population-group data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c01b3a3-63ca-4c8e-9d84-8d0ede6c7c33", + "metadata": {}, + "outputs": [], + "source": [ + "# overall information\n", + "un_df = pd.read_csv(\n", + " \"https://population.un.org/wpp/Download/Files/\"\n", + " \"1_Indicators%20(Standard)/CSV_FILES/WPP2019_TotalPopulationBySex.csv\"\n", + ")\n", + "\n", + "# by_age_group\n", + "by_age = pd.read_csv(\n", + " \"https://population.un.org/wpp/Download/Files/1_Indicators\"\n", + " \"%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv\"\n", + ")\n", + "\n", + "# exporting\n", + "un_df.to_csv(sset.DIR_UN_WPP_RAW / \"UN_WPP2019_TotalPopulation.csv\", index=False)\n", + "by_age.to_csv(sset.DIR_UN_WPP_RAW / \"UN_WPP2019_Population_by_Age.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "8cdb4d29-7c84-49e0-ba3e-4fb7663ecd32", + "metadata": {}, + "source": [ + "### Åland Island GDP and population (from Statistics and Research Åland or ÅSUB)\n", + "\n", + "Note when newer versions are available, old links from ÅSUB will become deprecated; the below links in `ALA_GDP_LINK` and `ALA_POP_LINK` are valid as of 2022-03-29." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "792b59d8-44fa-4d85-93db-8682b1857e24", + "metadata": {}, + "outputs": [], + "source": [ + "# links\n", + "ALA_GDP_LINK = (\n", + " \"https://www.asub.ax/sites/www.asub.ax/files/attachments/page/nr005en.xls\"\n", + ")\n", + "ALA_POP_LINK = (\n", + " \"https://www.asub.ax/sites/www.asub.ax/files/attachments/page/alv01_aland_faroe\"\n", + " \"_islands_and_greenland_-_an_overview_with_comparable_data.xlsx\"\n", + ")\n", + "\n", + "# datasets read-in\n", + "ala_gdp = pd.read_excel(ALA_GDP_LINK, header=3)\n", + "ala_pop = pd.read_excel(ALA_POP_LINK, header=2, sheet_name=\"Population development\")\n", + "\n", + "# exporting\n", + "ala_gdp.to_excel(sset.DIR_ALAND_STATISTICS_RAW / \"aland_gdp.xlsx\", index=False)\n", + "ala_pop.to_excel(sset.DIR_ALAND_STATISTICS_RAW / \"aland_pop.xlsx\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "3fe8a00c-5822-467a-80a8-b4b6722eea3c", + "metadata": { + "tags": [] + }, + "source": [ + "### Global Wealth Databook (from Credit Suisse)\n", + "\n", + "We download the 2021 vintage (latest as of 2022-03-21)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81d2a06a-52eb-4f4f-94fd-8d01e9c32042", + "metadata": {}, + "outputs": [], + "source": [ + "URL_GWDB = (\n", + " \"https://www.credit-suisse.com/media/assets/corporate/docs/about-us/research\"\n", + " \"/publications/global-wealth-databook-2021.pdf\"\n", + ")\n", + "\n", + "gwr_raw = urequest.urlopen(URL_GWDB)\n", + "file = open(str(sset.PATH_GWDB2021_RAW), \"wb\")\n", + "file.write(gwr_raw.read())\n", + "file.close()" + ] + }, + { + "cell_type": "markdown", + "id": "a9f0b8fa-7c93-4735-9caa-e07777d150a2", + "metadata": {}, + "source": [ + "### LitPop (Eberenz et al. 2020, Earth Syst. Sci. Data)\n", + "\n", + "#### Download Data from the Internet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ed65443-7818-406e-a23a-1f62aba91a5d", + "metadata": {}, + "outputs": [], + "source": [ + "# link for downloading the LitPop files\n", + "link_base = (\n", + " \"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/331316\"\n", + ")\n", + "\n", + "# readme, data, normalized data, and metadata\n", + "links = [\n", + " link_base + \"/_readme_v1_2.txt?sequence=18&isAllowed=y\",\n", + " link_base + \"/LitPop_v1_2.tar?sequence=16&isAllowed=y\",\n", + " link_base + \"/Lit_Pop_norm_v1.tar?sequence=4&isAllowed=y\",\n", + " link_base + \"/_metadata_countries_v1_2.csv?sequence=12&isAllowed=y\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ff7328f-8976-4f0c-b981-adff491472c8", + "metadata": {}, + "outputs": [], + "source": [ + "def litpop_download(link, direc=sset.DIR_LITPOP_RAW):\n", + " \"\"\"Given a URL link, downloads (LitPop-related) data from the web and saves it in\n", + " the specified local directory. The file name is parsed so that anything after the\n", + " string `?sequence` is dropped (e.g., `file.txt?sequence=..` to `file.txt`).\n", + "\n", + " Parameters\n", + " ----------\n", + " link : str\n", + " URL link for the file online\n", + " direc : str or pathlib.Path\n", + " directory to store the LitPop datasets\n", + "\n", + " Returns\n", + " -------\n", + " None, but saves the file downloaded from online to `direc`.\n", + "\n", + " \"\"\"\n", + " if type(direc) is str:\n", + " direc = Path(direc)\n", + "\n", + " stop = link.find(\"?sequence\")\n", + " start = link.rfind(\"/\", 0, stop) + 1\n", + " urequest.urlretrieve(link, direc / link[start:stop])\n", + "\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1dfe03b-cc1f-4fa8-a89e-3952b856b309", + "metadata": {}, + "outputs": [], + "source": [ + "# cluster setup\n", + "N_CLUSTER = len(links)\n", + "cluster = gateway.new_cluster(worker_image=image_name, profile=\"micro\")\n", + "client = cluster.get_client()\n", + "cluster.scale(N_CLUSTER)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0e059a0-cc7f-4193-b682-aeb2155967e9", + "metadata": {}, + "outputs": [], + "source": [ + "# takes approximately 20 minutes\n", + "futures = client.map(litpop_download, links)\n", + "dd.progress(futures)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a39c609-0e3b-4caa-9181-28b4570e097f", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(0)\n", + "client.close()\n", + "cluster.close()\n", + "cluster.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "ae5d6120-d2f8-474a-891e-5b59838d3b11", + "metadata": {}, + "source": [ + "#### Un-tar and clear storage\n", + "\n", + "We only un-tar the regular (not normalized) LitPop data here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d685d7d9-0c7a-4c99-af5f-639cdd3c618d", + "metadata": {}, + "outputs": [], + "source": [ + "# un-tar\n", + "regular_litpop = sset.DIR_LITPOP_RAW / \"LitPop_v1_2.tar\"\n", + "with tarfile.open(regular_litpop) as file:\n", + " file.extractall(sset.DIR_LITPOP_RAW)\n", + "\n", + "# clear storage for the existing tar file\n", + "os.remove(regular_litpop)" + ] + }, + { + "cell_type": "markdown", + "id": "4702790c-75dd-484a-b8e3-0acdf36d5c35", + "metadata": {}, + "source": [ + "### GEG-15\n", + "\n", + "We download 2'30\" GEG15 and unzip." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3109c6-fa04-406d-9def-21b745f6d83c", + "metadata": {}, + "outputs": [], + "source": [ + "# downloading\n", + "zip_url = (\n", + " \"https://data.humdata.org/dataset/1c9cf1eb-c20a-4a06-8309-9416464af746/\"\n", + " \"resource/e321d56d-022e-4070-80ac-f7860646408d/download/gar-exp.zip\"\n", + ")\n", + "zip_path = sset.DIR_GEG15_RAW / \"gar-exp.zip\"\n", + "urequest.urlretrieve(zip_url, zip_path)\n", + "\n", + "# unzipping\n", + "outpath = sset.DIR_GEG15_RAW / zip_path.stem\n", + "os.makedirs(outpath, exist_ok=True)\n", + "subprocess.Popen([\"unzip\", f\"{zip_path}\", \"-d\", f\"{outpath}\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e9ef63f-9a5a-40f9-8bd8-c5f33f524616", + "metadata": {}, + "outputs": [], + "source": [ + "# remove zip file (use after unzipping)\n", + "os.remove(zip_path)" + ] + }, + { + "cell_type": "markdown", + "id": "96bd9ff7-2b4f-4082-9d4c-0042c2b3ee9f", + "metadata": {}, + "source": [ + "### Country-level Construction Cost Index from [Lincke and Hinkel (2021, *Earth's Future*)](https://agupubs.onlinelibrary.wiley.com/doi/full/10.1029/2020EF001965?campaign=woletoc)\n", + "\n", + "The accompanying GitHub repository to Lincke and Hinkel (2021) is at [this link](https://github.com/daniellincke/DIVA_paper_migration)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cb05b93-c8cc-4f9e-9e99-ce5b7d511d16", + "metadata": {}, + "outputs": [], + "source": [ + "# raw data file from the GitHub repo\n", + "lincke_hinkel_cci_url = (\n", + " \"https://raw.githubusercontent.com/daniellincke/\"\n", + " \"DIVA_paper_migration/master/data/csv/country_input.csv\"\n", + ")\n", + "\n", + "# data read-in\n", + "lincke_hinkel_df = pd.read_csv(lincke_hinkel_cci_url)\n", + "\n", + "# saving at PATH_EXPOSURE_LINCKE\n", + "lincke_hinkel_df.to_parquet(sset.PATH_EXPOSURE_LINCKE)" + ] + }, + { + "cell_type": "markdown", + "id": "342a0f73-06b7-4ed9-8ae8-113775265c15", + "metadata": {}, + "source": [ + "### SRTM 15+\n", + "\n", + "We use version 2.3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eb84251-5e5a-4180-b59c-793fc06dd913", + "metadata": {}, + "outputs": [], + "source": [ + "# Workaround for urllib request error\n", + "ssl._create_default_https_context = ssl._create_unverified_context\n", + "URL_SRTM15 = \"https://topex.ucsd.edu/pub/srtm15_plus/SRTM15_V2.3.nc\"\n", + "\n", + "urequest.urlretrieve(URL_SRTM15, SRTM15PLUS_DIRECTORY / URL_SRTM15.split(\"/\")[-1])" + ] + }, + { + "cell_type": "markdown", + "id": "d5efcb44-1f60-4377-9d01-e970b918ca5e", + "metadata": { + "tags": [] + }, + "source": [ + "## Further data requiring separate manual instructions\n", + "\n", + "In all cases below, `sset` is defined by `from sliiders import settings as sset` as above.\n", + "\n", + "### UN Statistics National Accounts (Analysis of Main Aggregates; abbreviated as UN AMA)\n", + "\n", + "#### UN AMA nominal (current prices) GDP per capita information\n", + "\n", + "1. Travel to this [link](https://unstats.un.org/unsd/snaama/Basic) to get to the UN Statistics National Accounts search page.\n", + "2. Select all countries and all years available, and select \"GDP, Per Capita GDP - US Dollars\".\n", + "3. Select \"Export to CSV\", and you will download the file `Results.csv`. Rename this file as `un_snaama_nom_gdppc.csv`. We save this in `sset.DIR_UN_AMA_RAW`.\n", + "\n", + "#### UN AMA nominal (current prices) GDP information\n", + "\n", + "1. Similar to the nominal GDP per capita information, travel to this [link](https://unstats.un.org/unsd/snaama/Basic) to get to the UN Statistics National Accounts search page.\n", + "2. Select all countries and all years available, and select \"GDP, at current prices - US Dollars\".\n", + "3. Select \"Export to CSV\", and you will download the file `Results.csv`. Rename this file as `un_snaama_nom_gdp.csv`. We save this in `sset.DIR_UN_AMA_RAW`.\n", + "\n", + "### OECD region-level information\n", + "\n", + "#### OECD: population (region-level)\n", + "1. Go to the following OECD Stat website: link [here](https://stats.oecd.org/)\n", + "2. On the left, find the header \"Regions and Cities\" and click the \"+\" button.\n", + "3. From the drop down menu, click on \"Regional Statistics\".\n", + "4. Again from the drop down menu, click on \"Regional Demography.\"\n", + "5. Finally, select \"Population by 5-year age groups, small regions TL3.\" Make sure that \"Indicator\" is selected as \"Population, All ages\".\n", + "6. Download the file by selecting \"Export,\" then \"Text File (CSV).\"\n", + "7. When a pop-up appears, select \"Default format\" then \"Download.\" Rename the file as `REGION_DEMOGR.csv` (due to it having random-ish numeric parts in the name). Note that this step may take a longer time than others.\n", + "8. Finally, move the said file to `sset.DIR_OECD_REGIONS_RAW`.\n", + "\n", + "#### OECD: GDP (region-level, in millions of constant 2015 PPP USD)\n", + "1. Similar to the population information, go to the following OECD Stat website: link [here](https://stats.oecd.org/)\n", + "2. On the left, find the header \"Regions and Cities\" and click the \"+\" button.\n", + "3. From the drop down menu, click on \"Regional Statistics\".\n", + "4. Again from the drop down menu, click on \"Regional Economy.\"\n", + "5. Finally, select \"Gross Domestic Product, Small regions TL3.\" Make sure that \"Measure\" is selected as \"Millions USD, constant prices, constant PPP, base year 2015\".\n", + "6. Download the file by selecting \"Export,\" then \"Text File (CSV).\"\n", + "7. When a pop-up appears, select \"Default format\" then \"Download.\" Rename the file as `REGION_ECONOM.csv` (due to it having random-ish numeric parts in the name). Note that this step may take a longer time than others.\n", + "8. Finally, move the said file to `sset.DIR_OECD_REGIONS_RAW`.\n", + "\n", + "### IMF investment-to-GDP ratio, population, and GDP\n", + "\n", + "1. Travel to this [link](https://www.imf.org/en/Publications/SPROLLs/world-economic-outlook-databases#sort=%40imfdate%20descending) to get to the World Economic Outlook Databases page.\n", + "2. Click on the latest \"World Economic Outlook Database\" link on the page; for our purposes, we have used the latest available one, which was \"World Economic Outlook Database, October 2021\" (may be updated in the future).\n", + "3. Click \"By Countries\", then click \"ALL COUNTRIES\", then click \"CONTINUE\" on the page that says \"Select Countries.\"\n", + "4. Under the \"NATIONAL ACCOUNTS\" tab, check the following categories:\n", + " - Gross domestic product, current prices (U.S. DOLLARS)\n", + " - Gross domestic product per capita, current prices (U.S. DOLLARS)\n", + " - Gross domestic product per capita, constant prices (PURCHASING POWER PARITY; 2017 INTERNATIONAL DOLLARS)\n", + " - Total investment (PERCENT OF GDP)\n", + "5. Under the \"PEOPLE\" tab, check the category \"Population,\" then click on \"CONTINUE.\"\n", + "6. Under the tab \"DATE RANGE,\" use the earliest year for \"Start Year\" (1980, in our case), and the latest non-future year for \"End Year\" (2020, in our case).\n", + "7. Under the tab \"ADVANCED SETTINGS\", click on \"ISO Alpha-3 Code\" for getting country codes. \n", + "8. Click on \"PREPARE REPORT.\" Then, click on \"DOWNLOAD REPORT.\" Saved data should be in Excel format and be named `WEO_Data.xls`.\n", + "9. Open the said file on Excel, and re-save it in a preferred format of choice (we chose `.xlsx`); this is because the original file formatting is incompatible with Python and causes the error `ValueError: Excel file format cannot be determined, you must specify an engine manually.`\n", + "10. In our implementation, we save this file as `sset.PATH_IMF_WEO_RAW`.\n", + "\n", + "### World Bank Intercomparison Project 2017 (WB ICP 2017): Construction Cost Index\n", + "\n", + "While most World Bank data can be downloaded by using `pandas_datareader.wb`, it seems that variables in WB ICP 2017 - including `1501200:CONSTRUCTION`, which is necessary for SLIIDERS-ECON - cannot be downloaded using the said module (despite being searchable in the module using `pandas_datareader.wb.search`). Therefore, we follow the below manual process for downloading the WB ICP 2017 dataset.\n", + "1. Use [this link](https://databank.worldbank.org/embed/ICP-2017-Cycle/id/4add74e?inf=n) to access WB ICP 2017 in table format.\n", + "2. Upon entering the webpage, look to the upper right corner and click on the icon with downward arrow with an underline. This should prompt the download.\n", + "3. When the download finishes, there should be a `.zip` file called `ICP 2017 Cycle.zip`. Access the `.csv` file whose name ends in `_Data.csv` (there should be two files in the `.zip` file, the other being a file whose name ends in `_Series - Metadata.csv`).\n", + "4. Save that `.csv` file as `sset.PATH_EXPOSURE_WB_ICP`.\n", + "\n", + "### IIASA and OECD models' GDP and population projections (2010-2100, every 5 years)\n", + "\n", + "1. Go to the following IIASA SSP Database website: link [here](https://tntcat.iiasa.ac.at/SspDb); you may need to register and create your log-in.\n", + "2. In the above tabs, there is a tab called \"Download\"; click on it.\n", + "3. Under \"SSP Database Version 2 Downloads (2018)\" and under the sub-header \"Basic Elements\", there is a download link for `SspDb_country_data_2013-06-12.csv.zip`. Click and download the said `.zip` file.\n", + "4. Extract and save the `SspDb_country_data_2013-06-12.csv`. Again, for our purposes, we save this in `sset.DIR_IIASA_PROJECTIONS`.\n", + "\n", + "### LandScan 2019\n", + "\n", + "1. To download this dataset, you need to first apply for an Oak Ridge National Laboratory account (link [here](https://landscan.ornl.gov/user/apply)).\n", + "2. After having gained access, go to the said website, click on \"DOWNLOAD\" -> \"LandScan Datasets\" -> \"Continue to download\" next to LandScan 2019.\n", + "3. Click on \"By downloading LandScan 2019 I agree to the above terms\" in the following webpage; this will download the file `LandScan Global 2019.zip`. We save this in `sset.DIR_LANDSCAN_RAW`.\n", + "\n", + "### Global geoids, based on select Earth Gravitational Models (EGMs)\n", + "1. Go to the following International Centre for Global Earth Models (ICGEM) website (link [here](http://icgem.gfz-potsdam.de/calcgrid)) to reach the page \"Calculation of Gravity Field Functionals on Ellipsoidal Grids\".\n", + "2. Under **Model Selection**, select `XGM2019e_2159`.\n", + "3. Under **Functional Selection**, select `geoid`.\n", + "4. Under **Grid selection**, there's a **Grid Step [°]** option. Change the value to **0.05**. Also, make sure that the **Reference System** is `WGS84`.\n", + "5. Due to download size constraints, we need to download this data in 4 chunks. Do the following:\n", + " - Split the full range of latitudes and longitudes in half, which yields the following 4 combinations of longitude-latitude ranges: $([-180, 0], [-90, 0]), ([-180, 0], [0, 90]), ([0, 180], [-90, 0])$, and $([0, 180], [0, 90])$.\n", + " - Under **Grid selection** again, one can select the range of longitudes and latitudes. Select one of the above combinations and press `start computation`.\n", + " - This will open up a new tab for calculations, which may take some time to complete. Once this is done, press **Download Grid**.\n", + " - Once the download is complete, go back to the previous page with **Model selection**, **Functional selection**, and more. Make sure the selections you made are intact, select another longitude-latitude combination, and repeat the process until there are no combinations left.\n", + "6. Once the above steps are done, go back to Step 2 above; but instead of selecting `XGM2019e_2159` for **Model selection**, select `EGM96`. Go through the Steps 3 to 5 again with this new selection.\n", + "7. Once the downloads for `XGM2019e_2159` and `EGM96` are complete, you should have 4 files for each model (8 in total, in `.gdf` format). Save the `XGM2019e_2159` files in `sset.DIR_GEOG_DATUMS_XGM2019e_WGS84` and `EGM96` files in `sset.DIR_GEOG_DATUMS_EGM96_WGS84`.\n", + "\n", + "### Global Mean Dynamic Ocean Topography (MDT) from AVISO+\n", + "**Note**: While this dataset has a relatively open license, you will first need to obtain a MY AVISO+ account, which requires verification from the AVISO+ team and may take several days or weeks.\n", + "1. Go to the following AVISO+ website for **MDT CNES-CLS18**: link [here](https://www.aviso.altimetry.fr/en/data/products/auxiliary-products/mdt/mdt-global-cnes-cls18.html).\n", + "2. Once on the page, download the dataset through your MY AVISO+ account (click on `access via MY AVISO+` link and follow the instructions).\n", + "3. After following the instructions, you will acquire the file `mdt_cnes_cls18_global.nc.gz`. Extract the file `mdt_cnes_cls18_global.nc` from the `.gz` file and save it as `sset.PATH_GEOG_MDT_RAW`.\n", + "\n", + "### CIA World Factbook (compiled by Coleman [2020])\n", + "\n", + "1. Travel to this [link](https://github.com/iancoleman/cia_world_factbook_api) (credit to Coleman [2020]), and scroll down to the `readme.md`.\n", + "2. In the **Data** section of the `readme.md` file, there should be a link on \"Historical\"; click on this link to travel to a `mega.nz` website having `weekly_json.7z` file.\n", + "3. After checking that the filename to download is `weekly_json.7z`, download the said file by clicking on the \"Download\" button.\n", + "4. When download is successful, import `weekly_json.7z` to the preferred directory (`sset.DIR_YPK_RAW` in this implementation).\n", + "\n", + "### HydroSHEDS\n", + "1. Go to https://hydrosheds.org/downloads\n", + "2. Download the \"standard\" level-0 HydroBASINS files for each continent (use the Dropbox link if available--this appears as \"NOTE: you may also download data from here.\" as of 8/16/21. Download the shapefiles into the directory defined in `sset.DIR_HYDROBASINS_RAW`" + ] + }, + { + "cell_type": "markdown", + "id": "438f5274-f65c-41a3-8deb-7e62069f6138", + "metadata": {}, + "source": [ + "### Other SLIIDERS input datasets" + ] + }, + { + "cell_type": "markdown", + "id": "8b048d8c-ef4e-4119-9de5-9ec341f0d5a5", + "metadata": {}, + "source": [ + "There are three datasets that were manually constructed for use in `SLIIDERS`. They are available for download on Zenodo. Please download each file from Zenodo and copy to the paths designated for each dataset.\n", + "\n", + "#### 1. `ne_coastline_lines_CIAM_wexp_or_gtsm.shp`\n", + "Path: `sset.PATH_CIAM_COASTLINES` (Download all files with the name `ne_coastline_lines_CIAM_wexp_or_gtsm` (but different extensions) to this directory.) \n", + "Link: [TODO - include Zenodo link]\n", + "\n", + "Using the global coastlines derived from the Natural Earth layers, we included individual land masses formed by these coastlines only if they have either i) a non-zero value of exposure based on our exposure grid for population and capital assets, OR ii) if they have an associated coastal segment point, as derived primarily from the CoDEC GTSM station points. Association of a given land mass to nearby CoDEC point(s) was determined through manual inspection of the subset of land masses (n=636) with zero exposure in order to assess whether an intersecting or nearby station point represented that land area, resulting in the inclusion of 171 small land masses for which no population or capital is present but for which a coast point is associated.\n", + "\n", + "#### 2. `gtsm_stations_eur_tothin.shp`\n", + "Path: `sset.DIR_GTSM_STATIONS_TOTHIN` (Download all files with the name `gtsm_stations_eur_tothin` (but different extensions) to this directory.) \n", + "Link: [TODO - include Zenodo link]\n", + "\n", + "These 5,637 station points are a subset of the full CoDEC dataset (n=14,110) representing sites along European coastlines that are roughly five times more densely-spaced compared to the rest of the globe, as described in Muis et al. 2020. This subset of points are those that will be thinned by 5x to approximately match the density of CoDEC coast stations globally. Some manual inclusion criteria for this subset was applied in GIS due to the fact that simply seeking to select dense European stations based on the “station_name” field in the dataset, which contains the substring “eur” for all European locations, results in an over-selection of desired points (n=6,132), with many North African coastal points that are not densely-spaced containing this substring in their “station_name” as well. Therefore, European points were manually identified, with small islands, such as in the Mediterranean, included if their land mass contained 5 or more station points, which guarantees that they will be represented by at least one station point following the 5x thinning process. The resultant subset of points is used as a data input for the coastal segment construction in the preprocessing of the SLIIDERS dataset.\n", + "\n", + "#### 3. `us_manual_protected_areas.parquet`\n", + "Path: `sset.PATH_US_MANUAL_PROTECTED_AREAS` \n", + "Link: [TODO - include Zenodo link]\n", + "\n", + "The regions defined in this dataset represent a few areas in the United States that may have low-lying elevations, but are not vulnerable to flooding due to constructed barriers or since they are completely separated from the coastline by topographical features with much higher elevations. Areas protected by Louisiana levees were downloaded from the National Levee Database (https://levees.sec.usace.army.mil/), and areas corresponding to low-lying areas in California, Missouri, and Michigan that are not vulnerable to coastal flooding were created using spatial buffers around a central point." + ] + }, + { + "cell_type": "markdown", + "id": "c0323ad0-afc1-43bf-9e5b-82e37e5455ce", + "metadata": {}, + "source": [ + "### CoastalDEM v1.1\n", + "1. Acquire the global 1 arc-second CoastalDEM dataset from Climate Central (https://go.climatecentral.org/coastaldem/).\n", + "2. Save all 1-degree GeoTIFF files in `sset.DIR_COASTALDEM`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/1-create-coastline-segments.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/1-create-coastline-segments.ipynb new file mode 100644 index 0000000..57b7002 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/1-create-coastline-segments.ipynb @@ -0,0 +1,286 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0c63fafa-8600-47e7-b1b8-20e476a415a3", + "metadata": {}, + "source": [ + "# Create Coastline Segments" + ] + }, + { + "cell_type": "markdown", + "id": "27036a8b-f200-40d5-9308-77506829c0d3", + "metadata": {}, + "source": [ + "This notebook modifies the CoDEC points to generate a uniform set of coastline segment centroids." + ] + }, + { + "cell_type": "markdown", + "id": "2b15c558-112e-496a-9797-371c4dd15f7c", + "metadata": {}, + "source": [ + "**TODO**\n", + "\n", + "- [ ] add back in the last section (with filepaths appropriately in settings.py) when we figure out where `ne_coastline_polys_CIAM_exposure_matches.parquet` is generated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc3268f-bf70-421c-bd04-aa58b19d39f9", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8910b8e", + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "from shapely.geometry import Point\n", + "\n", + "from sliiders.settings import (\n", + " DIR_GTSM_STATIONS_TOTHIN,\n", + " PATH_SEG_CENTROIDS,\n", + " PATH_GTSM_SURGE,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f685e2db-d168-4ad5-8f42-7beba55370fe", + "metadata": {}, + "outputs": [], + "source": [ + "GTSMPTS_EUR = DIR_GTSM_STATIONS_TOTHIN / \"gtsm_stations_eur_tothin.shp\"" + ] + }, + { + "cell_type": "markdown", + "id": "d3cc347d", + "metadata": {}, + "source": [ + "### Import GTSM Station Points & Thin Europe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfa7e3c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Import GTSM (CoDEC) stations present in ERA5 GTSM data from Muis et al. 2020\n", + "gtsm0 = (\n", + " xr.open_dataset(PATH_GTSM_SURGE)\n", + " .rename(station_y_coordinate=\"lat\", station_x_coordinate=\"lon\")\n", + " .station_id.drop(\"station_name\")\n", + " .reset_coords()\n", + ")\n", + "gtsm0[\"station_id\"] = gtsm0.station_id.astype(str)\n", + "gtsm0 = gtsm0.to_dataframe()\n", + "\n", + "# Import Europe stations to be thinned (came in 5x higher res than rest of world)\n", + "# Stations to be thinned were manually ID'ed in GIS\n", + "gtsm_e0 = gpd.read_file(GTSMPTS_EUR)\n", + "gtsm_e0[\"serial_id\"] = [gtsm_e0.station_id[i][-5:] for i in range(len(gtsm_e0))]\n", + "gtsm_e0 = gtsm_e0.sort_values(\"serial_id\")\n", + "\n", + "# Filter 80% of Europe stations\n", + "gtsm_e1 = gtsm_e0.iloc[np.arange(0, len(gtsm_e0), 5)]\n", + "\n", + "# Update full GTSM layer\n", + "gtsm1 = gtsm0[\n", + " ~gtsm0.station_id.isin(gtsm_e0.station_id)\n", + "] # all stations not in the to-be-thinned orig Europe set\n", + "gtsm1 = pd.concat([gtsm1, gtsm_e1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76754f22", + "metadata": {}, + "outputs": [], + "source": [ + "# Add back in Gibraltar and Monaco\n", + "gib_id = \"id_coast_glob_eur_03452\"\n", + "mon_id = \"id_coast_glob_eur_03236\"\n", + "gtsm1 = pd.concat([gtsm1, gtsm0[gtsm0.station_id.isin([gib_id, mon_id])]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730731f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Add new manual points for 15 small country coastlines\n", + "lats = [\n", + " 18.20427252, # Anguilla\n", + " 42.92257646, # Bosnia & Herzegovina\n", + " 17.92105954, # St Barthelemy\n", + " -54.45126484, # Bouvet Island\n", + " -12.1888075, # Cocos (Keeling) Islands\n", + " 10.28952433, # Clipperton Island\n", + " 29.51144515, # Jordan\n", + " 16.69068301, # Montserrat\n", + " -29.01453206, # Norfolk Island\n", + " -0.54073669, # Nauru\n", + " -24.38660564, # Pitcairn Islands\n", + " 10.73177001, # Spratly Islands\n", + " -9.3415246, # Tokelau\n", + " 19.28118539, # US Minor Outlying Island (Wake Island)\n", + " 18.03885916,\n", + "] # Sint Maarten\n", + "\n", + "lons = [\n", + " -63.05668448,\n", + " 17.61671166,\n", + " -62.82955182,\n", + " 3.35020284,\n", + " 96.83802356,\n", + " -109.21026241,\n", + " 34.97905326,\n", + " -62.18841426,\n", + " 167.97463688,\n", + " 166.91406099,\n", + " -128.32974227,\n", + " 115.8022823,\n", + " -171.19264163,\n", + " 166.64951319,\n", + " -63.01482338,\n", + "]\n", + "\n", + "add_pts = {\n", + " \"station_id\": [\"id_coast_glob_990\" + str(i + 1).zfill(2) for i in range(len(lats))],\n", + " \"lat\": lats,\n", + " \"lon\": lons,\n", + " \"geometry\": [Point(lons[i], lats[i]) for i in range(len(lats))],\n", + "}\n", + "\n", + "add_pts = gpd.GeoDataFrame(add_pts, crs=\"EPSG:4326\")\n", + "\n", + "gtsm1 = pd.concat([gtsm1, add_pts])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abcb4fc8", + "metadata": {}, + "outputs": [], + "source": [ + "# Manual deletions of certain points that don't fall nearer to coastlines than other points\n", + "del_ids = [\n", + " \"eur_01019\",\n", + " \"eur_01812\",\n", + " \"eur_00979\",\n", + " \"13536\",\n", + " \"14447\",\n", + " \"15646\",\n", + " \"18265\",\n", + " \"18656\",\n", + " \"18720\",\n", + " \"18724\",\n", + "]\n", + "del_ids = [\"id_coast_glob_\" + del_ids[i] for i in range(len(del_ids))]\n", + "gtsm1 = gtsm1[~gtsm1.station_id.isin(del_ids)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88c6ce74", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove buoy and obs points\n", + "gtsm1 = gtsm1[~gtsm1.station_id.str.contains(\"buoy\")]\n", + "gtsm1 = gtsm1[~gtsm1.station_id.str.contains(\"obs\")]\n", + "\n", + "# Remove duplicates\n", + "idx_unique = pd.DataFrame(gtsm1.geometry).drop_duplicates().index\n", + "gtsm1 = gtsm1[gtsm1.index.isin(idx_unique)]\n", + "\n", + "# Remove unwanted columns\n", + "gtsm1 = gtsm1.drop(columns=[\"fid\", \"field_1\", \"serial_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c04a694e", + "metadata": {}, + "outputs": [], + "source": [ + "# Export resulting layers\n", + "gtsm1.to_file(PATH_SEG_CENTROIDS) # final set of majority GTSM points for CIAM segs" + ] + }, + { + "cell_type": "markdown", + "id": "b6cb46e3", + "metadata": {}, + "source": [ + "### [ASIDE] Identify which NaturalEarth land masses have exposure (pop or capital)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d03b678", + "metadata": {}, + "outputs": [], + "source": [ + "# land = gpd.read_parquet(TMPDIR + \"ne_coastline_polys_CIAM_exposure_matches.parquet\")\n", + "# land_exp = land[land.contains_exposure == True]\n", + "# land_exp.to_file(TMPDIR + \"ne_coastline_polys_CIAM_wexp.shp\")\n", + "\n", + "# land_noexp = land[land.contains_exposure == False]\n", + "# land_noexp.to_file(TMPDIR + \"ne_coastline_polys_CIAM_noexp.shp\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/10-combine_exposure_tiles.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/10-combine_exposure_tiles.ipynb new file mode 100644 index 0000000..55f4076 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/10-combine_exposure_tiles.ipynb @@ -0,0 +1,1379 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Combine exposure tiles to construct with-elevation, without-elevation, and area-by-elevation exposure parquets\n", + "Take all the 1-degree by 1-degree binned exposure tiles and combine them to form three global datasets:\n", + "1. \"With-elevation\" binned exposure: Includes all areas with elevations up to `sset.HIGHEST_WITHELEV_EXPOSURE_METERS`\n", + "2. \"Without-elevation\" binned exposure: Includes all global exposure\n", + "3. Area-by-seg-adm1: For each segment and adm1 region, the total area, in square kilometers, that is closer to that segment than to any other segment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "import dask.dataframe as ddf\n", + "import dask.distributed as dd\n", + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import rhg_compute_tools.kubernetes as rhgk\n", + "import rhg_compute_tools.utils as rhgu\n", + "\n", + "from sliiders import settings as sset\n", + "from sliiders import spatial\n", + "\n", + "spatial.filter_spatial_warnings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dir_batches = sset.DIR_EXPOSURE_BINNED_TMP / \"batches\"\n", + "dir_batches.mkdir(exist_ok=False)\n", + "\n", + "dir_seg_batches = sset.DIR_EXPOSURE_BINNED_TMP / \"segment_area_batches\"\n", + "dir_seg_batches.mkdir(exist_ok=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define batching function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@rhgu.block_globals\n", + "def run_batch(batch_num, batch_size, in_paths, dir_batches, include_tile_name=False):\n", + " exp = []\n", + " batch_paths = in_paths[\n", + " batch_num * batch_size : min((batch_num + 1) * batch_size, len(in_paths))\n", + " ]\n", + "\n", + " for filename in batch_paths:\n", + " try:\n", + " df = pd.read_csv(filename, index_col=None, header=0)\n", + " if include_tile_name:\n", + " df[\"filename\"] = filename.stem\n", + " exp.append(df)\n", + " except pd.errors.EmptyDataError:\n", + " # these are the placeholder CSVs\n", + " pass\n", + "\n", + " exp = pd.concat(exp, axis=0, ignore_index=True)\n", + " if \"wetland_flag\" in exp.columns:\n", + " exp[\"wetland_flag\"] = exp[\"wetland_flag\"].astype(bool)\n", + "\n", + " exp.to_parquet(dir_batches / f\"batch_{batch_num}.parquet\")\n", + "\n", + " return 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start workers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nworkers = 32" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client, cluster = rhgk.get_micro_cluster()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(nworkers)\n", + "\n", + "cluster" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combine 1-degree tile CSVs into batches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tile_paths = list(sset.DIR_EXPOSURE_BINNED_TMP_TILES.glob(\"*.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = int(len(tile_paths) / (nworkers * 2)) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# shuffling the paths helps assure each worker gets CSV batches of about the same total size\n", + "random.seed(1)\n", + "random.shuffle(tile_paths)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_futures = [\n", + " client.submit(run_batch, i, batch_size, tile_paths, dir_batches)\n", + " for i in range(nworkers * 2)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(batch_futures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combine 1-degree segment-area tile CSVs into batches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seg_tile_paths = list(sset.DIR_EXPOSURE_BINNED_TMP_TILES_SEGMENT_AREA.glob(\"*.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = int(len(seg_tile_paths) / (nworkers * 2)) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# shuffling the paths helps assure each worker gets CSV batches of about the same total size\n", + "random.seed(1)\n", + "random.shuffle(seg_tile_paths)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_futures = [\n", + " client.submit(run_batch, i, batch_size, seg_tile_paths, dir_seg_batches)\n", + " for i in range(nworkers * 2)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(batch_futures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merge tile batches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_ddf = ddf.read_parquet(str(dir_batches / f\"batch_*.parquet\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_ddf = exp_ddf.rename(columns={\"value\": \"asset_value\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_ddf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_dtypes = {\n", + " \"z_ix\": np.int32,\n", + " \"seg_adm\": str,\n", + " \"protection_zone\": np.int16,\n", + " \"area_km\": np.float32,\n", + " \"asset_value\": np.float32,\n", + " \"pop_landscan\": np.float32,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_ddf = exp_ddf.astype(column_dtypes).persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_ddf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merge segment-area tile batches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seg_area_ddf = ddf.read_parquet(str(dir_seg_batches / f\"batch_*.parquet\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area_by_elev = seg_area_ddf.groupby(\n", + " [\"z_ix\", \"seg_adm\", \"protection_zone\", \"wetland_flag\"]\n", + ")[\"area_km\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area_by_elev = area_by_elev.persist()\n", + "dd.progress(area_by_elev)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area_by_elev = area_by_elev.reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "highest_z_ix = (\n", + " int(sset.HIGHEST_WITHELEV_EXPOSURE_METERS / sset.EXPOSURE_BIN_WIDTH_H) - 1\n", + ")\n", + "area_by_elev = area_by_elev[area_by_elev[\"z_ix\"] <= highest_z_ix]\n", + "\n", + "area_by_elev" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam = (\n", + " area_by_elev.groupby([\"z_ix\", \"seg_adm\", \"protection_zone\", \"wetland_flag\"])[\n", + " \"area_km\"\n", + " ]\n", + " .sum()\n", + " .reset_index(drop=False)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area_by_elev_dtypes = {\n", + " \"z_ix\": np.int16,\n", + " \"seg_adm\": \"category\",\n", + " \"protection_zone\": \"category\",\n", + " \"wetland_flag\": bool,\n", + " \"area_km\": np.float32,\n", + " \"land_area_km\": np.float32,\n", + " \"wetland_area_km\": np.float32,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam = ciam.astype({k: v for k, v in area_by_elev_dtypes.items() if k in ciam.columns})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam = ciam.persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_local = ciam.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def divide_area_by_elev_into_wetland_and_non_wetland(area_by_elev_local):\n", + " group_cols = [\n", + " c for c in area_by_elev_local.columns if c not in [\"wetland_flag\", \"area_km\"]\n", + " ]\n", + "\n", + " with_wetland = area_by_elev_local.loc[area_by_elev_local[\"wetland_flag\"]]\n", + " without_wetland = area_by_elev_local.loc[~area_by_elev_local[\"wetland_flag\"]]\n", + " area_by_elev_local = pd.merge(\n", + " without_wetland,\n", + " with_wetland,\n", + " left_on=group_cols,\n", + " right_on=group_cols,\n", + " suffixes=(\"_no_wetland\", \"_wetland\"),\n", + " how=\"outer\",\n", + " ).reset_index(drop=True)\n", + "\n", + " area_by_elev_local = area_by_elev_local.drop(\n", + " columns=[\"wetland_flag_no_wetland\", \"wetland_flag_wetland\"]\n", + " )\n", + "\n", + " area_by_elev_local = area_by_elev_local.rename(\n", + " columns={\n", + " \"area_km_no_wetland\": \"land_area_km\",\n", + " \"area_km_wetland\": \"wetland_area_km\",\n", + " }\n", + " )\n", + "\n", + " area_by_elev_local[\"land_area_km\"] = area_by_elev_local[\"land_area_km\"].fillna(0)\n", + " area_by_elev_local[\"wetland_area_km\"] = area_by_elev_local[\n", + " \"wetland_area_km\"\n", + " ].fillna(0)\n", + "\n", + " area_by_elev_local = area_by_elev_local.astype(\n", + " {\n", + " k: v\n", + " for k, v in area_by_elev_dtypes.items()\n", + " if k in area_by_elev_local.columns\n", + " }\n", + " )\n", + "\n", + " return area_by_elev_local.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_local = divide_area_by_elev_into_wetland_and_non_wetland(ciam_local)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Remove any old versions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION.parent.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Save parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_local.to_parquet(sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.cancel(area_by_elev)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create without-elevation dataframe from with-elevation tiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_ddf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_ddf = exp_ddf.groupby(\n", + " [\"seg_adm\"],\n", + ")[[\"asset_value\", \"pop_landscan\", \"area_km\"]].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_ddf = withoutelev_ddf.reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_ddf = withoutelev_ddf.persist()\n", + "dd.progress(withoutelev_ddf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_ddf = withoutelev_ddf.astype(\n", + " {k: v for k, v in column_dtypes.items() if k in withoutelev_ddf.columns}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_ddf = withoutelev_ddf.persist()\n", + "dd.progress(withoutelev_ddf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Remove any old versions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHOUTELEV.exists()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Save parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_pq_out = withoutelev_ddf.to_parquet(\n", + " sset.PATH_EXPOSURE_BINNED_WITHOUTELEV,\n", + " engine=\"pyarrow\",\n", + " write_index=False,\n", + " compute=False,\n", + ").persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(withoutelev_ddf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create with-elevation parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_ddf = exp_ddf[exp_ddf[\"z_ix\"] <= highest_z_ix]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_ddf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_ddf = withelev_ddf.groupby([\"z_ix\", \"seg_adm\", \"protection_zone\"])[\n", + " [\"area_km\", \"asset_value\", \"pop_landscan\"]\n", + "].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_ddf = withelev_ddf.reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_ddf = withelev_ddf.persist()\n", + "dd.progress(withelev_ddf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Remove any old versions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHELEV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHELEV.exists()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Save parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_pq_out = withelev_ddf.to_parquet(\n", + " sset.PATH_EXPOSURE_BINNED_WITHELEV,\n", + " engine=\"pyarrow\",\n", + " write_index=False,\n", + " compute=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_pq_out = withelev_pq_out.persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(withelev_pq_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Shut down cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.close()\n", + "cluster.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make some final adjustments and checks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev = pd.read_parquet(sset.PATH_EXPOSURE_BINNED_WITHELEV)\n", + "\n", + "withoutelev = pd.read_parquet(sset.PATH_EXPOSURE_BINNED_WITHOUTELEV)\n", + "\n", + "area_ciam = pd.read_parquet(sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_dtypes = {\n", + " \"z_ix\": np.int32,\n", + " \"seg_adm\": \"category\",\n", + " \"protection_zone\": np.int16,\n", + " \"area_km\": np.float32,\n", + " \"asset_value\": np.float32,\n", + " \"pop_landscan\": np.float32,\n", + "}\n", + "\n", + "# Step through fields one-by-one to prevent memory explosion copying the whole dataframe\n", + "for field, field_type in exp_dtypes.items():\n", + " withelev[field] = withelev[field].astype(field_type)\n", + "\n", + "for field, field_type in exp_dtypes.items():\n", + " if field in withoutelev.columns:\n", + " withoutelev[field] = withoutelev[field].astype(field_type)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev = withelev[\n", + " (withelev[\"asset_value\"] > 0) | (withelev[\"pop_landscan\"] > 0)\n", + "].reset_index(drop=True)\n", + "\n", + "withoutelev = withoutelev[\n", + " (withoutelev[\"asset_value\"] > 0) | (withoutelev[\"pop_landscan\"] > 0)\n", + "].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_adm1(df):\n", + " df[\"adm1\"] = df[\"seg_adm\"].str[15:]\n", + " df[\"ISO\"] = df[\"adm1\"].str[:3]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area_ciam = parse_adm1(area_ciam)\n", + "withelev = parse_adm1(withelev)\n", + "withoutelev = parse_adm1(withoutelev)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Check against PWT 10.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ktable_full = pd.read_parquet(sset.PATH_COUNTRY_LEVEL_EXPOSURE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ktable_full = ktable_full.reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ktable_full = ktable_full[ktable_full[\"year\"] == 2019].set_index(\"ccode\")[\n", + " [\"cn_19\", \"pop\"]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ktable = ktable_full[\"cn_19\"] * 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop = ktable_full[\"pop\"] * 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "replacements = {\"XAD\": \"GBR\", \"XKO\": \"KO-\", \"XNC\": \"CYP\", \"XPI\": \"CHN\"}\n", + "\n", + "area_ciam[\"ISO\"] = area_ciam[\"ISO\"].apply(\n", + " lambda c: replacements[c] if c in replacements else c\n", + ")\n", + "\n", + "withelev[\"ISO\"] = withelev[\"ISO\"].apply(\n", + " lambda c: replacements[c] if c in replacements else c\n", + ")\n", + "\n", + "withoutelev[\"ISO\"] = withoutelev[\"ISO\"].apply(\n", + " lambda c: replacements[c] if c in replacements else c\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set(ktable.index) - set(withoutelev[\"ISO\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set(withoutelev[\"ISO\"].unique()) - set(ktable.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(set(withoutelev[\"ISO\"].unique()) - set(ktable.index)) == 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Rescale asset value if needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "country_totals = withoutelev.groupby(\"ISO\")[\"asset_value\"].sum()\n", + "country_totals.name = \"country_asset_value\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "check = pd.DataFrame(ktable).join(country_totals, on=\"ccode\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "check[\"diff\"] = check[\"cn_19\"] / check[\"country_asset_value\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If rescaling:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scaling = check[[\"diff\"]]\n", + "\n", + "scaling[\"diff\"].max(), scaling[\"diff\"].min()\n", + "\n", + "withoutelev = withoutelev.join(scaling, on=\"ISO\")\n", + "withelev = withelev.join(scaling, on=\"ISO\")\n", + "\n", + "withoutelev[\"asset_value\"] = withoutelev[\"asset_value\"] * withoutelev[\"diff\"]\n", + "withelev[\"asset_value\"] = withelev[\"asset_value\"] * withelev[\"diff\"]\n", + "\n", + "withoutelev = withoutelev.drop(columns=[\"diff\"])\n", + "withelev = withelev.drop(columns=[\"diff\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Rescale population if needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "country_totals_landscan = withoutelev.groupby(\"ISO\")[\"pop_landscan\"].sum()\n", + "country_totals_landscan.name = \"country_population_landscan\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "check = pd.DataFrame(pop).join(country_totals_landscan, on=\"ccode\")\n", + "check[\"diff_landscan\"] = check[\"pop\"] / check[\"country_population_landscan\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If rescaling:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scaling = check[[\"diff_landscan\"]]\n", + "\n", + "scaling[\"diff_landscan\"].max(), scaling[\"diff_landscan\"].min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev = withoutelev.join(scaling, on=\"ISO\")\n", + "withelev = withelev.join(scaling, on=\"ISO\")\n", + "\n", + "withoutelev[\"pop_landscan\"] = withoutelev[\"pop_landscan\"] * withoutelev[\"diff_landscan\"]\n", + "withelev[\"pop_landscan\"] = withelev[\"pop_landscan\"] * withelev[\"diff_landscan\"]\n", + "\n", + "withoutelev = withoutelev.drop(columns=[\"diff_landscan\"])\n", + "withelev = withelev.drop(columns=[\"diff_landscan\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev[\"asset_value\"].sum() / 1e12" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev[\"asset_value\"].sum() / 1e12" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev[\"pop_landscan\"].sum() / 1e9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev[\"pop_landscan\"].sum() / 1e9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev[\"ISO\"] = withoutelev[\"ISO\"].astype(\"category\")\n", + "withelev[\"ISO\"] = withelev[\"ISO\"].astype(\"category\")\n", + "\n", + "withoutelev[\"asset_value\"] = withoutelev[\"asset_value\"].astype(np.float32)\n", + "withelev[\"asset_value\"] = withelev[\"asset_value\"].astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHELEV.exists()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Delete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHELEV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev.to_parquet(sset.PATH_EXPOSURE_BINNED_WITHELEV, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHOUTELEV.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BINNED_WITHOUTELEV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev.to_parquet(sset.PATH_EXPOSURE_BINNED_WITHOUTELEV, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area_ciam.to_parquet(sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add `lowelev` field to CIAM-Adm1 intersections file to indicate inclusion in elevation processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_adm1 = gpd.read_parquet(sset.PATH_CIAM_ADM1_VORONOI_INTERSECTIONS)\n", + "ciam_adm1[\"lowelev\"] = ciam_adm1[\"seg_adm\"].isin(withelev[\"seg_adm\"].unique())\n", + "ciam_adm1[\"ISO\"] = ciam_adm1[\"ISO\"].apply(\n", + " lambda c: replacements[c] if c in replacements else c\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_adm1.to_parquet(\n", + " sset.PATH_CIAM_ADM1_VORONOI_INTERSECTIONS, index=False, row_group_size=500\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check that it looks good" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### withelev" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_out = pd.read_parquet(sset.PATH_EXPOSURE_BINNED_WITHELEV)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_out.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_out[\"asset_value\"].sum() / 1e12" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### withoutelev" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_out = pd.read_parquet(sset.PATH_EXPOSURE_BINNED_WITHOUTELEV)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_out.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_out[\"asset_value\"].sum() / 1e12" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### CIAM area-by-elevation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area = pd.read_parquet(sset.PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "area.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/2-create-segment-regions.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/2-create-segment-regions.ipynb new file mode 100644 index 0000000..b6210aa --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/2-create-segment-regions.ipynb @@ -0,0 +1,1033 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "735b7444-8a68-4f6e-987c-c045a53b64c6", + "metadata": { + "tags": [] + }, + "source": [ + "# Generate set of administrative regions to use in exposure grid\n", + "- The idea here is to get a globally comprehensive set of administrative boundaries at the `adm1` level or higher if the `adm1` level is not available\n", + "- Use `adm1` if available and `adm0` if not" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6e027b29-8979-440c-affa-b55d30ddd87a", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f26ced68", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/srv/conda/envs/notebook/lib/python3.9/site-packages/dask_gateway/client.py:21: FutureWarning: format_bytes is deprecated and will be removed in a future release. Please use dask.utils.format_bytes instead.\n", + " from distributed.utils import LoopRunner, format_bytes\n" + ] + } + ], + "source": [ + "import warnings\n", + "\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from shapely.errors import ShapelyDeprecationWarning\n", + "\n", + "from sliiders import settings as sset\n", + "from sliiders import spatial" + ] + }, + { + "cell_type": "markdown", + "id": "5ad804d0-523e-4086-a147-926e4a74147a", + "metadata": {}, + "source": [ + "#### Read full `adm0` and `adm1` sets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "10b69ed5-7c8f-4d87-9151-036c300d673f", + "metadata": {}, + "outputs": [], + "source": [ + "adm0 = (\n", + " gpd.read_file(sset.PATH_GADM, layer=\"level0\")[[\"GID_0\", \"geometry\"]]\n", + " .set_crs(epsg=4326)\n", + " .set_index(\"GID_0\")\n", + " .geometry\n", + ")\n", + "adm1 = (\n", + " gpd.read_file(sset.PATH_GADM, layer=\"level1\")[[\"GID_0\", \"GID_1\", \"geometry\"]]\n", + " .set_crs(epsg=4326)\n", + " .set_index(\"GID_1\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8aa26393-836c-47fd-9b0d-a91ae0ffc839", + "metadata": { + "tags": [] + }, + "source": [ + "#### Exclude non-territory ISOs\n", + "- `ATA`: Antarctica\n", + "- `XCA`: Caspian Sea" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ebcc3561-631d-4648-b0c5-eaf6cc583b02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ATA', 'XCA']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sset.EXCLUDED_ISOS" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4840ab34-0646-4a2b-95fe-738df65e980f", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter out excluded ISOs\n", + "adm0 = adm0[~adm0.index.isin(sset.EXCLUDED_ISOS)]\n", + "adm1 = adm1[~adm1[\"GID_0\"].isin(sset.EXCLUDED_ISOS)]" + ] + }, + { + "cell_type": "markdown", + "id": "b0354db1-e44f-4619-8fd9-648aa20dc0da", + "metadata": {}, + "source": [ + "#### Select only countries in `adm0` that do not appear in the `adm1` set, to be included as intermediate `adm1` regions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "43c4eb7a-d569-4096-bda2-915b147a5c04", + "metadata": {}, + "outputs": [], + "source": [ + "adm0_as_adm1 = adm0.drop(adm1.GID_0.unique())" + ] + }, + { + "cell_type": "markdown", + "id": "36e4a88b-ec4c-43d7-bd77-294b7008c816", + "metadata": {}, + "source": [ + "#### Concatenate all `adm1` regions" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d5a6c1a0-404f-4d9b-90a6-fa472953b040", + "metadata": {}, + "outputs": [], + "source": [ + "full = pd.concat([adm0_as_adm1.to_frame(), adm1]).geometry.to_frame()\n", + "full[\"ISO\"] = full.index.str.split(\".\").str[0]\n", + "full.index.name = \"adm1\"\n", + "assert full.index.is_unique\n", + "seg_centroids = gpd.read_file(sset.PATH_SEG_CENTROIDS).set_index(\"station_id\").geometry\n", + "coastlines = gpd.read_file(sset.PATH_CIAM_COASTLINES).set_index(\"line_id\").geometry\n", + "overlay_name = \"seg_adm\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f603158c-54e5-4e2d-82c6-0676ba8b7064", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating global Voronoi shapes for regions...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e4e49d791be4a22b480fdc8199eef6f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/362 [00:00" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "with warnings.catch_warnings():\n", + " warnings.filterwarnings(\"ignore\", category=ShapelyDeprecationWarning)\n", + " all_overlays = spatial.create_overlay_voronois(\n", + " full, seg_centroids, coastlines, overlay_name, plot=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31b77d44-3a31-42a6-a67b-4d869765c73f", + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_CIAM_ADM1_VORONOI_INTERSECTIONS.parent.mkdir(exist_ok=False)\n", + "\n", + "all_overlays.to_parquet(sset.PATH_CIAM_ADM1_VORONOI_INTERSECTIONS, index=False)\n", + "all_overlays.to_file(sset.PATH_CIAM_ADM1_VORONOI_INTERSECTIONS_SHP)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0374c1d8757d4e1dacdcabb664a8308d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_24e5d8edc2844796b805ad8a2bd156c0", + "max": 362, + "style": "IPY_MODEL_2705843ad2a042c680fa4ac2b8c08962", + "value": 362 + } + }, + "060b8277577e40bc932193c864698485": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_2aed0f1150db476ea1159d2d6e9adb00", + "max": 1812, + "style": "IPY_MODEL_2100273581d54e53bb0558e2a16d9187", + "value": 1812 + } + }, + "0b84dd267f3442c78853a030af4efeca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_63afc70313664099a7d62babce51c3b8", + "style": "IPY_MODEL_986f01097a00441095f9becd2c64915a", + "value": " 47/47 [00:11<00:00, 4.04it/s]" + } + }, + "13ecd51aefab4c6a83b847a43186ff2d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "170a0a1f73dc4158b0597431ec52f6ec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1923e2932e9d41e1a0ea2e87fad2d60b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_80b44682c9664773afa0a088caecab71", + "max": 47, + "style": "IPY_MODEL_addfd0c51a094222a934127dac51a548", + "value": 47 + } + }, + "1e8c99c057bc48b894bc9fe8c221ee54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_750c24a46eba489f88c427e6f8c4f54a", + "style": "IPY_MODEL_ded88cc42c4b47f3b20f8b2f92800535", + "value": "100%" + } + }, + "2100273581d54e53bb0558e2a16d9187": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "24e5d8edc2844796b805ad8a2bd156c0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "25d3c4c6296f4a7aa6618dfaab857296": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "2705843ad2a042c680fa4ac2b8c08962": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "29505693297245fb8a3bfd9ff64b6113": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "2aed0f1150db476ea1159d2d6e9adb00": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "30b6380bb88e4e07b086258c4d66a45b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "31116ceeae4d4f8b97ca45f9add694f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3e64137074c94d538e1e5ccbde13d512": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "45bd4de33c474a67b78138228c65f495": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_30b6380bb88e4e07b086258c4d66a45b", + "style": "IPY_MODEL_86f26d2b1f9440039fb4eca10a3dc988", + "value": " 1812/1812 [03:10<00:00, 30.80it/s]" + } + }, + "483ed794768f4856878163626d3db887": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "4942699b42c6475a905a906a747f5b03": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_29505693297245fb8a3bfd9ff64b6113", + "max": 8, + "style": "IPY_MODEL_25d3c4c6296f4a7aa6618dfaab857296", + "value": 8 + } + }, + "4d13eb5be5d7436b8faa3918aa989534": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "528a3c212bc6409ca12174f650eee29b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "535c72cf644444bc9d586f0325495ef2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "54034804419b48919e49c872fc873b9a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "55f58f53c1f248c78726021622b278d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_3e64137074c94d538e1e5ccbde13d512", + "style": "IPY_MODEL_54034804419b48919e49c872fc873b9a", + "value": "100%" + } + }, + "56e1f6044a60443282447e58f1f30c4a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "572046801e274f4c89e72c333a6d7504": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_528a3c212bc6409ca12174f650eee29b", + "max": 1812, + "style": "IPY_MODEL_59123326de9d4776a2991704c9aa124a", + "value": 1812 + } + }, + "59123326de9d4776a2991704c9aa124a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "5bc8b2140f4e4b5a8cd05f57fed7c058": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_4d13eb5be5d7436b8faa3918aa989534", + "style": "IPY_MODEL_95f71e0c12de4035a797f05d0571e897", + "value": " 1812/1812 [01:08<00:00, 28.06it/s]" + } + }, + "5c96125f4bee41f58a1dc8be8fdaae8d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_c1f4000e74804820a48a354d387aaa43", + "style": "IPY_MODEL_b048b225856047669208041317fda313", + "value": "100%" + } + }, + "5e4e49d791be4a22b480fdc8199eef6f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_55f58f53c1f248c78726021622b278d5", + "IPY_MODEL_0374c1d8757d4e1dacdcabb664a8308d", + "IPY_MODEL_cc3f8ee8b9304c19b593b8f85c218fb3" + ], + "layout": "IPY_MODEL_d67e6c0d449845ab9a35bf98d1f1c13b" + } + }, + "63afc70313664099a7d62babce51c3b8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "6586b8e007d243c6a45013e81639424a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_a420f54348b4484ba7c366b388948141", + "style": "IPY_MODEL_ac061fc1c1a848d68230326fd7948aa7", + "value": "100%" + } + }, + "67f50135b37543eb89f0e41789839a5f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_5c96125f4bee41f58a1dc8be8fdaae8d", + "IPY_MODEL_1923e2932e9d41e1a0ea2e87fad2d60b", + "IPY_MODEL_0b84dd267f3442c78853a030af4efeca" + ], + "layout": "IPY_MODEL_aa77f38bd867410facfc2a506958c73b" + } + }, + "72c7eecd602b4f9fa6f8e1533db44406": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "750c24a46eba489f88c427e6f8c4f54a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "756013cd23e743d1bf8903131753def2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_1e8c99c057bc48b894bc9fe8c221ee54", + "IPY_MODEL_9a415ca6e85542dbb61774521f366791", + "IPY_MODEL_f970e8fac44b443dbb4535c99431dc96" + ], + "layout": "IPY_MODEL_839e18a9937d4eafb84b19bf8e2b4232" + } + }, + "773ea45822c94a62b21715ae834d5b50": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7fbbc5f4442d4b2b89e6d9a252b5941b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_ceec393c4b3a47b386988a158c562f62", + "style": "IPY_MODEL_31116ceeae4d4f8b97ca45f9add694f9", + "value": " 207/207 [00:45<00:00, 3.55it/s]" + } + }, + "80b44682c9664773afa0a088caecab71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "815c2326bea24297a25883220adc62e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_6586b8e007d243c6a45013e81639424a", + "IPY_MODEL_b4b03c9a719344e780f579be1c7fdea2", + "IPY_MODEL_7fbbc5f4442d4b2b89e6d9a252b5941b" + ], + "layout": "IPY_MODEL_f0eb3584d41940dd848236cf38a8a8ce" + } + }, + "839e18a9937d4eafb84b19bf8e2b4232": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "86f26d2b1f9440039fb4eca10a3dc988": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "887da7bf7c90494eb0b24ac872ccb78e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "89f91f9f7db641c9a9e4d07ad54a556d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_92d2a9ccd789408f947f0f8072e46f0e", + "style": "IPY_MODEL_56e1f6044a60443282447e58f1f30c4a", + "value": "100%" + } + }, + "8a0d809dbb234822b6a916b65b2371a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_bdcb06328d9a429d9efa1460d745710d", + "style": "IPY_MODEL_95f5b61e62644d4aae27129d4e51beca", + "value": " 8/8 [00:01<00:00, 5.79it/s]" + } + }, + "8e2d37e64f4b4d0284e1c24b6c0cfff8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "92d2a9ccd789408f947f0f8072e46f0e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "94491b9204e44c48a9eec0cd69fe7098": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "94d26121e07e402fabe03ebe890b8be5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "95f5b61e62644d4aae27129d4e51beca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "95f71e0c12de4035a797f05d0571e897": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "96a2d9cadcd942a0ac8777b6708a6a8e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "986f01097a00441095f9becd2c64915a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "999039649c5b43acba264830aa5de738": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "9a415ca6e85542dbb61774521f366791": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_13ecd51aefab4c6a83b847a43186ff2d", + "max": 8, + "style": "IPY_MODEL_daaec40b64c649cfbd8c33b122fc8231", + "value": 8 + } + }, + "a1eb2a953b8148c3ac6b7c0f18210f26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_89f91f9f7db641c9a9e4d07ad54a556d", + "IPY_MODEL_060b8277577e40bc932193c864698485", + "IPY_MODEL_45bd4de33c474a67b78138228c65f495" + ], + "layout": "IPY_MODEL_999039649c5b43acba264830aa5de738" + } + }, + "a420f54348b4484ba7c366b388948141": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "aa77f38bd867410facfc2a506958c73b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ac061fc1c1a848d68230326fd7948aa7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "addfd0c51a094222a934127dac51a548": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "b048b225856047669208041317fda313": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b3d2f95d407b4421be0cd760165c5425": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_170a0a1f73dc4158b0597431ec52f6ec", + "style": "IPY_MODEL_df446b59f3784f5598ee8861d2e6b838", + "value": "100%" + } + }, + "b4b03c9a719344e780f579be1c7fdea2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_8e2d37e64f4b4d0284e1c24b6c0cfff8", + "max": 207, + "style": "IPY_MODEL_72c7eecd602b4f9fa6f8e1533db44406", + "value": 207 + } + }, + "bc6e3e2ffcbb41789cd304dda1239fe6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "bdcb06328d9a429d9efa1460d745710d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "c1f4000e74804820a48a354d387aaa43": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "cc3f8ee8b9304c19b593b8f85c218fb3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_535c72cf644444bc9d586f0325495ef2", + "style": "IPY_MODEL_887da7bf7c90494eb0b24ac872ccb78e", + "value": " 362/362 [08:20<00:00, 1.10it/s]" + } + }, + "cec434b2a0ce459fb1bd21e00c4a46e1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_94d26121e07e402fabe03ebe890b8be5", + "style": "IPY_MODEL_773ea45822c94a62b21715ae834d5b50", + "value": "100%" + } + }, + "ceec393c4b3a47b386988a158c562f62": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d3f1d09c1a7a467e9c8b2830b89f6271": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_b3d2f95d407b4421be0cd760165c5425", + "IPY_MODEL_572046801e274f4c89e72c333a6d7504", + "IPY_MODEL_5bc8b2140f4e4b5a8cd05f57fed7c058" + ], + "layout": "IPY_MODEL_96a2d9cadcd942a0ac8777b6708a6a8e" + } + }, + "d67e6c0d449845ab9a35bf98d1f1c13b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "daaec40b64c649cfbd8c33b122fc8231": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "ded88cc42c4b47f3b20f8b2f92800535": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "df446b59f3784f5598ee8861d2e6b838": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f0eb3584d41940dd848236cf38a8a8ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f5266bebb0eb49c9990e2af602fadb2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_cec434b2a0ce459fb1bd21e00c4a46e1", + "IPY_MODEL_4942699b42c6475a905a906a747f5b03", + "IPY_MODEL_8a0d809dbb234822b6a916b65b2371a9" + ], + "layout": "IPY_MODEL_483ed794768f4856878163626d3db887" + } + }, + "f970e8fac44b443dbb4535c99431dc96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_94491b9204e44c48a9eec0cd69fe7098", + "style": "IPY_MODEL_bc6e3e2ffcbb41789cd304dda1239fe6", + "value": " 8/8 [00:08<00:00, 1.22s/it]" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/3-fill_missing_litpop_with_geg.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/3-fill_missing_litpop_with_geg.ipynb new file mode 100644 index 0000000..5da4595 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/3-fill_missing_litpop_with_geg.ipynb @@ -0,0 +1,765 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Augment Missing Data in LitPop with Geg-15" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some missing countries in LitPop. This notebook fills in those areas with Geg-15 and saves the grid as a single parquet file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import zipfile\n", + "from pathlib import Path\n", + "\n", + "import dask.dataframe as ddf\n", + "import geopandas as gpd\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import numpy_groupies as npg\n", + "import pandas as pd\n", + "import regionmask\n", + "import xarray as xr\n", + "import xesmf as xe\n", + "from cartopy import crs as ccrs\n", + "from cartopy import feature as cfeature\n", + "\n", + "import rhg_compute_tools.kubernetes as rhgk\n", + "from sliiders import __file__\n", + "from sliiders import settings as sset\n", + "from sliiders.spatial import get_iso_geometry, grid_ix_to_val, grid_val_to_ix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client, cluster = rhgk.get_micro_cluster()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nworkers = 16\n", + "cluster.scale(nworkers)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sliiders_dir = Path(__file__).parent\n", + "zipf = zipfile.ZipFile(\"sliiders.zip\", \"w\", zipfile.ZIP_DEFLATED)\n", + "for root, dirs, files in os.walk(sliiders_dir):\n", + " for file in files:\n", + " zipf.write(\n", + " os.path.join(root, file),\n", + " os.path.relpath(os.path.join(root, file), os.path.join(sliiders_dir, \"..\")),\n", + " )\n", + "zipf.close()\n", + "client.upload_file(\"sliiders.zip\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "litpop = (\n", + " ddf.read_csv(\n", + " str(sset.PATH_LITPOP_RAW),\n", + " dtype={\"value\": \"float32\", \"lat\": \"float32\", \"lon\": \"float32\"},\n", + " )\n", + " .rename(columns={\"latitude\": \"lat\", \"longitude\": \"lon\"})\n", + " .repartition(npartitions=nworkers)\n", + " .persist()\n", + ")\n", + "litpop_meta = pd.read_csv(sset.DIR_LITPOP_RAW / \"_metadata_countries_v1_2.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Geodataframe for Countries with Missing LitPop Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "missing_countries = litpop_meta[litpop_meta[\"included\"] == 0].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "missing_countries = gpd.GeoDataFrame(\n", + " missing_countries,\n", + " geometry=get_iso_geometry(missing_countries[\"iso3\"].to_numpy()),\n", + ")\n", + "\n", + "missing_countries[\"iso3\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_id_to_iso = litpop_meta.set_index(\"region_id\")[[\"iso3\"]]\n", + "\n", + "litpop = litpop.join(region_id_to_iso, on=\"region_id\").persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "geg15 = pd.read_parquet(sset.PATH_GEG15_INT, columns=[\"lon\", \"lat\", \"iso3\", \"tot_val\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "geg15[\"tot_val\"] = geg15[\"tot_val\"] * 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lp_iso3 = litpop[\"iso3\"].unique().compute()\n", + "geg_iso3 = geg15[\"iso3\"].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve geg data to regrid\n", + "def subset_relevant_geg_data(poly, geg15, buffer=1 / 48):\n", + " # subset geg for buffered country poly bounds\n", + " geg15_sub = (\n", + " geg15[\n", + " (geg15.lon >= poly.bounds[0] - buffer)\n", + " & (geg15.lon <= poly.bounds[2] + buffer)\n", + " & (geg15.lat >= poly.bounds[1] - buffer)\n", + " & (geg15.lat <= poly.bounds[3] + buffer)\n", + " ][[\"lon\", \"lat\", \"tot_val\"]].reset_index(drop=True)\n", + " # .compute()\n", + " )\n", + "\n", + " if geg15_sub.shape[0] == 0:\n", + " return None\n", + "\n", + " subset = geg15_sub.set_index([\"lat\", \"lon\"]).to_xarray()\n", + "\n", + " subset[\"mask\"] = poly_mask(poly, subset)\n", + "\n", + " if subset.tot_val.where(subset.mask == 1).sum() <= 0:\n", + " return None\n", + "\n", + " return subset\n", + "\n", + "\n", + "def create_grid(subset, resolution, add_cell_corners=False):\n", + "\n", + " masked_lon = subset.lon.where((subset.mask > 0) & (subset.tot_val.notnull()))\n", + " masked_lat = subset.lat.where((subset.mask > 0) & (subset.tot_val.notnull()))\n", + "\n", + " # construct destination grid with mask holder variable\n", + " x1, y1 = np.floor((masked_lon.lon.min().item(), masked_lat.lat.min().item()))\n", + " x2, y2 = np.ceil((masked_lon.lon.max().item(), masked_lat.lat.max().item()))\n", + "\n", + " lat = np.arange(y1 + resolution / 2, y2, resolution)\n", + " lon = np.arange(x1 + resolution / 2, x2, resolution)\n", + "\n", + " ds_out = xr.Dataset(\n", + " coords={\n", + " \"lat\": lat,\n", + " \"lon\": lon,\n", + " }\n", + " )\n", + "\n", + " if add_cell_corners:\n", + " ds_out.coords[\"lat_b\"] = (ds_out.lat.min().item() - resolution / 2) + np.arange(\n", + " len(ds_out.lat) + 1\n", + " ) * resolution\n", + " ds_out.coords[\"lon_b\"] = (ds_out.lon.min().item() - resolution / 2) + np.arange(\n", + " len(ds_out.lon) + 1\n", + " ) * resolution\n", + "\n", + " return ds_out\n", + "\n", + "\n", + "def poly_mask(poly, grid):\n", + " mask_grid = grid.copy()\n", + " mask_grid[\"mask\"] = (\n", + " [\"lat\", \"lon\"],\n", + " np.full((len(mask_grid.lat), len(mask_grid.lon)), 1, np.int32),\n", + " )\n", + "\n", + " mask_grid = mask_grid.rio.set_spatial_dims(x_dim=\"lon\", y_dim=\"lat\", inplace=True)\n", + " mask_grid = mask_grid.rio.write_crs(\"epsg:4326\", inplace=True)\n", + "\n", + " clipped = mask_grid.rio.clip([poly], drop=False, all_touched=True)\n", + " clipped = (clipped == 1).astype(np.int32)\n", + "\n", + " return clipped.mask.dims, clipped.mask.values\n", + "\n", + "\n", + "def make_land_weights(subset, poly, out_resolution, in_resolution):\n", + "\n", + " print(\"Creating grids...\")\n", + " # create grid at out_resolution with grid cell edges at a whole lat and lon values\n", + " out_grid = create_grid(subset, resolution=out_resolution, add_cell_corners=True)\n", + "\n", + " # create grid at in_resolution with grid cell edges at a whole lat and lon values\n", + " in_grid = create_grid(subset, resolution=in_resolution, add_cell_corners=True)\n", + "\n", + " # create grid cell id for in_grid\n", + " in_grid[\"id5x\"] = (\n", + " [\"lat\", \"lon\"],\n", + " np.arange(in_grid.lat.shape[0] * in_grid.lon.shape[0]).reshape(\n", + " (in_grid.lat.shape[0], in_grid.lon.shape[0])\n", + " ),\n", + " )\n", + "\n", + " # apply in_grid grid cell id to out_grid cells\n", + " out_grid[\"idx5\"] = in_grid.reindex_like(\n", + " out_grid, method=\"nearest\", tolerance=in_resolution / 2\n", + " ).id5x\n", + "\n", + " print(\"Creating land mask...\")\n", + " out_grid[\"mask\"] = (\n", + " regionmask.Regions([poly], numbers=[1])\n", + " .mask(out_grid.lon.values, out_grid.lat.values)\n", + " .fillna(0)\n", + " )\n", + "\n", + " print(\"Constructing land weights...\")\n", + " in_grid[\"land_weights\"] = (\n", + " [\"lat\", \"lon\"],\n", + " npg.aggregate(\n", + " group_idx=out_grid.idx5.values.flatten(),\n", + " a=out_grid.mask.values.flatten(),\n", + " fill_value=0,\n", + " func=\"sum\",\n", + " ).reshape(in_grid.id5x.shape)\n", + " / ((in_resolution / out_resolution) ** 2),\n", + " )\n", + "\n", + " return in_grid\n", + "\n", + "\n", + "def prep_geg_for_regrid(\n", + " poly, geg15, geg_res=sset.GEG_GRID_WIDTH, litpop_res=sset.LITPOP_GRID_WIDTH\n", + "):\n", + "\n", + " # get relevant geg data given poly of interest\n", + " subset = subset_relevant_geg_data(poly, geg15, geg_res / 2)\n", + " if subset is None:\n", + " return None\n", + "\n", + " # construct land weights\n", + " weights = make_land_weights(subset, poly, litpop_res, geg_res)\n", + "\n", + " # add corners for conservative regrid\n", + " subset.coords[\"lat_b\"] = (subset.lat.min().item() - geg_res / 2) + np.arange(\n", + " len(subset.lat) + 1\n", + " ) * geg_res\n", + " subset.coords[\"lon_b\"] = (subset.lon.min().item() - geg_res / 2) + np.arange(\n", + " len(subset.lon) + 1\n", + " ) * geg_res\n", + "\n", + " # regrid landweights onto geg grid\n", + " regridder = xe.Regridder(weights, subset, \"conservative\")\n", + " land_weights_regrid = regridder(weights)\n", + "\n", + " # normalize using amount of land per cell\n", + " weights = geg_res**2\n", + " subset[\"tot_val_norm\"] = (\n", + " subset.tot_val.where(land_weights_regrid.land_weights > 0) / weights\n", + " )\n", + "\n", + " # drop out if all null data --> no asset value on relevant land\n", + " if (\n", + " subset.tot_val_norm.where((subset.mask > 0) & subset.tot_val_norm.notnull())\n", + " .notnull()\n", + " .sum()\n", + " == 0\n", + " ):\n", + " return None\n", + "\n", + " return subset\n", + "\n", + "\n", + "def regrid_geg(\n", + " poly, geg15, geg_res=sset.GEG_GRID_WIDTH, litpop_res=sset.LITPOP_GRID_WIDTH\n", + "):\n", + "\n", + " geg_sub = prep_geg_for_regrid(poly, geg15, geg_res, litpop_res)\n", + "\n", + " if geg_sub is None:\n", + " return None\n", + "\n", + " out_grid = create_grid(geg_sub, resolution=litpop_res)\n", + "\n", + " regridder = xe.Regridder(geg_sub, out_grid, \"nearest_s2d\")\n", + "\n", + " geg_regridded = regridder(geg_sub)\n", + "\n", + " mask_dims, mask = poly_mask(poly, geg_regridded[[\"lat\", \"lon\"]])\n", + " geg_regridded[\"tot_val\"] = (geg_regridded.tot_val_norm * (litpop_res**2)).where(\n", + " mask == 1\n", + " )\n", + "\n", + " return geg_regridded" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Regrid GEG for Missing Countries in LitPop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out_dict = {}\n", + "for territory in sset.ISOS_IN_GEG_NOT_LITPOP:\n", + " print(territory)\n", + " territory_shape = (\n", + " missing_countries[missing_countries[\"iso3\"] == territory].iloc[0].geometry\n", + " )\n", + " out_dict[territory] = regrid_geg(territory_shape, geg15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check Regridding Looks Good" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_exposure(ax, title, data, poly, vmin=None, vmax=None):\n", + " ax.set_extent(\n", + " [poly.bounds[0] - 1, poly.bounds[2] + 1, poly.bounds[1] - 1, poly.bounds[3] + 1]\n", + " )\n", + " ax.coastlines(\"10m\", linewidth=0.5, edgecolor=\"tab:orange\")\n", + "\n", + " adm0 = cfeature.NaturalEarthFeature(\n", + " category=\"cultural\",\n", + " name=\"admin_0_boundary_lines_land\",\n", + " scale=\"10m\",\n", + " facecolor=\"none\",\n", + " )\n", + "\n", + " ax.add_feature(adm0, edgecolor=\"tab:orange\", linewidth=0.1)\n", + "\n", + " data.where(data > 0.0000001).plot(\n", + " cmap=\"YlGnBu\",\n", + " norm=matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax),\n", + " ax=ax,\n", + " cbar_kwargs={\"shrink\": 0.5, \"label\": \"\"},\n", + " )\n", + "\n", + " ax.add_geometries(\n", + " [poly], ccrs.PlateCarree(), facecolor=\"none\", edgecolor=\"r\", linewidth=0.3\n", + " )\n", + " ax.set_title(title)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How does the regridding look?\n", + "%matplotlib inline\n", + "\n", + "plot_dict = out_dict\n", + "\n", + "pc_transform = ccrs.PlateCarree()\n", + "fig, axs = plt.subplots(\n", + " figsize=((3 * 3), (3 * 4)),\n", + " dpi=500,\n", + " ncols=3,\n", + " nrows=3,\n", + " subplot_kw={\"projection\": pc_transform},\n", + ")\n", + "\n", + "axs = axs.flatten()\n", + "for ax, tup in zip(axs, plot_dict.items()):\n", + " iso = tup[0]\n", + " out = tup[1]\n", + " row = missing_countries[missing_countries.iso3 == iso].iloc[0]\n", + " poly = row.geometry\n", + " plot_exposure(ax, iso, out[\"tot_val\"], poly)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Regridded Data into LitPop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# swap from value to integer indexing\n", + "litpop[\"lat\"] = litpop.lat.map_partitions(\n", + " grid_val_to_ix, cell_size=sset.LITPOP_GRID_WIDTH\n", + ")\n", + "litpop[\"lon\"] = litpop.lon.map_partitions(\n", + " grid_val_to_ix, cell_size=sset.LITPOP_GRID_WIDTH\n", + ")\n", + "litpop = litpop.persist()\n", + "\n", + "litpop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add geg data into litpop dask dataframe\n", + "for iso, _add in out_dict.items():\n", + " print(iso)\n", + " add = _add.copy()\n", + " add.coords[\"lat\"] = grid_val_to_ix(add.lat.values, sset.LITPOP_GRID_WIDTH)\n", + " add.coords[\"lon\"] = grid_val_to_ix(add.lon.values, sset.LITPOP_GRID_WIDTH)\n", + "\n", + " litpop_sub = litpop[\n", + " (litpop.lon >= add.lon.min().item())\n", + " & (litpop.lon <= add.lon.max().item())\n", + " & (litpop.lat >= add.lat.min().item())\n", + " & (litpop.lat <= add.lat.max().item())\n", + " ].compute()\n", + "\n", + " # Mask out all MAR values below the MAR-ESH border (this border is defined by its latitude)\n", + " if iso == \"ESH\":\n", + " litpop_sub = litpop_sub.loc[\n", + " ~(litpop_sub[\"iso3\"] == \"MAR\")\n", + " | ~(litpop_sub[\"lat\"] <= get_iso_geometry(\"ESH\").bounds[3])\n", + " ].copy()\n", + "\n", + " litpop_sub = litpop_sub.set_index([\"lat\", \"lon\"]).to_xarray()\n", + "\n", + " add = add.rename({\"tot_val\": \"value\"})\n", + "\n", + " add[\"iso3\"] = ([\"lat\", \"lon\"], np.where((~np.isnan(add[\"value\"])), iso, None))\n", + "\n", + " litpop_sub[\"new_iso3\"] = add[\"iso3\"]\n", + " litpop_sub[\"iso3\"] = xr.where(\n", + " litpop_sub[\"new_iso3\"].isnull(), litpop_sub[\"iso3\"], litpop_sub[\"new_iso3\"]\n", + " )\n", + " litpop_sub[\"new_value\"] = add[\"value\"]\n", + " litpop_sub[\"value\"] = xr.where(\n", + " litpop_sub[\"new_value\"].isnull(), litpop_sub[\"value\"], litpop_sub[\"new_value\"]\n", + " )\n", + "\n", + " mmed = xr.merge([litpop_sub[[\"value\", \"iso3\"]], add[[\"value\", \"iso3\"]]])\n", + "\n", + " litpop_m_sub = litpop[\n", + " ~(\n", + " (litpop.lon >= add.lon.min().item())\n", + " & (litpop.lon <= add.lon.max().item())\n", + " & (litpop.lat >= add.lat.min().item())\n", + " & (litpop.lat <= add.lat.max().item())\n", + " )\n", + " ]\n", + "\n", + " to_append = mmed[[\"value\", \"iso3\"]].to_dataframe().dropna().reset_index()\n", + "\n", + " # TODO figure out what's going on here--sometimes index isn't automatically named by `to_dataframe()`\n", + " to_append = to_append.rename(columns={\"level_0\": \"lat\", \"level_1\": \"lon\"})\n", + " litpop = litpop_m_sub.append(to_append).persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# prep vars for saving\n", + "litpop[\"y_ix\"] = litpop[\"lat\"].astype(np.int16)\n", + "litpop[\"x_ix\"] = litpop[\"lon\"].astype(np.int16)\n", + "litpop[\"value\"] = litpop[\"value\"].astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "litpop = litpop.persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out_iso3 = litpop[\"iso3\"].unique().compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "litpop = litpop[[\"y_ix\", \"x_ix\", \"value\"]]\n", + "litpop = litpop[litpop[\"value\"] > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_litpop = litpop.compute()\n", + "\n", + "df_litpop = df_litpop.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_litpop[\"value\"] = df_litpop[\"value\"].astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_litpop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_EXPOSURE_BLENDED.parent.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_litpop.to_parquet(\n", + " sset.PATH_EXPOSURE_BLENDED,\n", + " index=False,\n", + " compression=None,\n", + " engine=\"fastparquet\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check To Make Sure GEG Additions Look Good" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "litpop_int = pd.read_parquet(sset.PATH_EXPOSURE_BLENDED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "litpop_int[\"lat\"] = grid_ix_to_val(litpop_int.y_ix, cell_size=sset.LITPOP_GRID_WIDTH)\n", + "litpop_int[\"lon\"] = grid_ix_to_val(litpop_int.x_ix, cell_size=sset.LITPOP_GRID_WIDTH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How does the regridding look?\n", + "%matplotlib inline\n", + "\n", + "plot_dict = out_dict\n", + "\n", + "pc_transform = ccrs.PlateCarree()\n", + "fig, axs = plt.subplots(\n", + " figsize=((3 * 3), (3 * 4)),\n", + " dpi=500,\n", + " ncols=3,\n", + " nrows=4,\n", + " subplot_kw={\"projection\": pc_transform},\n", + ")\n", + "\n", + "axs = axs.flatten()\n", + "for ax, tup in zip(axs, plot_dict.items()):\n", + " iso = tup[0]\n", + " add = tup[1]\n", + " row = missing_countries[missing_countries.iso3 == iso].iloc[0]\n", + " poly = row.geometry\n", + "\n", + " litpop_sub = (\n", + " litpop_int[\n", + " (litpop_int.lon >= add.lon.min().item() - 1)\n", + " & (litpop_int.lon <= add.lon.max().item() + 1)\n", + " & (litpop_int.lat >= add.lat.min().item() - 1)\n", + " & (litpop_int.lat <= add.lat.max().item() + 1)\n", + " ]\n", + " .set_index([\"lat\", \"lon\"])\n", + " .to_xarray()\n", + " )\n", + "\n", + " plot_exposure(ax, row.country_name, litpop_sub.value, poly)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/4-vectorize-wetlands.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/4-vectorize-wetlands.ipynb new file mode 100644 index 0000000..671ca63 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/4-vectorize-wetlands.ipynb @@ -0,0 +1,440 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5a2b329-7283-4e28-bafb-c60698866336", + "metadata": {}, + "source": [ + "# Combine wetlands from wetlands and mangroves datasets into single shapefile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f7fe3a3-32d9-4d8b-bef8-dd6a3d863622", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "import warnings\n", + "\n", + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pygeos\n", + "import xarray as xr\n", + "from rhg_compute_tools import kubernetes as rhgk\n", + "\n", + "from sliiders import settings as sset\n", + "\n", + "warnings.filterwarnings(\"ignore\", message=\".*initial implementation of Parquet.*\")\n", + "\n", + "\n", + "PATH_GLOBCOVER_NC = sset.PATH_GLOBCOVER_2009.parent / (\n", + " sset.PATH_GLOBCOVER_2009.stem + \".nc\"\n", + ")\n", + "PATH_GLOBCOVER_SHP = sset.PATH_GLOBCOVER_2009.parent / (\n", + " sset.PATH_GLOBCOVER_2009.stem + \".shp\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0c5f49ce-57b8-4023-9654-553d34a3efde", + "metadata": {}, + "source": [ + "## 1. Get wetland cover from `GLOBCOVER` as `.nc`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c96a5fa-05a1-4fc2-998f-2b3ee3b4133d", + "metadata": {}, + "outputs": [], + "source": [ + "client, cluster = rhgk.get_micro_cluster()\n", + "\n", + "cluster.scale(8)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2375d146-3eda-4ffb-86a0-a5f2dad7638f", + "metadata": {}, + "outputs": [], + "source": [ + "da = xr.open_rasterio(sset.PATH_GLOBCOVER_2009, chunks={\"x\": 32400, \"y\": 27900})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b86684e-1b9f-4c95-92c7-73c97e31deb0", + "metadata": {}, + "outputs": [], + "source": [ + "da = da.persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59e98b1b-aa41-4cce-9848-a16fc5a86540", + "metadata": {}, + "outputs": [], + "source": [ + "da = da.squeeze(\"band\").drop(\"band\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "627b033f-6152-4c81-902b-1d6f6837689c", + "metadata": {}, + "outputs": [], + "source": [ + "da = da.isin([160, 170, 180]).persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be81f6f8-6f32-4130-a7ba-a639dac9e8cc", + "metadata": {}, + "outputs": [], + "source": [ + "da = da.to_dataset(name=\"wetlands\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea6a0f3f-32ad-4312-8344-9f8cb72d8800", + "metadata": {}, + "outputs": [], + "source": [ + "da = da.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d95a75ac-2ac1-424d-b3c0-96e655d10d73", + "metadata": {}, + "outputs": [], + "source": [ + "da.to_netcdf(PATH_GLOBCOVER_NC)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "483792a6-1025-4027-98f5-0bf2ae9dfc6a", + "metadata": {}, + "outputs": [], + "source": [ + "client.close()\n", + "cluster.close()" + ] + }, + { + "cell_type": "markdown", + "id": "72278bb5-64ec-4db8-aea3-66a642c1e77a", + "metadata": {}, + "source": [ + "## 2. Vectorize\n", + "### Run `gdal_polygonize.py` in shell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e3396e0-2046-464a-af5a-7b352b36e539", + "metadata": {}, + "outputs": [], + "source": [ + "cmd = f\"gdal_polygonize.py {str(PATH_GLOBCOVER_NC)} {str(PATH_GLOBCOVER_SHP)}\"\n", + "\n", + "sp = subprocess.Popen(cmd, shell=True)\n", + "rc = sp.wait()\n", + "\n", + "print(sp)" + ] + }, + { + "cell_type": "markdown", + "id": "0cb8dc3b-542b-4625-bdf7-d36b6cc03e6e", + "metadata": {}, + "source": [ + "## 3. Clean up shapefile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df08716c-21ad-4753-8ba9-7a2852b1009e", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = gpd.read_file(PATH_GLOBCOVER_SHP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e398a721-99c6-43a5-960c-21ee3465dc1b", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = gdf[gdf[\"DN\"] == 1].drop(columns=[\"DN\"])\n", + "gdf = gdf.explode().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c287dc36-b34d-488c-a54c-72fc447fd3c3", + "metadata": {}, + "outputs": [], + "source": [ + "gdf.to_file(PATH_GLOBCOVER_SHP)" + ] + }, + { + "cell_type": "markdown", + "id": "27ffb99e-5f35-4cc3-884b-41bee68b5ac7", + "metadata": {}, + "source": [ + "## 4. Combine with mangroves" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "328f8af0-f3e7-4030-80b1-a905544fb7ec", + "metadata": {}, + "outputs": [], + "source": [ + "globcover_mask = gpd.read_file(PATH_GLOBCOVER_SHP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29339179-1b73-41ec-ac91-2c7bc0162d58", + "metadata": {}, + "outputs": [], + "source": [ + "mangroves = gpd.read_file(sset.PATH_GLOBAL_MANGROVES)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "778e9038-725b-4a10-8612-eca89f4c7a09", + "metadata": {}, + "outputs": [], + "source": [ + "globcover_mask[\"geometry\"] = pygeos.to_shapely(\n", + " pygeos.make_valid(pygeos.from_shapely(globcover_mask[\"geometry\"]))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb1862d9-55c8-4d54-a3d9-2920856698ea", + "metadata": {}, + "outputs": [], + "source": [ + "combined = gpd.sjoin(globcover_mask, mangroves, how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97d71975-6308-4c10-883c-a4df8128a2e8", + "metadata": {}, + "outputs": [], + "source": [ + "mangroves = mangroves.reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad696a47-aa48-4057-ab74-4375f6753438", + "metadata": {}, + "outputs": [], + "source": [ + "mangroves[\"in_combined\"] = mangroves[\"index\"].isin(combined[\"index_right\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a8e7d5-e4a9-48fe-a60a-f2953f336d28", + "metadata": {}, + "outputs": [], + "source": [ + "globcover_only = combined[combined[\"index_right\"].isnull()].copy()\n", + "combined = combined[combined[\"index_right\"].notnull()].copy()\n", + "\n", + "combined = combined.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf8e39a3-2ea5-4c43-85db-89dcd0429a65", + "metadata": {}, + "outputs": [], + "source": [ + "combined[\"index_right\"] = combined[\"index_right\"].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de6cad78-4671-4ecd-a9e8-a631d04c5ae8", + "metadata": {}, + "outputs": [], + "source": [ + "combined[\"geometry_right\"] = gpd.GeoSeries(\n", + " np.take(mangroves[\"geometry\"].to_numpy(), combined[\"index_right\"].to_numpy())\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19010a01-c54d-42bc-8410-2e8d1202a682", + "metadata": {}, + "outputs": [], + "source": [ + "combined = combined.set_geometry(\"geometry_right\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6640f34-4beb-49e9-b5e0-3b25651ddf6a", + "metadata": {}, + "outputs": [], + "source": [ + "combined = combined.dissolve(\"FID\").reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f37b290-5556-4e19-928c-2093ffd5e0fd", + "metadata": {}, + "outputs": [], + "source": [ + "combined[\"geometry\"] = combined[\"geometry\"].difference(combined[\"geometry_right\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3eb10b51-e7a0-4bb2-9e1a-ee242ed4d1a4", + "metadata": {}, + "outputs": [], + "source": [ + "combined = combined.set_geometry(\"geometry\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ea52f0d-8d05-4733-aff6-a6df428e3a92", + "metadata": {}, + "outputs": [], + "source": [ + "combined = pd.concat([combined, mangroves, globcover_only], ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61e24048-b516-4721-b04a-e07737e6c8bc", + "metadata": {}, + "outputs": [], + "source": [ + "combined = combined[[\"geometry\"]].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24f52be2-46d1-4850-b666-a0e14f2e76c8", + "metadata": {}, + "outputs": [], + "source": [ + "combined.iloc[:20].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0520bb8b-c229-46ce-abe2-30f5920656ec", + "metadata": {}, + "outputs": [], + "source": [ + "combined = combined.explode().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f949b9e-a537-4126-b740-ebcc0213ca37", + "metadata": {}, + "outputs": [], + "source": [ + "combined = combined[combined.geometry.area > 0].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66bc8acb-32e5-4624-a053-95243a33680d", + "metadata": {}, + "outputs": [], + "source": [ + "combined.to_file(sset.PATH_WETLANDS_INT)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/5-get_positive_elev_tiles.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/5-get_positive_elev_tiles.ipynb new file mode 100644 index 0000000..2f09ec1 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/5-get_positive_elev_tiles.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define lists of tiles to be processed in the gridded exposure step\n", + "- `WITHELEV`: Include information on exposure, elevation, and other boundaries.\n", + "- `WITHOUTELEV`: Include information on exposure and other boundaries, but not elevation.\n", + "- `CIAM`: Include information on elevation and other boundaries, but not exposure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import rhg_compute_tools.kubernetes as rhgk\n", + "import xarray as xr\n", + "from sliiders import settings as sset\n", + "from sliiders import spatial" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nworkers = 16\n", + "\n", + "client, cluster = rhgk.get_micro_cluster()\n", + "\n", + "cluster.scale(nworkers)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lat_size = 43200\n", + "lon_size = 86400\n", + "\n", + "lats_per_deg, lons_per_deg = int(lat_size / 180), int(lon_size / 360)\n", + "\n", + "lon_chunk = int(lon_size / nworkers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bdem = xr.open_dataarray(sset.PATH_SRTM15_PLUS, chunks={\"lat\": lats_per_deg}).persist()\n", + "\n", + "bdem_max = (\n", + " bdem.coarsen(lat=lats_per_deg).max().coarsen(lon=lons_per_deg).max().compute()\n", + ")\n", + "\n", + "bdem_min = (\n", + " bdem.coarsen(lat=lats_per_deg).min().coarsen(lon=lons_per_deg).min().compute()\n", + ")\n", + "\n", + "bdem_max.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bdem_min.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Double-check that the grid's spacing is regular over 1-degree tiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(np.unique(np.floor(bdem.lat.values[:lats_per_deg]))) == 1\n", + "assert len(np.unique(np.floor(bdem.lon.values[:lons_per_deg]))) == 1\n", + "\n", + "assert (np.floor(bdem.lat.values)[::lats_per_deg] == np.arange(-90, 90)).sum() == 180\n", + "assert (np.floor(bdem.lon.values)[::lons_per_deg] == np.arange(-180, 180)).sum() == 360" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Shut down cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.close()\n", + "cluster.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Organize tiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "max_tiles = bdem_max.to_dataframe(name=\"max\").reset_index()\n", + "min_tiles = bdem_min.to_dataframe(name=\"min\").reset_index()\n", + "\n", + "tiles = pd.merge(max_tiles, min_tiles, on=[\"lat\", \"lon\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Determine whether each tile meets certain criteria, which will be used to define categories" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare tiles and category sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tiles[\"tile_name\"] = spatial.get_tile_names(tiles, \"lon\", \"lat\")\n", + "\n", + "tiles = gpd.GeoDataFrame(tiles, geometry=tiles.apply(spatial.get_degree_box, axis=1))\n", + "\n", + "tiles[\"llat\"] = np.floor(tiles[\"lat\"])\n", + "tiles[\"llon\"] = np.floor(tiles[\"lon\"])\n", + "\n", + "assets = pd.read_parquet(sset.PATH_EXPOSURE_BLENDED, columns=[\"x_ix\", \"y_ix\"]).values.T\n", + "exp_tiles = spatial.get_all_exp_tiles(*assets)\n", + "\n", + "pop = pd.read_parquet(sset.PATH_LANDSCAN_INT)\n", + "pop = pop.loc[pop.population > 0, [\"x_ix\", \"y_ix\"]].values.T\n", + "pop_tiles = spatial.get_all_exp_tiles(*pop)\n", + "\n", + "coastaldem_tiles = [t.stem for t in sset.DIR_COASTALDEM.glob(\"*.tif\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apply category logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Tile is included in CoastalDEM\n", + "tiles[\"coastaldem\"] = tiles[\"tile_name\"].isin(coastaldem_tiles)\n", + "\n", + "# Tile has non-0 asset-value\n", + "tiles[\"exp\"] = tiles[\"tile_name\"].isin(exp_tiles)\n", + "\n", + "# Tile has non-0 population\n", + "tiles[\"pop\"] = tiles[\"tile_name\"].isin(pop_tiles)\n", + "\n", + "# Tile is below the 60th parallel south (governed under the Antarctic Treaty System)\n", + "tiles[\"antarctica\"] = tiles[\"lat\"] < -60\n", + "\n", + "# Tile includes elevations below 50 meters\n", + "tiles[\"below50\"] = tiles[\"min\"] <= 50\n", + "\n", + "# Tile includes elevations above -50 meters\n", + "tiles[\"above_neg50\"] = tiles[\"max\"] >= -50" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save list of low-lying tiles that are not contiguous with the ocean (\"inland\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ocean_shape = tiles[tiles[\"below50\"]].buffer(0.01).unary_union\n", + "\n", + "ocean_shape = list(ocean_shape.geoms)[np.argmax([g.area for g in ocean_shape.geoms])]\n", + "\n", + "tiles[\"contiguous_with_ocean\"] = tiles[\"geometry\"].within(ocean_shape)\n", + "\n", + "tiles[tiles[\"contiguous_with_ocean\"]].plot(figsize=(20, 20))\n", + "\n", + "inland = (\n", + " tiles[(tiles[\"coastaldem\"]) & (~tiles[\"contiguous_with_ocean\"])][[\"tile_name\"]]\n", + " .sort_values(\"tile_name\")\n", + " .reset_index(drop=True)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Categorize tiles based on whether they are relevant to each group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tiles = tiles[~tiles[\"tile_name\"].isin(inland[\"tile_name\"].to_numpy())].reset_index(\n", + " drop=True\n", + ")\n", + "\n", + "tiles[\"WITHELEV\"] = (tiles[\"below50\"] | tiles[\"coastaldem\"]) & tiles[\"exp\"]\n", + "tiles[\"WITHOUTELEV\"] = tiles[\"exp\"] & (~tiles[\"WITHELEV\"])\n", + "tiles[\"CIAM\"] = (\n", + " (tiles[\"above_neg50\"])\n", + " & (tiles[\"below50\"] | tiles[\"coastaldem\"])\n", + " & (~tiles[\"antarctica\"])\n", + " & (~tiles[\"exp\"])\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot tile categories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_color(tile):\n", + " if tile[\"WITHELEV\"]:\n", + " return \"purple\"\n", + " if tile[\"WITHOUTELEV\"]:\n", + " return \"green\"\n", + " if tile[\"CIAM\"]:\n", + " return \"orange\"\n", + " return \"blue\"\n", + "\n", + "\n", + "tiles[\"color\"] = tiles.apply(get_color, axis=1)\n", + "tiles.plot(color=tiles[\"color\"], figsize=(20, 20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform booleans into categories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tiles[\"PROCESSING_SET\"] = np.where(\n", + " tiles[\"WITHELEV\"],\n", + " \"WITHELEV\",\n", + " np.where(\n", + " tiles[\"WITHOUTELEV\"], \"WITHOUTELEV\", np.where(tiles[\"CIAM\"], \"CIAM\", None)\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out = tiles[[\"tile_name\", \"PROCESSING_SET\"]]\n", + "\n", + "out = out[pd.notnull(out[\"PROCESSING_SET\"])].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out.to_parquet(sset.PATH_EXPOSURE_TILE_LIST, index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/6-generate_datum_conversion_grid.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/6-generate_datum_conversion_grid.ipynb new file mode 100644 index 0000000..c3653c9 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/6-generate_datum_conversion_grid.ipynb @@ -0,0 +1,320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3bfe2898-b80a-4d0c-911a-a90c9fdb1f9c", + "metadata": {}, + "source": [ + "## Generate Datum Conversion Grid\n", + "\n", + "This notebook cleans and aggregates the downloaded XGM2019e and EGM geoid data, which is split up into quarters of the full grid. It also interpolates (or converts) the mean dynamic topography (MDT) data into grid structure used in XGM2019e and EGM data. Finally, geoid heights and MDT information are combined into one dataset (in `xarray.Dataset` format) and the final dataset is exported as a `.zarr` storage.\n", + "\n", + "See `notebooks/create-SLIIDERS/download-sliiders-econ-input-data.ipynb` for more details related to downloading the raw data used in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e04fc3fb-7cb0-40a2-97cc-9166263da7aa", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff84c62b-d1c6-41ca-91ef-ed5d6691f633", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "from cartopy import crs as ccrs\n", + "from sliiders import settings as sset\n", + "from sliiders import gcs\n", + "from pyinterp.backends.xarray import Grid2D\n", + "from pyinterp.fill import gauss_seidel\n", + "from pathlib import Path\n", + "\n", + "from sliiders.io import read_gdf\n", + "from sliiders.spatial import interpolate_da_like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49656d02-f289-4682-8ea4-4d40a353d013", + "metadata": {}, + "outputs": [], + "source": [ + "def read_gdf_grids(in_dir):\n", + " \"\"\"Combining all `.gdf` files in the directory `in_dir` to form an\n", + " `xarray.DataArray` over a single grid structure; also, for longitude systems where\n", + " values run from 0 to 360 (instead of -180 to 180), cleans so that longitudes run\n", + " from -180 to 180.\n", + "\n", + " Parameters\n", + " ----------\n", + " in_dir : pathlib.Path-like or str\n", + " path that contains all the relevant `.gdf` files that, together, form one grid\n", + "\n", + " Returns\n", + " -------\n", + " xarray.DataArray\n", + " containing data on `z` (as specified by the helper function `read_gdf`) over\n", + " all the grid points\n", + "\n", + " \"\"\"\n", + " grid = [read_gdf(d) for d in in_dir.glob(\"*\")]\n", + "\n", + " # hack b/c as of xarray 0.18.0, combine_by_coords seems to be throwing weird error\n", + " lats = np.sort(np.unique([g.lat for g in grid]))\n", + " lons = np.sort(np.unique([g.lon for g in grid]))\n", + " out = xr.DataArray(coords={\"lat\": lats, \"lon\": lons}, dims=[\"lat\", \"lon\"], name=\"z\")\n", + " for g in grid:\n", + " out = out.fillna(g)\n", + "\n", + " assert np.allclose(out.sel(lon=0), out.sel(lon=360))\n", + " out = out.drop_sel(lon=360)\n", + " out[\"lon\"] = out.lon.where(out.lon <= 180, out.lon - 360)\n", + "\n", + " return out.sortby([\"lon\", \"lat\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1816bc2f-01c7-4d21-8f45-1d1369d423f3", + "metadata": {}, + "source": [ + "## Load XGM2019e data and EGM96 data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4178f9-b88d-4a86-bd92-469a23ca8eaf", + "metadata": {}, + "outputs": [], + "source": [ + "xgm_wgs = read_gdf_grids(sset.DIR_GEOG_DATUMS_XGM2019e_WGS84)\n", + "egm_wgs = read_gdf_grids(sset.DIR_GEOG_DATUMS_EGM96_WGS84)" + ] + }, + { + "cell_type": "markdown", + "id": "03d31c49-1bf0-4905-a6cc-bd387a96fd04", + "metadata": {}, + "source": [ + "## Load and interpolate mean dynamic topography (MDT) data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59514b9c-efd8-4099-a627-aae45b8d7553", + "metadata": {}, + "outputs": [], + "source": [ + "# loading raw MDT\n", + "mdt = xr.open_dataset(sset.PATH_GEOG_MDT_RAW).mdt.isel(time=0, drop=True).load()\n", + "\n", + "# cleaning the longitudes above 180 degrees; then sort, rename longitude and latitude\n", + "mdt[\"longitude\"] = mdt.longitude.where(mdt.longitude < 180, mdt.longitude - 360)\n", + "mdt = mdt.sortby([\"latitude\", \"longitude\"]).rename(longitude=\"lon\", latitude=\"lat\")\n", + "\n", + "# fill the nans by Gauss-Seidel method; make sure it is converged\n", + "converged, filled = gauss_seidel(Grid2D(mdt))\n", + "assert converged\n", + "mdt_filled = mdt.copy()\n", + "mdt_filled[:] = filled.T\n", + "\n", + "# interpolate\n", + "mdt_filled = interpolate_da_like(mdt_filled, xgm_wgs)\n", + "mdt = interpolate_da_like(mdt, xgm_wgs)" + ] + }, + { + "cell_type": "markdown", + "id": "f68b7ccd-af77-4c7f-b510-252a601263bb", + "metadata": {}, + "source": [ + "## Put all conversions together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "480da8d3-fff3-4854-976e-7f844c51641d", + "metadata": {}, + "outputs": [], + "source": [ + "out = xr.Dataset(\n", + " {\n", + " \"mdt\": mdt_filled,\n", + " \"egm96_xgm2019e\": egm_wgs - xgm_wgs,\n", + " \"egm96_wgs84\": egm_wgs,\n", + " \"xgm2019e_wgs84\": xgm_wgs,\n", + " \"mdt_interpolated\": mdt.isnull(),\n", + " }\n", + ")\n", + "out = out.dropna(\"lat\", how=\"any\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32e5d306-7670-4eda-a96d-e4f02704c49c", + "metadata": {}, + "outputs": [], + "source": [ + "# variable attrs\n", + "out.mdt.attrs.update(\n", + " {\n", + " \"long_name\": \"MDT 1993-2019\",\n", + " \"description\": (\n", + " \"Mean dynamic ocean topography over the period 1993-2019, as calculated by \"\n", + " \"AVISO altimetry (see \"\n", + " \"https://www.aviso.altimetry.fr/en/data/products/auxiliary-products/mdt.html\"\n", + " \"). This data is extrapolated over land using a Gauss-Seidel algorithm, \"\n", + " \"implemented by pyinterp. It's then interpolated to a consistent grid. The \"\n", + " \"MDT value should be thought of as relative to a theoretical geoid, rather \"\n", + " \"than any one particular geoid model (using a particular geoid model other \"\n", + " \"than the native GOCO05s results in non-physical oscillations in MDT).\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "out.egm96_xgm2019e.attrs.update(\n", + " {\n", + " \"long_name\": \"EGM96, rel. XGM2019e_2159\",\n", + " \"description\": (\n", + " \"Height of EGM96 geoid relative to XGM2019e_2159 geoid. Useful for \"\n", + " \"converting topography data between the two datums.\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "out.egm96_wgs84.attrs.update(\n", + " {\n", + " \"long_name\": \"EGM96, rel. WGS84\",\n", + " \"description\": (\n", + " \"Geoid heights of EGM96 geoid model, relative to WGS84 ellipsoid.\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "out.xgm2019e_wgs84.attrs.update(\n", + " {\n", + " \"long_name\": \"XGM2019e_2159, rel. WGS84\",\n", + " \"description\": (\n", + " \"Geoid heights of XGM2019e_2159 geoid model, relative to WGS84 ellipsoid.\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "out.mdt_interpolated.attrs.update(\n", + " {\n", + " \"long_name\": \"Interpolation mask for MDT values\",\n", + " \"description\": (\n", + " \"Indicates where bicubically interpolated MDT values required at least one \"\n", + " \"source value to be estimated through Gauss-Seidel relaxation (implemented \"\n", + " \"by pyinterp), rather than taken directly from the AVISO MDT dataset.\"\n", + " ),\n", + " }\n", + ")\n", + "\n", + "# dataset attrs\n", + "out.attrs.update(\n", + " {\n", + " \"author\": \"Ian Bolliger\",\n", + " \"contact\": \"ian.bolliger@blackrock.com\",\n", + " \"updated\": pd.Timestamp.now(tz=\"US/Pacific\").strftime(\"%c %z\"),\n", + " \"method\": \"See individual variable attrs for specific methods.\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9c3b0628-234d-4323-920c-80ebeeb9caac", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# graphical check\n", + "this_vars = [v for v in out.variables if v not in [\"lat\", \"lon\"]]\n", + "fig, axs = plt.subplots(\n", + " len(this_vars),\n", + " 1,\n", + " subplot_kw={\"projection\": ccrs.PlateCarree()},\n", + " figsize=(10, 5 * len(this_vars)),\n", + ")\n", + "for vx, v in enumerate(this_vars):\n", + " if v not in [\"lat\", \"lon\"]:\n", + " out[v].isel(lat=slice(0, None, 10), lon=slice(0, None, 10)).plot(\n", + " x=\"lon\", y=\"lat\", ax=axs[vx]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1355dac-2adb-440c-bc23-a63ad9949a53", + "metadata": {}, + "outputs": [], + "source": [ + "# Exporting\n", + "os.makedirs(\n", + " os.path.dirname(gcs.gcsmap_to_fuse(sset.PATH_GEOG_DATUMS_GRID)), exist_ok=True\n", + ")\n", + "out.to_zarr(sset.PATH_GEOG_DATUMS_GRID, consolidated=True, mode=\"w\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/7-create_dem_mss.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/7-create_dem_mss.ipynb new file mode 100644 index 0000000..f1af9e3 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/7-create_dem_mss.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7e932cf8-3f20-4e6b-b12a-633a162da8c6", + "metadata": {}, + "source": [ + "# Create MSS coastal DEM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7c00785-c628-446b-bf33-c3fede918bef", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "import dask.distributed as dd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pyinterp.backends.xarray as pbx\n", + "import rhg_compute_tools.gcs as rhgcs\n", + "import rhg_compute_tools.kubernetes as rhgk\n", + "import rhg_compute_tools.utils as rhgu\n", + "import xarray as xr\n", + "from shapely.geometry import box\n", + "\n", + "from sliiders import settings as sset" + ] + }, + { + "cell_type": "markdown", + "id": "ce069aa2-f411-42b6-8010-99478f80c520", + "metadata": {}, + "source": [ + "Define elevation-processing functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b67c13-2806-41ec-8273-10a957806602", + "metadata": {}, + "outputs": [], + "source": [ + "@rhgu.block_globals\n", + "def get_grid_at_tile(da, grid):\n", + " \"\"\"\n", + " Get interpolated datum tile in the same shape as `da` using `pbx.Grid2D`\n", + " \"\"\"\n", + " buffer = 0.2\n", + "\n", + " # Ensure tiles along the 180 meridian have coordinates defined contiguously\n", + " if da.x[-1].item() > 179:\n", + " new_lons = grid.lon.values\n", + " new_lons[new_lons < -179] = new_lons[new_lons < -179] + 360\n", + " grid = grid.assign_coords({\"lon\": new_lons})\n", + " elif da.x[0].item() < -179:\n", + " new_lons = grid.lon.values\n", + " new_lons[new_lons > 179] = new_lons[new_lons > 179] - 360\n", + " grid = grid.assign_coords({\"lon\": new_lons})\n", + "\n", + " grid = grid.isel(\n", + " lon=(grid.lon >= da.x[0] - buffer) & (grid.lon <= da.x[-1] + buffer),\n", + " lat=(grid.lat >= da.y[-1] - buffer) & (grid.lat <= da.y[0] + buffer),\n", + " ).load()\n", + "\n", + " grid = grid.sortby(\"lon\")\n", + "\n", + " grid.lon.attrs[\"units\"] = \"degrees_east\"\n", + " grid.lat.attrs[\"units\"] = \"degrees_north\"\n", + "\n", + " interpolator = pbx.Grid2D(grid, geodetic=True)\n", + "\n", + " mx, my = np.meshgrid(da.x.values, da.y.values, indexing=\"ij\")\n", + "\n", + " out = interpolator.bicubic(dict(lon=mx.flatten(), lat=my.flatten()))\n", + "\n", + " out = out.reshape(mx.shape)\n", + " out = xr.DataArray(out).rename({\"dim_0\": \"x\", \"dim_1\": \"y\"})\n", + " out[\"x\"] = da.x.values\n", + " out[\"y\"] = da.y.values\n", + "\n", + " return out\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_bbox(tile_name):\n", + " \"\"\"\n", + " Return bounding box from tile name in the string format \"VXXHYYY\" representing the southwestern corner of a 1-degree tile,\n", + " where \"V\" is \"N\" (north) or \"S\" (south), \"H\" is \"E\" (east) or \"W\" (west), \"XX\" is a two-digit zero-padded number indicating\n", + " the number of degrees north or south from 0,0, and \"YYY\" is a three-digit zero-padded number indicating the number of degrees\n", + " east or west from 0,0.\n", + " \"\"\"\n", + " lat_term, lon_term = tile_name[:3], tile_name[3:]\n", + "\n", + " lat_direction, lat_value = lat_term[0], int(lat_term[1:])\n", + " lon_direction, lon_value = lon_term[0], int(lon_term[1:])\n", + "\n", + " lat_sign = 1 if lat_direction == \"N\" else -1\n", + " lon_sign = 1 if lon_direction == \"E\" else -1\n", + "\n", + " llat = lat_sign * lat_value\n", + " llon = lon_sign * lon_value\n", + "\n", + " ulat = llat + 1\n", + " ulon = llon + 1\n", + "\n", + " return box(llon, llat, ulon, ulat)\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_tile_path(tile):\n", + " \"\"\"Get raw CoastalDEM tile path\"\"\"\n", + " return sset.DIR_COASTALDEM / f\"{tile}.tif\"\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_elev_tile(\n", + " tile_name, bbox, use_coastal_dem=True, egm96_xgm2019e=None, mdt=None, cap=None\n", + "):\n", + " \"\"\"\n", + " Get 1-arcsec elevation tile relative to MSS.\n", + " Use CoastalDEM where available, replacing null areas with SRTM15+.\n", + " Get elevations relative to MSS datum using EGM96 -> XGM2019e and Mean Dynamic Topography (MDT) datum transformations\n", + " (both provided by Aviso).\n", + " \"\"\"\n", + " llon, llat, ulon, ulat = bbox.bounds\n", + " if use_coastal_dem:\n", + " # load tile\n", + " tile_path = get_tile_path(tile_name)\n", + " elev_tile = xr.open_rasterio(tile_path).sel(band=1).drop(\"band\")\n", + " elev_tile.load()\n", + "\n", + " # handle tiles with inaccurately bottom-left .1-degree metadata\n", + " if elev_tile[\"y\"].values.max() - elev_tile[\"y\"].values.min() < 0.9:\n", + " elev_tile[\"y\"] = (\n", + " elev_tile[\"y\"].values.min()\n", + " + (elev_tile[\"y\"].values - elev_tile[\"y\"].values.min()) * 10\n", + " )\n", + " elev_tile[\"x\"] = (\n", + " elev_tile[\"x\"].values.min()\n", + " + (elev_tile[\"x\"].values - elev_tile[\"x\"].values.min()) * 10\n", + " )\n", + "\n", + " # open our \"main DEM\" (to fill in missing pixels in CoastalDEM)\n", + " with xr.open_dataarray(sset.PATH_SRTM15_PLUS) as srtm:\n", + "\n", + " srtm_buffer = 0.01\n", + "\n", + " # Ensure tiles along the 180 meridian have coordinates defined contiguously\n", + " if llon == 179:\n", + " new_lons = srtm.lon.values\n", + " new_lons[new_lons < -179] = new_lons[new_lons < -179] + 360\n", + " srtm = srtm.assign_coords({\"lon\": new_lons})\n", + " elif ulon == -179:\n", + " new_lons = srtm.lon.values\n", + " new_lons[new_lons > 179] = new_lons[new_lons > 179] - 360\n", + " srtm = srtm.assign_coords({\"lon\": new_lons})\n", + "\n", + " # fill NaNs with SRTM\n", + " this_srtm = srtm.isel(\n", + " lon=(srtm.lon >= llon - srtm_buffer) & (srtm.lon <= ulon + srtm_buffer),\n", + " lat=(srtm.lat >= llat - srtm_buffer) & (srtm.lat <= ulat + srtm_buffer),\n", + " )\n", + "\n", + " this_srtm = this_srtm.sortby(\"lon\")\n", + " this_srtm.load()\n", + "\n", + " if use_coastal_dem:\n", + " srtm_interp = this_srtm.rename({\"lon\": \"x\", \"lat\": \"y\"}).interp_like(\n", + " elev_tile, method=\"linear\", assume_sorted=True\n", + " )\n", + " # -32767 means SRTM input to coastalDEM was missing (we have previously filled this in\n", + " # our master DEM)\n", + " # -9999 means outside of a particular spatial domain for coastalDEM\n", + " elev_tile = elev_tile.where(~elev_tile.isin([-32767, -9999])).fillna(\n", + " srtm_interp\n", + " )\n", + " # 0 is where coastalDEM is \"underwater\"\n", + " elev_tile = elev_tile.where(elev_tile != 0, np.nan)\n", + " else:\n", + " grid_width = 3600\n", + " size = 1 / grid_width\n", + "\n", + " lons_small = np.arange(llon + (size / 2), ulon, size)\n", + " lats_small = np.flip(np.arange(llat + (size / 2), ulat, size))\n", + "\n", + " srtm_interp = this_srtm.rename({\"lon\": \"x\", \"lat\": \"y\"}).interp(\n", + " {\"x\": lons_small, \"y\": lats_small}, method=\"linear\", assume_sorted=True\n", + " )\n", + " elev_tile = srtm_interp\n", + "\n", + " # Datum transformations\n", + " if (egm96_xgm2019e is None) or (mdt is None):\n", + " with xr.open_zarr(sset.PATH_GEOG_DATUMS_GRID, consolidated=True) as datum_grid:\n", + " mdt = datum_grid.mdt\n", + " egm96_xgm2019e = datum_grid.egm96_xgm2019e\n", + "\n", + " egm96_xgm2019e_interp = get_grid_at_tile(elev_tile, egm96_xgm2019e)\n", + " mdt_interp = get_grid_at_tile(elev_tile, mdt)\n", + " elev_tile = elev_tile + egm96_xgm2019e_interp\n", + " elev_tile = elev_tile - mdt_interp\n", + "\n", + " # Bundle higher-than-coastal elevation values into one to simplify later data processing\n", + " if cap is not None:\n", + " elev_tile = xr.where(elev_tile > cap, cap, elev_tile)\n", + "\n", + " return elev_tile\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def process_tile(\n", + " tile_name,\n", + " egm96_xgm2019e=None,\n", + " mdt=None,\n", + "):\n", + " bbox = get_bbox(tile_name)\n", + "\n", + " # get coastalDEM tile, filled with SRTM, relative to MSS\n", + " tile_path = get_tile_path(tile_name)\n", + " elev_tile = get_elev_tile(\n", + " tile_name,\n", + " bbox,\n", + " use_coastal_dem=tile_path.exists(),\n", + " egm96_xgm2019e=egm96_xgm2019e,\n", + " mdt=mdt,\n", + " )\n", + "\n", + " elev_tile = elev_tile.astype(np.float32)\n", + "\n", + " path_out_tmp = sset.DIR_MSS / f\"{tile_name}_tmp.tif\"\n", + " path_out = sset.DIR_MSS / f\"{tile_name}.tif\"\n", + " elev_tile.rio.to_raster(path_out_tmp)\n", + "\n", + " cmd_cp = f\"gdal_translate {str(path_out_tmp)} {str(path_out)} -co COMPRESS=DEFLATE -co PREDICTOR=3\"\n", + " cmd_rm = f\"rm {str(path_out_tmp)}\"\n", + "\n", + " subprocess.run(cmd_cp.split(\" \"), capture_output=True)\n", + " subprocess.run(cmd_rm.split(\" \"), capture_output=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a0506823-b42c-430c-beb4-9b5b6df7a7ef", + "metadata": {}, + "source": [ + "Get list of tiles to process" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bd1bd70-e2ad-4c2e-a62d-e18c855f6c76", + "metadata": {}, + "outputs": [], + "source": [ + "tile_meta = pd.read_parquet(sset.PATH_EXPOSURE_TILE_LIST)\n", + "coastal_tiles = tile_meta.loc[\n", + " tile_meta[\"PROCESSING_SET\"].isin([\"CIAM\", \"WITHELEV\"]), \"tile_name\"\n", + "].to_numpy()\n", + "\n", + "sset.DIR_MSS.mkdir(exist_ok=True)\n", + "finished_tiles = [t[:-4] for t in rhgcs.ls(sset.DIR_MSS)]\n", + "\n", + "coastal_tiles = [t for t in coastal_tiles if t not in finished_tiles]\n", + "\n", + "len(coastal_tiles)" + ] + }, + { + "cell_type": "markdown", + "id": "f02629e4-78dc-4651-91ac-3d540bfd508b", + "metadata": {}, + "source": [ + "Start up cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8192c8a-b98f-4b10-956d-80a656c6fb22", + "metadata": {}, + "outputs": [], + "source": [ + "client, cluster = rhgk.get_micro_cluster()\n", + "cluster.scale(24)\n", + "\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81e1b9a2-717d-4436-b37e-f1d179198168", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "from sliiders import __file__\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "sliiders_dir = Path(__file__).parent\n", + "zipf = zipfile.ZipFile(\"sliiders.zip\", \"w\", zipfile.ZIP_DEFLATED)\n", + "for root, dirs, files in os.walk(sliiders_dir):\n", + " for file in files:\n", + " zipf.write(\n", + " os.path.join(root, file),\n", + " os.path.relpath(os.path.join(root, file), os.path.join(sliiders_dir, \"..\")),\n", + " )\n", + "zipf.close()\n", + "client.upload_file(\"sliiders.zip\")" + ] + }, + { + "cell_type": "markdown", + "id": "afa91dcb-75ef-4f96-814c-5b4e5a97631b", + "metadata": {}, + "source": [ + "Load datum grids onto workers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2890474c-4ce3-4f87-8053-b7a2bacd2880", + "metadata": {}, + "outputs": [], + "source": [ + "with xr.open_zarr(sset.PATH_GEOG_DATUMS_GRID, consolidated=True) as datum_grid:\n", + " egm96_xgm2019e = datum_grid.egm96_xgm2019e.load()\n", + " mdt = datum_grid.mdt.load()\n", + "\n", + "egm96_xgm2019e_fut = client.scatter(egm96_xgm2019e, broadcast=True)\n", + "mdt_fut = client.scatter(mdt, broadcast=True)" + ] + }, + { + "cell_type": "markdown", + "id": "93fa0f69-a699-4713-9e87-779e6ca8cf64", + "metadata": {}, + "source": [ + "Run tiles on workers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9308220-6dcd-471c-9bdb-434852d35d8a", + "metadata": {}, + "outputs": [], + "source": [ + "fut = client.map(\n", + " process_tile, coastal_tiles, egm96_xgm2019e=egm96_xgm2019e_fut, mdt=mdt_fut\n", + ")\n", + "dd.progress(fut)" + ] + }, + { + "cell_type": "markdown", + "id": "1d71a720-c12d-4bbf-b0ed-456d6e927b24", + "metadata": {}, + "source": [ + "Close cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c433787-6e1c-45b7-9e49-9f0517d3ec3c", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.close()\n", + "client.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/8-generate_protected_areas.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/8-generate_protected_areas.ipynb new file mode 100644 index 0000000..ddbb462 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/8-generate_protected_areas.ipynb @@ -0,0 +1,779 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5a16ad8e-333e-46b3-9e7f-b6698457add7", + "metadata": {}, + "source": [ + "# Generate protected areas from various levee and hydrological data sources" + ] + }, + { + "cell_type": "markdown", + "id": "55a6f3c0-78d6-404e-9c20-d482e54b9afc", + "metadata": {}, + "source": [ + "### Gather basins from HydroBASINS to find endorheic basins to include in \"protected areas\" dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1642f3da-283c-4bd1-a6c0-454e44e64e8a", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "import contextily as ctx\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from shapely.geometry import Polygon, box\n", + "from tqdm.notebook import tqdm\n", + "\n", + "from sliiders import settings as sset\n", + "from sliiders import spatial\n", + "\n", + "spatial.filter_spatial_warnings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "795c2764-e952-4ff0-89a2-7dfc641efcf0", + "metadata": {}, + "outputs": [], + "source": [ + "all_basin_shapefiles = set((sset.DIR_HYDROBASINS_RAW.glob(\"hybas_*_lev00_v1c.shp\")))\n", + "eu_basin_shapefile = sset.DIR_HYDROBASINS_RAW / \"hybas_eu_lev00_v1c.shp\"\n", + "\n", + "all_basin_shapefiles.remove(eu_basin_shapefile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21010e1f-884c-4f0d-81fb-dac94730f22c", + "metadata": {}, + "outputs": [], + "source": [ + "all_other_basins = pd.concat(\n", + " [gpd.read_file(basin_shapefile) for basin_shapefile in tqdm(all_basin_shapefiles)],\n", + " ignore_index=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8aef1e20-c8fe-4b8a-8bf6-23405757d948", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_and_caspian = gpd.read_file(sset.PATH_NATURALEARTH_OCEAN)\n", + "\n", + "ocean_and_caspian = ocean_and_caspian.explode(index_parts=False)\n", + "\n", + "ocean_and_caspian[\"area\"] = ocean_and_caspian.area\n", + "\n", + "# Sort so that ocean is first, Caspian is second\n", + "ocean_and_caspian = ocean_and_caspian.sort_values(\"area\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5cd75f3-1a99-4da6-8fe4-ec9965c5a344", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_shape = ocean_and_caspian.geometry.values[0]\n", + "caspian_shape = ocean_and_caspian.geometry.values[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45106d7b-4d62-4a9d-b122-fc772e86186e", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c14925b8-e41c-493a-86d5-ff9fa4b146f0", + "metadata": {}, + "outputs": [], + "source": [ + "caspian_shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b078059-ace1-41c7-937d-0cbe89548c3a", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_buffer = ocean_shape.buffer(sset.ENDORHEIC_BASIN_OCEAN_BUFFER).simplify(\n", + " tolerance=0.1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b661588d-de5e-4229-907b-82a85fe1a3a6", + "metadata": {}, + "source": [ + "### Handle Caspian Sea as a special case since it is considered \"ocean\" by HydroBASINS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40c70a80-1abd-496a-9fe5-39baad6ded1a", + "metadata": {}, + "outputs": [], + "source": [ + "eu_basins = gpd.read_file(eu_basin_shapefile)\n", + "\n", + "eu_basins[\"touches_caspian\"] = eu_basins[\"geometry\"].intersects(\n", + " box(*caspian_shape.bounds).buffer(0.5)\n", + ")\n", + "\n", + "eu_basins[\"feeds_into_caspian\"] = eu_basins[\"touches_caspian\"].copy()\n", + "\n", + "prev_basin_count = 0\n", + "while True:\n", + "\n", + " feeds_into_caspian = set(eu_basins.loc[eu_basins[\"feeds_into_caspian\"], \"HYBAS_ID\"])\n", + "\n", + " eu_basins[\"feeds_into_caspian\"] = (\n", + " (eu_basins[\"feeds_into_caspian\"])\n", + " | (eu_basins[\"NEXT_DOWN\"].isin(feeds_into_caspian))\n", + " | (eu_basins[\"NEXT_SINK\"].isin(feeds_into_caspian))\n", + " | (eu_basins[\"MAIN_BAS\"].isin(feeds_into_caspian))\n", + " )\n", + "\n", + " this_basin_count = eu_basins[\"feeds_into_caspian\"].sum()\n", + " if this_basin_count == prev_basin_count:\n", + " break\n", + "\n", + " prev_basin_count = this_basin_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8f82508-a003-4ded-9932-6dc7522434ca", + "metadata": {}, + "outputs": [], + "source": [ + "eu_basins[eu_basins[\"feeds_into_caspian\"]].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f38e19b-db3f-40ff-9493-58ac7e002e52", + "metadata": {}, + "outputs": [], + "source": [ + "eu_basins.loc[eu_basins[\"feeds_into_caspian\"], \"ENDO\"] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee1b2c99-8653-48c5-be1f-ca117d6637d8", + "metadata": {}, + "outputs": [], + "source": [ + "all_basins = pd.concat([all_other_basins, eu_basins], ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e1d4f9a7-0b0e-49c7-9f3a-4332d0fc869f", + "metadata": {}, + "source": [ + "### Apply narrow definition of \"endorheic\" by assuming all \"virtual\" connections e.g. groundwater are real connections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af868b08-7636-4ed4-8d18-67db98b633a7", + "metadata": {}, + "outputs": [], + "source": [ + "all_basins[\"not_endorheic\"] = all_basins[\"ENDO\"] == 0\n", + "\n", + "prev_non_endorheic_ct = -1\n", + "while True:\n", + " not_endorheic = set(all_basins.loc[all_basins[\"not_endorheic\"], \"HYBAS_ID\"])\n", + " all_basins[\"not_endorheic\"] = (\n", + " (all_basins[\"not_endorheic\"])\n", + " | (all_basins[\"NEXT_DOWN\"].isin(not_endorheic))\n", + " | (all_basins[\"NEXT_SINK\"].isin(not_endorheic))\n", + " | (all_basins[\"MAIN_BAS\"].isin(not_endorheic))\n", + " )\n", + " non_endorheic_ct = len(not_endorheic)\n", + "\n", + " if non_endorheic_ct == prev_non_endorheic_ct:\n", + " break\n", + " prev_non_endorheic_ct = non_endorheic_ct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4483cc1b-64d6-4230-ab00-046453a32d02", + "metadata": {}, + "outputs": [], + "source": [ + "all_endorheic_basins = all_basins.loc[~all_basins[\"not_endorheic\"]].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "074029e5-6fae-40ed-8c3e-58708d32a26c", + "metadata": {}, + "outputs": [], + "source": [ + "ax = all_endorheic_basins.plot(figsize=(20, 20))\n", + "ctx.add_basemap(ax, crs=\"EPSG:4327\")" + ] + }, + { + "cell_type": "markdown", + "id": "d10faf31-11af-4c22-aaf2-2ac36eb2a3c4", + "metadata": {}, + "source": [ + "### Divide ocean shape into 1-degree tiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f626fd9-e3e8-4569-aaab-f43fc9b4a052", + "metadata": {}, + "outputs": [], + "source": [ + "llats = range(-90, 91)\n", + "llons = range(-180, 181)\n", + "boxes = []\n", + "llats_list = []\n", + "llons_list = []\n", + "for llat in llats:\n", + " for llon in llons:\n", + " llats_list.append(llat)\n", + " llons_list.append(llon)\n", + " boxes.append(\n", + " box(\n", + " llon,\n", + " llat,\n", + " llon + 1,\n", + " llat + 1,\n", + " )\n", + " )\n", + "\n", + "ocean_boxes_gdf = gpd.GeoDataFrame(\n", + " {\"llat\": llats_list, \"llon\": llons_list}, geometry=boxes, crs=\"EPSG:4326\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad4784dc-b067-4b8f-8aaa-669bd025e075", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_boxes_gdf[\"ocean_box\"] = ocean_boxes_gdf[\"geometry\"].intersection(ocean_buffer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21b9a884-41c6-4ba9-9fea-246ee9d6ee0c", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_boxes_gdf = ocean_boxes_gdf.drop(columns=\"geometry\").rename(\n", + " columns={\"ocean_box\": \"geometry\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e08d48a-687c-4784-b35e-61dee6c55d26", + "metadata": {}, + "outputs": [], + "source": [ + "ocean_boxes_gdf = ocean_boxes_gdf[~ocean_boxes_gdf[\"geometry\"].is_empty]" + ] + }, + { + "cell_type": "markdown", + "id": "03a079b9-de30-4322-97e1-62cf99d31897", + "metadata": {}, + "source": [ + "### Find all endorheic basins that intersect with the ocean buffer, label them \"not_endorheic\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c0f96c7-0f87-489d-8ea9-052a0289b474", + "metadata": {}, + "outputs": [], + "source": [ + "intersections = gpd.sjoin(\n", + " all_endorheic_basins, ocean_boxes_gdf, how=\"left\", op=\"intersects\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71f67a6f-5621-47e4-8477-77fbce5da491", + "metadata": {}, + "outputs": [], + "source": [ + "no_ocean = set(\n", + " intersections[intersections[\"index_right\"].isnull()][\"HYBAS_ID\"].unique()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "952dfb67-fe0c-4441-b28d-eaff7434fa43", + "metadata": {}, + "outputs": [], + "source": [ + "all_endorheic_basins[\"not_endorheic\"] = ~all_endorheic_basins[\"HYBAS_ID\"].isin(no_ocean)" + ] + }, + { + "cell_type": "markdown", + "id": "50049da5-4bc6-4d7c-a4ae-6db63cc716c0", + "metadata": {}, + "source": [ + "### Once basins are labelled \"not_endorheic\" close to the ocean, we want basins flowing into those to be \"not_endorheic\" as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c81528-1c09-4546-b937-6c2eda774aa7", + "metadata": {}, + "outputs": [], + "source": [ + "prev_non_endorheic_ct = -1\n", + "while True:\n", + " not_endorheic = set(\n", + " all_endorheic_basins.loc[all_endorheic_basins[\"not_endorheic\"], \"HYBAS_ID\"]\n", + " )\n", + " all_endorheic_basins[\"not_endorheic\"] = (\n", + " (all_endorheic_basins[\"not_endorheic\"])\n", + " | (all_endorheic_basins[\"NEXT_DOWN\"].isin(not_endorheic))\n", + " | (all_endorheic_basins[\"NEXT_SINK\"].isin(not_endorheic))\n", + " | (all_endorheic_basins[\"MAIN_BAS\"].isin(not_endorheic))\n", + " )\n", + " non_endorheic_ct = len(not_endorheic)\n", + "\n", + " if non_endorheic_ct == prev_non_endorheic_ct:\n", + " break\n", + " prev_non_endorheic_ct = non_endorheic_ct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "631bd9ca-c6f9-48f1-b765-91d928a57f43", + "metadata": {}, + "outputs": [], + "source": [ + "all_endorheic_basins = all_endorheic_basins[~all_endorheic_basins[\"not_endorheic\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9754aea2-d8b3-4402-a56c-05b3fd36ed13", + "metadata": {}, + "outputs": [], + "source": [ + "ax = all_endorheic_basins.plot(figsize=(20, 20))\n", + "ctx.add_basemap(ax, crs=\"EPSG:4326\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e87a877-95e8-483e-b208-99fd0fafa824", + "metadata": {}, + "outputs": [], + "source": [ + "endorheic_basins_dissolved = all_endorheic_basins.unary_union.geoms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc6d23fb-9df0-470f-8f82-a0c8c24db588", + "metadata": {}, + "outputs": [], + "source": [ + "combined_basins = gpd.GeoSeries(list(endorheic_basins_dissolved))\n", + "combined_basins = gpd.GeoDataFrame(geometry=combined_basins)\n", + "\n", + "combined_basins[\"area\"] = combined_basins.geometry.area\n", + "combined_basins = combined_basins.sort_values(\"area\", ascending=False)\n", + "combined_basins = combined_basins[\n", + " combined_basins[\"area\"] > sset.MIN_BASIN_TILE_DEGREE_AREA\n", + "].copy()" + ] + }, + { + "cell_type": "markdown", + "id": "8eac7436-1049-4185-97d3-0255042bfaef", + "metadata": {}, + "source": [ + "### Label basins manually (check each basin manually)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72c98d17-62f3-4a4e-8fe8-304fa7015e16", + "metadata": {}, + "outputs": [], + "source": [ + "ax = combined_basins.plot(figsize=(20, 20))\n", + "ctx.add_basemap(ax, crs=\"EPSG:4326\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a028ba1d-a263-461e-a855-21edbade243a", + "metadata": {}, + "outputs": [], + "source": [ + "combined_basins[\"label\"] = [\n", + " \"eurasia_caspian\",\n", + " \"sahara_sahel\",\n", + " \"central_australia\",\n", + " \"arabian_peninsula_dead_sea\",\n", + " \"altiplano_and_argentina\",\n", + " \"southern_africa\",\n", + " \"great_lakes_and_horn_of_africa\",\n", + " \"great_basin\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "ea440528-8082-4381-807f-390c1d319fcb", + "metadata": {}, + "source": [ + "### Fill Eurasian-Caspian basin with the Caspian itself" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fb0f5d6-d00d-4dc8-92c1-4c58e1c8c159", + "metadata": {}, + "outputs": [], + "source": [ + "surrounding_caspian = combined_basins.loc[\n", + " combined_basins[\"label\"] == \"eurasia_caspian\", \"geometry\"\n", + "].values[0]\n", + "combined_basins.loc[\n", + " combined_basins[\"label\"] == \"eurasia_caspian\", \"geometry\"\n", + "] = Polygon(surrounding_caspian.exterior)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6265240d-4067-443e-bd51-814331437de4", + "metadata": {}, + "outputs": [], + "source": [ + "combined_basins = combined_basins.reset_index(drop=True).drop(columns=\"area\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be77caf7-9e4a-4498-a5a7-5ee51b788f30", + "metadata": {}, + "outputs": [], + "source": [ + "combined_basins.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f918a16-bcb2-44a6-b517-82b2e2f4f907", + "metadata": {}, + "outputs": [], + "source": [ + "sset.PATH_MANUAL_PROTECTED_AREAS.parent.mkdir(exist_ok=True, parents=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5654be15-111c-4450-9e18-4a81d5394974", + "metadata": {}, + "outputs": [], + "source": [ + "combined_basins.to_parquet(sset.PATH_MANUAL_PROTECTED_AREAS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8fc7461-efe7-4958-9171-246117d83479", + "metadata": {}, + "outputs": [], + "source": [ + "combined_basins = gpd.read_parquet(sset.PATH_MANUAL_PROTECTED_AREAS)" + ] + }, + { + "cell_type": "markdown", + "id": "27ec19f6-fb7a-4366-8b19-87aa34abbe5b", + "metadata": {}, + "source": [ + "## Combine protected areas for global processing\n", + "- US National Levee Database (NLDB) and manual areas\n", + "- Large global endorheic basins\n", + "- Manual boxes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74e5ef7e-a64d-4c3f-ae6e-2172d448a06e", + "metadata": {}, + "outputs": [], + "source": [ + "nldb_and_manual_areas = gpd.read_parquet(sset.PATH_US_MANUAL_PROTECTED_AREAS)" + ] + }, + { + "cell_type": "markdown", + "id": "fda0cdf8-c491-4efd-a97c-0503f0956517", + "metadata": {}, + "source": [ + "Netherlands (assume all of the European Netherlands is protected)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "044b423d-9634-4fee-8639-a82bc0691055", + "metadata": {}, + "outputs": [], + "source": [ + "vor_shapes = gpd.read_parquet(sset.PATH_GADM_ADM0_VORONOI)\n", + "protected_areas_nld = vor_shapes[vor_shapes[\"ISO\"] == \"NLD\"][[\"geometry\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "4d6035bf-c212-4dfe-a86b-f4992882c8f3", + "metadata": {}, + "source": [ + "Manual boxes (additional to those defined in `sset.PATH_US_MANUAL_PROTECTED_AREAS`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3011a0b-d436-4f15-bec3-15dc84c013f7", + "metadata": {}, + "outputs": [], + "source": [ + "manual_box_bounds = [\n", + " {\"label\": \"orinoco\", \"minx\": -70.0, \"miny\": 5.0, \"maxx\": -66.0, \"maxy\": 8.0},\n", + " {\"label\": \"heilongjiang\", \"minx\": 130.0, \"miny\": 45.0, \"maxx\": 136.0, \"maxy\": 48.0},\n", + " {\n", + " \"label\": \"southern_africa\",\n", + " \"minx\": 28.0,\n", + " \"miny\": -25.0,\n", + " \"maxx\": 33.0,\n", + " \"maxy\": -20.0,\n", + " },\n", + " {\n", + " \"label\": \"great_basin\",\n", + " \"minx\": -119.0,\n", + " \"miny\": 35.0,\n", + " \"maxx\": -115.0,\n", + " \"maxy\": 40.0,\n", + " },\n", + " {\n", + " \"label\": \"inner_australia\",\n", + " \"minx\": 135.0,\n", + " \"miny\": -32.0,\n", + " \"maxx\": 143.0,\n", + " \"maxy\": -25.0,\n", + " },\n", + " {\"label\": \"yakutsk\", \"minx\": 125.0, \"miny\": 62.0, \"maxx\": 130.0, \"maxy\": 67.0},\n", + " {\"label\": \"lake_baikal\", \"minx\": 102.0, \"miny\": 49.0, \"maxx\": 113.0, \"maxy\": 57.0},\n", + " {\"label\": \"great_lakes\", \"minx\": -95.0, \"miny\": 41.0, \"maxx\": -75.0, \"maxy\": 50.0},\n", + "]\n", + "\n", + "box_countries = {\n", + " \"orinoco\": \"Colombia, Venezuela\",\n", + " \"heilongjiang\": \"China\",\n", + " \"southern_africa\": \"Botswana, South Africa, Zimbabwe\",\n", + " \"great_basin\": \"USA\",\n", + " \"inner_australia\": \"Australia\",\n", + " \"yakutsk\": \"Russia\",\n", + " \"lake_baikal\": \"Russia\",\n", + " \"great_lakes\": \"Canada, USA\",\n", + "}\n", + "\n", + "manual_boxes = gpd.GeoDataFrame(manual_box_bounds)\n", + "\n", + "manual_boxes[\"geometry\"] = manual_boxes.apply(\n", + " lambda row: box(row[\"minx\"], row[\"miny\"], row[\"maxx\"], row[\"maxy\"]), axis=1\n", + ")\n", + "\n", + "manual_boxes = manual_boxes.drop(columns=[\"minx\", \"miny\", \"maxx\", \"maxy\"])\n", + "\n", + "manual_boxes.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a72ea14-a3d3-450d-954e-0e71586fffa6", + "metadata": {}, + "outputs": [], + "source": [ + "nldb_and_manual_areas[\"country\"] = \"USA\"\n", + "\n", + "protected_areas_nld[\"levee_segment_id\"] = -1\n", + "protected_areas_nld[\"protection_group\"] = \"Netherlands\"\n", + "protected_areas_nld[\"protection_type\"] = \"Netherlands\"\n", + "protected_areas_nld[\"country\"] = \"NLD\"\n", + "\n", + "combined_basins[\"levee_segment_id\"] = -1\n", + "combined_basins[\n", + " \"protection_group\"\n", + "] = \"largest endorheic basin areas, with buffer from ocean\"\n", + "combined_basins[\"protection_type\"] = \"endorheic basin\"\n", + "combined_basins[\"country\"] = \"multiple\"\n", + "\n", + "manual_boxes[\"levee_segment_id\"] = -1\n", + "manual_boxes[\"protection_group\"] = \"manual boxes\"\n", + "manual_boxes[\"protection_type\"] = \"non-coastal\"\n", + "manual_boxes[\"country\"] = manual_boxes[\"label\"].apply(lambda x: box_countries[x])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e12c4e5-11f0-439b-ab5c-3ff56b6c19d3", + "metadata": {}, + "outputs": [], + "source": [ + "protected_areas = (\n", + " pd.concat(\n", + " [\n", + " nldb_and_manual_areas,\n", + " protected_areas_nld,\n", + " combined_basins,\n", + " manual_boxes,\n", + " ],\n", + " ignore_index=True,\n", + " )\n", + " .reset_index(drop=False)\n", + " .rename(columns={\"index\": \"protection_zone_id\"})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcacf25c-ffd4-430d-b762-e56cf2f2f3a2", + "metadata": {}, + "outputs": [], + "source": [ + "protected_areas.sample(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b796e0e-614f-4ff1-b29f-0c825b50d443", + "metadata": {}, + "outputs": [], + "source": [ + "# plot-checking the protected areas\n", + "protected_areas.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "f51062a6-3991-4d02-b7c6-4d0d03f94331", + "metadata": {}, + "source": [ + "# Save combined areas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b299af-af5a-40d0-a650-4fed516e095b", + "metadata": {}, + "outputs": [], + "source": [ + "protected_areas.to_parquet(sset.PATH_GLOBAL_PROTECTED_AREAS)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/9-generate_exposure_tiles.ipynb b/notebooks/create-SLIIDERS-ECON/exposure/9-generate_exposure_tiles.ipynb new file mode 100644 index 0000000..2b4a7cc --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/9-generate_exposure_tiles.ipynb @@ -0,0 +1,737 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Combine data layers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Get elevation relative to sea level\n", + "2. Match to nearest country, impact region, protection zone (e.g. levees)\n", + "3. Uniformly distribute exposure over all surface area > 0 elevation within a 30\" pixel\n", + "4. Aggregate both surface area and exposure up to adm1 X coastal segment X protection zone X wetland flag X .1-meter elevation bin" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "from pathlib import Path\n", + "import dask.distributed as dd\n", + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "import regionmask\n", + "import rhg_compute_tools.gcs as rhgcs\n", + "import rhg_compute_tools.kubernetes as rhgk\n", + "import rhg_compute_tools.utils as rhgu\n", + "import xarray as xr\n", + "from shapely.geometry import box\n", + "\n", + "from sliiders import settings as sset\n", + "from sliiders import spatial as spatial\n", + "\n", + "spatial.filter_spatial_warnings()\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def load_exposure(bbox, sset):\n", + " \"\"\"Get asset value and population within the bounds defined by `bbox`\"\"\"\n", + " llon, llat, ulon, ulat = bbox.bounds\n", + "\n", + " # Get corners of `bbox` by their indices\n", + " lx_ix, ux_ix = spatial.grid_val_to_ix(\n", + " np.array([llon, ulon]),\n", + " sset.LITPOP_GRID_WIDTH,\n", + " )\n", + "\n", + " ly_ix, uy_ix = spatial.grid_val_to_ix(\n", + " np.array([llat, ulat]),\n", + " sset.LITPOP_GRID_WIDTH,\n", + " )\n", + "\n", + " # Define filters for reading parquet (saves computation and memory)\n", + " parquet_filters = [\n", + " [\n", + " (\"x_ix\", \">=\", lx_ix),\n", + " (\"x_ix\", \"<\", ux_ix),\n", + " (\"y_ix\", \">=\", ly_ix),\n", + " (\"y_ix\", \"<\", uy_ix),\n", + " ]\n", + " ]\n", + "\n", + " exp_filters = [parquet_filters[0] + [(\"value\", \">\", 0)]]\n", + " pop_filters = [parquet_filters[0] + [(\"population\", \">\", 0)]]\n", + "\n", + " # asset value\n", + " exp = pd.read_parquet(\n", + " sset.PATH_EXPOSURE_BLENDED,\n", + " columns=[\"value\", \"x_ix\", \"y_ix\"],\n", + " filters=exp_filters,\n", + " )\n", + "\n", + " pop_landscan = pd.read_parquet(\n", + " sset.PATH_LANDSCAN_INT,\n", + " columns=[\"population\", \"x_ix\", \"y_ix\"],\n", + " filters=pop_filters,\n", + " ).rename(columns={\"population\": \"pop_landscan\"})\n", + "\n", + " exp = pd.merge(\n", + " exp,\n", + " pop_landscan,\n", + " how=\"outer\",\n", + " left_on=[\"x_ix\", \"y_ix\"],\n", + " right_on=[\"x_ix\", \"y_ix\"],\n", + " )\n", + "\n", + " exp[\"value\"] = exp[\"value\"].fillna(0)\n", + " exp[\"pop_landscan\"] = exp[\"pop_landscan\"].fillna(0)\n", + "\n", + " return exp\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_protected_area_matches(elev_tile, bbox, sset):\n", + " \"\"\"\n", + " Get IDs of protected areas in `bbox`, returning a flattened array\n", + " corresponding to the flattened indices of `elev_tile`\n", + " \"\"\"\n", + " protected_areas = gpd.read_parquet(sset.PATH_GLOBAL_PROTECTED_AREAS)\n", + "\n", + " return spatial.get_partial_covering_matches(\n", + " elev_tile, bbox, protected_areas, id_name=\"protection_zone_id\"\n", + " )\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_wetland_matches(elev_tile, bbox, sset):\n", + " \"\"\"\n", + " Get flag indicating existence of wetlands in `bbox`, returning a flattened array\n", + " corresponding to the flattened indices of `elev_tile`\n", + " \"\"\"\n", + " wetlands = gpd.read_file(sset.PATH_WETLANDS_INT, bbox=(bbox.bounds))\n", + "\n", + " return spatial.get_partial_covering_matches(elev_tile, bbox, wetlands)\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_seg_adm(elev_tile, bbox, sset):\n", + " seg_adm = gpd.read_file(\n", + " sset.PATH_CIAM_ADM1_VORONOI_INTERSECTIONS_SHP,\n", + " bbox=box(*bbox.buffer(0.1).bounds),\n", + " )\n", + "\n", + " return spatial.get_vor_matches(elev_tile, bbox, seg_adm, \"seg_adm\", \"seg_adm\")\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def match_elev_pixels_to_shapes(elev_tile, bbox, sset):\n", + "\n", + " out_df = spatial.get_empty_exp_grid(elev_tile, sset.LITPOP_GRID_WIDTH)\n", + "\n", + " out_df[\"seg_adm\"] = get_seg_adm(elev_tile, bbox, sset)\n", + " out_df[\"seg_adm\"] = out_df[\"seg_adm\"].astype(\"category\")\n", + "\n", + " out_df[\"protection_zone\"] = get_protected_area_matches(elev_tile, bbox, sset)\n", + " out_df[\"protection_zone\"] = out_df[\"protection_zone\"].astype(\"category\")\n", + "\n", + " out_df[\"wetland_flag\"] = get_wetland_matches(elev_tile, bbox, sset)\n", + " out_df[\"wetland_flag\"] = out_df[\"wetland_flag\"].astype(bool)\n", + "\n", + " return out_df\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_valid_points_df(\n", + " elev_tile,\n", + " bbox,\n", + " all_points,\n", + " sset,\n", + "):\n", + " elev_array = elev_tile.values.flatten()\n", + "\n", + " all_points[\"z_ix\"] = spatial.grid_val_to_ix(\n", + " elev_array, sset.EXPOSURE_BIN_WIDTH_V, map_nans=0\n", + " )\n", + "\n", + " all_points[\"valid\"] = (~np.isnan(elev_array)) & (\n", + " (all_points[\"z_ix\"] >= 0) | (all_points[\"protection_zone\"] != -1)\n", + " )\n", + "\n", + " all_points[\"area_km\"] = spatial.get_cell_size_km(elev_tile, bbox)\n", + "\n", + " out_types = {\n", + " \"x_ix\": np.int16,\n", + " \"y_ix\": np.int16,\n", + " \"z_ix\": np.int32,\n", + " \"seg_adm\": \"category\",\n", + " \"protection_zone\": \"category\",\n", + " \"wetland_flag\": bool,\n", + " \"area_km\": np.float32,\n", + " }\n", + "\n", + " # compress\n", + " all_points = all_points.astype(\n", + " {k: v for k, v in out_types.items() if k in all_points.columns}\n", + " )\n", + "\n", + " poselev_pts = (\n", + " all_points[all_points[\"valid\"]].drop(columns=[\"valid\"]).reset_index(drop=True)\n", + " )\n", + "\n", + " negelev_pts = (\n", + " all_points[(~all_points[\"valid\"]) & (all_points[\"wetland_flag\"])]\n", + " .drop(columns=[\"valid\"])\n", + " .reset_index(drop=True)\n", + " )\n", + "\n", + " return poselev_pts, negelev_pts\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_agg_fields():\n", + " \"\"\"Get fields to aggregate over\"\"\"\n", + " return [\n", + " \"z_ix\",\n", + " \"seg_adm\",\n", + " \"protection_zone\",\n", + " ]\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def write_empty_csv(out_path):\n", + " # write CSV placeholder to indicate this tile has been processed, but doesn't have exposure\n", + " pd.DataFrame().to_csv(out_path, index=False)\n", + " return out_path\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_tile_out_path(tile_name, sset):\n", + " \"\"\"Get output path from the coastalDEM input path\"\"\"\n", + " return sset.DIR_EXPOSURE_BINNED_TMP_TILES / f\"{tile_name}.csv\"\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_exp_noland_out_path(tile_name, sset):\n", + " \"\"\"Get output path for exposure that couldn't be matched to land within its 1-degree elevation tile\"\"\"\n", + " return sset.DIR_EXPOSURE_BINNED_TMP_TILES_NOLAND / f\"{tile_name}.csv\"\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def get_seg_area_out_path(tile_name, sset):\n", + " \"\"\"Get output path for segment areas\"\"\"\n", + " return sset.DIR_EXPOSURE_BINNED_TMP_TILES_SEGMENT_AREA / f\"{tile_name}.csv\"\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def merge_exposure_to_highres_grid(this_exp, out, sset):\n", + " agg_fields = get_agg_fields()\n", + "\n", + " ix_merge = pd.merge(\n", + " this_exp[[\"x_ix\", \"y_ix\"]],\n", + " out[[\"x_ix\", \"y_ix\", \"seg_adm\"]].drop_duplicates(),\n", + " left_on=[\"x_ix\", \"y_ix\"],\n", + " right_on=[\"x_ix\", \"y_ix\"],\n", + " how=\"left\",\n", + " )\n", + "\n", + " missing_exp_tiles = ix_merge[ix_merge[\"seg_adm\"].isnull()].drop(columns=[\"seg_adm\"])\n", + " valid_exp_tiles = ix_merge[ix_merge[\"seg_adm\"].notnull()].drop(columns=[\"seg_adm\"])\n", + "\n", + " if valid_exp_tiles.shape[0] == 0:\n", + " valid_exp_tiles = out[[\"x_ix\", \"y_ix\"]].drop_duplicates()\n", + "\n", + " missing_exp_tiles[\"lon\"] = spatial.grid_ix_to_val(\n", + " missing_exp_tiles[\"x_ix\"], sset.LITPOP_GRID_WIDTH\n", + " )\n", + " missing_exp_tiles[\"lat\"] = spatial.grid_ix_to_val(\n", + " missing_exp_tiles[\"y_ix\"], sset.LITPOP_GRID_WIDTH\n", + " )\n", + "\n", + " valid_exp_tiles[\"lon\"] = spatial.grid_ix_to_val(\n", + " valid_exp_tiles[\"x_ix\"], sset.LITPOP_GRID_WIDTH\n", + " )\n", + " valid_exp_tiles[\"lat\"] = spatial.grid_ix_to_val(\n", + " valid_exp_tiles[\"y_ix\"], sset.LITPOP_GRID_WIDTH\n", + " )\n", + "\n", + " exp_ix_mappings = (\n", + " spatial.get_closest_valid_exp_tiles(missing_exp_tiles, valid_exp_tiles)\n", + " if missing_exp_tiles.shape[0] > 0\n", + " else None\n", + " )\n", + "\n", + " if exp_ix_mappings is not None:\n", + " this_exp = pd.merge(\n", + " this_exp,\n", + " exp_ix_mappings,\n", + " left_on=[\"x_ix\", \"y_ix\"],\n", + " right_on=[\"x_ix\", \"y_ix\"],\n", + " how=\"left\",\n", + " )\n", + "\n", + " this_exp[\"x_ix\"] = this_exp[\"valid_x_ix\"].fillna(this_exp[\"x_ix\"]).astype(int)\n", + " this_exp[\"y_ix\"] = this_exp[\"valid_y_ix\"].fillna(this_exp[\"y_ix\"]).astype(int)\n", + "\n", + " this_exp = (\n", + " this_exp.groupby([\"x_ix\", \"y_ix\"])[[\"value\", \"pop_landscan\"]]\n", + " .sum()\n", + " .reset_index(drop=False)\n", + " )\n", + "\n", + " exp_tile_areas = (\n", + " out.groupby([\"x_ix\", \"y_ix\"])[[\"area_km\"]]\n", + " .sum()\n", + " .rename(columns={\"area_km\": \"tile_area_km\"})\n", + " )\n", + "\n", + " out = out.join(exp_tile_areas, on=[\"x_ix\", \"y_ix\"])\n", + "\n", + " out = pd.merge(\n", + " out,\n", + " this_exp,\n", + " how=\"left\",\n", + " left_on=[\"x_ix\", \"y_ix\"],\n", + " right_on=[\"x_ix\", \"y_ix\"],\n", + " ).reset_index(drop=True)\n", + "\n", + " out = out.drop(columns=[\"x_ix\", \"y_ix\"])\n", + "\n", + " out[\"value\"] = out[\"value\"] * out[\"area_km\"] / out[\"tile_area_km\"]\n", + " out[\"pop_landscan\"] = out[\"pop_landscan\"] * out[\"area_km\"] / out[\"tile_area_km\"]\n", + "\n", + " out = out.drop(columns=[\"tile_area_km\"])\n", + "\n", + " out[\"value\"] = out[\"value\"].fillna(0)\n", + " out[\"pop_landscan\"] = out[\"pop_landscan\"].fillna(0)\n", + "\n", + " assert out.notnull().all().all()\n", + "\n", + " out = out.drop(columns=[\"lon\", \"lat\"])\n", + "\n", + " out = out.groupby(agg_fields, observed=True).sum().reset_index()\n", + "\n", + " # make sure no exposure was dropped or added from the original exposure within tile (within some margin of float error)\n", + " # include very low sums for 0 / 0 division (areas where there is no exposure, but we calculate anyway for diva areas)\n", + " assert (\n", + " this_exp[\"value\"].sum() < 0.00001\n", + " or np.abs(this_exp[\"value\"].sum() / out[\"value\"].sum() - 1) < 0.00001\n", + " )\n", + "\n", + " return out\n", + "\n", + "\n", + "@rhgu.block_globals\n", + "def process_tile(\n", + " tile_name,\n", + " sset,\n", + " calc_elev=True,\n", + " calc_exp=True,\n", + "):\n", + " warnings.filterwarnings(\"ignore\", message=\"Geometry is in a geographic CRS\")\n", + " warnings.filterwarnings(\"ignore\", message=\"CRS mismatch between the CRS\")\n", + " warnings.filterwarnings(\n", + " \"ignore\", message=\"Sequential read of iterator was interrupted\"\n", + " )\n", + "\n", + " out_path = get_tile_out_path(tile_name, sset)\n", + " bbox = spatial.get_bbox(tile_name)\n", + "\n", + " this_exp = load_exposure(bbox, sset) if calc_exp else None\n", + "\n", + " if calc_elev:\n", + " elev_tile = (\n", + " xr.open_rasterio(sset.DIR_MSS / f\"{tile_name}.tif\")\n", + " .squeeze(\"band\")\n", + " .drop(\"band\")\n", + " )\n", + " # Bundle higher-than-coastal elevation values into one to simplify later data processing\n", + " elev_tile = xr.where(elev_tile > sset.ELEV_CAP, sset.ELEV_CAP, elev_tile)\n", + " else:\n", + " elev_tile = spatial.get_granular_grid(bbox)\n", + "\n", + " # match tile points with countries, impact regions, protection zones\n", + " out = match_elev_pixels_to_shapes(elev_tile, bbox, sset)\n", + "\n", + " # get points on land, assign impact regions and countries at exposure grid level\n", + " out, negelev_pts = get_valid_points_df(elev_tile, bbox, out, sset)\n", + "\n", + " # if calc_elev:\n", + " seg_areas = out.groupby(\n", + " [\"seg_adm\", \"protection_zone\", \"wetland_flag\", \"z_ix\"],\n", + " as_index=False,\n", + " observed=True,\n", + " )[\"area_km\"].sum()\n", + "\n", + " negelev_areas = negelev_pts.groupby(\n", + " [\"seg_adm\", \"protection_zone\", \"wetland_flag\"],\n", + " as_index=False,\n", + " observed=True,\n", + " )[\"area_km\"].sum()\n", + " negelev_areas[\"z_ix\"] = -1\n", + "\n", + " seg_areas = pd.concat([seg_areas, negelev_areas], ignore_index=True)\n", + "\n", + " seg_areas = seg_areas[\n", + " (seg_areas[\"z_ix\"] <= 200) & (seg_areas[\"protection_zone\"] == -1)\n", + " ]\n", + "\n", + " seg_out_path = get_seg_area_out_path(tile_name, sset)\n", + " seg_areas.to_csv(seg_out_path, index=False)\n", + " if not calc_exp:\n", + " return seg_out_path\n", + "\n", + " if out.shape[0] == 0:\n", + " if calc_exp:\n", + " this_exp.to_csv(get_exp_noland_out_path(tile_name, sset), index=False)\n", + " return write_empty_csv(out_path)\n", + "\n", + " out = (\n", + " out[~out[\"wetland_flag\"]].drop(columns=[\"wetland_flag\"]).reset_index(drop=True)\n", + " )\n", + "\n", + " out = merge_exposure_to_highres_grid(this_exp, out, sset)\n", + "\n", + " out.to_csv(out_path, index=False)\n", + "\n", + " return out_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Copy CIAM seg shapefiles if they haven't been updated for this version of the exposure grid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_maj_min(vers_name):\n", + " major, minor = vers_name.split(\".\")\n", + " return int(major), int(minor)\n", + "\n", + "\n", + "exp_vers_maj, exp_vers_min = get_maj_min(sset.EXPOSURE_BINNED_VERS[1:])\n", + "\n", + "dir_shp = sset.DIR_CIAM_VORONOI.parent\n", + "\n", + "existing_vers = [get_maj_min(p.name[1:]) for p in list(dir_shp.glob(\"v*.*\"))]\n", + "\n", + "existing_vers.sort(key=lambda s: s[1])\n", + "existing_vers.sort(key=lambda f: f[0])\n", + "\n", + "latest_vers_maj, latest_vers_min = existing_vers[-1]\n", + "\n", + "if (exp_vers_maj, exp_vers_min) not in existing_vers:\n", + "\n", + " src_dir = dir_shp / (\"v\" + str(latest_vers_maj) + \".\" + str(latest_vers_min))\n", + " dst_dir = dir_shp / (\"v\" + str(exp_vers_maj) + \".\" + str(exp_vers_min))\n", + "\n", + " rhgcs.cp(src_dir, dst_dir, flags=[\"r\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Prepare output directories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sset.DIR_EXPOSURE_BINNED.mkdir(exist_ok=True)\n", + "\n", + "sset.DIR_EXPOSURE_BINNED_TMP.mkdir(exist_ok=True)\n", + "\n", + "sset.DIR_EXPOSURE_BINNED_TMP_TILES.mkdir(exist_ok=True)\n", + "sset.DIR_EXPOSURE_BINNED_TMP_TILES_NOLAND.mkdir(exist_ok=True)\n", + "sset.DIR_EXPOSURE_BINNED_TMP_TILES_SEGMENT_AREA.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get list of tiles to process" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tile_meta_path = sset.PATH_EXPOSURE_TILE_LIST\n", + "\n", + "tile_meta = pd.read_parquet(tile_meta_path)\n", + "\n", + "tile_groups = tile_meta.groupby(\"PROCESSING_SET\")[\"tile_name\"].unique().to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_tiles = np.concatenate(list(tile_groups.values()))\n", + "\n", + "finished_tiles = [t[:-4][:7] for t in rhgcs.ls(sset.DIR_EXPOSURE_BINNED_TMP_TILES)]\n", + "finished_segs = [\n", + " t[:-4][:7] for t in rhgcs.ls(sset.DIR_EXPOSURE_BINNED_TMP_TILES_SEGMENT_AREA)\n", + "]\n", + "\n", + "remaining_tiles = [\n", + " t for t in all_tiles if (t not in finished_tiles and t not in finished_segs)\n", + "]\n", + "\n", + "print(len(remaining_tiles))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client, cluster = rhgk.get_standard_cluster()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nworkers = 200\n", + "cluster.scale(nworkers)\n", + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import zipfile\n", + "from pathlib import Path\n", + "\n", + "from sliiders import __file__\n", + "\n", + "sliiders_dir = Path(__file__).parent\n", + "zipf = zipfile.ZipFile(\"sliiders.zip\", \"w\", zipfile.ZIP_DEFLATED)\n", + "for root, dirs, files in os.walk(sliiders_dir):\n", + " for file in files:\n", + " zipf.write(\n", + " os.path.join(root, file),\n", + " os.path.relpath(os.path.join(root, file), os.path.join(sliiders_dir, \"..\")),\n", + " )\n", + "zipf.close()\n", + "client.upload_file(\"sliiders.zip\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Without elevation\n", + "\n", + "Note: when running the below three cells, one occasionally may run into the Dask cluster being stuck on making a progress. We find that this occurrence is not tile-specific. In such cases, we advise the user to follow these steps:\n", + "\n", + "\n", + "1. Close the current Dask cluster and client by running `client.restart(); cluster.scale(0); client.close(); cluster.close()`\n", + "2. Once the Dask cluster and client have successfully closed, restart the notebook kernel.\n", + "3. Run all of the codes up to this section, and the cell directly below. Make sure that the Dask cluster is successfully running.\n", + "4. Since we only need to remaining tiles that has not been processed, run (in via `client.map`) `process_tile` on these remaining ones. This can be done by running the below cells again, since the already-processed tiles would not be included in `remaining_tiles` anymore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_tiles = np.array(\n", + " [t for t in tile_groups[\"WITHOUTELEV\"] if t in remaining_tiles]\n", + ")\n", + "\n", + "withoutelev_tiles.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withoutelev_futures = client.map(\n", + " process_tile, withoutelev_tiles, sset=sset, calc_elev=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(withoutelev_futures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# With elevation\n", + "\n", + "Note: similar to the without elevation cases, there could be cases in which Dask becomes stuck on making a progress. In such cases, we advise the user to follow similar steps to those explained above (but without having to re-run the steps involving without elevation workflow (i.e., after restarting the notebook, run all except the three cells under **Without elevation**, and work on the remaining tiles)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With Exposure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_tiles = np.array([t for t in tile_groups[\"WITHELEV\"] if t in remaining_tiles])\n", + "\n", + "withelev_tiles.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "withelev_futures = client.map(\n", + " process_tile,\n", + " withelev_tiles,\n", + " sset=sset,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(withelev_futures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### No exposure (CIAM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_tiles = np.array([t for t in tile_groups[\"CIAM\"] if t in remaining_tiles])\n", + "ciam_tiles.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ciam_futures = client.map(\n", + " process_tile,\n", + " ciam_tiles,\n", + " sset=sset,\n", + " calc_exp=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.progress(ciam_futures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Shutdown workers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.close()\n", + "cluster.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-ECON/exposure/README.md b/notebooks/create-SLIIDERS-ECON/exposure/README.md new file mode 100644 index 0000000..73561b5 --- /dev/null +++ b/notebooks/create-SLIIDERS-ECON/exposure/README.md @@ -0,0 +1,15 @@ +Run the notebooks in this directory in order: + +1. `create-coastline-segments`: Create segments from CoDEC points. +2. `create-segment-regions`: Divide the world up into Voronoi polygons for each segmentXregion. +3. `fill_missing_litpop_with_geg`: Fill missing regions in LitPop with data from GEG-15. +4. `vectorize-wetlands`: Transform wetlands rasters (GLOBCOVER and Global Mangrove Watch) into single shapefile. +5. `get_positive_elev_tiles`: Assign global 1-degree tiles to groups for tile processing notebook +6. `generate_datum_conversion_grid`: converts (interpolates) MDT data to match with geoid grid and combines geoid and MDT datasets +7. `create_dem_mss`: Create elevation grid relative to MSS +8. `generate_protected_areas`: Create areas that are "protected" from sea level rise. +9. `generate_exposure_tiles`: Assign population, asset value, elevation, segments, protected regions, and administrative regions to global 1-degree tiles. +10. `combine_exposure_tiles`: Combine 1-degree tiles into the following datasets: + * Exposure with elevation (coastal exposure) + * Exposure without elevation (all exposure) + * Areas by elevation diff --git a/notebooks/create-SLIIDERS-SLR/README.md b/notebooks/create-SLIIDERS-SLR/README.md new file mode 100644 index 0000000..bc08df4 --- /dev/null +++ b/notebooks/create-SLIIDERS-SLR/README.md @@ -0,0 +1,10 @@ +This directory contains notebooks to generate SLIIDERS-SLR; a dataset of gridded local sea-level Monte Carlo samples based on the LocalizeSL framework. + +The final output is a Zarr store containing 10,000 Monte Carlo draws for each of the RCP scenarios and years (decadal), at each site ID (defined by LocalizeSL), for each corefile. + +The steps to produce this output are as follows: +1. `download-ifile-to-gcs.ipynb`: define the corefiles (IFILES) that you'd like to use and download them on GCS +2. `convert-mat-version.ipynb`: Convert the downloaded corefiles (IFILES) to the Octave-readable MATLAB v5 format. +3. `generate-projected-lsl.ipynb`: Dask workers running Octave. For any corefile, call the LocalizeSL `LocalizeStoredProjections` function, followed by `WriteTableMC`, to get outputs as TSVs. +4. `retrieve-num-gcms.ipynb`: Calculate number of GCMs for each site-year-scenario, for later use in clipping some sites due to data quality issues. +5. `process-localizesl-output.ipynb`: combine all TSVs into a single Zarr store. Clip some sites based on data quality criteria. \ No newline at end of file diff --git a/notebooks/create-SLIIDERS-SLR/convert-mat-version.ipynb b/notebooks/create-SLIIDERS-SLR/convert-mat-version.ipynb new file mode 100644 index 0000000..294dd2d --- /dev/null +++ b/notebooks/create-SLIIDERS-SLR/convert-mat-version.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert MATLAB v7.3 corefiles to MATLAB v5\n", + "- Octave doesn't support v7.3 as of 2020-05-19\n", + "- We use the Mat I/O C library to convert v7.3 to v5 (Mat I/O doesn't support v7)\n", + "- Note that what is called \"v5\" in Mat I/O is the same as \"v6\" in Octave" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote MAT file: \"/gcs/rhg-data/impactlab-rhg/coastal/sliiders/int/slr/ifiles/SLRProjections170113GRIDDEDcore_v5.mat\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing MAT variable: OceanDynMean... succeeded.\n", + "Writing MAT variable: OceanDynN... succeeded.\n", + "Writing MAT variable: OceanDynRegions... succeeded.\n", + "Writing MAT variable: OceanDynStd... succeeded.\n", + "Writing MAT variable: OceanDynTECorr... succeeded.\n", + "Writing MAT variable: OceanDynYears... succeeded.\n", + "Writing MAT variable: ThermExpMean... succeeded.\n", + "Writing MAT variable: ThermExpN... succeeded.\n", + "Writing MAT variable: ThermExpStd... succeeded.\n", + "Writing MAT variable: ThermExpYears... succeeded.\n", + "Writing MAT variable: colAIS... succeeded.\n", + "Writing MAT variable: colGIC... succeeded.\n", + "Writing MAT variable: colGIS... succeeded.\n", + "Writing MAT variable: colLS... succeeded.\n", + "Writing MAT variable: colTE... succeeded.\n", + "Writing MAT variable: fpsite... succeeded.\n", + "Writing MAT variable: mergeZOSZOSTOGA... succeeded.\n", + "Writing MAT variable: nearestTG... succeeded.\n", + "Writing MAT variable: quantlevs... succeeded.\n", + "Writing MAT variable: rateprojs... succeeded.\n", + "Writing MAT variable: rateprojssd... succeeded.\n", + "Writing MAT variable: samps... succeeded.\n", + "Writing MAT variable: scens... succeeded.\n", + "Writing MAT variable: seeds... succeeded.\n", + "Writing MAT variable: targregionnames... succeeded.\n", + "Writing MAT variable: targregions... succeeded.\n", + "Writing MAT variable: targsitecoords... succeeded.\n", + "Writing MAT variable: targyears... succeeded.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote MAT file: \"/gcs/rhg-data/impactlab-rhg/coastal/sliiders/int/slr/ifiles/SLRProjections190726core_SEJ_full_v5.mat\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing MAT variable: corefileH... succeeded.\n", + "Writing MAT variable: corefileL... succeeded.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote MAT file: \"/gcs/rhg-data/impactlab-rhg/coastal/sliiders/int/slr/ifiles/SLRProjections200204GRIDDEDcore_D20_v5.mat\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing MAT variable: OceanDynMean... succeeded.\n", + "Writing MAT variable: OceanDynN... succeeded.\n", + "Writing MAT variable: OceanDynRegions... succeeded.\n", + "Writing MAT variable: OceanDynStd... succeeded.\n", + "Writing MAT variable: OceanDynTECorr... succeeded.\n", + "Writing MAT variable: OceanDynYears... succeeded.\n", + "Writing MAT variable: ThermExpMean... succeeded.\n", + "Writing MAT variable: ThermExpN... succeeded.\n", + "Writing MAT variable: ThermExpStd... succeeded.\n", + "Writing MAT variable: ThermExpYears... succeeded.\n", + "Writing MAT variable: colAIS... succeeded.\n", + "Writing MAT variable: colGIC... succeeded.\n", + "Writing MAT variable: colGIS... succeeded.\n", + "Writing MAT variable: colLS... succeeded.\n", + "Writing MAT variable: colTE... succeeded.\n", + "Writing MAT variable: fpsite... succeeded.\n", + "Writing MAT variable: mergeZOSZOSTOGA... succeeded.\n", + "Writing MAT variable: nearestTG... succeeded.\n", + "Writing MAT variable: quantlevs... succeeded.\n", + "Writing MAT variable: rateprojs... succeeded.\n", + "Writing MAT variable: rateprojssd... succeeded.\n", + "Writing MAT variable: samps... succeeded.\n", + "Writing MAT variable: scens... succeeded.\n", + "Writing MAT variable: seeds... succeeded.\n", + "Writing MAT variable: targregionnames... succeeded.\n", + "Writing MAT variable: targregions... succeeded.\n", + "Writing MAT variable: targsitecoords... succeeded.\n", + "Writing MAT variable: targyears... succeeded.\n", + "Writing MAT variable: OceanDynMean... succeeded.\n", + "Writing MAT variable: OceanDynN... succeeded.\n", + "Writing MAT variable: OceanDynRegions... succeeded.\n", + "Writing MAT variable: OceanDynStd... succeeded.\n", + "Writing MAT variable: OceanDynTECorr... succeeded.\n", + "Writing MAT variable: OceanDynYears... succeeded.\n", + "Writing MAT variable: ThermExpMean... succeeded.\n", + "Writing MAT variable: ThermExpN... succeeded.\n", + "Writing MAT variable: ThermExpStd... succeeded.\n", + "Writing MAT variable: ThermExpYears... succeeded.\n", + "Writing MAT variable: colAIS... succeeded.\n", + "Writing MAT variable: colGIC... succeeded.\n", + "Writing MAT variable: colGIS... succeeded.\n", + "Writing MAT variable: colLS... succeeded.\n", + "Writing MAT variable: colTE... succeeded.\n", + "Writing MAT variable: fpsite... succeeded.\n", + "Writing MAT variable: mergeZOSZOSTOGA... succeeded.\n", + "Writing MAT variable: nearestTG... succeeded.\n", + "Writing MAT variable: quantlevs... succeeded.\n", + "Writing MAT variable: rateprojs... succeeded.\n", + "Writing MAT variable: rateprojssd... succeeded.\n", + "Writing MAT variable: samps... succeeded.\n", + "Writing MAT variable: scens... succeeded.\n", + "Writing MAT variable: seeds... succeeded.\n", + "Writing MAT variable: targregionnames... succeeded.\n", + "Writing MAT variable: targregions... succeeded.\n", + "Writing MAT variable: targsitecoords... succeeded.\n", + "Writing MAT variable: targyears... succeeded.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote MAT file: \"/gcs/rhg-data/impactlab-rhg/coastal/sliiders/int/slr/ifiles/SLRProjections210628GRIDDEDcore_SROCC_v5.mat\"\n" + ] + } + ], + "source": [ + "import shlex\n", + "from subprocess import run\n", + "\n", + "from sliiders.settings import DIR_IFILES_INT, DIR_IFILES_RAW\n", + "\n", + "DIR_IFILES_INT.mkdir(parents=True, exist_ok=True)\n", + "\n", + "for mfile_in in DIR_IFILES_RAW.glob(\"*.mat\"):\n", + " mfile_out = DIR_IFILES_INT / (mfile_in.stem + \"_v5\" + mfile_in.suffix)\n", + "\n", + " cmd = f\"matio_73to5 {mfile_in} {mfile_out}\"\n", + " run(shlex.split(cmd))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-SLR/download-ifile-to-gcs.ipynb b/notebooks/create-SLIIDERS-SLR/download-ifile-to-gcs.ipynb new file mode 100644 index 0000000..7c8198f --- /dev/null +++ b/notebooks/create-SLIIDERS-SLR/download-ifile-to-gcs.ipynb @@ -0,0 +1,82 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare LocalizeSL corefiles for Octave processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Choose corefiles from LocalizeSL/IFILES to use (see readme immediately in that directory for more details on the different corefiles)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import requests\n", + "\n", + "from sliiders.settings import DIR_IFILES_RAW, LOCALIZESL_COREFILES, LOCALIZESL_REV\n", + "\n", + "DIR_IFILES_RAW.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for corefile_name in LOCALIZESL_COREFILES:\n", + " url = (\n", + " f\"https://github.com/bobkopp/LocalizeSL/raw/{LOCALIZESL_REV}/IFILES/\"\n", + " f\"{corefile_name}.mat\"\n", + " )\n", + "\n", + " filename = Path(url).name\n", + " path_out = DIR_IFILES_RAW / filename\n", + "\n", + " pathstr = str(path_out)\n", + "\n", + " r = requests.get(url)\n", + " path_out.write_bytes(r.content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-SLR/generate-projected-lsl.ipynb b/notebooks/create-SLIIDERS-SLR/generate-projected-lsl.ipynb new file mode 100644 index 0000000..faab24a --- /dev/null +++ b/notebooks/create-SLIIDERS-SLR/generate-projected-lsl.ipynb @@ -0,0 +1,1426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Generate Monte Carlo outputs for each gridded site in a corefile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import dask.distributed as dd\n", + "import numpy as np\n", + "import parameterize_jobs as pj\n", + "from dask_gateway import Gateway\n", + "from gcsfs import GCSFileSystem\n", + "from IPython.display import display\n", + "from scipy.io import loadmat\n", + "\n", + "import rhg_compute_tools.gcs as rhgcs\n", + "from sliiders.settings import DIR_IFILES_INT, DIR_SLR_INT, LOCALIZESL_COREFILES\n", + "from sliiders.utils import upload_pkg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FS = GCSFileSystem(token=\"/opt/gcsfuse_tokens/rhg-data.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "TMP_DIR = DIR_SLR_INT / \"tmp\"\n", + "\n", + "BATCH_SIZE = 32\n", + "\n", + "DIR_MFILES_SRC = Path(\"../../LocalizeSL/MFILES\")\n", + "TMP_MFILES_DIR = TMP_DIR / \"MFILES\"\n", + "TMP_MFILES_DIR.mkdir(exist_ok=True, parents=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Copy MFILES to location accessible by workers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cp_output = FS.put(\n", + " str(DIR_MFILES_SRC), TMP_MFILES_DIR.relative_to(\"/gcs\"), recursive=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gateway = Gateway()\n", + "cluster = gateway.new_cluster(\n", + " idle_timeout=3600,\n", + " profile=\"micro\",\n", + ")\n", + "client = cluster.get_client()\n", + "cluster.scale(64)\n", + "\n", + "upload_pkg(client, \"../../sliiders\")\n", + "cluster" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Octave function used to run LocalizeSL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_lslr_func = f\"\"\"\n", + "function this_ids = get_lslr(ix_start, ix_end, corefile_name, subcorefile_choice, dir_out)\n", + " ifilesdir='{DIR_IFILES_INT}';\n", + " mfilesdir='{TMP_MFILES_DIR}'\n", + "\n", + " addpath(ifilesdir);\n", + " addpath(mfilesdir);\n", + " \n", + " pkg load statistics\n", + "\n", + " f = [corefile_name '_v5.mat'];\n", + "\n", + " corefilewrapper=load(fullfile(ifilesdir, f));\n", + "\n", + " mkdir(dir_out);\n", + "\n", + " ccclab = \"SROCC\";\n", + " if strcmp(corefile_name, 'SLRProjections170113GRIDDEDcore')\n", + " ccclab = \"170113\";\n", + " end\n", + " if strcmp(corefile_name, 'SLRProjections190726core_SEJ_full')\n", + " disp(\"Make sure to run twice. Once for corefileL and once for corefileH\");\n", + " ccclab = \"SEJ\";\n", + " if strcmp(subcorefile_choice, \"H\")\n", + " corefile = corefilewrapper.corefileH;\n", + " else\n", + " corefile = corefilewrapper.corefileL;\n", + " end\n", + " else\n", + " corefile = corefilewrapper;\n", + " end\n", + " disp([\"Corefile: \" corefile_name]);\n", + " disp([\"Corefile label: \" ccclab]);\n", + " disp([\"Corefile subgroup: \" subcorefile_choice]);\n", + " \n", + " rateproj_corefile = load(fullfile(ifilesdir, 'SLRProjections190726core_SEJ_full_v5.mat')).corefileL;\n", + "\n", + " % Take corefile.targregionnames\n", + " % Get index of each name in rateproj_corefile.targregionnames\n", + " % Take those indices from rateprojs and rateprojssd\n", + " [_, idx_this_corefile, idx_rateproj_corefile] = intersect(corefile.targregionnames, rateproj_corefile.targregionnames);\n", + " corefile.rateprojs(idx_this_corefile) = rateproj_corefile.rateprojs(idx_rateproj_corefile);\n", + " corefile.rateprojssd(idx_this_corefile) = rateproj_corefile.rateprojssd(idx_rateproj_corefile);\n", + "\n", + " rand(\"seed\", 0);\n", + " corefile.seeds = [];\n", + " for rrr=1:size(corefile.samps, 2)\n", + " seeds=linspace(0,1,size(corefile.samps, 1) + 2);\n", + " seeds=seeds(2:end-1);\n", + " seeds=norminv(seeds(randperm(length(seeds))));\n", + "\n", + " corefile.seeds = [corefile.seeds; seeds];\n", + " end\n", + "\n", + " siteids = int64(corefile.targregions);\n", + "\n", + " % Subset `siteids` to the really high-numbered ones (the gridded ones, rather than those indexed by PSMSL stations)\n", + " gridids = siteids(siteids > 100000000);\n", + "\n", + " disp([\"Number of sites: \" mat2str(length(gridids))]);\n", + " \n", + " if (ix_start == 0 && ix_end == 0)\n", + " this_ids = [0];\n", + " else\n", + " this_ids = gridids(ix_start:ix_end);\n", + " end\n", + "\n", + " n_ids = size(this_ids)(1);\n", + " for i=1:n_ids\n", + " [sampslocrise,sampsloccomponents,siteids,sitenames,targyears,scens,cols] = LocalizeStoredProjections(this_ids(i),corefile); \n", + " if this_ids(i) == 0\n", + " sl_str = \"GSL\";\n", + " else\n", + " sl_str = \"LSL\";\n", + " WriteTableMC(sampsloccomponents,24,siteids,sitenames,targyears,scens,[dir_out sl_str 'proj_MC_' ccclab '_baseline_']);\n", + " end\n", + " WriteTableMC(sampsloccomponents,[],siteids,sitenames,targyears,scens,[dir_out sl_str 'proj_MC_' ccclab '_']);\n", + " end\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Python wrappers needed for running Octave function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_batch(start, end, corefile, sub_corefile, dir_out):\n", + " from oct2py import octave\n", + "\n", + " octave.eval(get_lslr_func)\n", + " return octave.get_lslr(start, end, corefile, sub_corefile, str(dir_out) + \"/\")\n", + "\n", + "\n", + "def get_num_sites(corefile, sub_corefile=None):\n", + " path_corefile = DIR_IFILES_INT / (corefile + \"_v5.mat\")\n", + "\n", + " cf = loadmat(path_corefile, squeeze_me=True)\n", + "\n", + " if sub_corefile is None:\n", + " targ_regions = cf[\"targregions\"]\n", + " else:\n", + " targ_regions = cf[\"corefile\" + sub_corefile][\"targregions\"].item()\n", + "\n", + " return (targ_regions > 100000000).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Run jobs on workers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "futures = dict()\n", + "\n", + "n_corefiles = len(sum(LOCALIZESL_COREFILES.values(), []))\n", + "for corefile, sub_corefiles in LOCALIZESL_COREFILES.items():\n", + " futures[corefile] = dict()\n", + " for sub_corefile in sub_corefiles:\n", + "\n", + " num_sites = get_num_sites(corefile, sub_corefile)\n", + "\n", + " # get beginning and ending index for each batch\n", + " starts = np.arange(1, num_sites, BATCH_SIZE)\n", + " ends = np.arange(BATCH_SIZE, num_sites + BATCH_SIZE, BATCH_SIZE)\n", + " ends[-1] = num_sites\n", + "\n", + " # add gsl\n", + " starts = np.hstack(([0], starts))\n", + " ends = np.hstack(([0], ends))\n", + "\n", + " # get out dir\n", + " dir_out = TMP_DIR / corefile / \"mc_tsv\"\n", + " dir_out.mkdir(parents=True, exist_ok=True)\n", + "\n", + " # get jobs\n", + " jobs = pj.Constant(\n", + " corefile=corefile, sub_corefile=sub_corefile, dir_out=dir_out\n", + " ) * pj.ParallelComponentSet(start=starts, end=ends)\n", + "\n", + " # map jobs\n", + " futures[corefile][sub_corefile] = client.map(pj.expand_kwargs(run_batch), jobs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean up temporary MFILES and close cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd.wait(futures)\n", + "cluster.close()\n", + "client.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FS.rm(str(TMP_MFILES_DIR.relative_to(\"/gcs\")), recursive=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "nbdime-conflicts": { + "local_diff": [ + { + "key": "widgets", + "op": "add", + "value": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + } + ], + "remote_diff": [ + { + "key": "widgets", + "op": "add", + "value": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "01751e9f631f4ad29269930e933bb9a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_9087d1288cc74b25b96bc36457cf9c86", + "IPY_MODEL_3e4f7718cfda4cd0a3f828c077674a29" + ], + "layout": "IPY_MODEL_a287cc2020e941dbb4ec8862559d8d03" + } + }, + "025b107359e8494985fcefba1c2e43ee": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "026c0c222181416ab109ca7372d2c9bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Minimum", + "layout": "IPY_MODEL_4e51cd8a9cc8443b9106b6d0eabc234c", + "step": 1, + "style": "IPY_MODEL_3f647d95b04142309b44ae885f866b92" + } + }, + "0755c45d2beb436a9e625c55adb7a929": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "09a5e0b9bf1648eab18faf2e9cf58146": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_d87c4689c4c44600a3c6e321c19c16f3", + "style": "IPY_MODEL_0755c45d2beb436a9e625c55adb7a929", + "value": "
67 / 67
" + } + }, + "0a7542ae1e874ca489b5cbfced51b642": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "0cc92f85655f40d8a3e3ce8d85051489": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "10ef31469b124f85bde701a7a52315bf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1a6b7b3270494781b97a90279c2261b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_fd53353613644e2f87ace812c76885a5", + "style": "IPY_MODEL_eb249e5749ec4388abe3b2825fa0f3a5", + "value": "
Finished: 1hr 11min 7.0s
" + } + }, + "1b9cdd9c6fab4c37ab2c7b23a1595312": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "2104895dcf34414cbcc739b509bdeca2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "22e8ec715f0043ee88310eb409224faa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "24d74565e58a4ff7b9f8cc257726d629": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "26e469238501404f96542480e40a64e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": {} + }, + "2819b95095a4402da59fb723ff853010": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_33c5b71c428049ccbeb6c740b30b7ea2", + "style": "IPY_MODEL_3f62acfa3c064961910b134f92ed7ab6", + "value": "
run_batch
" + } + }, + "28f4cce95baf40bfa512dbb8936e142c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_a20af9530e4e47fe81d1c3c30e8bd436", + "style": "IPY_MODEL_834f4cc42c1546538f18bbffcde38398", + "value": "
Finished: 46min 59.5s
" + } + }, + "29932152f2d640179a8ae7580e96df08": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_93c4637d3469462b99192a248cbafd87", + "IPY_MODEL_a9ac73548240496eb11975dfcdb9c044", + "IPY_MODEL_897ffbbe561c4738b954fa0221797db7" + ], + "layout": "IPY_MODEL_67bc2a1fdda64e77b60cc2ae138d4f14" + } + }, + "33c5b71c428049ccbeb6c740b30b7ea2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "36e094751a9c4711b4b6c55f54c41fa8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "37c9c23980704317835e235be6179486": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "38bc474a0d06487180f71a71ae69c5f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "3e4f7718cfda4cd0a3f828c077674a29": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_567f1375d858498d921e55182b8fdb5c" + ], + "layout": "IPY_MODEL_e32474e03e8f4559802b6144cd6a9aa4" + } + }, + "3f62acfa3c064961910b134f92ed7ab6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3f647d95b04142309b44ae885f866b92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "43a78cbc080d4d94841c8f8af8b50eb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "44c9f1e0775e4318a830fd4565de82c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_4597fb498328415cb6321cdd865ea3a9", + "IPY_MODEL_6dca74c3da00424e837b4fd8fd660fd8", + "IPY_MODEL_2819b95095a4402da59fb723ff853010" + ], + "layout": "IPY_MODEL_a97a55314fee44b58172aecc6ab52ac4" + } + }, + "4597fb498328415cb6321cdd865ea3a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_2104895dcf34414cbcc739b509bdeca2", + "style": "IPY_MODEL_928181208ce2443f8278c26744d66926", + "value": "
67 / 67
" + } + }, + "4db704dc7d05472c8b2f00743ef0a6ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "4e51cd8a9cc8443b9106b6d0eabc234c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "width": "150px" + } + }, + "4f2b0dd904ce464e8eabc3de04a6dca5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_5bdc0c82d0034781bf5d3c84129405db", + "IPY_MODEL_5861d125a6b84e2fb4df27ea013529a3" + ], + "layout": "IPY_MODEL_ef551c4113954c5aab97b743838180b4" + } + }, + "52822a8c109c4f47ad1e27b36dad4498": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "55278627ad9f4195a3401edc7f0ce7eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_a891144765dc45e484b20ea4aa7a0906", + "IPY_MODEL_89fb4b6b807e420484a057e02a089834", + "IPY_MODEL_f6930f6fc7ee4d80b0adf733e2eced6c", + "IPY_MODEL_f953bcfe9daf4ac8ba8e4127453b36fd" + ], + "layout": "IPY_MODEL_1b9cdd9c6fab4c37ab2c7b23a1595312" + } + }, + "567f1375d858498d921e55182b8fdb5c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_68bc2ee7a74f4c2cbe40b167f4d34a22", + "IPY_MODEL_d9457500ca8945f788df02c372ef5fcd", + "IPY_MODEL_b932d9780cf9482ca4bc3eb88183c5c8" + ], + "layout": "IPY_MODEL_0a7542ae1e874ca489b5cbfced51b642" + } + }, + "56bc83bb6b5b43a587f918caa2ca64f4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5861d125a6b84e2fb4df27ea013529a3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "description": "Scale", + "layout": "IPY_MODEL_4e51cd8a9cc8443b9106b6d0eabc234c", + "style": "IPY_MODEL_69fc7ae4f5b4470faa449b8840a88e07" + } + }, + "58a5f49ba22c4157ab3c8bedfda73b52": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_95b0281a6dea40d9ba011de14bdbfc7e", + "style": "IPY_MODEL_36e094751a9c4711b4b6c55f54c41fa8", + "value": "
run_batch
" + } + }, + "5b2edaa051054124b317aa95de3b0a6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_e62ab9c3134340ca832b788beeb30e12", + "style": "IPY_MODEL_9675c2267a35465ab2a9c2fe5da02f9a", + "value": "
run_batch
" + } + }, + "5bdc0c82d0034781bf5d3c84129405db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Workers", + "layout": "IPY_MODEL_4e51cd8a9cc8443b9106b6d0eabc234c", + "step": 1, + "style": "IPY_MODEL_fabcf155c317400e92f810e06f4b44d0" + } + }, + "5c40b198f23b49d8b531ca1ad7a79e10": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "60170fc9b8f7436bb8d8ef63385553d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6749914566c840f6bf5c62eee6c8e740": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "67bc2a1fdda64e77b60cc2ae138d4f14": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "67c59408d5184f3a860eb11ecb43510e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "68bc2ee7a74f4c2cbe40b167f4d34a22": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_840292dcfa7f4a4d897dca4c2a18fa63", + "style": "IPY_MODEL_60170fc9b8f7436bb8d8ef63385553d5", + "value": "
67 / 67
" + } + }, + "69fc7ae4f5b4470faa449b8840a88e07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": {} + }, + "6dca74c3da00424e837b4fd8fd660fd8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_aaa9b3dcff9a4761bf416075ee9c5371", + "max": 1, + "style": "IPY_MODEL_75e5184b73204e698bc55dd201b4245f", + "value": 1 + } + }, + "720cccf78c934a4b84f6f45f1d8a0af5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "72569d836c9e40f0909f8828cd2cdd4d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "737eb9c5b9bf4fc48d92e9bbffa1c77a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_29932152f2d640179a8ae7580e96df08" + ], + "layout": "IPY_MODEL_f29d6daccdab494ca9c05f43bd131207" + } + }, + "73c5d03be3fc4385af68f3c2ddc40e6c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "74097c600949435b84433cfca93f9dab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "74bab6fe2d904d8fa2abe91c57f49c0c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "75e5184b73204e698bc55dd201b4245f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "78a6359adf57408090e190c8280787a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "7913eda5863241f295001fd9a823f6a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_28f4cce95baf40bfa512dbb8936e142c", + "IPY_MODEL_737eb9c5b9bf4fc48d92e9bbffa1c77a" + ], + "layout": "IPY_MODEL_67c59408d5184f3a860eb11ecb43510e" + } + }, + "7a4dad3cf68b4e48b62e4fb446171ce3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7b98d59f26354a6db4c6296ed34899c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_09a5e0b9bf1648eab18faf2e9cf58146", + "IPY_MODEL_8873037bb2724e93b5d5b7a672cbf5e8", + "IPY_MODEL_5b2edaa051054124b317aa95de3b0a6a" + ], + "layout": "IPY_MODEL_38bc474a0d06487180f71a71ae69c5f6" + } + }, + "7fadbe16d1094cb38147abb7cb2f4925": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Maximum", + "layout": "IPY_MODEL_4e51cd8a9cc8443b9106b6d0eabc234c", + "step": 1, + "style": "IPY_MODEL_d8c67fb04dba41dc858e0da764b0e6d5" + } + }, + "81a92892f3b54601827100fdfd3bf219": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "834f4cc42c1546538f18bbffcde38398": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "840292dcfa7f4a4d897dca4c2a18fa63": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8439432eab1b465a9f1196349dc4c136": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_88bf4ff86b27405d835dffb337d5b9d8", + "style": "IPY_MODEL_37c9c23980704317835e235be6179486", + "value": "
Finished: 1hr 17min 56.4s
" + } + }, + "8873037bb2724e93b5d5b7a672cbf5e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_9b96383ec86a44a4b2093ea405248a60", + "max": 1, + "style": "IPY_MODEL_6749914566c840f6bf5c62eee6c8e740", + "value": 1 + } + }, + "88bf4ff86b27405d835dffb337d5b9d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "897ffbbe561c4738b954fa0221797db7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_74097c600949435b84433cfca93f9dab", + "style": "IPY_MODEL_9e581f60ab2f440cbb20185130d99954", + "value": "
run_batch
" + } + }, + "89fb4b6b807e420484a057e02a089834": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_fa800f253f394091b4db6bd335b3f4b8", + "IPY_MODEL_d6f815cd55c448a1a88e992d9912326c" + ], + "layout": "IPY_MODEL_52822a8c109c4f47ad1e27b36dad4498" + } + }, + "8b31180c33f44b6e8933e9cc0ec97903": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8d0fbf56bfc140659768ed3f41c45d90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "9087d1288cc74b25b96bc36457cf9c86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_c9e534d150a14788a78ab42359f3dfab", + "style": "IPY_MODEL_73c5d03be3fc4385af68f3c2ddc40e6c", + "value": "
Finished: 9min 13.3s
" + } + }, + "90ee381c05984631b0607e0c4e5c834c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "description": "Adapt", + "layout": "IPY_MODEL_4e51cd8a9cc8443b9106b6d0eabc234c", + "style": "IPY_MODEL_26e469238501404f96542480e40a64e5" + } + }, + "928181208ce2443f8278c26744d66926": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "928228174ba64e6fa7e3400e55e237f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "93c4637d3469462b99192a248cbafd87": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_025b107359e8494985fcefba1c2e43ee", + "style": "IPY_MODEL_22e8ec715f0043ee88310eb409224faa", + "value": "
67 / 67
" + } + }, + "95b0281a6dea40d9ba011de14bdbfc7e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "9675c2267a35465ab2a9c2fe5da02f9a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "96ec83ef01fa4b6e8159cc3bb051509c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_a69276475e614fb6b89a163b5735ed36" + ], + "layout": "IPY_MODEL_7a4dad3cf68b4e48b62e4fb446171ce3" + } + }, + "9b96383ec86a44a4b2093ea405248a60": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "9cb81102a0cd4a5181d3417d1236d930": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_10ef31469b124f85bde701a7a52315bf", + "max": 1, + "style": "IPY_MODEL_78a6359adf57408090e190c8280787a7", + "value": 1 + } + }, + "9e581f60ab2f440cbb20185130d99954": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a20af9530e4e47fe81d1c3c30e8bd436": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a287cc2020e941dbb4ec8862559d8d03": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a69276475e614fb6b89a163b5735ed36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_fa9e12916ada4a3c8f18016054293783", + "IPY_MODEL_9cb81102a0cd4a5181d3417d1236d930", + "IPY_MODEL_58a5f49ba22c4157ab3c8bedfda73b52" + ], + "layout": "IPY_MODEL_56bc83bb6b5b43a587f918caa2ca64f4" + } + }, + "a7faa49336aa4d049956a262bf2e5d84": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a891144765dc45e484b20ea4aa7a0906": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_a7faa49336aa4d049956a262bf2e5d84", + "style": "IPY_MODEL_928228174ba64e6fa7e3400e55e237f7", + "value": "

GatewayCluster

" + } + }, + "a97a55314fee44b58172aecc6ab52ac4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a9ac73548240496eb11975dfcdb9c044": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_8b31180c33f44b6e8933e9cc0ec97903", + "max": 1, + "style": "IPY_MODEL_74bab6fe2d904d8fa2abe91c57f49c0c", + "value": 1 + } + }, + "aaa9b3dcff9a4761bf416075ee9c5371": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "aaf076c445f243f7a57f16607cc200e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_8439432eab1b465a9f1196349dc4c136", + "IPY_MODEL_96ec83ef01fa4b6e8159cc3bb051509c" + ], + "layout": "IPY_MODEL_72569d836c9e40f0909f8828cd2cdd4d" + } + }, + "ab4d0e904841425abf43a4b763e2d3e2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ab5931b37c66495380648b6fc1c41fdd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ac00b4f535dc4fa28df607d36261ba02": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b745ecb0604442f4934d64d3de414140": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_026c0c222181416ab109ca7372d2c9bf", + "IPY_MODEL_7fadbe16d1094cb38147abb7cb2f4925", + "IPY_MODEL_90ee381c05984631b0607e0c4e5c834c" + ], + "layout": "IPY_MODEL_d71072ec97d24e24b1112ffe62a51af5" + } + }, + "b932d9780cf9482ca4bc3eb88183c5c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_5c40b198f23b49d8b531ca1ad7a79e10", + "style": "IPY_MODEL_0cc92f85655f40d8a3e3ce8d85051489", + "value": "
run_batch
" + } + }, + "bb592927448241b9ac4329b772f7bf86": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "be2d98ad206a47ed944cf4d42fb9d328": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "be7f5d3fadb44292873ce4da0d68b1b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "c9e534d150a14788a78ab42359f3dfab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "cdb9b031355c4441b8b31d5c95be1dc5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "min_width": "150px" + } + }, + "ce7059379a394fb2bbe0b05add67b2f7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d1aa2ba6533b433686f4c03d30ec42bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_e47746b5a42348688e2f045605dee7d1", + "IPY_MODEL_e4341f78b68f435ba6e6d2e9cb0de7d4" + ], + "layout": "IPY_MODEL_be7f5d3fadb44292873ce4da0d68b1b4" + } + }, + "d6f815cd55c448a1a88e992d9912326c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "AccordionModel", + "state": { + "_titles": { + "0": "Manual Scaling", + "1": "Adaptive Scaling" + }, + "children": [ + "IPY_MODEL_4f2b0dd904ce464e8eabc3de04a6dca5", + "IPY_MODEL_b745ecb0604442f4934d64d3de414140" + ], + "layout": "IPY_MODEL_f91409ee90c14b29af8e8a956990badd", + "selected_index": null + } + }, + "d70653312b8c4d49a3c44a62c7cf694c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_1a6b7b3270494781b97a90279c2261b4", + "IPY_MODEL_efe768b143594a3a94aaaa4e67ae67aa" + ], + "layout": "IPY_MODEL_ce7059379a394fb2bbe0b05add67b2f7" + } + }, + "d71072ec97d24e24b1112ffe62a51af5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d87c4689c4c44600a3c6e321c19c16f3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d8c67fb04dba41dc858e0da764b0e6d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d9457500ca8945f788df02c372ef5fcd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_bb592927448241b9ac4329b772f7bf86", + "max": 1, + "style": "IPY_MODEL_8d0fbf56bfc140659768ed3f41c45d90", + "value": 1 + } + }, + "dfb2dc339a494ea891ca3a03a38a839f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e2c6461db44d4fcfba77abd869b822eb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e32474e03e8f4559802b6144cd6a9aa4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e4341f78b68f435ba6e6d2e9cb0de7d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_7b98d59f26354a6db4c6296ed34899c7" + ], + "layout": "IPY_MODEL_ab5931b37c66495380648b6fc1c41fdd" + } + }, + "e47746b5a42348688e2f045605dee7d1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_43a78cbc080d4d94841c8f8af8b50eb4", + "style": "IPY_MODEL_dfb2dc339a494ea891ca3a03a38a839f", + "value": "
Finished: 1hr 16min 39.9s
" + } + }, + "e62ab9c3134340ca832b788beeb30e12": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "eb249e5749ec4388abe3b2825fa0f3a5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ef551c4113954c5aab97b743838180b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "efe768b143594a3a94aaaa4e67ae67aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_44c9f1e0775e4318a830fd4565de82c8" + ], + "layout": "IPY_MODEL_24d74565e58a4ff7b9f8cc257726d629" + } + }, + "f29d6daccdab494ca9c05f43bd131207": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f6930f6fc7ee4d80b0adf733e2eced6c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_e2c6461db44d4fcfba77abd869b822eb", + "style": "IPY_MODEL_720cccf78c934a4b84f6f45f1d8a0af5", + "value": "

Name: daskhub-dev.7b42fe169b8647ed92c39ec0acec6d82

" + } + }, + "f91409ee90c14b29af8e8a956990badd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "min_width": "500px" + } + }, + "f953bcfe9daf4ac8ba8e4127453b36fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_ab4d0e904841425abf43a4b763e2d3e2", + "style": "IPY_MODEL_ac00b4f535dc4fa28df607d36261ba02", + "value": "

Dashboard: /services/dask-gateway/clusters/daskhub-dev.7b42fe169b8647ed92c39ec0acec6d82/status

\n" + } + }, + "fa800f253f394091b4db6bd335b3f4b8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_cdb9b031355c4441b8b31d5c95be1dc5", + "style": "IPY_MODEL_4db704dc7d05472c8b2f00743ef0a6ed", + "value": "\n
\n\n\n \n \n \n
Workers 64
Cores 64
Memory 416.00 GiB
\n
\n" + } + }, + "fa9e12916ada4a3c8f18016054293783": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_81a92892f3b54601827100fdfd3bf219", + "style": "IPY_MODEL_be2d98ad206a47ed944cf4d42fb9d328", + "value": "
67 / 67
" + } + }, + "fabcf155c317400e92f810e06f4b44d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "fd53353613644e2f87ace812c76885a5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + } + }, + "version_major": 2, + "version_minor": 0 + } + } + } + ] + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-SLR/process-localizesl-output.ipynb b/notebooks/create-SLIIDERS-SLR/process-localizesl-output.ipynb new file mode 100644 index 0000000..3a4cdb6 --- /dev/null +++ b/notebooks/create-SLIIDERS-SLR/process-localizesl-output.ipynb @@ -0,0 +1,836 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Aggregate TSV outputs of LocalizeSL into Zarr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.config\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "from dask_gateway import Gateway\n", + "from gcsfs import GCSFileSystem\n", + "from tqdm.notebook import tqdm\n", + "\n", + "from rhg_compute_tools import xarray as rhgx\n", + "from sliiders.settings import (\n", + " DIR_SLR_INT,\n", + " LOCALIZESL_COREFILES,\n", + " PATH_SLIIDERS_SLR,\n", + " PATH_SLR_N_GCMS,\n", + ")\n", + "from sliiders.utils import upload_pkg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# attrs for output zarr\n", + "AUTHOR = \"Ian Bolliger, Daniel Allen\"\n", + "CONTACT = \"ibolliger@rhg.com, dallen@berkeley.edu\"\n", + "METHOD = \"\"\"LocalizeSL was used to estimate monte carlo draws of future GMSL and LMSL relative to a vertical datum of MSL2000. Data quality adjustments:\n", + "1. RCP6 ignored due to no post-2100 projections and fewer GCMS used for pre-2100 projections.\n", + "2. Sites with <3 GCMs for an pre-2100 years dropped.\n", + "3. Sites with any null values for post-2100 projections dropped.\"\"\"\n", + "DESCRIPTION = \"LocalizeSL-based relative sea level rise projections\"\n", + "\n", + "FS = GCSFileSystem(token=\"/opt/gcsfuse_tokens/rhg-data.json\")\n", + "PATH_SLR_N_GCMS = FS.get_mapper(PATH_SLR_N_GCMS.relative_to(\"/gcs\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gateway = Gateway()\n", + "cluster = gateway.new_cluster(\n", + " idle_timeout=3600,\n", + " profile=\"micro\",\n", + ")\n", + "client = cluster.get_client()\n", + "cluster.scale(140)\n", + "\n", + "upload_pkg(client, \"../../sliiders\")\n", + "cluster" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get lists of outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corefile: SLRProjections190726core_SEJ_full -- 4192 site scenario files found, 2 global files found\n", + "Corefile: SLRProjections170113GRIDDEDcore -- 6288 site scenario files found, 3 global files found\n", + "Corefile: SLRProjections200204GRIDDEDcore_D20 -- 6288 site scenario files found, 3 global files found\n", + "Corefile: SLRProjections210628GRIDDEDcore_SROCC -- 6288 site scenario files found, 3 global files found\n", + "From all corefiles, 2096 baseline paths found\n" + ] + } + ], + "source": [ + "lsl_baseline_paths = dict()\n", + "\n", + "corefile_paths = dict()\n", + "for corefile in LOCALIZESL_COREFILES:\n", + " corefile_paths[corefile] = dict()\n", + " dir_tsv = DIR_SLR_INT / \"tmp\" / corefile / \"mc_tsv\"\n", + "\n", + " all_lsl_paths = list(dir_tsv.glob(\"LSL*.tsv\"))\n", + " all_gsl_paths = list(dir_tsv.glob(\"GSL*.tsv\"))\n", + "\n", + " corefile_paths[corefile][\"lsl_scenario_paths\"] = [\n", + " p for p in all_lsl_paths if \"_rcp60\" not in p.stem and \"_baseline\" not in p.stem\n", + " ]\n", + "\n", + " all_baseline_paths = [p for p in all_lsl_paths if \"_baseline\" in p.stem]\n", + " baseline_paths = {p.stem.split(\"_\")[-2]: p for p in all_baseline_paths}\n", + "\n", + " for site_id in baseline_paths:\n", + " lsl_baseline_paths[site_id] = baseline_paths[site_id]\n", + "\n", + " corefile_paths[corefile][\"lsl_baseline_paths\"] = [\n", + " p for p in all_lsl_paths if \"_baseline\" in p.stem\n", + " ]\n", + "\n", + " corefile_paths[corefile][\"gsl_paths\"] = [\n", + " p for p in all_gsl_paths if \"_rcp60\" not in p.stem and \"_baseline\" not in p.stem\n", + " ]\n", + "\n", + " print(\n", + " f\"Corefile: {corefile} -- {len(corefile_paths[corefile]['lsl_scenario_paths'])} site scenario \"\n", + " f\"files found, \"\n", + " f\"{len(corefile_paths[corefile]['gsl_paths'])} global files found\"\n", + " )\n", + "\n", + "lsl_baseline_paths = list(lsl_baseline_paths.values())\n", + "print(f\"From all corefiles, {len(lsl_baseline_paths)} baseline paths found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define functions to load and process TSVs" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def combine_site_tsvs_into_dataframe(paths, is_baseline=False):\n", + " \"\"\"Open TSVs that are outputs of `WriteTableMC`, a function in the LocalizeSL repository\"\"\"\n", + "\n", + " dfs = []\n", + " for path in tqdm(paths):\n", + " df = pd.read_csv(\n", + " path,\n", + " sep=\"\\t\",\n", + " skiprows=2,\n", + " header=None,\n", + " names=[\"year\"] + [i for i in range(10000)],\n", + " )\n", + " rcp = path.stem.split(\"_\")[-1]\n", + " corefile = path.parent.parent.name\n", + " df.insert(0, \"scenario\", f\"{corefile}_{rcp}\")\n", + " df = df.dropna(subset=[0])\n", + " dfs.append(df)\n", + "\n", + " df_info = pd.read_csv(paths[0], nrows=0).columns[0]\n", + " coords = (df_info.split(\" \")[0]).split(\"_\")[1:]\n", + " # GSL\n", + " if coords == []:\n", + " lon = np.nan\n", + " lat = np.nan\n", + " # LSL\n", + " else:\n", + " lat = float(coords[0])\n", + " lon = float(coords[1])\n", + " site_id = path.stem.split(\"_\")[-2]\n", + "\n", + " out_df = pd.concat(dfs, ignore_index=True)\n", + "\n", + " out_df = out_df.sort_values([\"scenario\", \"year\"])\n", + " out_df.columns.name = \"mc_sample_id\"\n", + "\n", + " return out_df, site_id, lon, lat\n", + "\n", + "\n", + "def df_to_da(combined, site_id, lon, lat):\n", + "\n", + " # Move columns representing sample instances to possible values of a single column\n", + " flattened = combined.melt(id_vars=[\"scenario\", \"year\"], value_name=\"msl_msl00\")\n", + "\n", + " # Some type-casting\n", + " flattened[\"year\"] = flattened[\"year\"].astype(np.uint16)\n", + "\n", + " # Convert centimeters to meters\n", + " flattened[\"msl_msl00\"] = flattened[\"msl_msl00\"] / 100\n", + " flattened[\"msl_msl00\"] = flattened[\"msl_msl00\"].astype(np.float32)\n", + "\n", + " # Set index as dimensions of destination xarray Dataset\n", + " flattened = flattened.set_index([\"scenario\", \"year\", \"mc_sample_id\"])\n", + "\n", + " # Convert to DataArray\n", + " ds = flattened.to_xarray()\n", + "\n", + " # add in coords\n", + " ds = ds.expand_dims({\"site_id\": [site_id]})\n", + " ds.coords[\"lon\"] = (\"site_id\", [lon])\n", + " ds.coords[\"lat\"] = (\"site_id\", [lat])\n", + "\n", + " # make sure longitude is -180 to 180\n", + " ds[\"lon\"] = ds.lon.where(ds.lon <= 180, -360 + ds.lon)\n", + "\n", + " # some type casting to minimize size\n", + " ds[\"year\"] = ds.year.astype(np.uint16)\n", + " ds[\"mc_sample_id\"] = ds.mc_sample_id.astype(np.uint16)\n", + "\n", + " # convert to DataArray\n", + " da = ds.msl_msl00\n", + "\n", + " return da\n", + "\n", + "\n", + "def process_site(paths, is_baseline=False):\n", + " combined, site_id, lon, lat = combine_site_tsvs_into_dataframe(paths)\n", + " if is_baseline:\n", + " combined[\"scenario\"] = \"baseline\"\n", + "\n", + " da_out = df_to_da(\n", + " combined,\n", + " site_id,\n", + " lon,\n", + " lat,\n", + " )\n", + " if is_baseline:\n", + " return da_out.squeeze(\"scenario\").drop(\"scenario\")\n", + " return da_out\n", + "\n", + "\n", + "def get_groups_from_paths(paths):\n", + " # group jobs by site_id\n", + " site_ids = [p.stem.split(\"_\")[-2] for p in paths]\n", + " site_ser = pd.Series(paths, index=site_ids)\n", + " return site_ser.groupby(level=0).apply(list).to_list()\n", + "\n", + "\n", + "def process_all_sites(corefile):\n", + " lsl_groups = get_groups_from_paths(corefile_paths[corefile][\"lsl_scenario_paths\"])\n", + "\n", + " # submit jobs to return futures of dataarrays\n", + " gsl_fut = client.submit(process_site, corefile_paths[corefile][\"gsl_paths\"])\n", + " lsl_fut = client.map(process_site, lsl_groups)\n", + " return gsl_fut, lsl_fut" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process all files" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SLRProjections190726core_SEJ_full\n", + "SLRProjections170113GRIDDEDcore\n", + "SLRProjections200204GRIDDEDcore_D20\n", + "SLRProjections210628GRIDDEDcore_SROCC\n" + ] + } + ], + "source": [ + "sl_arrs = []\n", + "for ix, corefile in enumerate(LOCALIZESL_COREFILES.keys()):\n", + " print(corefile)\n", + "\n", + " # process results\n", + " gsl_fut, lsl_fut = process_all_sites(corefile)\n", + "\n", + " # gather arrays\n", + " gsl_arr = rhgx.dataarrays_from_delayed([gsl_fut], client=client)[0].squeeze(\n", + " drop=True\n", + " )\n", + " lsl_arr = rhgx.dataarray_from_delayed(lsl_fut, dim=\"site_id\", client=client).astype(\n", + " np.float32\n", + " )\n", + "\n", + " # merge arrays\n", + " sl_arr = xr.Dataset(\n", + " {\n", + " \"lsl_msl00\": lsl_arr,\n", + " \"gsl_msl00\": gsl_arr,\n", + " }\n", + " ).persist()\n", + "\n", + " sl_arrs.append(sl_arr)\n", + "\n", + "baseline_groups = get_groups_from_paths(lsl_baseline_paths)\n", + "lsl_baseline_fut = client.map(process_site, baseline_groups, is_baseline=True)\n", + "\n", + "lsl_baseline_arr = rhgx.dataarray_from_delayed(\n", + " lsl_baseline_fut, dim=\"site_id\", client=client\n", + ").astype(np.float32)\n", + "\n", + "lsl_baseline_arr = xr.Dataset(\n", + " {\n", + " \"lsl_ncc_msl00\": lsl_baseline_arr,\n", + " }\n", + ").persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sl_arr = xr.merge((xr.concat(sl_arrs, \"scenario\"), lsl_baseline_arr))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# update attrs\n", + "sl_arr.lsl_msl00.attrs.update(\n", + " {\n", + " \"long_name\": \"Estimated Future LMSL, MSL00\",\n", + " \"description\": (\n", + " \"Monte Carlo estimates of local mean sea level for a given RCP scenario \"\n", + " \"and year. Estimates are made on a sparse 2-degree coastal grid, and are \"\n", + " \"relative to MSL00 vertical datum.\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "\n", + "sl_arr.gsl_msl00.attrs.update(\n", + " {\n", + " \"long_name\": \"Estimated Future GMSL, MSL00\",\n", + " \"description\": (\n", + " \"Monte Carlo estimates of global mean sea level for a given RCP scenario \"\n", + " \"and year. Estimates are relative to MSL00 vertical datum.\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "\n", + "sl_arr.lsl_ncc_msl00.attrs.update(\n", + " {\n", + " \"long_name\": \"Counterfactual Future LMSL (no climate change), MSL00\",\n", + " \"description\": (\n", + " \"Monte Carlo estimates of local mean sea level in the no-climate change scenario \"\n", + " \"for each year. Estimates are made on a sparse 2-degree coastal grid, and are \"\n", + " \"relative to MSL00 vertical datum.\"\n", + " ),\n", + " \"units\": \"m\",\n", + " }\n", + ")\n", + "\n", + "sl_arr.attrs.update(\n", + " {\n", + " \"author\": AUTHOR,\n", + " \"contact\": CONTACT,\n", + " \"description\": DESCRIPTION,\n", + " \"method\": METHOD,\n", + " \"updated\": pd.Timestamp.now(tz=\"US/Pacific\").strftime(\"%c\"),\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean the outputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clean using the following criteria:\n", + "\n", + "- Ignore RCP6 when creating groups of trajectories binned by GMSL (this occurs later in the binning notebook). This is missing for all sites post-2100 (since no CMIP5 models were run for the extended timeline using RCP6 and has fewer GCMs used than other scenarios for the pre-2100 years (since RCP6 was not a prioritized scenario).\n", + "- Drop any sites that have <3 GCMs for any pre-2100 years.\n", + "- Ignoring RCP6 (which has missing values for all sites after 2100), drop any sites that have null values for any post-2100 years (no sites have missing values outside of RCP6 for pre-2100 years)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "local_metadata": { + "tags": [] + }, + "remote_metadata": {} + }, + "outputs": [], + "source": [ + "n_gcms = xr.open_zarr(PATH_SLR_N_GCMS, chunks=None).numGCMs.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# filter to only sites w/ >=3 gcms\n", + "good_sites = (\n", + " n_gcms.sel(year=slice(None, 2090)).min(dim=[\"year\", \"scenario\"]) >= 3\n", + ").values\n", + "\n", + "# filter to sites that have no missing values\n", + "good_sites = (\n", + " good_sites\n", + " & sl_arr.lsl_msl00.notnull().all(dim=[\"mc_sample_id\", \"scenario\", \"year\"]).values\n", + ")\n", + "\n", + "# execute filtering\n", + "with dask.config.set(**{\"array.slicing.split_large_chunks\": False}):\n", + " sl_arr = sl_arr.isel(site_id=good_sites).persist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# re-chunk\n", + "sl_arr = sl_arr.chunk(\n", + " {\"scenario\": -1, \"site_id\": 100, \"year\": -1, \"mc_sample_id\": 100}\n", + ").persist()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sl_arr.to_zarr(FS.get_mapper(PATH_SLIIDERS_SLR.relative_to(\"/gcs\")), mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.close(), client.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "nbdime-conflicts": { + "local_diff": [ + { + "key": "widgets", + "op": "add", + "value": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + } + ], + "remote_diff": [ + { + "key": "widgets", + "op": "add", + "value": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0076aa049aaa4ca3903afa05d5dcf14a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": {} + }, + "02384fe74f774f47b037d5863b7070b0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_58c2f7e60e9f44f2856ae5f917718d2e", + "style": "IPY_MODEL_4fd8d719926648e384cf0302d416a20a", + "value": "

Dashboard: /services/dask-gateway/clusters/daskhub-dev.33349d5e586245c7a57e585ee1cc92a6/status

\n" + } + }, + "072710f5a5d24788beadfe83105a204a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_d427b85661fc464c91e900374b75212d", + "IPY_MODEL_f3c6a0285b4f4efd878d83e6f7c20aa8" + ], + "layout": "IPY_MODEL_f0b4ddf4acd34d6384fd885c57c6d5cc" + } + }, + "13555bcc815840b98f8f4f080b9c08da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1f0a472b1af54581a3181bf915e7e902": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "225d8927d3c649cf906e04e2149828f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "2ca364e8531042f09f61623a058bd0c3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "width": "150px" + } + }, + "37616593ee214ce68284e2dedf4f74cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "4faa28ce442d4531bb90012eb58cd01f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Workers", + "layout": "IPY_MODEL_2ca364e8531042f09f61623a058bd0c3", + "step": 1, + "style": "IPY_MODEL_225d8927d3c649cf906e04e2149828f9" + } + }, + "4fd8d719926648e384cf0302d416a20a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "50a09203816b48a7bbe5a6b4f82447cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_7cb3b025ed42447f90ff87f2010fb123", + "style": "IPY_MODEL_cdaa5671ba4d4f769291c596fc333fab", + "value": "

GatewayCluster

" + } + }, + "54ac6e8841c04dcfb48d1d3f97b17697": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5548e78809424dd6b70acaf470126ffc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "556480de37274becb4f321d0210e50e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5790eb0bb5a54f86b731c9e433d5be4b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_50a09203816b48a7bbe5a6b4f82447cd", + "IPY_MODEL_072710f5a5d24788beadfe83105a204a", + "IPY_MODEL_dd9f0374151f4865b7ef19850ac556aa", + "IPY_MODEL_02384fe74f774f47b037d5863b7070b0" + ], + "layout": "IPY_MODEL_1f0a472b1af54581a3181bf915e7e902" + } + }, + "589e0bff592c48389395393cf8e6b434": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Minimum", + "layout": "IPY_MODEL_2ca364e8531042f09f61623a058bd0c3", + "step": 1, + "style": "IPY_MODEL_5548e78809424dd6b70acaf470126ffc" + } + }, + "58c2f7e60e9f44f2856ae5f917718d2e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "64dc4bde5b1243729f601307cb5551e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "Maximum", + "layout": "IPY_MODEL_2ca364e8531042f09f61623a058bd0c3", + "step": 1, + "style": "IPY_MODEL_556480de37274becb4f321d0210e50e4" + } + }, + "7cb3b025ed42447f90ff87f2010fb123": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8f09161fb4784fff940eface7043fde1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "description": "Scale", + "layout": "IPY_MODEL_2ca364e8531042f09f61623a058bd0c3", + "style": "IPY_MODEL_de3bf99ec72b4844a92d67c3761abbb7" + } + }, + "99c63563f34a41d09d2d5d98a37e6db5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "min_width": "150px" + } + }, + "b8d10e468f144cfea362b385cf79cd7c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "description": "Adapt", + "layout": "IPY_MODEL_2ca364e8531042f09f61623a058bd0c3", + "style": "IPY_MODEL_0076aa049aaa4ca3903afa05d5dcf14a" + } + }, + "bf5b337abe18402bb7b5915277677be8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "c1a7fb7e3ba04c21a20719278bb006fb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "cdaa5671ba4d4f769291c596fc333fab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d1710681c40241cd9755dc88200f5149": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_4faa28ce442d4531bb90012eb58cd01f", + "IPY_MODEL_8f09161fb4784fff940eface7043fde1" + ], + "layout": "IPY_MODEL_bf5b337abe18402bb7b5915277677be8" + } + }, + "d427b85661fc464c91e900374b75212d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_99c63563f34a41d09d2d5d98a37e6db5", + "style": "IPY_MODEL_54ac6e8841c04dcfb48d1d3f97b17697", + "value": "\n
\n\n\n \n \n \n
Workers 140
Cores 140
Memory 910.00 GiB
\n
\n" + } + }, + "dd9f0374151f4865b7ef19850ac556aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_c1a7fb7e3ba04c21a20719278bb006fb", + "style": "IPY_MODEL_37616593ee214ce68284e2dedf4f74cb", + "value": "

Name: daskhub-dev.33349d5e586245c7a57e585ee1cc92a6

" + } + }, + "de3bf99ec72b4844a92d67c3761abbb7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": {} + }, + "f0b4ddf4acd34d6384fd885c57c6d5cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f3c6a0285b4f4efd878d83e6f7c20aa8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "AccordionModel", + "state": { + "_titles": { + "0": "Manual Scaling", + "1": "Adaptive Scaling" + }, + "children": [ + "IPY_MODEL_d1710681c40241cd9755dc88200f5149", + "IPY_MODEL_fdc9fde437bd4309b55f0fc168dce3a6" + ], + "layout": "IPY_MODEL_f53e6c2926114df0892b2318f23d62ec", + "selected_index": null + } + }, + "f53e6c2926114df0892b2318f23d62ec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "min_width": "500px" + } + }, + "fdc9fde437bd4309b55f0fc168dce3a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_589e0bff592c48389395393cf8e6b434", + "IPY_MODEL_64dc4bde5b1243729f601307cb5551e7", + "IPY_MODEL_b8d10e468f144cfea362b385cf79cd7c" + ], + "layout": "IPY_MODEL_13555bcc815840b98f8f4f080b9c08da" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + } + ] + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/create-SLIIDERS-SLR/retrieve-num-gcms.ipynb b/notebooks/create-SLIIDERS-SLR/retrieve-num-gcms.ipynb new file mode 100644 index 0000000..a7db300 --- /dev/null +++ b/notebooks/create-SLIIDERS-SLR/retrieve-num-gcms.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d54c06e4-f962-40a2-9d51-5d53f613fc42", + "metadata": {}, + "source": [ + "# Retrieve `OceanDynN` information from LocalizeSL corefiles\n", + "`OceanDynN` represents the number of GCMs used to calculate LSL projections for each year-RCP-site\n", + "\n", + "This notebook saves this information in a `.zarr` in a format similar to the projections themselves" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06b96358-6f6c-4d18-993c-548d013f662f", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from gcsfs import GCSFileSystem\n", + "from oct2py import octave\n", + "\n", + "from sliiders.settings import (\n", + " DIR_IFILES_INT,\n", + " DIR_SLR_INT,\n", + " LOCALIZESL_COREFILES,\n", + " PATH_SLR_N_GCMS,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b1d27162-8a9b-4752-9d88-050379c29191", + "metadata": {}, + "source": [ + "#### Define paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fb517fb-cf6c-4bb7-aeb6-575c9fe082be", + "metadata": {}, + "outputs": [], + "source": [ + "FS = GCSFileSystem(token=\"/opt/gcsfuse_tokens/rhg-data.json\")\n", + "PATH_SLR_N_GCMS = FS.get_mapper(PATH_SLR_N_GCMS.relative_to(\"/gcs\"))\n", + "\n", + "DIR_OCTAVE_OUTPUTS = DIR_SLR_INT / \"ngcm_localizeSL_outputs\"\n", + "DIR_OCTAVE_OUTPUTS.mkdir(exist_ok=True)\n", + "\n", + "DIR_MFILES = Path(\"../../LocalizeSL/MFILES\")" + ] + }, + { + "cell_type": "markdown", + "id": "c1ccca44-499b-410d-b205-23582d25a2ab", + "metadata": {}, + "source": [ + "### Define Octave function to save information about `OceanDyn` parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d9e7d86-c117-4583-bddd-7a855baacdba", + "metadata": {}, + "outputs": [], + "source": [ + "load_oceandyn_func = f\"\"\"\n", + "function oceandyn_info = save_oceandyn_info(corefile_name, subcorefile_choice)\n", + " dir_out = '{DIR_OCTAVE_OUTPUTS}';\n", + "\n", + " ifilesdir='{DIR_IFILES_INT}';\n", + " mfilesdir='{DIR_MFILES}';\n", + "\n", + " addpath(ifilesdir);\n", + " addpath(mfilesdir);\n", + "\n", + " f = [corefile_name '_v5.mat'];\n", + "\n", + " corefilewrapper=load(fullfile(ifilesdir, f));\n", + "\n", + " mkdir(dir_out);\n", + "\n", + " if strcmp(corefile_name, 'SLRProjections190726core_SEJ_full')\n", + " if strcmp(subcorefile_choice, \"H\")\n", + " corefile = corefilewrapper.corefileH;\n", + " else\n", + " corefile = corefilewrapper.corefileL;\n", + " end\n", + " else\n", + " corefile = corefilewrapper;\n", + " end\n", + "\n", + " disp([\"Corefile: \" corefile_name]);\n", + " disp([\"Corefile subgroup: \" subcorefile_choice]);\n", + "\n", + " siteids = int64(corefile.targregions);\n", + "\n", + " for i=1:length(corefile.scens)\n", + " scen = cell2mat(corefile.scens(i));\n", + " csvwrite(strcat(dir_out, '/OceanDynN_', corefile_name, '_', scen, '.csv'), corefile.OceanDynN(:,:,i));\n", + " csvwrite(strcat(dir_out, '/OceanDynYears_', corefile_name, '_', scen, '.csv'), corefile.OceanDynYears);\n", + " dlmwrite(strcat(dir_out, '/siteid_', corefile_name, '_', scen, '.csv'), siteids, 'precision', '%i')\n", + " end\n", + "end\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "d3b7abb1-a454-4768-b0e1-4633950cb6ea", + "metadata": {}, + "source": [ + "### Save OceanDyn metadata for each corefile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e22ac5a3-7b5c-44ad-ac2b-2ef5b7802ad9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corefile: SLRProjections190726core_SEJ_full\n", + "Corefile subgroup: L\n", + "Corefile: SLRProjections190726core_SEJ_full\n", + "Corefile subgroup: H\n", + "Corefile: SLRProjections170113GRIDDEDcore\n", + "warning: implicit conversion from numeric to char\n", + "Corefile subgroup: \u0000\n", + "Corefile: SLRProjections200204GRIDDEDcore_D20\n", + "warning: implicit conversion from numeric to char\n", + "Corefile subgroup: \u0000\n", + "Corefile: SLRProjections210628GRIDDEDcore_SROCC\n", + "warning: implicit conversion from numeric to char\n", + "Corefile subgroup: \u0000\n" + ] + } + ], + "source": [ + "octave.eval(load_oceandyn_func)\n", + "\n", + "for corefile_name, subcorefiles in LOCALIZESL_COREFILES.items():\n", + " for subcorefile_choice in subcorefiles:\n", + " octave.save_oceandyn_info(corefile_name, subcorefile_choice)" + ] + }, + { + "cell_type": "markdown", + "id": "78ddc154-c4e0-439a-849b-788ec59d15ea", + "metadata": {}, + "source": [ + "### Load CSV outputs into DataFrames" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83c33314-69b6-4e50-80fe-bd9462b64aae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SLRProjections190726core_SEJ_full ['2p0degree+L', 'rcp85+H']\n", + "SLRProjections170113GRIDDEDcore ['rcp26', 'rcp45', 'rcp60', 'rcp85']\n", + "SLRProjections200204GRIDDEDcore_D20 ['rcp26', 'rcp45', 'rcp60', 'rcp85']\n", + "SLRProjections210628GRIDDEDcore_SROCC ['rcp26', 'rcp45', 'rcp60', 'rcp85']\n" + ] + } + ], + "source": [ + "rcps = [\"2p0degree+L\", \"rcp85+H\", \"rcp26\", \"rcp45\", \"rcp60\", \"rcp85\"]\n", + "\n", + "dfs = []\n", + "for corefile in LOCALIZESL_COREFILES.keys():\n", + "\n", + " rcps = [\"rcp26\", \"rcp45\", \"rcp60\", \"rcp85\"]\n", + " if corefile == \"SLRProjections190726core_SEJ_full\":\n", + " rcps = [\"2p0degree+L\", \"rcp85+H\"]\n", + "\n", + " print(corefile, rcps)\n", + "\n", + " for rcp in rcps:\n", + " scenario = f\"{corefile}_{rcp}\"\n", + " sites = pd.read_csv(DIR_OCTAVE_OUTPUTS / f\"siteid_{scenario}.csv\", header=None)\n", + " sites[\"name\"] = sites[0].astype(int).astype(str)\n", + " sites = sites.drop(columns=[0])\n", + "\n", + " years = pd.read_csv(\n", + " DIR_OCTAVE_OUTPUTS / f\"OceanDynYears_{scenario}.csv\", header=None\n", + " )\n", + "\n", + " years = years.T.rename(columns={0: \"year\"})\n", + "\n", + " df = pd.read_csv(\n", + " DIR_OCTAVE_OUTPUTS / f\"OceanDynN_{scenario}.csv\",\n", + " header=None,\n", + " names=sites[\"name\"].values,\n", + " )\n", + " df = df.join(years).set_index(\"year\", drop=True)\n", + " df = df[\n", + " [c for c in df.columns if int(c) > 100000000]\n", + " ] # These high-valued sites are the gridded ones\n", + " df = df.loc[2000:]\n", + " df[\"scenario\"] = scenario\n", + " df = df.set_index(\"scenario\", append=True)\n", + "\n", + " df = df.stack()\n", + " df.index = df.index.set_names(\"scenario\", level=1)\n", + " df.index = df.index.set_names(\"site_id\", level=2)\n", + "\n", + " dfs.append(df)" + ] + }, + { + "cell_type": "markdown", + "id": "152a6a4b-5343-4c2a-881c-ca22e6716041", + "metadata": {}, + "source": [ + "## Merge DataFrames into xarray and save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65915940-695d-4623-b58d-89862d38a2bb", + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat(dfs).to_xarray().to_dataset(name=\"numGCMs\").to_zarr(PATH_SLR_N_GCMS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e2c614b-5a22-4b46-8f5a-59a59f81f5e1", + "metadata": {}, + "outputs": [], + "source": [ + "FS.rm(str(DIR_OCTAVE_OUTPUTS.relative_to(\"/gcs\")), recursive=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..13129ff --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" + +[tool.isort] +profile = "black" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..464f498 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,18 @@ +[metadata] +name = sliiders +description = A global coastal dataset of physical and socioeconomic metrics organized by coastal segment and elevation slice for use in global coastal risk research +long_description = file: README.md +long_description_content_type = text/markdown +author = Daniel Allen, Ian Bolliger, Nicholas Depsky, Junho Choi +author_email = ian.bolliger@blackrock.com +version = v1.0.0 +url = https://github.com/ClimateImpactLab/sliiders +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +packages = find: +include_package_data = True +python_requires = >=3.6 diff --git a/sliiders/__init__.py b/sliiders/__init__.py new file mode 100644 index 0000000..3b5eb16 --- /dev/null +++ b/sliiders/__init__.py @@ -0,0 +1,2 @@ +from . import country_level_ypk, gcs, settings, spatial +from .settings import * diff --git a/sliiders/country_level_ypk.py b/sliiders/country_level_ypk.py new file mode 100644 index 0000000..3d0503a --- /dev/null +++ b/sliiders/country_level_ypk.py @@ -0,0 +1,1279 @@ +# various functions used for the country-level information workflow +from itertools import product as lstprod + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy.optimize import minimize as opt_min +from tqdm.auto import tqdm + +from .settings import PATH_PWT_RAW, PPP_CCODE_IF_MSNG, SCENARIOS, SSP_PROJ_ORG_SER + + +def log_lin_interpolate(df, header="v_"): + """Simple log-linear interpolation, to fit the horizontal (or wide-panel format) + dataset that we use. + + Parameters + ---------- + df : pandas DataFrame + contains data that we may interpolate + header : str + header of the variable names; should be followed by year (e.g., "v_1950") + + Returns + ------- + df_rtn : pandas DataFrame + DataFrame containing interpolated data + + """ + + v_ = np.sort([x for x in df.columns if header in x]) + yrs = [int(x.replace(header, "")) for x in v_] + all_yrs = range(min(yrs), max(yrs) + 1) + all_v = np.sort([header + str(x) for x in all_yrs]) + front_v = [x for x in df.columns if header not in x] + + df_missing_v = np.setdiff1d(all_v, v_) + df_rtn = df.copy() + if len(df_missing_v) > 0: + df_rtn[df_missing_v] = np.nan + + ## re-ordering the columns, just in case + df_rtn = df_rtn[np.hstack([front_v, all_v])] + + for i in df_rtn.index: + fp = df_rtn.loc[i, :][all_v] + ## in case there is any nonpositive values or no missing values, + ## cannot log-linearly interpolate (former) and no need to interpolate (latter) + if (fp <= 0).any() or (not fp.isnull().any()): + continue + + where_nm = np.where(~pd.isnull(fp.values))[0] + fp, i_yrs = np.log(fp[where_nm].astype("float64")), np.array(all_yrs)[where_nm] + + ## we only want to INTERpolate with this function, and not EXTRApolate + want_interp_range = range(i_yrs.min(), i_yrs.max() + 1) + case = np.exp(np.interp(want_interp_range, i_yrs, fp)) + want_interp_v = ["v_" + str(x) for x in want_interp_range] + df_rtn.loc[i, want_interp_v] = case + + return df_rtn + + +def ssp_and_model_simplify(ssp_col, model_col, df, dic=SSP_PROJ_ORG_SER): + """Simplifying the SSP and model (IAM) designations. For instance, "SSP2" has many + corresponding versions of the same scenario including SSP2_v9_130219, + SSP2_v9_130325, and so forth. This function simplifies those different names. + + Parameters + ---------- + ssp_col : str + column name for the SSP scenario + model_col : str + column name for the IAM scenario + df : pandas DataFrame + DataFrame to apply the function to + dic : dict + containing verbose names and simplified names for SSP-IAM scenarios + + Returns + ------- + df_rtn : pandas DataFrame + cleaned DataFrame containing simplified ssp and model names + + """ + + ser = pd.Series(dic) + + df["ssp"] = ser.reindex(df[ssp_col]).values + df["iam"] = ser.reindex(df[model_col]).values + + df_rtn = df.copy() + df_rtn.rename(columns={"REGION": "ccode"}, inplace=True) + + csi = ["ccode", "ssp", "iam"] + df_rtn_rest_col = [x for x in df_rtn.columns if x not in csi] + df_rtn = df_rtn[csi + df_rtn_rest_col] + + return df_rtn + + +def yearly_growth(df, header="v_"): + """Turn a horizontal (wide-panel) DataFrame with annual values (whose variable names + start with `header` and have year designations) into containing annual growth rates. + The initial-year growth rates are set to be 0. + + Parameters + ---------- + df : pandas DataFrame + DataFrame containing annual data, where for some 4-digit year `y`, the variable + names are as follows: `{header}{y}`; should be in wide-panel format (i.e., + one row for each country) + header : str + header for the annual variable names + + Returns + ------- + rtn_df : pandas DataFrame + DataFrame containing annual growth rates + + """ + + yrs = [int(v[-4:]) for v in df.columns if header in v] + others = [v for v in df.columns if header not in v] + yrs.sort() + v_ = [header + str(yr) for yr in yrs] + rtn_df = df[others + v_].copy() + + for i, v in enumerate(v_): + if i == 0: + rtn_df[v] = 0 + continue + v_prev = v_[(i - 1)] + rtn_df[v] = np.log(df[v]) - np.log(df[v_prev]) + + return rtn_df + + +def helper_extrap_using_closest( + prob_ctry, after, avail_yr, end_yr, tgt_df, sse_good_df, wgt_power, hdr="v_" +): + """Helper function for the function `extrap_using_closest`, which is used in + detecting similar-trajectory countries and using the said trajectories to impute + the missing values of another country + + Parameters + ---------- + prob_ctry : str + country code needing extrapolation (projection) + after : boolean + for projecting forward in time, set as True; for projecting backwards in time, + set as False + avail_yr : int + latest available year if projecting forward (`after`=True), and earliest + available year if projecting backwards (`after`=False) + end_yr : int + latest year to project until if projecting forward (`after`=True), and earliest + year to project until if projecting backwards (`after`=False) + tgt_df : pandas DataFrame + DataFrame to calculate the extrapolated projections from; should be in + wide-panel format, containing + sse_good_df : pandas DataFrame + DataFrame containing the sum of squared errors (of known growth rates) with + respect to the countries that have the closest trajectories to `prob_ctry` + wgt_power : float + by what exponent the weights (for creating extrapolations) should be applied at + hdr : str + header for the annual variables in `tgt_df` + + Returns + ------- + extrapolated : numpy array + containing extrapolated information, using similar-trajectory countries + + """ + + if prob_ctry is None: + prob_ctry = tgt_df.index.unique()[0] + + if after: + v_s = [hdr + str(x) for x in range(avail_yr, end_yr + 1)] + else: + v_s = [hdr + str(x) for x in range(end_yr, avail_yr + 1)] + + gr_df_base_avail = sse_good_df[v_s + ["sse", "sse_rank"]].copy() + avail_v = hdr + str(avail_yr) + gr_df_base_avail[v_s] = gr_df_base_avail[v_s].div(gr_df_base_avail[avail_v], axis=0) + + ## if there's a PERFERCTLY matching set of growth rates, then just take that + ## country (or those countries') growth rates + if (gr_df_base_avail.sse == 0).any(): + idx = gr_df_base_avail.loc[gr_df_base_avail.sse == 0, :].index.unique() + if len(idx) == 1: + growth_rates = gr_df_base_avail.loc[idx[0], v_s] + else: + growth_rates = gr_df_base_avail.loc[idx, v_s].values.mean(axis=0) + else: + gr_df_base_avail["wgt_vals"] = (1 / gr_df_base_avail["sse"]).values ** wgt_power + denom_values = np.sum(gr_df_base_avail["wgt_vals"].values) + growth_rates = ( + np.sum( + gr_df_base_avail[v_s].mul(gr_df_base_avail["wgt_vals"], axis=0), axis=0 + ) + / denom_values + ) + + avail_val = tgt_df.loc[prob_ctry, avail_v] + extrapolated = np.array(growth_rates) * avail_val + + if after: + extrapolated = extrapolated[1:] + else: + extrapolated = extrapolated[0:-1] + + return extrapolated + + +def extrap_using_closest( + prob_lst, + orig_df, + n_det=5, + wgt_power=1, + begin_end=[1950, 2019], + exclude_these=["MAF", "WLF", "ESH"], + merge_orig=True, + header="v_", + fill_name="msng_fill", + ctry_col="ccode", +): + """Uses the "closest" countries (in terms of existing data's trajectory with + respect to a given year, with the metric for determining "closeness" as the + sum of squared errors [SSE]) whose data are non-missing to figure out the trajectory + of "problematic" (i.e., with missing data) countries. + + Parameters + ---------- + prob_lst : array-like + List of countries whose data are partially missing + n_det : int + Number of "similar countries" to use + wgt_power : float + Whether higher weights should be given to those that are "closer" or not + (higher positive number --> greater weights) + begin_end : array-like of int + The earliest and the last year that need extrapolation + exclude_these : list of str + list of countries to exclude for using as "closest" countries, or to extrapolate + in general; for instance, if a country has only one year's worth of + data (like MAF, WLF, ESH's GDP values) then it would be a good reason to + exclude these countries. + merge_orig : boolean + whether the information from non-problematic countries should be merged + when returning the data back + header : str + for the variables (e.g., "v_" for "v_1950" indicating 1950 values) + fill_name : str + column name for the missing value "fill" information (which years were filled, + using which countries) + ctry_col : str + column name for the country-code variable, default being "ccode" + + Returns + ------- + df_rtn : pandas DataFrame + DataFrame containing extrapolated and existing information countries with + missing values. If merge_orig is True, then it would also contain the countries + without any extrapolated (i.e., the "non-problematic") + + """ + ## indicing for operations below + ctry_msg = "Needs have the country-code column / index `{}` in the dataset" + ctry_msg = ctry_msg.format(ctry_col) + assert (ctry_col in orig_df.index.names) or (ctry_col in orig_df.columns), ctry_msg + + if ctry_col not in orig_df.index.names and ctry_col in orig_df.columns: + df_idxed = orig_df.set_index([ctry_col]) + else: + df_idxed = pd.DataFrame(orig_df) + + ## sorting the problematic (with missing-value) countries for consistency + prob, exclude_these = list(np.sort(prob_lst)), list(exclude_these) + + ## variable names and getting the dataframe of "good-to-go" country codes + v_ = np.sort( + [ + x + for x in orig_df.columns + if (header in x) + and (int(x[-4:]) <= begin_end[1]) + and (int(x[-4:]) >= begin_end[0]) + ] + ) + + ## good_ctries are only those that are absolutely filled + ## excluding those that should be excluded + good_ctries = df_idxed[(~df_idxed[v_].isnull().any(axis=1))].index.unique() + good_ctries = np.setdiff1d(good_ctries, prob + exclude_these) + good_df = df_idxed.loc[good_ctries, :] + good_gr = yearly_growth(good_df, header) + + ## running for each of the problematic countries + df_collection = [] + for i in tqdm(prob): + ## there could be missing values in between known yrs, so interpolate + tgt_df = df_idxed.loc[[i], :].copy() + row_vals = tgt_df.loc[i, v_].copy() + row_vals = np.where(row_vals < 0, np.nan, row_vals) + valid_where = np.where(~pd.isnull(row_vals))[0] + mn_valid_loc, mx_valid_loc = min(valid_where), max(valid_where) + v_valid = v_[mn_valid_loc : mx_valid_loc + 1] + if len(valid_where) != (mx_valid_loc + 1 - mn_valid_loc): + log_interp_vals = np.interp( + range(mn_valid_loc, mx_valid_loc + 1), + valid_where, + np.log( + tgt_df.loc[i, np.array(v_)[valid_where]].values.astype("float64") + ), + ) + tgt_df[v_valid] = np.exp(log_interp_vals) + row_valid = tgt_df.loc[i, v_valid] + + ## as yearly growth rates, with missing values filled as 0 + tgt_gr = yearly_growth(tgt_df).fillna(0) + + ## detecting which is the valid (or non-missing) growth rates + gr_row_valid = tgt_gr.loc[i, v_valid].values + + ## subtract the problematic growth rates from good-to-go growth rates, + ## and calculate the sum of squared errors to detect which is the closest + sse_df = good_gr.copy() + sse_df["sse"] = (sse_df[v_valid].sub(gr_row_valid, axis=1) ** 2).sum(axis=1) + sse_df.sort_values(["sse"], inplace=True) + sse_df["sse_rank"] = range(0, sse_df.shape[0]) + + ## top n closest in terms of trajectory + necess_sse_df = sse_df[sse_df.sse_rank < n_det][["sse", "sse_rank"]] + necess_sse_df = necess_sse_df.merge( + good_df[v_], + how="left", + left_index=True, + right_index=True, + ) + + ## if need to project backwards in time + rtn_row, past_fill, fut_fill = np.array(row_valid), None, None + if v_valid[0] != v_[0]: + avail_yr, earl_yr = int(v_valid[0][-4:]), begin_end[0] + past_vals = helper_extrap_using_closest( + i, + False, + avail_yr, + earl_yr, + tgt_df, + necess_sse_df, + wgt_power, + hdr=header, + ) + rtn_row = np.hstack([past_vals, rtn_row]) + past_fill = "{}-{}".format(earl_yr, avail_yr - 1) + + ## if need to project forward in time + if v_valid[-1] != v_[-1]: + avail_yr, late_yr = int(v_valid[-1][-4:]), begin_end[-1] + fut_vals = helper_extrap_using_closest( + i, + True, + avail_yr, + late_yr, + tgt_df, + necess_sse_df, + wgt_power, + hdr=header, + ) + rtn_row = np.hstack([rtn_row, fut_vals]) + fut_fill = "{}-{}".format(avail_yr + 1, late_yr) + + ## extrapolation information as "fill_info" + used_ccodes, fill_info = ",".join(list(necess_sse_df.index.unique())), "-" + if (past_fill is not None) and (fut_fill is not None): + fill_info = past_fill + "," + fut_fill + ":" + used_ccodes + elif past_fill is not None: + fill_info = past_fill + ":" + used_ccodes + elif fut_fill is not None: + fill_info = fut_fill + ":" + used_ccodes + + tgt_df_extrap = tgt_df.copy() + tgt_df_extrap[v_] = rtn_row + tgt_df_extrap[fill_name] = fill_info + df_collection.append(tgt_df_extrap) + + rtn_df = pd.concat(df_collection, axis=0) + + if merge_orig: + unaltered = np.setdiff1d( + orig_df.index.get_level_values("ccode").unique(), + rtn_df.index.get_level_values("ccode").unique(), + ) + orig_slice = orig_df.loc[unaltered, :].copy() + orig_slice[fill_name] = "-" + rtn_df = pd.concat([rtn_df, orig_slice], axis=0).sort_index() + + return rtn_df + + +def organize_hor_to_ver( + df, + main_cat, + sub_cats, + new_vname, + hdr="v_", + yrs=range(1950, 2020), + timename="year", +): + """Use for organizing wide-format panel data ("horizontal") to long-format panel + data ("vertical"). Serves as a wrapper for the function `pandas.wide_to_long`, but + repurposed for our workflow (mostly in terms of renaming the variables) + + Note: For every row of the "input" dataframe `df`, we assume that there is at most + one combination of the categories in `catnames`; for instance, if `catname` + is equal to ["ccode", "ssp", "iam"], we expect that there should be at most + one account for each countrycode-SSP-IAM combination. + + Parameters + ---------- + df : pandas DataFrame + dataframe containing information, that is in a "wide-format" + main_cat : str + name of the main category we want organize by (e.g., "ccode" for country-codes) + sub_cats : array-like of str or None + list or array containing the names of additional categories we want to organize + by (e.g., ["ssp", "iam"]); if equals to None, then is understood as an empty + array + new_vname : str + name of the variable to be newly assigned + hdr : str + current "header" of the columns in wide-format (e.g., "v_" would + mean that v_1950, v_1951, v_1952,... are the column names) + yrs : array-like of int + years to consider + timename : str + what to call the part of the index + + Returns + ------- + long_df : pandas DataFrame + containing the data in long-panel (or vertical) format + + """ + + if sub_cats is None: + sub_cats = [] + cats = np.hstack([[main_cat], sub_cats]) + reorder = np.hstack([[main_cat, timename], sub_cats]) + + ## resetting the index to be compliant with `pandas.wide_to_long` + df_reind = df.reset_index() + if df.index.names is None: + df_reind.drop(["index"], axis=1, inplace=True) + v_s = np.intersect1d([hdr + str(x) for x in yrs], df_reind.columns) + df_reind = df_reind[np.hstack([[x for x in df_reind.columns if hdr not in x], v_s])] + + long_df = pd.wide_to_long(df_reind, hdr, cats, timename).reset_index() + long_df.set_index(list(reorder), inplace=True) + long_df.sort_index(axis=0, inplace=True) + long_df.rename(columns={hdr: new_vname}, inplace=True) + + return long_df + + +def organize_ver_to_hor( + df, + varname, + timename, + ccodename, + total_yrs=range(1950, 2020), + impose_total=False, +): + """For organizing a "vertical dataframe" (or long-panel form data) to "horizontal + dataframe" (or wide-panel format data). Mainly works as a wrapper for pandas.pivot + but repurposed for our purposes (including re-naming the columns) + + Parameters + ---------- + df : pandas DataFrame + dataframe containing information + varname : str + column name of the variable that we want the information about + timename : str + column name of the variable that indicates time or years + ccodename : str + column name of the variable indicating country-codes + total_yrs : array-like + range of the years that we want information about + impose_total : boolean + if True, all years in the `total_yrs` array are represented (even if missing + entirely from the dataset); if False, then only necessary columns are reported + (with at least some non-missing values) + + Returns + ------- + df_rtn : pandas.DataFrame + :py:class:`pandas.DataFrame` containing information specifically about the + variable indicated by "varname", in a wide-panel format. + + """ + + ## necessary to reset the index to pass to pandas.pivot + df_rtn = df.reset_index() + names = np.array([varname, timename, ccodename]) + assert len(np.setdiff1d(names, df_rtn.columns)) == 0, "necessary columns missing." + + df_rtn.sort_values([ccodename, timename], inplace=True) + df_rtn = df_rtn.pivot(index=[ccodename], columns=timename, values=varname) + df_rtn.columns.name = None + df_rtn.columns = ["v_" + str(x) for x in df_rtn.columns] + total_v = ["v_" + str(x) for x in total_yrs] + + df_rtn = df_rtn[[v for v in total_v if v in df_rtn.columns]] + if impose_total: + leftovers = np.setdiff1d(total_v, df_rtn.columns) + df_rtn[leftovers] = np.nan + df_rtn = df_rtn[total_v] + + return df_rtn + + +def ppp_conversion_specific_year( + yr, + to=True, + extrap_sim=True, + fill_msng_ctries=PPP_CCODE_IF_MSNG, + pwtvar="pl_gdpo", +): + """Given a specified year (`yr`), creates a table of PPP conversion factors either + to that year (to=True) or from that year (to=False). The range of years to + convert from or to that year is fixed to 1950-2019, which is all the available + years from Penn World Tables. We can specify the `pwtvar` variable to change + whether we would like to use a different price level variable (e.g., `pl_n` for + capital, `pl_gdpo` for output-side GDP). + + Parameters + ---------- + yr : int + specific year that we will calculate PPP conversion rates to or from + to : boolean + boolean for indicating if the target year is the year that one should calculate + the years from (`to`=False) or to (`to`=True). e.g., if yr=2019 and to=True, + this function will calculate the conversion rates from 2019 PPP to PPP of any + year between 1950 and 2019 (but NOT change the base dollar terms) + extrap_sim : boolean + boolean for whether to extrapolate or not, for countries having partial + information (i.e., not all conversion rates for 1950-2019). + fill_msng_ctries : None or dict + indicates if we should fill in for those countries that are either entirely + missing from both WDI and PWT datasets or has too much unreliable / missing + data + pwtvar : str + the name of the price level variable to calculate PPP conversion rates from + for PWT. + + Outputs + ------- + pl_ver : pandas DataFrame + containing countrycode, year, and conversion rates (PPP); information organized + in a vertical (long-panel) format, with extrapolation done for the specified + variable when there are missing variables if `extrap_sim` is equal to True. + + """ + + print("Fetching information from PWT...") + ## reading in the necessary PWT dataframe + pwt = ( + pd.read_excel(PATH_PWT_RAW) + .rename(columns={"countrycode": "ccode"}) + .set_index(["ccode", "year"]) + ) + pwt_years = pwt.index.get_level_values("year").unique() + yr_range = range(pwt_years.min(), pwt_years.max() + 1) + + v_ = ["v_" + str(v) for v in yr_range] + pl = organize_ver_to_hor(pwt, pwtvar, "year", "ccode", yr_range) + pl_ccode = pl.index.get_level_values("ccode").unique() + + ## replace with pl_gdpo information if specific pl values for a country are + ## missing entirely + if pwtvar != "pl_gdpo": + pl_gdpo = organize_ver_to_hor(pwt, "pl_gdpo", "year", "ccode", yr_range) + for c in pl_ccode: + row = pl.loc[c, v_].values + if sum(pd.isnull(row)) == len(row): + pl.loc[c, v_] = pl_gdpo.loc[c, v_].values + + if extrap_sim: + prob = ( + pl.loc[pl[v_].isnull().any(axis=1), :] + .index.get_level_values("ccode") + .unique() + ) + pl = extrap_using_closest(prob, pl, exclude_these=[]) + + pl_ver = organize_hor_to_ver(pl, "ccode", None, pwtvar, yrs=yr_range) + fill_name = "{}_fill".format(pwtvar) + pl_ver.rename(columns={"msng_fill": fill_name}, inplace=True) + pl_ver[fill_name] = [v.split(":")[-1] for v in pl_ver[fill_name]] + + ## making sure that the fill-information is "-" if information was not missing + pl_ver = pl_ver.merge( + pwt[[pwtvar]].rename(columns={pwtvar: "temp"}), + left_index=True, + right_index=True, + how="left", + ) + pl_ver.loc[~pd.isnull(pl_ver["temp"]), fill_name] = "-" + pl_ver.drop(["temp"], axis=1, inplace=True) + + else: + pl_ver = organize_hor_to_ver(pl, "ccode", None, pwtvar, yrs=yr_range) + + ## taking care of the case of Bermuda, since it is sometimes suffering + ## from negative price levels + if (pwtvar == "pl_gdpo") and ("BMU" in pl_ccode): + pl_ccode = np.setdiff1d(pl_ccode, ["BMU"]) + bmu_copy = pl_ver.loc[("GBR", slice(None)), :].reset_index().copy() + bmu_copy[fill_name] = "copy_from_GBR" + bmu_copy["ccode"] = "BMU" + bmu_copy.set_index(["ccode", "year"], inplace=True) + pl_ver = pd.concat([pl_ver.loc[(pl_ccode, slice(None)), :], bmu_copy], axis=0) + + ## merging the "base" price level, which is that of the US + pwt_ppp = pl_ver.merge( + ( + pl_ver.loc[("USA", slice(None)), [pwtvar]] + .reset_index() + .drop(["ccode"], axis=1) + .set_index(["year"]) + .rename(columns={pwtvar: "base"}) + ), + left_index=True, + right_index=True, + how="left", + ) + + ## note that according to Feenstra et al. (2015), PPP / XR = pl / pl_base + ## with the "base" again being the United States; `ppp` below is PPP / XR + pwt_ppp["ppp"] = pwt_ppp[pwtvar] / pwt_ppp["base"] + + ## multiplying `ppp` can be understood as turning PPP-adjusted value of a certain + ## year to nominal value; turning base-year-a PPP values to base-year-b PPP values + ## therefore requires multiplying `ppp`(a) / `ppp`(b) + tgtyr_ppp = f"ppp_{yr}" + pwt_ppp = pwt_ppp.merge( + ( + pwt_ppp.loc[(slice(None), yr), ["ppp"]] + .rename(columns={"ppp": tgtyr_ppp}) + .reset_index() + .drop(["year"], axis=1) + .set_index(["ccode"]) + ), + left_index=True, + right_index=True, + how="left", + ) + + ## conversion rates + if to: + pwt_ppp["conv"] = pwt_ppp["ppp"] / pwt_ppp[tgtyr_ppp] + else: + pwt_ppp["conv"] = pwt_ppp[tgtyr_ppp] / pwt_ppp["ppp"] + pwt_ppp.drop([tgtyr_ppp], axis=1, inplace=True) + + to_keep = [] + for i in pwt_ppp.index.get_level_values("ccode").unique(): + i_case = pwt_ppp.loc[i, "conv"].isnull().all() + if not i_case: + to_keep.append(i) + + pwt_ppp = pwt_ppp.loc[(to_keep, slice(None)), :].sort_index() + + ## filling in the missing countries with known values + if fill_msng_ctries is not None: + print("Filling in the missing countries...") + pwt_ppp["conv_fill"] = "refer_to_other_cols" + + for key, replace_these in fill_msng_ctries.items(): + conv_fill_key = "copy_from_{}".format(key) + no_replace_these_ccodes = np.setdiff1d( + pwt_ppp.index.get_level_values("ccode").unique(), replace_these + ) + pwt_ppp = pwt_ppp.loc[(no_replace_these_ccodes, slice(None)), :] + + copies = [pwt_ppp] + for rep_ctry in replace_these: + ctry_copied = pwt_ppp.loc[(key, slice(None)), :].copy().reset_index() + ctry_copied["ccode"] = rep_ctry + ctry_copied["conv_fill"] = conv_fill_key + ctry_copied.set_index(["ccode", "year"], inplace=True) + copies.append(ctry_copied) + pwt_ppp = pd.concat(copies, axis=0) + + pwt_ppp = pwt_ppp[["conv", "conv_fill", fill_name]].copy() + pwt_ppp.sort_index(inplace=True) + print("...done") + + return pwt_ppp + + +def smooth_fill( + da1_in, + da2_in, + fill_all_null=True, + time_dim="time", + other_dim="storm", +): + """Fill values from 2D dataarray `da1_in` with values from 2D dataarray + `da2_in`. + + For instance, one may use this with storm datasets. If filling the beginning or end + of a storm, pin the `da2_in` value to the `da1_in` value at the first/last point of + overlap and then use the `da2_in` values only to estimate the "change" in values + over time, using a ratio of predicted value in the desired time to the reference + time. This can also be used when, for example, `da1_in` refers to RMW and `da2_in` + refers to ROCI. In this case, you want to define ``fill_all_null=False`` to avoid + filling RMW with ROCI when no RMW values are available but some ROCI values are + available. + + Parameters + ---------- + da1_in, da2_in : xarray.DataArray + DataArrays indexed by other dimension (defined by `other_dim`) and time + dimension (defined by `time_dim`) + fill_all_null : bool, optional + If True, fills even when there are no known (or non-NA) values in `da1_in` + time_dim : str, optional + variable name to indicate the time dimension, default set to be "time" + other_dim : str, optional + variable name to indicate the other dimension, default set to be "storm" but + can also indicate country or region names, for instance + + Returns + ------- + :class:`xarray.DataArray` + Same as ``da1`` but with NaN's filled by the described algorithm. + + Raises + ------ + AssertionError : + If there are "interior" NaN's in either dataset, i.e. if any storm has a NaN + after the first non-NaN but before the last non-NaN. These should have + previously been interpolated. + + Examples + -------- + >>> import xarray as xr + >>> da1 = xr.DataArray( + ... np.array( + ... [ + ... [np.nan, 1, 2, 3], + ... [np.nan, np.nan, 4, 5], + ... [6, 7, np.nan, np.nan], + ... [8, 9, 10, np.nan], + ... [11, 12, 13, 14], + ... [np.nan, np.nan, np.nan, np.nan], + ... ] + ... ), + ... coords = {"storm": range(6), "time": range(4)}, + ... dims = ["storm", "time"] + ... ) + >>> da2 = xr.DataArray( + ... np.array( + ... [ + ... [15, 16, 17, 18], + ... [19, 20, 21, 22], + ... [23, 24, 25, 26], + ... [27, 28, 29, 30], + ... [31, 32, 33, 34], + ... [35, 36, 37, 38], + ... ] + ... ), + ... coords = {"storm": range(6), "time": range(4)}, + ... dims = ["storm", "time"] + ... ) + >>> smooth_fill(da1, da2) + + array([[ 0.9375 , 1. , 2. , 3. ], + [ 3.61904762, 3.80952381, 4. , 5. ], + [ 6. , 7. , 7.29166667, 7.58333333], + [ 8. , 9. , 10. , 10.34482759], + [11. , 12. , 13. , 14. ], + [35. , 36. , 37. , 38. ]]) + Coordinates: + * storm (storm) int64 0 1 2 3 4 5 + * time (time) int64 0 1 2 3 + """ + + da1 = da1_in.copy() + da2 = da2_in.copy() + either_non_null = da1.notnull() | da2.notnull() + + da1 = da1.interpolate_na(dim=time_dim, use_coordinate=True) + da2 = da2.interpolate_na(dim=time_dim, use_coordinate=True) + for da in [da1, da2]: + assert da.interpolate_na(dim=time_dim).notnull().sum() == da.notnull().sum() + + adjust = da1.reindex({other_dim: da2[other_dim]}) + first_valid_index = (adjust.notnull() & da2.notnull()).argmax(dim=time_dim) + last_valid_index = ( + adjust.bfill(time_dim).isnull() | da2.bfill(time_dim).isnull() + ).argmax(dim=time_dim) - 1 + + all_null = adjust.isnull().all(dim=time_dim) + if not fill_all_null: + all_null *= False + + est_to_obs_rat_first = adjust.isel({time_dim: first_valid_index}) / da2.isel( + {time_dim: first_valid_index} + ) + + est_val = da2.where( + all_null | adjust.ffill(time_dim).notnull(), + da2 * est_to_obs_rat_first, + ) + + est_to_obs_rat_last = adjust.isel({time_dim: last_valid_index}) / da2.isel( + {time_dim: last_valid_index} + ) + + est_val = est_val.where( + all_null | adjust.bfill(time_dim).notnull(), + da2 * est_to_obs_rat_last, + ) + + # fill storms with da1 vals using the full da2 time series. For storms with some da1 + # vals, fill the tails using da2 scaled so that it matches at the first and last + # points seen in both da1 and da2 + out = da1.fillna(est_val) + + # make sure we didn't add vals + return out.where(either_non_null) + + +def minimize_simple_production(x, K_values, Y_values): + """Helper function for getting at the `A` (TFP) and `alpha` (GDP elasticity of + capital). Returns the sum of squared errors with respect to actual GDP values. + + Parameters + ---------- + x : array-like of floats + divided into `A` (TFP) and `alpha` (GDP elasticity of capital) + K_values : array-like of floats + containing historical capital values (in the case of only-capital production + functional form) or historical per-capita capital values (in the case of Cobb- + Douglas functional form) + Y_values : array-like of floats + containing historical GDP values (in the case of only-capital production + functional form) or historical per-capita GDP values (in the case of Cobb- + Douglas functional form) + + Returns + ------- + sse : float + Sum of squared errors from netting the actual GDP values from estimated + GDP values assuming a functional form + + """ + A, alpha = x + + diff = A * (K_values**alpha) - Y_values + sse = np.sum(diff**2) + + return sse + + +def MPK_init_calc( + ccode, + hist_df, + base2010_df, + alpha_overall, + hist_YKP=["rgdpna_19", "rnna_19", "pop"], + base_YKP=["gdp", "capital", "pop"], + init_A_alpha=[100, 1], +): + """Function for calculating the value of MPK (marginal product of capital) of the + country specified by `ccode` and in the year specified by `year`. + Parameters + ---------- + ccode : str + country code of the country we need to calculate the MPK for + hist_df : pandas.DataFrame + dataframe containing historical (1950-2020) information on GDP, capital stock, + and population; should contain the column names in `hist_YKP`. Note that its + values are in millions (of dollars for GDP and capital, of people + for population) + base2010_df : pandas.DataFrame + dataframe containing projected 2010 information on GDP and population and + baseline 2010 historical capital stock information; should contain the column + names in `base_YKP` + alpha_overall : array-like of floats + should contain elasticities of GDP w.r.t. capital values that have been pre- + calculated; two elements, the former being our own calculation of the elasticity + and the latter being the elasticity calculated in Crespo Cuaresma (2017) + hist_YKP : array-like of str + column names (of `hist_df`) in the following order: constant PPP GDP + variable, constant PPP capital stock variable, and population variable + base_YKP : array-like of str + column names (of `base2010_df`) in the following order: constant PPP GDP + variable, constant PPP capital stock variable, and population variable + init_A_alpha : array-like of floats + points of initialization for `A` (TFP) and `alpha` (elasticity of GDP w.r.t. + capital) + + Returns + ------- + MPK_overall_our, MPK_overall_iiasa, MPK_country_pc, MPK_country : tuple of floats + calculated MPKs using different versions of the GDP elasticity w.r.t. capital, + first - using our self-calculated elasticity, + second - using the Crespo Cuaresma (2017) elasticity, + third - using the country-specific elasticity assuming Cobb-Douglass function, + fourth - using the country-specific elasticity assuming capital-only function + + """ + + # Y, K, pop values of the years that we want to examine + histccodes = hist_df.index.get_level_values("ccode").unique() + baseccodes = base2010_df.index.get_level_values("ccode").unique() + msg_error = "`ccode` must be in both `hist_df` and `base2010_df`." + assert (ccode in histccodes) and (ccode in baseccodes), msg_error + + # multiplying 1,000,000 since they are in millions + YKP = hist_df.loc[ccode, hist_YKP].dropna() + Ys = YKP[hist_YKP[0]].values * 1000000 + Ks = YKP[hist_YKP[1]].values * 1000000 + Ps = YKP[hist_YKP[2]].values * 1000000 + + # creating the capital intensity values using projected base 2010 data + yk_df = base2010_df.loc[ + base2010_df.index.get_level_values("ccode") == ccode, : + ].copy() + yk_df = yk_df.reset_index().set_index(["ccode", "ssp", "iam"])[base_YKP] + yk_df["yk"] = yk_df[base_YKP[0]] / yk_df[base_YKP[1]] + + # if all zeros for any of the three variables, no reason to calculate MPK + if (Ys == 0).all() or (Ks == 0).all() or (Ps == 0).all(): + yk_df["mpk_our"], yk_df["mpk_iiasa"] = 0, 0 + yk_df["mpk_ctry_cd"], yk_df["mpk_ctry_co"] = 0, 0 + return yk_df + + # let us sort them in the order of Ks, just in case + KYPs = np.array(sorted(zip(Ks, Ys, Ps))) + Ks, Ys, Ps = KYPs[:, 0], KYPs[:, 1], KYPs[:, 2] + + # optimizing values for A (total factor productivity) and alpha (GDP elasticity + # wrt. capital); capital-only + A_alpha_getter = lambda x: minimize_simple_production(x, Ks, Ys) + A, alpha = opt_min( + A_alpha_getter, init_A_alpha, bounds=((0, np.inf), (0, np.inf)) + ).x + + # optimizing values for A and alpha; Cobb-Douglas + A_alpha_getter = lambda x: minimize_simple_production(x, Ks / Ps, Ys / Ps) + A_pc, alpha_pc = opt_min( + A_alpha_getter, init_A_alpha, bounds=((0, np.inf), (0, np.inf)) + ).x + + # calculating MPK values based on the above A and alpha calculations + yk_df["mpk_our"] = alpha_overall[0] * yk_df["yk"] + yk_df["mpk_iiasa"] = alpha_overall[-1] * yk_df["yk"] + yk_df["mpk_ctry_cd"] = alpha_pc * yk_df["yk"] + yk_df["mpk_ctry_co"] = alpha * yk_df["yk"] + + return yk_df + + +def pim_single_ctry( + ccode_df, + MPK_init, + alpha_overall, + MPK_var="mpk_our", + scenarios=SCENARIOS, + yr_startend=[2010, 2100], + MPK_bar=0.1, + gamma_MPK=0.985, + gamma_I=0.98, + Yvar="gdp", + Kvar="capital", + iy_var="iy_ratio", + depre_overall_var="delta", + depre_ctry_var="delta_c", +): + """Function for running the perpetual inventory method (PIM, as described in + Dellink et al., 2017), for a specific country, for each SSP-IAM scenario. + + ---------- + ccode_df : pandas.DataFrame + DataFrame containing country-specific information for conducting the by-country + PIM process to acquire capital stock projections. Needs to contain `Yvar`, + `Kvar`, with indices `ccode`, `year`, `ssp`, and `iam`. + MPK_init : pandas.DataFrame + DataFrame containing initial-year marginal product of capital. Also should + contain depreciation rates; so should contain `MPK_var`, `depre_overall_var`, + `iy_var`, and `depre_ctry_var` and with index `ccode`, `ssp`, and `iam` + alpha_overall : float + elasticity of GDP w.r.t. capital, global and not country-specific + MPK_var : str + column name for the initial-year marginal product of capital + scenarios : array-like of tuples of str + array-like of tuples containing SSP and IAM (in that order) scenarios + yr_startend : array-like of ints + starting year and end year of projection + MPK_bar : float + long-term elasticity of GDP w.r.t. capital, value 0.1 taken from Dellink et al. + (2017) + gamma_MPK : float + velocity of converging to long-term elasticity of GDP w.r.t. capital, value + 0.985 taken from Dellink et al. (2017) + gamma_I : float + velocity of converging to long-term investment-to-GDP ratio, value 0.98 taken + from Dellink et al. (2017) + Yvar : str + column name of the constant PPP GDP variable + Kvar : str + column name of the (initial-year) constant PPP capital stock variable + iy_var : str + column name of the (initial-year) investment-to-GDP variable + depre_overall_var : str + depreciation rate variable (over all countries) + depre_ctry_var : str + country-specific depreciation rate variable + + Returns + ------- + ccode_df : pandas.DataFrame + DataFrame containing the updated values of capital stock projection estimates + + """ + newvar = "{}_estim".format(Kvar) + ccode = ccode_df.index.get_level_values("ccode").values[0] + + ## delta is same across all scenarios + delta, delta_r = MPK_init.loc[ + (ccode, "SSP1", "OECD"), [depre_overall_var, depre_ctry_var] + ].values + + ccode_df["MPK"], ccode_df["IY"] = np.nan, np.nan + ccode_df[newvar], ccode_df["KY"] = np.nan, np.nan + for yr in range(yr_startend[0], yr_startend[-1] + 1): + for scen in scenarios: + ## advancing MPK values annually + ssp, iam = scen + if yr == yr_startend[0]: + MPK = MPK_init.loc[(ccode, ssp, iam), MPK_var] + else: + prev_MPK = ccode_df.loc[(ccode, yr - 1, ssp, iam), "MPK"] + MPK = gamma_MPK * prev_MPK + (1 - gamma_MPK) * MPK_bar + ccode_df.loc[(slice(None), yr, ssp, iam), "MPK"] = MPK + ky_LT = alpha_overall / MPK + + ## year-to-year GDP growth rates + if yr != yr_startend[-1]: + g_Ys = ccode_df.loc[(ccode, [yr, yr + 1], ssp, iam), Yvar].values + Y_yr = g_Ys[0] + else: + g_Ys = ccode_df.loc[(ccode, [yr - 1, yr], ssp, iam), Yvar].values + Y_yr = g_Ys[1] + g_Y = g_Ys[-1] / g_Ys[0] - 1 + + ## long-run I-to-Y ratio + iy_LT = (g_Y + delta) * ky_LT + + ## I-to-Y ratios time series + if yr == yr_startend[0]: + IY = MPK_init.loc[(ccode, ssp, iam), iy_var] + else: + prev_IY = ccode_df.loc[(ccode, yr - 1, ssp, iam), "IY"] + IY = (gamma_I * prev_IY) + (1 - gamma_I) * iy_LT + ccode_df.loc[(slice(None), yr, ssp, iam), "IY"] = IY + + ## Perpetual inventory method capital stock time series + if yr == yr_startend[0]: + K_yr = MPK_init.loc[(ccode, ssp, iam), Kvar] + else: + prev_K = ccode_df.loc[(ccode, yr - 1, ssp, iam), newvar] + K_yr = (1 - delta_r) * prev_K + IY * Y_yr + ccode_df.loc[(slice(None), yr, ssp, iam), newvar] = K_yr + ccode_df.loc[(slice(None), yr, ssp, iam), "KY"] = K_yr / Y_yr + + return ccode_df + + +def examine_against_fig6(pim_df, intensity="KY", fig_size=(18, 9)): + """ + Function to examine the estimated capital intensity (the variable `intensity` in the + DataFrame `pim_df`) against the Dellink et al. (2017) output of the same variable + for four countries Tanzania, India, China, and the United States (shown in Fig. 6 of + the paper). Also calculates the SSE across own estimates and Dellink et al. (2017)'s + numbers. + + Parameters + ---------- + pim_df : pandas DataFrame + containing the `intensity` variable; should have indices `ccode`, `year`, + `ssp`, and `iam` (in that order) + intensity : str + capital intensity variable in `pim_df` + fig_size : tuple of floats or ints + to set the output figure size + + Returns + ------- + sse : float + SSE (w.r.t. Dellink et al. (2017) Figure 6) calculated + also, presents the bar graphs (containing capital intensity values from data) + plotted in comparison to Dellink et al. (2017) Figure 6 + + """ + + FOUR_CTRIES = ["TZA", "IND", "CHN", "USA"] + SSP = ["SSP{}".format(x) for x in range(5, 0, -1)] + FIG_YRS = [2100, 2050, 2020] + + ## preparing the figures + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9)) + which = np.arange(0.1, 15 * 0.5 + 0.1, 0.5) + which = which + np.array(range(0, len(which))) * 0.1 + + ## from Dellink et al. (2017); had to measure the figure with a ruler + ky_cm = 3 / 6.525 + dellink_case = pd.DataFrame( + list(lstprod(*[FOUR_CTRIES, FIG_YRS, SSP])), columns=["ccode", "year", "ssp"] + ) + dellink_case[intensity] = np.nan + dellink_case["year"] = dellink_case["year"].astype("int64") + dellink_case.set_index(["ccode", "year", "ssp"], inplace=True) + + ## Values from Figure 6, in the order of SSP5 -> SSP1 and 2100, 2050, 2020 + TZN = [ + np.array([6.2, 4.45, 4.4]) * ky_cm, + np.array([6.95, 6.1, 4.475]) * ky_cm, + np.array([6.25, 5.675, 4.5]) * ky_cm, + np.array([5.95, 5.175, 4.45]) * ky_cm, + np.array([6.25, 4.65, 4.45]) * ky_cm, + ] + IND = [ + np.array([7.4, 5.7, 5.75]) * ky_cm, + np.array([7.575, 6.7, 5.8]) * ky_cm, + np.array([7.65, 7.45, 5.85]) * ky_cm, + np.array([7.3, 6.525, 5.775]) * ky_cm, + np.array([7.6, 5.95, 5.75]) * ky_cm, + ] + CHN = [ + np.array([9.9, 8.45, 6.35]) * ky_cm, + np.array([10.25, 9.55, 6.475]) * ky_cm, + np.array([9.8, 10.55, 6.525]) * ky_cm, + np.array([9.6, 9.55, 6.45]) * ky_cm, + np.array([10.45, 8.8, 6.4]) * ky_cm, + ] + USA = [ + np.array([6.25, 5.4, 5.05]) * ky_cm, + np.array([6.75, 5.9, 5.1]) * ky_cm, + np.array([7.275, 6.325, 5.15]) * ky_cm, + np.array([6.9, 6.05, 5.1]) * ky_cm, + np.array([6.65, 5.8, 5.1]) * ky_cm, + ] + for i, ct in enumerate([TZN, IND, CHN, USA]): + ctry = FOUR_CTRIES[i] + for j, row in enumerate(ct): + ssp = SSP[j] + dellink_case.loc[(ctry, FIG_YRS, ssp), intensity] = row + + labs = [] + for j, ssp in enumerate(SSP): + labs += ["2100", "{} 2050".format(ssp), "2020"] + + ax1.set_yticks(which + 0.15) + ax1.set_yticklabels(labs) + dellink_vals = [] + for l, ctry in enumerate(FOUR_CTRIES): + ctry_vals = [] + for ssp in SSP: + ctry_vals += list(dellink_case.loc[(ctry, FIG_YRS, ssp), intensity].values) + ax1.barh(which + l * 0.1, ctry_vals, height=0.1, label=ctry) + dellink_vals += ctry_vals + ax1.legend() + ax1.set_title("Capital intensities for selected countries, Dellink et al. (2017)") + + ax2.set_yticks(which + 0.15) + ax2.set_yticklabels(labs) + our_vals = [] + for l, ctry in enumerate(FOUR_CTRIES): + ctry_vals = [] + for ssp in SSP: + ctry_vals += list( + pim_df.loc[(ctry, FIG_YRS, ssp, "OECD"), intensity].values + ) + ax2.barh(which + l * 0.1, ctry_vals, height=0.1, label=ctry) + our_vals += ctry_vals + mx = max(our_vals + dellink_vals) + ax_set = np.linspace(0, np.ceil(mx), 5) + sse = ((np.array(our_vals) - np.array(dellink_vals)) ** 2).sum() + sser = round(sse, 3) + + ax1.set_xticks(ax_set) + ax2.set_xticks(ax_set) + + ax2.legend() + ax2.set_title("Capital intensities, our own replication using the OECD method") + + fig.suptitle("Comparison of capital intensities; SSE={}".format(sser), fontsize=12) + fig.show() + + return sse + + +def top_bottom_10(df, yr=2100, ssp="SSP3", capvar="capital_estim"): + """Shows the top ten and bottom ten according to `capvar` in the DataFrame `df` + in the year `yr` and the SSP `ssp`; figures for IIASA and OECD IAMs are drawn + separately. + + Parameters + ---------- + df : pandas DataFrame + containing `capvar` variable, with indices `ccode`, `year`, + `ssp`, and `iam` (in that order) + yr : int + year in which we would like to compare the `capvar` values across countries + ssp : str + SSP scenario that we would like to examine + capvar : str + the name of the variable to produce top 10 and bottom 10 countries from + + Returns + ------- + None, but presents the top 10 and bottom 10 countries by IAMs in bar graphs + + """ + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 14)) + + iiasa_df = df.loc[(slice(None), yr, ssp, "IIASA"), [capvar]].copy() + iiasa_df.sort_values([capvar], inplace=True) + iiasa_sma = iiasa_df.index.get_level_values("ccode")[0:10] + iiasa_sma_vals = np.log(iiasa_df[capvar].values[0:10]) + + small = list(range(1, 11)) + ax1.barh(small, iiasa_sma_vals, label="Bottom 10", color="orange", height=0.8) + ax1.set_yticks(small) + ax1.set_yticklabels(iiasa_sma) + + iiasa_big = iiasa_df.index.get_level_values("ccode")[-10:] + iiasa_big_vals = np.log(iiasa_df[capvar].values[-10:]) + + big = list(range(11, 21)) + ax1.barh(big, iiasa_big_vals, label="Top 10", color="#87CEEB", height=0.8) + ax1.set_yticks(small + big) + ax1.set_yticklabels(np.hstack([iiasa_sma, iiasa_big])) + fig.suptitle("Log of capital stock in the year {} and {} scenario".format(yr, ssp)) + ax1.set_title("Case for IIASA") + ax1.set_xlabel("Log of millions of dollars") + + oecd_df = df.loc[(slice(None), yr, ssp, "OECD"), [capvar]].copy() + oecd_df.sort_values([capvar], inplace=True) + oecd_sma = oecd_df.index.get_level_values("ccode")[0:10] + oecd_sma_vals = np.log(oecd_df[capvar].values[0:10]) + + ax2.barh(small, oecd_sma_vals, label="Bottom 10", color="orange", height=0.8) + ax2.set_yticks(small) + ax2.set_yticklabels(oecd_sma) + + oecd_big = oecd_df.index.get_level_values("ccode")[-10:] + oecd_big_vals = np.log(oecd_df[capvar].values[-10:]) + + ax2.barh(big, oecd_big_vals, label="Top 10", color="#87CEEB", height=0.8) + ax2.set_yticks(small + big) + ax2.set_yticklabels(np.hstack([oecd_sma, oecd_big])) + ax2.set_title("Case for OECD") + ax2.set_xlabel("Log of millions of dollars") + + fig.show() + + return None diff --git a/sliiders/dask.py b/sliiders/dask.py new file mode 100644 index 0000000..9c0113b --- /dev/null +++ b/sliiders/dask.py @@ -0,0 +1,19 @@ +import os +import zipfile + +from dask.utils import tmpfile + + +def upload_pkg(client, pkg_dir): + with tmpfile(extension="zip") as f: + zipf = zipfile.ZipFile(f, "w", zipfile.ZIP_DEFLATED) + for root, dirs, files in os.walk(pkg_dir): + for file in files: + zipf.write( + os.path.join(root, file), + os.path.relpath( + os.path.join(root, file), os.path.join(pkg_dir, "..") + ), + ) + zipf.close() + client.upload_file(f) diff --git a/sliiders/gcs.py b/sliiders/gcs.py new file mode 100644 index 0000000..695237a --- /dev/null +++ b/sliiders/gcs.py @@ -0,0 +1,56 @@ +from pathlib import Path + +import gcsfs + +# in CI, no access to creds so we need to handle this case +try: + FS = gcsfs.GCSFileSystem( + project="rhg-data", token="/opt/gcsfuse_tokens/rhg-data.json" + ) +except FileNotFoundError: + FS = None + +import gcsfs + +# in CI, no access to creds so we need to handle this case +try: + FS = gcsfs.GCSFileSystem( + project="rhg-data", token="/opt/gcsfuse_tokens/rhg-data.json" + ) +except FileNotFoundError: + FS = None + + +def fuse_to_gcsmap(path, fs=FS): + """Convert a path using the gcs FUSE file system into a mapper that can be used + for a zarr store. + + Parameters + ---------- + path : str or :class:`pathlib.Path` + Path on GCS FUSE (i.e. starts with ``/gcs/``) + fs : `:class:`gcsfs.GCSFileSystem` + If None, will just return the path on GCS + Returns + ------- + :class:`fsspec.mapping.FSMap` + Mapper to object store + """ + + # handle when fs is null by just returning the GCSFUSE path + if fs is None: + return Path(path) + + return fs.get_mapper("/".join(Path(path).parts[2:]), check=False) + + +def gcsmap_to_fuse(gcsmap): + return Path("/gcs", gcsmap.root) + + +def fuse_to_url(path): + return str(path).replace("/gcs/", "gs://") + + +def fuse_to_gspath(path): + return str(path).replace("/gcs/", "") diff --git a/sliiders/io.py b/sliiders/io.py new file mode 100644 index 0000000..a706d17 --- /dev/null +++ b/sliiders/io.py @@ -0,0 +1,66 @@ +import geopandas as gpd +import pandas as pd +from cartopy.io import shapereader + + +def load_adm0_shpfiles(vector_types, resolution_m=10): + """ + Load and return dictionary of geopandas dataframes from the Natural Earth + repository. + + Parameters + ---------- + vector_types : list of strings + each string is a natural earth admin0 vector type. + Vector type options can be found here: + https://www.naturalearthdata.com/downloads/10m-cultural-vectors/ + resolution_m : int + Resolution of file to obtain. Must match one of those available via + Natural Earth + + Returns + ------- + dict of :py:class:`geopandas.Dataframe` + Keys are vector types associated with the geopandas dataframe and values are + Dataframes from the Naturalearth API, loaded from cache if possible. + """ + return_dict = {} + for vector_type in vector_types: + return_dict[vector_type] = gpd.read_file( + shapereader.natural_earth( + resolution=f"{resolution_m}m", + category="cultural", + name="admin_0_{}".format(vector_type), + ) + ) + return return_dict + + +def read_gdf(fpath): + """Reads in the `.gdf` file located at `fpath` into `pandas.DataFrame`, assigns + columns `lon` for longitude, `lat` for latitude, and `z` for data (such as geoid), + then returns a `xarray.DataArray` containing the values of `z` and coordinates + `lon` and `lat`. + + Parameters + ---------- + fpath : pathlib.Path-like + path where the `.gdf` file of interest is located at + + Returns + ------- + xarray.DataArray + containing values of `z` with coordinates `lon` and `lat` + + """ + + return ( + pd.read_table( + fpath, + skiprows=36, + names=["lon", "lat", "z"], + delim_whitespace=True, + ) + .set_index(["lon", "lat"]) + .z.to_xarray() + ) diff --git a/sliiders/settings.py b/sliiders/settings.py new file mode 100644 index 0000000..286759d --- /dev/null +++ b/sliiders/settings.py @@ -0,0 +1,639 @@ +from pathlib import Path + +import numpy as np +import pandas as pd + +from .gcs import FS, fuse_to_gcsmap + +# Versions +GLOBAL_PROTECTED_AREAS_VERS = "v0.2" +LEVEES_VERS = "v0.2" +GPW_VERS = "v4rev11" +LANDSCAN_YEAR = "2019" +LANDSCAN_VERS = f"LandScan Global {LANDSCAN_YEAR}" +GADM_VERS = "gadm36" +LITPOP_VERS = "LitPop_v1_2" +LITPOP_DATESTAMP = "20220118" +GEG15_VERS = "v0.1" +EXPOSURE_BLENDED_VERS = "v0.5" +EXPOSURE_BINNED_VERS = "v0.14" +COUNTRY_LEVEL_TABLE_VERS = "v0.10" +DATUM_CONVERSION_VERS = "v0.3" +SLIIDERS_VERS = "v1.0" +PWT_DATESTAMP = "20220328" +MPD_DATESTAMP = "20220329" +WB_WDI_DATESTAMP = "20220329" +ALAND_STATISTICS_DATESTAMP = "20220329" +GWDB_DATESTAMP = "20220321" +OECD_DATESTAMP = "20220329" +UN_AMA_DATESTAMP = "20220329" +IMF_WEO_VERS = "October_2021" +UN_WPP_VERS = "2019" +IIASA_PROJECTIONS_DOWNLOAD_VERS = "2018" + +# Definitions +SPATIAL_WARNINGS_TO_IGNORE = [ + "CRS mismatch between the CRS", + "Geometry is in a geographic CRS", + "initial implementation of Parquet.", + "Iteration over", + "__len__ for multi-part geometries", + "The array interface is deprecated", + "Only Polygon objects have interior rings", +] + +# SLIIDERS-SLR PARAMS +LOCALIZESL_COREFILES = { + "SLRProjections190726core_SEJ_full": ["L", "H"], + "SLRProjections170113GRIDDEDcore": [None], + "SLRProjections200204GRIDDEDcore_D20": [None], + "SLRProjections210628GRIDDEDcore_SROCC": [None], +} +LOCALIZESL_REV = "c9b020a0f9409cde3f6796ca936f229c90f7d5c6" + +# Aland Islands, Western Sahara, Libya, Palestine, South Sudan, Syria, Kosovo +ISOS_IN_GEG_NOT_LITPOP = ["ALA", "ESH", "LBY", "PSE", "SSD", "SYR", "XKX"] + +# for organizing scenarios +SSP_PROJ_ORG_SER = pd.Series( + { + "SSP1_v9_130219": "SSP1", + "SSP1_v9_130325": "SSP1", + "SSP1_v9_130424": "SSP1", + "SSP1_v9_130115": "SSP1", + "SSP2_v9_130219": "SSP2", + "SSP2_v9_130325": "SSP2", + "SSP2_v9_130424": "SSP2", + "SSP2_v9_130115": "SSP2", + "SSP3_v9_130219": "SSP3", + "SSP3_v9_130325": "SSP3", + "SSP3_v9_130424": "SSP3", + "SSP3_v9_130115": "SSP3", + "SSP4_v9_130219": "SSP4", + "SSP4_v9_130325": "SSP4", + "SSP4_v9_130424": "SSP4", + "SSP4_v9_130115": "SSP4", + "SSP4d_v9_130115": "SSP4", + "SSP5_v9_130219": "SSP5", + "SSP5_v9_130325": "SSP5", + "SSP5_v9_130424": "SSP5", + "SSP5_v9_130115": "SSP5", + "IIASA GDP": "IIASA", + "IIASA-WiC POP": "IIASA-WiC", + "NCAR": "NCAR", + "OECD Env-Growth": "OECD", + "PIK GDP-32": "PIK", + } +) +SCENARIOS = [ + ("SSP1", "OECD"), + ("SSP1", "IIASA"), + ("SSP2", "OECD"), + ("SSP2", "IIASA"), + ("SSP3", "OECD"), + ("SSP3", "IIASA"), + ("SSP4", "OECD"), + ("SSP4", "IIASA"), + ("SSP5", "OECD"), + ("SSP5", "IIASA"), +] + +# country ISO code groupings +EXCLUDED_ISOS = ["ATA", "XCA"] + +FRA_MSNG = [ + "REU", + "WLF", + "ATF", + "SPM", + "AND", + "BLM", + "GLP", + "GUF", + "MAF", + "MCO", + "MTQ", + "MYT", + "NCL", + "PYF", +] +USA_MSNG = [ + "ASM", + "GUM", + "LIE", + "MNP", + "PRK", + "SOM", + "MHL", + "FSM", + "ERI", + "CUB", + "UMI", + "VIR", +] +PPP_CCODE_IF_MSNG = { + "AUS": ["CCK", "CXR", "HMD", "NFK"], + "DNK": ["GRL", "FRO"], + "FRA": FRA_MSNG, + "FIN": ["ALA"], + "ITA": ["VAT", "SMR"], + "USA": USA_MSNG, + "MAR": ["ESH"], + "CUW": ["BES"], + "NZL": ["NIU", "COK", "TKL"], + "NOR": ["BVT", "SJM"], + "GBR": ["IMN", "FLK", "GGY+JEY", "GIB", "PCN", "SGS", "SHN", "GGY", "JEY"], + "ESH": ["MAR"], +} + +PWT_ISOS = [ + "ABW", + "AGO", + "AIA", + "ALB", + "ARE", + "ARG", + "ARM", + "ATG", + "AUS", + "AUT", + "AZE", + "BDI", + "BEL", + "BEN", + "BFA", + "BGD", + "BGR", + "BHR", + "BHS", + "BIH", + "BLR", + "BLZ", + "BMU", + "BOL", + "BRA", + "BRB", + "BRN", + "BTN", + "BWA", + "CAF", + "CAN", + "CHE", + "CHL", + "CHN", + "CIV", + "CMR", + "COD", + "COG", + "COL", + "COM", + "CPV", + "CRI", + "CUW", + "CYM", + "CYP", + "CZE", + "DEU", + "DJI", + "DMA", + "DNK", + "DOM", + "DZA", + "ECU", + "EGY", + "ESP", + "EST", + "ETH", + "FIN", + "FJI", + "FRA", + "GAB", + "GBR", + "GEO", + "GHA", + "GIN", + "GMB", + "GNB", + "GNQ", + "GRC", + "GRD", + "GTM", + "GUY", + "HKG", + "HND", + "HRV", + "HTI", + "HUN", + "IDN", + "IND", + "IRL", + "IRN", + "IRQ", + "ISL", + "ISR", + "ITA", + "JAM", + "JOR", + "JPN", + "KAZ", + "KEN", + "KGZ", + "KHM", + "KNA", + "KOR", + "KWT", + "LAO", + "LBN", + "LBR", + "LCA", + "LKA", + "LSO", + "LTU", + "LUX", + "LVA", + "MAC", + "MAR", + "MDA", + "MDG", + "MDV", + "MEX", + "MKD", + "MLI", + "MLT", + "MMR", + "MNE", + "MNG", + "MOZ", + "MRT", + "MSR", + "MUS", + "MWI", + "MYS", + "NAM", + "NER", + "NGA", + "NIC", + "NLD", + "NOR", + "NPL", + "NZL", + "OMN", + "PAK", + "PAN", + "PER", + "PHL", + "POL", + "PRT", + "PRY", + "PSE", + "QAT", + "ROU", + "RUS", + "RWA", + "SAU", + "SDN", + "SEN", + "SGP", + "SLE", + "SLV", + "SRB", + "STP", + "SUR", + "SVK", + "SVN", + "SWE", + "SWZ", + "SXM", + "SYC", + "SYR", + "TCA", + "TCD", + "TGO", + "THA", + "TJK", + "TKM", + "TTO", + "TUN", + "TUR", + "TWN", + "TZA", + "UGA", + "UKR", + "URY", + "USA", + "UZB", + "VCT", + "VEN", + "VGB", + "VNM", + "YEM", + "ZAF", + "ZMB", + "ZWE", +] + +UNINHABITED_ISOS = ["ATF", "BVT", "CL-", "HMD", "IOT", "SGS"] +OTHER_ISOS = [ + "AFG", + "ALA", + "AND", + "ASM", + "BES", + "BLM", + "CCK", + "COK", + "CUB", + "CXR", + "ERI", + "ESH", + "FLK", + "FRO", + "FSM", + "GGY", + "GIB", + "GLP", + "GRL", + "GUF", + "GUM", + "IMN", + "JEY", + "KIR", + "KO-", + "LBY", + "LIE", + "MAF", + "MCO", + "MHL", + "MNP", + "MTQ", + "MYT", + "NCL", + "NFK", + "NIU", + "NRU", + "PCN", + "PLW", + "PNG", + "PRI", + "PRK", + "PYF", + "REU", + "SHN", + "SJM", + "SLB", + "SMR", + "SOM", + "SPM", + "SSD", + "TKL", + "TLS", + "TON", + "TUV", + "UMI", + "VAT", + "VIR", + "VUT", + "WLF", + "WSM", +] + +ALL_ISOS = np.sort(np.union1d(PWT_ISOS, UNINHABITED_ISOS + OTHER_ISOS)) +EXTENDED_ISOS = ["GGY+JEY", "CHI", "XKX"] +ALL_ISOS_EXTENDED = np.sort(np.union1d(ALL_ISOS, EXTENDED_ISOS)) + +# Dask image name +DASK_IMAGE = "gcr.io/rhg-project-1/pytc-image-devbase:latest" + +# Constants +# Data +LITPOP_GRID_WIDTH = 1 / 120 +GEG_GRID_WIDTH = 1 / 24 +LANDSCAN_GRID_WIDTH = 1 / 120 + +EXPOSURE_BIN_WIDTH_V = 1 / 10 # meters +EXPOSURE_BIN_WIDTH_H = 1 / 10 # 10cm +HIGHEST_WITHELEV_EXPOSURE_METERS = 20 +ELEV_CAP = HIGHEST_WITHELEV_EXPOSURE_METERS + 1 # "higher than coastal" value + +## Spatial + +# Area, in "square degrees", above which we will consider endorheic basins as protected areas +# N.B. this is an arbitrary choice (something more robust could use something like a bathtub model +# over a highly resolved elevation grid). +MIN_BASIN_TILE_DEGREE_AREA = 20.0 + +# minimum distance in degrees from the ocean to include an endorheic basin as +# a "protected area" +ENDORHEIC_BASIN_OCEAN_BUFFER = 0.2 + +MAX_VORONOI_COMPLEXITY = ( + 40e6 # Maximum number of initial points in shapefile when generating Voronoi +) + +# Width, in degrees, of squares in which to divide the shapes of administrative regions. +# The smaller shapes are more manageable and computationally efficient in many +# geometry-processing algorithms +DEFAULT_BOX_SIZE = 1.0 + +DENSIFY_TOLERANCE = 0.01 +MARGIN_DIST = 0.001 +ROUND_INPUT_POINTS = 6 +SMALLEST_INTERIOR_RING = 1e-13 + +# What are the return periods (in years) we allow for retreat and protect standards +SVALS = np.array([10, 100, 1000, 10000]) + +# Paths and Directories +DIR_DATA = Path("/gcs/rhg-data/impactlab-rhg/coastal/sliiders") + +DIR_DATA_RAW = DIR_DATA / "raw" +DIR_DATA_INT = DIR_DATA / "int" +DIR_RESULTS = DIR_DATA / "output" + +DIR_EXPOSURE_RAW = DIR_DATA_RAW / "exposure" +DIR_EXPOSURE_INT = DIR_DATA_INT / "exposure" + +DIR_LITPOP_RAW = DIR_EXPOSURE_RAW / "asset_value" / "litpop" / LITPOP_DATESTAMP +PATH_LITPOP_RAW = DIR_LITPOP_RAW / LITPOP_VERS / "LitPop_pc_30arcsec_*.csv" + +DIR_GEG15_RAW = DIR_EXPOSURE_RAW / "asset_value" / "geg15" +DIR_GEG15_INT = DIR_EXPOSURE_INT / "asset_value" / "geg15" / GEG15_VERS +PATH_GEG15_INT = DIR_GEG15_INT / "gar_exp.parquet" + +DIR_SLR_RAW = DIR_DATA_RAW / "slr" +DIR_SLR_INT = DIR_DATA_INT / "slr" + +DIR_IFILES_RAW = DIR_SLR_RAW / "ifiles" +DIR_IFILES_INT = DIR_SLR_INT / "ifiles" +PATH_SLR_N_GCMS = fuse_to_gcsmap(DIR_SLR_INT / f"numGCMs_{SLIIDERS_VERS}.zarr", FS) + +DIR_GEOG_RAW = DIR_DATA_RAW / "geography" +DIR_GEOG_INT = DIR_DATA_INT / "geography" + +PATH_CIAM_2016 = fuse_to_gcsmap( + DIR_DATA_RAW / "CIAM_2016" / "diaz2016_inputs_raw.zarr", FS +) + +PATH_SLIIDERS_ECON = fuse_to_gcsmap( + DIR_RESULTS / f"sliiders-econ-{SLIIDERS_VERS}.zarr", FS +) +PATH_SLIIDERS_SLR = fuse_to_gcsmap( + DIR_RESULTS / f"sliiders-slr-{SLIIDERS_VERS}.zarr", FS +) + +PATH_SEG_CENTROIDS = DIR_GEOG_INT / "gtsm_stations_thinned_ciam" + +PATH_CIAM_COASTLINES = DIR_GEOG_INT / "ne_coastline_lines_CIAM_wexp_or_gtsm" + +DIR_GTSM_STATIONS_TOTHIN = DIR_GEOG_RAW / "gtsm_stations_eur_tothin" + +DIR_CIAM_VORONOI = DIR_GEOG_INT / "ciam_and_adm1_intersections" / EXPOSURE_BINNED_VERS +PATH_CIAM_ADM1_VORONOI_INTERSECTIONS = ( + DIR_CIAM_VORONOI / "ciam_and_adm1_intersections.parquet" +) + +PATH_CIAM_ADM1_VORONOI_INTERSECTIONS_SHP = ( + DIR_CIAM_VORONOI / "ciam_and_adm1_intersections.shp" +) + +DIR_SHAPEFILES = Path("/gcs/rhg-data/impactlab-rhg/spatial/shapefiles/source") + +DIR_GADM = Path(DIR_SHAPEFILES / "gadm" / GADM_VERS) + +PATH_GADM = DIR_GADM / f"{GADM_VERS}_levels" / f"{GADM_VERS}_levels.gpkg" +PATH_GADM_ADM1 = DIR_GADM / "adm1.parquet" +PATH_GADM_ADM0_VORONOI = DIR_GADM / "adm0_voronoi.parquet" +PATH_GADM_ADM1_VORONOI = DIR_GADM / "adm1_voronoi.parquet" + +PATH_EXPOSURE_BLENDED = ( + DIR_EXPOSURE_INT + / "asset_value" + / "litpop" + / EXPOSURE_BLENDED_VERS + / "LitPop_pc_30arcsec.parquet" +) + +PATH_NATURALEARTH_OCEAN = DIR_SHAPEFILES / "natural_earth" / "ne_10m_ocean" +DIR_HYDROBASINS_RAW = DIR_DATA_RAW / "hydrosheds" / "hydrobasins" + +DIR_GLOBAL_PROTECTED_AREAS = ( + DIR_EXPOSURE_INT + / "protected_locations" + / "global" + / "historical" + / GLOBAL_PROTECTED_AREAS_VERS +) + +PATH_US_MANUAL_PROTECTED_AREAS = ( + DIR_EXPOSURE_RAW + / "protected_areas" + / "usa" + / "manual" + / "us_manual_protected_areas.parquet" +) + +PATH_MANUAL_PROTECTED_AREAS = ( + DIR_GLOBAL_PROTECTED_AREAS / "manual_global_basins.parquet" +) +PATH_GLOBAL_PROTECTED_AREAS = DIR_GLOBAL_PROTECTED_AREAS / "all_protected_areas.parquet" + +DIR_WETLANDS_RAW = DIR_DATA_RAW / "wetlands_mangroves" +DIR_WETLANDS_INT = DIR_DATA_INT / "wetlands_mangroves" +PATH_GLOBCOVER_2009 = ( + DIR_WETLANDS_RAW + / "Globcover2009_V2.3_Global" + / "GLOBCOVER_L4_200901_200912_V2.3.tif" +) + +PATH_GLOBAL_MANGROVES = ( + DIR_WETLANDS_RAW + / "GMW_001_GlobalMangroveWatch_2016" + / "01_Data" + / "GMW_2016_v2.shp" +) + +PATH_WETLANDS_INT = DIR_WETLANDS_INT / "wetlands.shp" + +DIR_ELEVATION = Path("/gcs/rhg-data/impactlab-rhg/common_data/elevation") +DIR_ELEVATION_RAW = DIR_ELEVATION / "raw" +DIR_ELEVATION_INT = DIR_ELEVATION / "int" + +PATH_SRTM15_PLUS = DIR_ELEVATION_RAW / "srtm15_plus" / "SRTM15_V2.3.nc" +DIR_MSS = DIR_ELEVATION_INT / "CoastalDEM_mss_corrected" +DIR_COASTALDEM = ( + DIR_ELEVATION_RAW / "climate_central" / "coastal_dem_30as" / "CoastalDEM_Global_30m" +) + +DIR_LANDSCAN_RAW = DIR_EXPOSURE_RAW / "landscan" +DIR_LANDSCAN_INT = DIR_EXPOSURE_INT / "landscan" / f"ls{LANDSCAN_YEAR}" +PATH_LANDSCAN_INT = DIR_LANDSCAN_INT / "population.parquet" + +DIR_EXPOSURE_BINNED = ( + DIR_EXPOSURE_INT / "asset_value" / "binned" / "global" / "historical" +) +DIR_EXPOSURE_BINNED_TMP = DIR_EXPOSURE_BINNED / "tmp" +DIR_EXPOSURE_BINNED_TMP_TILES = DIR_EXPOSURE_BINNED_TMP / "tiles" +DIR_EXPOSURE_BINNED_TMP_TILES_NOLAND = DIR_EXPOSURE_BINNED_TMP / "tiles_noland" +DIR_EXPOSURE_BINNED_TMP_TILES_SEGMENT_AREA = ( + DIR_EXPOSURE_BINNED_TMP / "tiles_segment_area" +) + +PATH_EXPOSURE_TILE_LIST = DIR_EXPOSURE_BINNED / "tmp" / "meta" / "tile_list.parquet" + +PATH_EXPOSURE_AREA_BY_CIAM_AND_ELEVATION = ( + DIR_EXPOSURE_BINNED / EXPOSURE_BINNED_VERS / "ciam_segs_area_by_elev.parquet" +) + +PATH_EXPOSURE_BINNED_WITHOUTELEV = ( + DIR_EXPOSURE_BINNED + / EXPOSURE_BINNED_VERS + / "binned_exposure_withoutelev_base.parquet" +) + +PATH_EXPOSURE_BINNED_WITHELEV = ( + DIR_EXPOSURE_BINNED / EXPOSURE_BINNED_VERS / "binned_exposure_withelev_base.parquet" +) + +DIR_GEOG_DATUMS_RAW = DIR_GEOG_RAW / "datum_conversions" +DIR_GEOG_DATUMS_INT = DIR_GEOG_INT / "datum_conversions" + +DIR_GEOG_DATUMS_EGM96_WGS84 = DIR_GEOG_DATUMS_RAW / "egm96" +DIR_GEOG_DATUMS_XGM2019e_WGS84 = DIR_GEOG_DATUMS_RAW / "xgm2019e" + +PATH_GEOG_MDT_RAW = DIR_GEOG_RAW / "mdt" / "aviso_2018" / "mdt_cnes_cls18_global.nc" + +PATH_GEOG_DATUMS_GRID = fuse_to_gcsmap( + DIR_GEOG_DATUMS_INT / f"datum_conversions_gridded_{DATUM_CONVERSION_VERS}.zarr", FS +) + +PATH_GTSM_SURGE = ( + DIR_DATA_RAW / "esl" / "CODEC_amax_ERA5_1979_2017_coor_mask_GUM_RPS.nc" +) + +DIR_CCI_RAW = DIR_DATA_RAW / "cci" +PATH_EXPOSURE_WB_ICP = DIR_CCI_RAW / "world_bank_ICP_2017.csv" +PATH_EXPOSURE_LINCKE = DIR_CCI_RAW / "lincke_2021_country_input.csv" + +# Various directories and paths for the country-level ("YPK") workflow +DIR_YPK_INT = DIR_EXPOSURE_INT / "ypk" +DIR_YPK_FINAL = DIR_YPK_INT / "finalized" +DIR_YPK_RAW = DIR_EXPOSURE_RAW / "ypk" +PATH_COUNTRY_LEVEL_EXPOSURE = DIR_YPK_FINAL / "gdp_gdppc_pop_capital_1950_2020.parquet" +PATH_COUNTRY_LEVEL_EXPOSURE_PROJ = ( + DIR_YPK_FINAL / "gdp_gdppc_pop_capital_proj_2010_2100.parquet" +) + +DIR_UN_AMA_RAW = DIR_YPK_RAW / "un_ama" / UN_AMA_DATESTAMP +DIR_UN_WPP_RAW = DIR_YPK_RAW / "un_wpp" / UN_WPP_VERS +DIR_WB_WDI_RAW = DIR_YPK_RAW / "wb_wdi" / WB_WDI_DATESTAMP +DIR_OECD_REGIONS_RAW = DIR_YPK_RAW / "oecd_regions" / OECD_DATESTAMP +DIR_IIASA_PROJECTIONS = ( + DIR_YPK_RAW / "iiasa_projections" / IIASA_PROJECTIONS_DOWNLOAD_VERS +) +DIR_ALAND_STATISTICS_RAW = DIR_YPK_RAW / "asub" / ALAND_STATISTICS_DATESTAMP +PATH_GWDB2021_RAW = ( + DIR_YPK_RAW / "gwdb" / GWDB_DATESTAMP / "global-wealth-databook-2021.pdf" +) +PATH_PWT_RAW = DIR_YPK_RAW / "pwt" / PWT_DATESTAMP / "pwt_100.xlsx" +PATH_IMF_WEO_RAW = DIR_YPK_RAW / "imf_weo" / IMF_WEO_VERS / "WEO_iy_ratio_pop_gdp.xlsx" +PATH_MPD_RAW = DIR_YPK_RAW / "mpd" / MPD_DATESTAMP / "maddison_project.xlsx" diff --git a/sliiders/spatial.py b/sliiders/spatial.py new file mode 100644 index 0000000..f43171f --- /dev/null +++ b/sliiders/spatial.py @@ -0,0 +1,3331 @@ +import random +import warnings +import zipfile +from collections import defaultdict +from operator import itemgetter +from typing import Any, Sequence, Union + +import geopandas as gpd +import matplotlib._color_data as mcd +import networkx as nx +import numpy as np +import pandas as pd +import pygeos +import regionmask +import rioxarray +import shapely as shp +import xarray as xr +from dask_gateway import Gateway +from IPython.display import display +from numba import jit +from pyinterp.backends.xarray import Grid2D +from scipy.spatial import SphericalVoronoi, cKDTree +from shapely.geometry import ( + GeometryCollection, + LineString, + MultiLineString, + MultiPolygon, + Point, + Polygon, + box, +) +from shapely.ops import linemerge, unary_union +from tqdm.notebook import tqdm + +from . import settings as sset +from .io import load_adm0_shpfiles + +assert sset.MARGIN_DIST < sset.DENSIFY_TOLERANCE +assert 10 ** (-sset.ROUND_INPUT_POINTS) < sset.MARGIN_DIST + +SPHERICAL_VORONOI_THRESHOLD = ( + 1e-7 # `threshold` parameter of SphericalVoronoi() (not sure it can go any lower) +) +LAT_TO_M = 111131.745 +EARTH_RADIUS = 6371.009 + + +def iso_poly_box_getter(iso, shp_df): + """Get `box`es or rectangular areas of coordinates that contains each Polygon + belonging to the shapefile of the country specified by the ISO code. + + Parameters + ---------- + iso : str + ISO code of the country that we are interested in + shp_df : geopandas DataFrame + with the indices being the iso codes and with a column called `geometry` + containing the shapefile of the relevant countries + + Returns + ------- + list of tuples (of length four) + containing the smallest and largest x and y coordinates (longitudes and + latitudes) + + """ + shp = shp_df.loc[iso, "geometry"] + if type(shp) == MultiPolygon: + shps = shp.geoms + else: + shps = [shp] + + poly_bounds = [] + for poly in shps: + xx = np.array(poly.exterior.coords.xy[0]) + yy = np.array(poly.exterior.coords.xy[1]) + xmin, xmax = np.floor(xx.min()), np.ceil(xx.max()) + ymin, ymax = np.floor(yy.min()), np.ceil(yy.max()) + poly_bounds.append((xmin, xmax, ymin, ymax)) + + return list(set(poly_bounds)) + + +def get_iso_geometry(iso=""): + """For a given list of ISO 3166-1 alpha-3 codes (or a single code), returns + corresponding Natural Earth shapefile geometries. Note that some ISO codes are + recorded differently under Natural Earth (e.g., Aland Island is originally `ALA` + but recorded as `ALD`), so such cases are fixed when encountered. + + Parameters + ---------- + iso : str or list of str + three-letter ISO 3166-1 alpha-3 code, or list of such codes, referencing a + geographic region in the Natural Earth shapefiles + + Returns + ------- + :py:class:`shapely.geometry` or list of :py:class:`shapely.geometry` + """ + input_is_list = isinstance(iso, (list, np.ndarray)) + + if input_is_list: + isos = list(iso) + else: + isos = [iso] + + shp_dict = load_adm0_shpfiles( + ["countries", "map_units", "map_subunits", "disputed_areas"] + ) + country_shps = shp_dict["countries"] + map_unit_shps = shp_dict["map_units"] + map_subunit_shps = shp_dict["map_subunits"] + disputed_area_shps = shp_dict["disputed_areas"] + + for i in range(len(isos)): + # cw between iso codes -- + if isos[i] == "ALA": + isos[i] = "ALD" + if isos[i] == "ESH": + isos[i] = "SAH" + if isos[i] == "PSE": + isos[i] = "PSX" + if isos[i] == "SJM": + isos[i] = "NSV" + if isos[i] == "SSD": + isos[i] = "SDS" + if isos[i] == "XKX": + isos[i] = "KOS" + if isos[i] == "BES": + isos[i] = "NLY" + + geos = [] + for iso in isos: + if iso == "SAH": + geo = disputed_area_shps[ + disputed_area_shps["NAME_EN"] == "Western Sahara" + ].geometry.unary_union + else: + # retrieve shape file + try: + geo = country_shps[ + country_shps["ADM0_A3"] == iso.upper() + ].geometry.iloc[0] + if iso == "MAR": + geo = geo.difference( + disputed_area_shps[ + disputed_area_shps["NAME_EN"] == "Western Sahara" + ].geometry.unary_union + ) + except IndexError: + try: + geo = map_unit_shps[ + map_unit_shps["GU_A3"] == iso.upper() + ].geometry.iloc[0] + except IndexError: + geo = map_subunit_shps[ + map_subunit_shps["SU_A3"] == iso.upper() + ].geometry.iloc[0] + geos.append(geo) + + if input_is_list: + return geos + return geos[0] + + +def filter_spatial_warnings(): + """Suppress warnings defined in `sliiders.settings.SPATIAL_WARNINGS_TO_IGNORE`""" + for msg in sset.SPATIAL_WARNINGS_TO_IGNORE: + warnings.filterwarnings("ignore", message=f".*{msg}*") + + +def add_rand_color(gdf, col=None): + """Get a list of random colors corresponding to either each row or each ID + (as defined by `col`) of a GeoDataFrame. Used in `sliiders` for diagnostic + visualizations, not functionality. + + Parameters + ---------- + gdf : geopandas.GeoDataFrame + GeoDataFrame to assign colors to + + col: str + Column name in `gdf` to use as unique ID to assign colors to + + Returns + ------- + colors : list-like, or pandas.Series + A list of random colors corresponding to each row, or to values + defined in gdf[`col`] + """ + if col is None: + colors = random.choices(list(mcd.XKCD_COLORS.keys()), k=gdf.shape[0]) + else: + unique_vals = gdf[col].unique() + color_dict = { + v: random.choice(list(mcd.XKCD_COLORS.keys())) for v in unique_vals + } + return gdf[col].apply(lambda v: color_dict[v]) + return colors + + +def get_points_on_lines(geom, distance, starting_length=0.0): + """Return evenly spaced points on a LineString or + MultiLineString object. + + Parameters + ---------- + geom : :py:class:`shapely.geometry.MultiLineString` or + :py:class:`shapely.geometry.LineString` + distance : float + Interval desired between points along LineString(s). + starting_length : float + How far in from one end of the LineString you would like + to put your first point. + + Returns + ------- + coast : :py:class:`shapely.geometry.MultiPoint` object + Contains all of the points on your line. + """ + + if geom.geom_type == "LineString": + short_length = geom.length - starting_length + num_vert = int(short_length / distance) + 1 + + # if no points should be on this linestring, return + # empty list + if short_length <= 0: + return [], -short_length + + # else return list of coordinates + remaining_length = geom.length - ((num_vert - 1) * distance + starting_length) + return ( + shp.geometry.MultiPoint( + [ + geom.interpolate(n * distance + starting_length, normalized=False) + for n in range(num_vert) + ] + ), + remaining_length, + ) + elif geom.geom_type == "MultiLineString": + this_length = starting_length + parts = [] + for part in geom: + res, this_length = get_points_on_lines(part, distance, this_length) + parts += res + return shp.geometry.MultiPoint(parts), this_length + else: + raise ValueError("unhandled geometry %s", (geom.geom_type,)) + + +def grab_lines(g): + """Get a LineString or MultiLineString representing all the lines in a + geometry. + + Parameters + ---------- + g : shapely.Geometry + Any Geometry in Shapely + + Returns + ------- + shapely.LineString or shapely.MultiLineString + A shapely.Geometry object representing all LineStrings in `g`. + """ + if isinstance(g, Point): + return LineString() + if isinstance(g, LineString): + return g + + return linemerge( + [ + component + for component in g.geoms + if isinstance(component, LineString) + or isinstance(component, MultiLineString) + ] + ) + + +def grab_polygons(g): + """Get a Polygon or MultiPolygon representing all the polygons in a + geometry. + + Parameters + ---------- + g : shapely.Geometry + Any Geometry in Shapely + + Returns + ------- + shapely.Polygon or shapely.MultiPolygon + A shapely.Geometry object representing all Polygons in `g`. + + """ + if isinstance(g, Point): + return Polygon() + if isinstance(g, Polygon): + return g + if isinstance(g, MultiPolygon): + return g + return unary_union( + [ + component + for component in g.geoms + if isinstance(component, Polygon) or isinstance(component, MultiPolygon) + ] + ) + + +def strip_line_interiors_poly(g): + """Remove tiny interior Polygons from a Polygon. + + Parameters + ---------- + g : shapely.Polygon + A Shapely Polygon + + Returns + ------- + shapely.Polygon + A Shapely Polygon equivalent to `g`, removing any interior Polygons + smaller than or equal to `sliiders.spatial.SMALLEST_INTERIOR_RING`, + measured in "square degrees". + """ + return Polygon( + g.exterior, + [i for i in g.interiors if Polygon(i).area > sset.SMALLEST_INTERIOR_RING], + ) + + +def strip_line_interiors(g): + """Remove tiny interior Polygons from a Geometry. + + Parameters + ---------- + g : shapely.Geometry + A Shapely Geometry. Must be either an object containing Polygons, i.e. + shapely.Polygon or shapely.MultiPolygonn or shapely.GeometryCollection + + Returns + ------- + shapely.Polygon or shapely.MultiPolygon + A collection of Shapely Polygons equivalent to the set of Polygons + contained in `g`, removing any interior Polygons smaller than or equal + to `sliiders.spatial.SMALLEST_INTERIOR_RING`, measured in + "square degrees". + """ + if isinstance(g, Polygon): + return strip_line_interiors_poly(g) + if isinstance(g, MultiPolygon): + return unary_union( + [ + strip_line_interiors_poly(component) + for component in g.geoms + if isinstance(component, Polygon) + ] + ) + + # Recursively call this function for each Polygon or Multipolygon contained + # in the geometry + if isinstance(g, GeometryCollection): + return unary_union( + [ + strip_line_interiors(grab_polygons(g2)) + for g2 in g.geoms + if (isinstance(g2, Polygon) or isinstance(g2, MultiPolygon)) + ] + ) + + raise ValueError( + "Geometry must be of type `Polygon`, `MultiPolygon`, or `GeometryCollection`." + ) + + +def fill_in_gaps(gser): + """Fill in the spatial gaps of a GeoSeries within the latitude-longitude + coordinate system. Approximates a "nearest shape" from the original + geometries by iteratively expanding the original shapes in degree-space. + Not ideal for precise nearest-shape-matching, but useful in cases where + gaps are small and/or insignificant but may lead to computational + difficulties. + + Parameters + ---------- + gser : :py:class:`geopandas.GeoSeries` + A GeoSeries intended to include globally comprehensive shapes + + Returns + ------- + out : :py:class:`geopandas.GeoSeries` + A GeoSeries covering the globe, with initially empty spaces filled in by a + nearby shape. + """ + uu = gser.unary_union + current_coverage = box(-180, -90, 180, 90).difference(uu) + if isinstance(current_coverage, Polygon): + current_coverage = MultiPolygon([current_coverage]) + + assert all([g.type == "Polygon" for g in current_coverage.geoms]) + + intersects_missing_mask = gser.intersects(current_coverage) + intersects_missing = gser[intersects_missing_mask].copy().to_frame(name="geometry") + + for buffer_size in tqdm([0.01, 0.01, 0.01, 0.03, 0.05, 0.1, 0.1, 0.1]): + with warnings.catch_warnings(): + filter_spatial_warnings() + intersects_missing["buffer"] = intersects_missing["geometry"].buffer( + buffer_size + ) + + new_buffers = [] + for i in intersects_missing.index: + new_buffer = ( + intersects_missing.loc[i, "buffer"] + .intersection(current_coverage) + .buffer(0) + ) + new_buffers.append(new_buffer) + current_coverage = current_coverage.difference(new_buffer) + + with warnings.catch_warnings(): + filter_spatial_warnings() + intersects_missing["new_buffer"] = gpd.GeoSeries( + new_buffers, index=intersects_missing.index, crs=intersects_missing.crs + ).buffer(0.00001) + use_new_buffer_mask = intersects_missing["new_buffer"].geometry.area > 0 + intersects_missing.loc[ + use_new_buffer_mask, "geometry" + ] = intersects_missing.loc[use_new_buffer_mask, "geometry"].union( + intersects_missing.loc[use_new_buffer_mask, "new_buffer"] + ) + + assert current_coverage.area == 0 + + out = gser[~intersects_missing_mask].copy() + + out = pd.concat( + [out, intersects_missing.geometry], + ).rename(out.name) + + assert intersects_missing.is_valid.all() + + return out + + +def get_polys_in_box(all_polys, lx, ly, ux, uy): + """Get the subset of shapes in `all_polys` that overlap with the box defined + by `lx`, `ly`, `ux`, `uy`. + + Parameters + ---------- + all_polys : pygeos.Geometry + Array of pygeos Polygons + + lx : float + Left (western) bound of box + + ly : float + Lower (southern) bound of box + + ux : float + Right (eastern) bound of box + + uy : float + Upper (northern) bound of box + + Returns + ------- + vertical_slab : pygeos.Geometry + List of the pygeos polygons from `all_polys` overlapped with (cut by) + the box. + + slab_polys : np.array + List of indices from `all_polys` corresponding to the polygons in + `vertical_slab`. + """ + + vertical_slab = pygeos.clip_by_rect(all_polys, lx, ly, ux, uy) + + poly_found_mask = ~pygeos.is_empty(vertical_slab) + slab_polys = np.where(poly_found_mask) + + vertical_slab = vertical_slab[poly_found_mask] + + # invalid shapes may occur from Polygons being cut into what should be MultiPolygons + not_valid = ~pygeos.is_valid(vertical_slab) + vertical_slab[not_valid] = pygeos.make_valid(vertical_slab[not_valid]) + + vertical_slab_shapely = pygeos.to_shapely(vertical_slab) + vertical_slab_shapely = [strip_line_interiors(p) for p in vertical_slab_shapely] + vertical_slab = pygeos.from_shapely(vertical_slab_shapely) + + return vertical_slab, slab_polys + + +def grid_gdf( + orig_gdf, + box_size=sset.DEFAULT_BOX_SIZE, + show_bar=True, +): + """Divide a GeoDataFrame into a grid, returning the gridded shape-parts and + the "empty" areas, each nested within a `box_size`-degree-width square. + This reduces the sizes and rectangular boundaries of geometries, easing + many computational processes, especially those that depend on a spatial + index. + + Note: This may be deprecated in a future version if something like this + becomes available: https://github.com/pygeos/pygeos/pull/256 + + Parameters + ---------- + orig_gdf : :py:class:`geopandas.GeoSeries` or :py:class:`geopandas.GeoSeries` + GeoDataFrame/GeoSeries to be divided into a grid + + box_size : float + Width and height of boxes to divide geometries into + + show_bar : bool + Show progress bar + + Returns + ------- + gridded_gdf : geopandas.GeoDataFrame + GeoDataFrame containing `orig_gdf` geometries divided into grid cells. + + all_oc : pygeos.Geometry + List of pygeos Polygons corresponding to the "ocean" shapes in each + grid cell. Ocean shapes are defined as areas not covered by any + geometry in `orig_gdf`. + """ + + if isinstance(orig_gdf, gpd.GeoSeries): + orig_gdf = orig_gdf.to_frame(name="geometry") + orig_geos = pygeos.from_shapely(orig_gdf.geometry) + + llon, llat, ulon, ulat = orig_gdf.total_bounds + + boxes = [] + ixs = [] + all_oc = [] + iterator = np.arange(llon - 1, ulon + 1, box_size) + if show_bar: + iterator = tqdm(iterator) + for lx in iterator: + ux = lx + box_size + vertical_slab, slab_polys = get_polys_in_box(orig_geos, lx, llat, ux, ulat) + for ly in np.arange(llat - 1, ulat + 1, box_size): + uy = ly + box_size + res = pygeos.clip_by_rect(vertical_slab, lx, ly, ux, uy) + polygon_found_mask = ~pygeos.is_empty(res) + res = res[polygon_found_mask] + # invalid shapes may occur from Polygons being cut into what should be + # MultiPolygons + not_valid = ~pygeos.is_valid(res) + res[not_valid] = pygeos.make_valid(res[not_valid]) + ix = np.take(slab_polys, np.where(polygon_found_mask)) + if res.shape[0] > 0: + boxes.append(res) + ixs.append(ix) + + if res.shape[0] > 0: + + this_uu = pygeos.union_all(res) + + this_oc = pygeos.difference( + pygeos.from_shapely(box(lx, ly, ux, uy)), this_uu + ) + + oc_parts = pygeos.get_parts(this_oc) + all_oc += list(oc_parts) + + else: + this_oc = pygeos.from_shapely(box(lx, ly, ux, uy)) + all_oc.append(this_oc) + + geom_ix = np.concatenate(ixs, axis=1).flatten() + geom = np.concatenate(boxes).flatten() + + gridded_gdf = orig_gdf.drop(columns="geometry").iloc[geom_ix] + gridded_gdf["geometry"] = geom + + all_oc = np.array(all_oc) + all_oc = all_oc[~pygeos.is_empty(all_oc)] + + return gridded_gdf, all_oc + + +def divide_pts_into_categories( + pts, + pt_gadm_ids, + all_oc, + tolerance=sset.DENSIFY_TOLERANCE, + at_blank_tolerance=sset.MARGIN_DIST, +): + """From a set of points and IDs, divide points into "coastal-coastal" and + "coastal-border" categories. + + "Coastal" indicates proximity to the "coast", i.e. the edges of the union + of all original polygons, defined by `all_oc`. "Border" indicates + non-proximity to the coast. Proximity to the coast is calculated as being + within `at_blank_tolerance` of `all_oc`. + + "Coastal-border" points are defined as all coastal points that are within + `tolerance` of "border" points (points that are not near the coast). + + "Coastal-coastal" points are defined as the remaining "coastal" points + (not near a border). + + The motivation for this function is to simplify the point set used to + generate Voronoi regions from a set of polygons. Precision matters a lot + in parts of shapes that are near borders with other regions, and less so + in coastal areas that are distant from the nearest non-same region. Points + that are neither coastal, nor near a border, can be ignored, as they do not + define the edges of a region. That is, they are entirely interior to a + region's boundaries, so will not figure in the calculation of all areas + nearest to that region. + + Parameters + ---------- + pts : np.ndarray + 2D array with dimensions 2xN, representing N longitude-latitude + coordinates. + + pt_gadm_ids : np.ndarray + 1D array representing N IDs corresponding to `pts`. + + all_oc : pygeos.Geometry + List of pygeos Polygons corresponding to the "ocean" shapes in each + grid cell. Ocean shapes should be defined as areas not covered by any + geometry in the set of shapes represented here by their component + points. + + tolerance : float + Maximum distance from one point to a point with a different ID for the + first point to be considered a "border" point. + + at_blank_tolerance : float + Maximum distance from `all_oc` for a point to be considered "coastal". + + Returns + ------- + coastal_coastal_pts : np.ndarray + 2D array representing coastal points that are not near borders. + coastal_border_pts : np.ndarray + 2D array representing coastal points that are near borders. + coastal_coastal_gadm : np.ndarray + 1D array representing IDs corresponding to `coastal_coastal_pts`. + coastal_border_gadm : np.ndarray + 1D array representing IDs corresponding to `coastal_border_pts`. + """ + at_blank_tolerance = at_blank_tolerance + (at_blank_tolerance / 10) + tolerance = tolerance + (tolerance / 10) + + tree = cKDTree(pygeos.get_coordinates(all_oc)) + + batch_size = int(1e6) + starts = np.arange(0, pts.shape[0], batch_size) + ends = starts + batch_size + ends[-1] = pts.shape[0] + + pts_at_blank = [] + for i in range(len(starts)): + start = starts[i] + end = ends[i] + pts_subset = pts[start:end] + pts_tree = cKDTree(pts_subset) + pts_at_blank_subset = pts_tree.query_ball_tree(tree, r=at_blank_tolerance) + pts_at_blank_subset = np.array( + [True if r else False for r in pts_at_blank_subset] + ) + pts_at_blank.append(pts_at_blank_subset) + + pts_at_blank = np.concatenate(pts_at_blank) + + coastal_pts = pts[pts_at_blank] + coastal_pt_gadm_ids = pt_gadm_ids[pts_at_blank] + + border_pts = pts[~pts_at_blank] + + tree = cKDTree(border_pts) + + batch_size = int(1e6) + starts = np.arange(0, coastal_pts.shape[0], batch_size) + ends = starts + batch_size + ends[-1] = coastal_pts.shape[0] + + pts_at_border = [] + for i in range(len(starts)): + start = starts[i] + end = ends[i] + pts_subset = coastal_pts[start:end] + pts_tree = cKDTree(pts_subset) + pts_at_border_subset = pts_tree.query_ball_tree(tree, r=tolerance) + pts_at_border_subset = np.array( + [True if r else False for r in pts_at_border_subset] + ) + pts_at_border.append(pts_at_border_subset) + + pts_at_border = np.concatenate(pts_at_border) + + coastal_coastal_pts = coastal_pts[~pts_at_border].copy() + coastal_coastal_gadm = coastal_pt_gadm_ids[~pts_at_border].copy() + + coastal_border_pts = coastal_pts[pts_at_border].copy() + coastal_border_gadm = coastal_pt_gadm_ids[pts_at_border].copy() + + return ( + coastal_coastal_pts, + coastal_border_pts, + coastal_coastal_gadm, + coastal_border_gadm, + ) + + +def simplify_nonborder( + coastal_coastal_pts, + coastal_border_pts, + coastal_coastal_gadm, + coastal_border_gadm, + tolerance=sset.MARGIN_DIST, +): + """Simplify coastal Voronoi generator points that are not near the border + of another administrative region. + + Parameters + ---------- + coastal_coastal_pts : np.ndarray + 2D array of longitude-latitude coordinates representing + "coastal-coastal" points (see documentation in + `divide_pts_into_categories()`.) + + coastal_border_pts : np.ndarray + 2D array of longitude-latitude coordinates representing + "coastal-border" points (see documentation in + `divide_pts_into_categories()`.) + + coastal_coastal_gadm : np.ndarray + 1D array of region IDs corresponding to `coastal_coastal_pts`. + + coastal_border_gadm : np.ndarray + 1D array of region IDs corresponding to `coastal_border_pts`. + + tolerance : float + Precision in degree-distance below which we tolerate imprecision for + all points. + + Returns + ------- + non_border : np.ndarray + 2D array of points that are not close to the border + + non_border_gadm : np.ndarray + 1D array of region IDs corresponding to `non_border`. + + now_border : np.ndarray + 2D array of points that are close to the border + + now_border_gadm : np.ndarray + 1D array of region IDs corresponding to `now_border`. + """ + border_tree = cKDTree(coastal_border_pts) + + d, i = border_tree.query(coastal_coastal_pts, distance_upper_bound=1) + + already_simplified = np.zeros_like(coastal_coastal_pts[:, 0], dtype="bool") + non_border = [] + non_border_gadm = [] + + for UPPER_BOUND in [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]: + if UPPER_BOUND <= tolerance: + break + + simplify = ~(d < UPPER_BOUND) + this_level_nonborder = coastal_coastal_pts[simplify & (~already_simplified)] + this_level_nonborder_gadm = coastal_coastal_gadm[ + simplify & (~already_simplified) + ] + + already_simplified[simplify] = True + + # For points >= UPPER_BOUND away from the border, round to nearest + # UPPER_BOUND/10 + this_level_nonborder = np.round( + this_level_nonborder, int(-np.log10(UPPER_BOUND) + 1) + ) + this_level_nonborder, this_level_nonborder_ix = np.unique( + this_level_nonborder, axis=0, return_index=True + ) + this_level_nonborder_gadm = this_level_nonborder_gadm[this_level_nonborder_ix] + + non_border.append(this_level_nonborder) + non_border_gadm.append(this_level_nonborder_gadm) + + non_border = np.concatenate(non_border) + non_border_gadm = np.concatenate(non_border_gadm) + + now_border = coastal_coastal_pts[~already_simplified] + now_border_gadm = coastal_coastal_gadm[~already_simplified] + + return non_border, non_border_gadm, now_border, now_border_gadm + + +def explode_gdf_to_pts(geo_array, id_array, rounding_decimals=sset.ROUND_INPUT_POINTS): + """Transform an array of shapes into an array of coordinate pairs, keeping + the IDs of shapes aligned with the coordinates. + + Parameters + ---------- + geo_array : :py:class:`numpy.ndarray` + Array of ``pygeos`` geometries + + id_array : :py:class:`numpy.ndarray` + List of IDs corresponding to shapes in `geo_array` + + Returns + ------- + pts : np.ndarray + 2D array of longitude-latitude pairs representing all points, rounded + to ``sliiders.settings.ROUND_INPUT_POINTS`` precision, represented in the + geometries of ``geo_array``. + + pt_ids : np.ndarray + 1D array of IDs corresponding to ``pts``. + """ + counts = np.array([pygeos.count_coordinates(poly) for poly in geo_array]) + + pt_ids = np.repeat(id_array, counts) + + pts = pygeos.get_coordinates(geo_array) + + pts, pts_ix = np.unique(np.round(pts, rounding_decimals), axis=0, return_index=True) + pt_ids = pt_ids[pts_ix] + + return pts, pt_ids + + +def polys_to_vor_pts(regions, all_oc, tolerance=sset.DENSIFY_TOLERANCE): + """Create a set of Voronoi region generator points from a set of shapes. + + Parameters + ---------- + regions : geopandas.GeoDataFrame + GeoDataFrame defining region boundaries, with `UID` unique ID field + + all_oc : pygeos.Geometry + List of pygeos Polygons corresponding to the "ocean" shapes in each + grid cell. Ocean shapes should be defined as areas not covered by any + geometry in the set of `regions`. + + tolerance : float + Desired precision of geometries in `regions` + + Returns + ------- + :py:class:`geopandas.GeoSeries` + Resulting points derived from `regions` to use as Voronoi generators + """ + densified = pygeos.segmentize(pygeos.from_shapely(regions["geometry"]), tolerance) + + pts, pt_gadm_ids = explode_gdf_to_pts(densified, regions.index.values) + + all_oc_densified = pygeos.segmentize(all_oc, sset.MARGIN_DIST) + + ( + coastal_coastal_pts, + coastal_border_pts, + coastal_coastal_gadm, + coastal_border_gadm, + ) = divide_pts_into_categories(pts, pt_gadm_ids, all_oc_densified, tolerance) + + non_border, non_border_gadm, now_border, now_border_gadm = simplify_nonborder( + coastal_coastal_pts, + coastal_border_pts, + coastal_coastal_gadm, + coastal_border_gadm, + tolerance=sset.MARGIN_DIST, + ) + + vor_pts = np.concatenate([non_border, now_border, coastal_border_pts]) + vor_gadm = np.concatenate([non_border_gadm, now_border_gadm, coastal_border_gadm]) + + return remove_duplicate_points( + gpd.GeoSeries.from_xy( + x=vor_pts[:, 0], + y=vor_pts[:, 1], + index=pd.Index(vor_gadm, name=regions.index.name), + crs=regions.crs, + ) + ) + + +def get_hemisphere_shape(hemisphere): + """Define Shapely boxes for each hemisphere and the globe. + + Parameters + ---------- + hemisphere : str + Options are "west", "east", and "both". + + Returns + ------- + shapely.Polygon + A single box corresponding to the requested hemisphere(s). + """ + if hemisphere == "west": + return box(-180, -90, 0, 90) + elif hemisphere == "east": + return box(0, -90, 180, 90) + elif hemisphere == "both": + return box(-180, -90, 180, 90) + else: + raise ValueError + + +def make_valid_shapely(g): + """Wrapper to call `make_valid` on a list of Shapely geometries. + Should be deprecated upon release of Shapely 2.0. + + Parameters + ---------- + g : list-like + List of Shapely geometries or geopandas.GeoSeries + + Returns + ------- + list + List of Shapely geometries, after calling `pygeos.make_valid()` on all. + """ + return pygeos.to_shapely(pygeos.make_valid(pygeos.from_shapely(g))) + + +def clip_geoseries_by_rect(gs, rect): + """Wrapper to mask a geopandas.GeoSeries by a Shapely rectangle. + Should be deprecated upon release of Shapely 2.0. + + Parameters + ---------- + gs : geopandas.GeoSeries + Any Geopandas GeoSeries + + rect : shapely.Polygon + A Shapely rectangle + + Returns + ------- + geopandas.GeoSeries + A GeoSeries equivalent to `gs`, bound by `rect`. + """ + try: + return gpd.GeoSeries(pygeos.clip_by_rect(pygeos.from_shapely(gs), *rect.bounds)) + except Exception: # weird issue with CYM, clip_by_rect doesn't work + return gs.apply(lambda g: g.intersection(rect)) + + +def diff_geoseries(gs1, gs2): + """Wrapper to get the spatial difference between two GeoSeries. + + Parameters + ---------- + gs1 : geopandas.GeoSeries + Any GeoSeries + + gs2 : geopandas.GeoSeries + Any GeoSeries + + Returns + ------- + geopandas.GeoSeries + The spatial difference between `gs1` and `gs2` (i.e. `gs1` - `gs2`) + """ + return gpd.GeoSeries( + pygeos.difference(pygeos.from_shapely(gs1), pygeos.from_shapely(gs2)) + ) + + +@jit(nopython=True, parallel=False) +def lon_lat_to_xyz(lons, lats): + """Transformation from longitude-latitude to an x-y-z cube + with centroid [0, 0, 0]. Resulting points are on the unit sphere. + + Parameters + ---------- + lons : np.ndarray + 1D array of longitudes + + lats : np.ndarray + 1D array of latitudes + + Returns + ------- + np.ndarray + 2D array representing x-y-z coordinates equivalent to inputs + """ + lat_radians, lon_radians = np.radians(lats), np.radians(lons) + sin_lat, cos_lat = np.sin(lat_radians), np.cos(lat_radians) + sin_lon, cos_lon = np.sin(lon_radians), np.cos(lon_radians) + x = cos_lat * cos_lon + y = cos_lat * sin_lon + z = sin_lat + return np.stack((x, y, z), axis=1) + + +@jit(nopython=True, parallel=False) +def xyz_to_lon_lat(xyz): + """Transformation from x-y-z cube with centroid [0, 0, 0] to + longitude-latitude. + + Parameters + ---------- + xyz : np.ndarray + 2D array representing x-y-z coordinates on the unit sphere + + Returns + ------- + np.ndarray + 2D array representing longitude-latitude coordinates equivalent to + inputs + """ + x, y, z = xyz[:, 0], xyz[:, 1], xyz[:, 2] + lats = np.degrees(np.arcsin(z).flatten()) + lons = np.degrees(np.arctan2(y, x).flatten()) + # ensure consistency with points exactly on meridian + lons = np.where(lons == -180, 180, lons) + + return np.stack((lons, lats), axis=1) + + +def combine_reg_group(reg_group): + """Combine tesselated triplets on a sphere to get the points defining region + boundaries. + """ + pairs = defaultdict(list) + + for reg in reg_group: + for v in range(len(reg)): + p1 = reg[v] + p2 = reg[(v + 1) % len(reg)] + pairs[p1].append(p2) + pairs[p2].append(p1) + + edge_pairs = {k: v for k, v in pairs.items() if len(v) != 6} + edge_pairs = { + k: [item for item in v if item in edge_pairs.keys()] + for k, v in edge_pairs.items() + } + + G = nx.Graph() + + G.add_nodes_from(list(edge_pairs.keys())) + + for k in edge_pairs: + for item in edge_pairs[k]: + G.add_edge(k, item) + G.add_edge(item, k) + + cycles = nx.cycle_basis(G) + + return cycles + + +def get_reg_group(loc_reg_lists, loc, regions): + """Get all regions, as lists of vertex indices, corresponding to an ID + used to assign Voronoi regions. + + Parameters + ---------- + loc_reg_lists : dict from object to list of int + Mapping from each ID of the Voronoi-generating Polygon, to the list + of all indices of generator points with that ID. + + loc : object + Some key (ID) in `loc_reg_lists` + + regions : list of lists of int + Corresponds to the `regions` property of a + `scipy.spatial.SphericalVoronoi` object. From their documentation: + "the n-th entry is a list consisting of the indices of the vertices + belonging to the n-th point in points" + (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.SphericalVoronoi.html) + + Returns + ------- + List of ints + List of lists of vertex indices, where each sub-list represents a + region, corresponding to all the generator points sharing the ID `loc`. + + """ + reg_group = itemgetter(*loc_reg_lists[loc])(regions) + if isinstance(reg_group, tuple): + return list(reg_group) + + return [reg_group] + + +def fix_ring_topology(reg_group_polys, reg_group_loc_ids): + """Insert holes in polygons that completely surround another polygon so + that they are distinct. This resolves an issue in Voronoi construction + where some regions cover others that they surround, rather than including a + hole where the surrounded polygon should be. + + Parameters + ---------- + reg_group_polys : list of shapely.geometry.Polygon + List of all Voronoi polygons in longitude-latitude space. + + reg_group_loc_ids : list of int + List of IDs corresponding to the generating regions of + `reg_group_polys`. + + Returns + ------- + reg_group_polys, reg_group_loc_ids : tuple + The inputs, modified so that surrounding polygons have holes where + they surround other polygons. + + """ + group_polys = pygeos.from_shapely(reg_group_polys) + + tree = pygeos.STRtree(group_polys) + + contains, contained = tree.query_bulk(group_polys, "contains_properly") + + # Check that there are no rings inside rings. If there are, this function + # And `get_groups_of_regions()` may need to be re-worked + assert set(contains) & set(contained) == set([]) + + for container_ix in np.unique(contains): + + reg_group_polys[container_ix] = pygeos.to_shapely( + pygeos.make_valid( + pygeos.polygons( + pygeos.get_exterior_ring(group_polys[container_ix]), + holes=pygeos.get_exterior_ring( + group_polys[contained[contains == container_ix]] + ), + ) + ) + ) + + reg_group_polys = [ + p for (i, p) in enumerate(reg_group_polys) if i not in np.unique(contained) + ] + reg_group_loc_ids = [ + l for (i, l) in enumerate(reg_group_loc_ids) if i not in np.unique(contained) + ] + + return reg_group_polys, reg_group_loc_ids + + +@jit(nopython=True) +def numba_geometric_slerp(start, end, t): + """Optimized version of scipy.spatial.geometric_slerp + + Adapted from: + https://github.com/scipy/scipy/blob/master/scipy/spatial/_geometric_slerp.py + + Parameters + ---------- + start : np.ndarray + Single n-dimensional input coordinate in a 1-D array-like + object. `n` must be greater than 1. + + end : np.ndarray + Single n-dimensional input coordinate in a 1-D array-like + object. `n` must be greater than 1. + + t : np.ndarray + A float or 1D array-like of doubles representing interpolation + parameters, with values required in the inclusive interval + between 0 and 1. A common approach is to generate the array + with ``np.linspace(0, 1, n_pts)`` for linearly spaced points. + Ascending, descending, and scrambled orders are permitted. + + Returns + ------- + np.ndarray + An array of doubles containing the interpolated + spherical path and including start and + end when 0 and 1 t are used. The + interpolated values should correspond to the + same sort order provided in the t array. The result + may be 1-dimensional if ``t`` is a float. + """ + # create an orthogonal basis using QR decomposition + basis = np.vstack((start, end)) + Q, R = np.linalg.qr(basis.T) + signs = 2 * (np.diag(R) >= 0) - 1 + Q = Q.T * np.reshape(signs.T, (2, 1)) + R = R.T * np.reshape(signs.T, (2, 1)) + + # calculate the angle between `start` and `end` + c = np.dot(start, end) + s = np.linalg.det(R) + omega = np.arctan2(s, c) + + # interpolate + start, end = Q + s = np.sin(t * omega) + c = np.cos(t * omega) + return start * np.reshape(c, (c.shape[0], 1)) + end * np.reshape(s, (s.shape[0], 1)) + + +@jit(nopython=True, parallel=False) +def clip_to_sphere(poly_points): + """Ensure 3D points do not reach outside of unit cube. + As designed this should only correct for tiny differences that would make + x-y-z to lon-lat conversion impossible. + + Parameters + ---------- + poly_points : np.ndarray + 3D array of points (that should be) on unit sphere + + Returns + ------- + poly_points : np.ndarray + 3D array of points (that should be) on unit sphere, clipped wherever + they exceed the bounds of the unit cube. + """ + poly_points = np.minimum(poly_points, 1) + poly_points = np.maximum(poly_points, -1) + return poly_points + + +def get_polygon_covering_pole(poly_points_lon_lat, nsign): + """Convert a polygon defined by its edges into a polygon representing + its latitude-longitude space comprehensively. + + Coordinates that cover poles may define polygon boundaries but not + their relationship to a pole explicitly. For example, consider a polygon + represented by these coordinates: + + [[0, 60], [120, 60], [240, 60], [0, 60]] + + This may represent the region of the earth above the 60-degree latitude + line, or it may represent the region of the earth below that line. This + function ensures an explicit definition on a projected coordinate system. + + Parameters + ---------- + poly_points_lon_lat : np.ndarray + 2D array of coordinates (longitude, latitude) representing a polygon + that covers a pole. + + nsign : int + Integer representing positive (nsign == 1: north pole) or negative + (nsign == -1: south pole) sign of latitude of the pole to be covered. + + Returns + ------- + p : shapely.Polygon + Polygon defined by `poly_points_lon_lat`, transformed to cover the pole + indicated by `nsign` in latitude-longitude space. + """ + diff = poly_points_lon_lat[1:] - poly_points_lon_lat[:-1] + turnpoints = np.flip(np.where(np.abs(diff[:, 0]) > 180)[0]) + + for turnpoint in turnpoints: + esign = 1 if poly_points_lon_lat[turnpoint][0] > 0 else -1 + + start, end = poly_points_lon_lat[turnpoint], poly_points_lon_lat[ + turnpoint + 1 + ] + np.array([360 * esign, 0]) + + refpoint = 180 * esign + opppoint = 180 * -esign + + xdiff = end[0] - start[0] + ydiff = end[1] - start[1] + + xpart = (refpoint - start[0]) / xdiff if xdiff > 0 else 0.5 + + newpt1 = [refpoint, start[1] + ydiff * xpart] + newpt2 = [refpoint, 90 * nsign] + newpt3 = [opppoint, 90 * nsign] + newpt4 = [opppoint, start[1] + ydiff * xpart] + + insert_pts = np.array([newpt1, newpt2, newpt3, newpt4]) + + poly_points_lon_lat = np.insert( + poly_points_lon_lat, turnpoint + 1, insert_pts, axis=0 + ) + + p = Polygon(poly_points_lon_lat) + return p + + +@jit(nopython=True, parallel=False) +def ensure_validity(poly_points_lon_lat): + """Resolve duplicate points and some floating point issues in polygons + derived from `numba_process_points()`. + + Parameters + ---------- + poly_points_lon_lat : np.ndarray + 2D array of longitude-latitude coordinates + + Returns + ------- + np.ndarray + A version of `poly_points_lon_lat` with duplicates removed and some + floating point issues resolved. + """ + same_as_next = np.zeros((poly_points_lon_lat.shape[0]), dtype=np.uint8) + same_as_next = same_as_next > 1 + same_as_next[:-1] = ( + np.sum(poly_points_lon_lat[:-1] == poly_points_lon_lat[1:], axis=1) == 2 + ) + poly_points_lon_lat = poly_points_lon_lat[~same_as_next] + out = np.empty_like(poly_points_lon_lat) + return np.round(poly_points_lon_lat, 9, out) + + +@jit(nopython=True) +def numba_divide_polys_by_meridians(poly_points_lon_lat): + """Transform polygons defined by vertices that wrap around the globe, + into those same polygons represented as 2D shapes. + + Parameters + ---------- + poly_points_lon_lat : np.ndarray + 2D array of longitude-latitude coordinates + + Returns + ------- + list of np.ndarray + List of 2D array of longitude-latitude coordinates, representing all + 2D polygons formed by `poly_points_lon_lat` when represented in + projected space that does not wrap around the globe + """ + + diff = poly_points_lon_lat[1:] - poly_points_lon_lat[:-1] + turnpoints = np.flip(np.where(np.abs(diff[:, 0]) > 180)[0]) + if turnpoints.shape[0] == 0: + return [poly_points_lon_lat] + else: + for turnpoint in turnpoints: + esign = 1 if poly_points_lon_lat[turnpoint][0] > 0 else -1 + + start, end = poly_points_lon_lat[turnpoint], poly_points_lon_lat[ + turnpoint + 1 + ] + np.array([360 * esign, 0]) + + refpoint = 180 * esign + opppoint = 180 * -esign + + xdiff = end[0] - start[0] + ydiff = end[1] - start[1] + + xpart = (refpoint - start[0]) / xdiff if xdiff > 0 else 0.5 + + newpt1 = [refpoint, start[1] + ydiff * xpart] + newpt4 = [opppoint, start[1] + ydiff * xpart] + + insert_pts = np.array([newpt1, newpt4]) + + poly_points_lon_lat = np.concatenate( + ( + poly_points_lon_lat[: turnpoint + 1], + insert_pts, + poly_points_lon_lat[turnpoint + 1 :], + ), + axis=0, + ) + + diff = poly_points_lon_lat[1:] - poly_points_lon_lat[:-1] + + turnpoint_switches_off1 = np.zeros((diff[:, 0].shape[0]), dtype=np.int8) + + turnpoint_switches_off1[np.where(diff[:, 0] < -240)[0]] = 1 + turnpoint_switches_off1[np.where(diff[:, 0] > 240)[0]] = -1 + + turnpoint_switches = np.zeros( + (poly_points_lon_lat[:, 0].shape[0]), dtype=np.int8 + ) + + turnpoint_switches[1:] = turnpoint_switches_off1 + + turnpoints = np.where(turnpoint_switches)[0] + + shapeset = np.cumsum(turnpoint_switches) + + return [poly_points_lon_lat[shapeset == sh] for sh in np.unique(shapeset)] + + +@jit(nopython=True, parallel=False) +def interpolate_vertices_on_sphere(vertices): + """Interpolate points in x-y-z space on a sphere. + + Parameters + ---------- + vertices : np.ndarray + 2D array of x-y-z coordinates on the unit sphere + + Returns + ------- + np.ndarray + 2D array of x-y-z coordinates on the unit sphere, interpolated with + at least one point for every distance of length `precision`. + """ + n = len(vertices) + + poly_interp_x = [] + poly_interp_y = [] + poly_interp_z = [] + ct = 0 + for i in range(n): + precision = 1e-3 + start = vertices[i] + end_ix = (i + 1) % n + end = vertices[end_ix] + dist = np.linalg.norm(start - end) + n_pts = max(int(dist / precision), 2) + t_vals = np.linspace(0, 1, n_pts) + if i != n - 1: + t_vals = t_vals[:-1] + + result = numba_geometric_slerp(start, end, t_vals) + for x in result[:, 0]: + poly_interp_x.append(x) + for y in result[:, 1]: + poly_interp_y.append(y) + for z in result[:, 2]: + poly_interp_z.append(z) + + ct += result.shape[0] + + return np.stack( + ( + np.array(poly_interp_x)[:ct], + np.array(poly_interp_y)[:ct], + np.array(poly_interp_z)[:ct], + ), + axis=1, + ) + + +@jit(nopython=True) +def numba_process_points(vertices): + """Densify x-y-z spherical vertices and convert to lon-lat space. + + Parameters + ---------- + vertices : np.ndarray + 2D array of x-y-z coordinates on the unit sphere + + Returns + ------- + poly_points_lon_lat : np.ndarray + 2D array of longitude-latitude coordinates, representing densified + version of `vertices`. + """ + poly_points = interpolate_vertices_on_sphere(vertices) + poly_points = clip_to_sphere(poly_points) + poly_points_lon_lat = xyz_to_lon_lat(poly_points) + return poly_points_lon_lat + + +def get_groups_of_regions( + loc_reg_lists, loc, sv, includes_southpole, includes_northpole, combine_by_id=True +): + """Get Voronoi output shapes for generator points that are part of + the same original Polygon. + + Parameters + ---------- + loc_reg_lists : dict from object to list of int + Mapping from each ID of the Voronoi-generating Polygon, to the list + of all indices of generator points with that ID. + + loc : object + Some key in `loc_reg_lists` + + sv : scipy.spatial.SphericalVoronoi + SphericalVoronoi object based on input points + + includes_southpole : bool + Whether any of the Voronoi regions in `sv.regions` covers the south + pole. + + includes_northpole : bool + Whether any of the Voronoi regions in `sv.regions` covers the north + pole. + + combine_by_id : bool + Whether to combine all Voronoi regions with the same `UID`, or to keep + them separate. + + Returns + ------- + reg_group : list of int, or list of list of int + List of indices in `sv.vertices` composing the nodes of the Voronoi + polygon corresponding to `UID` == `loc`. If there are multiple polygons + formed from the combination of Voronoi shapes (e.g. if two islands of + a region are separated by an island with another `UID`), returns a list + of these lists of indices. + + """ + + reg_group = get_reg_group(loc_reg_lists, loc, sv.regions) + if not combine_by_id: + return reg_group + + if (not includes_southpole) and (not includes_northpole): + # Optimization to combine points from the same region into one large shape + # WARNING: Robust to interior rings, with `fix_ring_topology`, but not to + # rings within those rings. This is ok as long as the related assertion + # in `fix_ring_topology()` passes. + candidate = combine_reg_group(reg_group) + if len(candidate) == 1: + reg_group = candidate + + return reg_group + + +def get_polys_from_cycles( + loc_reg_lists, + reg_cycles, + sv, + loc, + includes_southpole, + includes_northpole, + ix_min, + ix_max, +): + """Transform Voronoi regions defined by `sv` on a sphere into the polygons + they define in longitude-latitude space. + + Parameters + ---------- + loc_reg_lists : dict from object to list of int + Mapping from the `UID` of the Voronoi-generating Polygon, to the list + of all indices in `pts_df` with that `UID`. + + reg_cycles : list of int, or list of list of int + List of indices in `sv.vertices` composing the nodes of the Voronoi + region corresponding to some `UID`. If there are multiple polygons + formed from the combination of Voronoi shapes (e.g. if two islands of + a region are separated by an island with another `UID`), this is a list + of these lists of indices. + + sv : scipy.spatial.SphericalVoronoi + SphericalVoronoi object based on input points + + loc : object + Some key in `loc_reg_lists` + + includes_southpole : bool + Whether any of the Voronoi regions in `sv.regions` covers the south + pole. + + includes_northpole : bool + Whether any of the Voronoi regions in `sv.regions` covers the north + pole. + + ix_min : list of int + Indices of most southerly origin points + + ix_max : list of int + Indices of most northerly origin points + + Returns + ------- + reg_group_polys : list of shapely.Polygon + Polygons representing Voronoi outputs in longitude-latitude space + + reg_group_loc_ids : list of int + `UID`s of `reg_group_polys` + + """ + reg_group_polys = [] + reg_group_loc_ids = [] + for i, reg in enumerate(reg_cycles): + + poly_points_lon_lat = numba_process_points(sv.vertices[reg]) + + if (includes_southpole or includes_northpole) and ( + loc_reg_lists[loc][i] in (set(ix_max) | set(ix_min)) + ): + nsign = 1 if loc_reg_lists[loc][i] in ix_max else -1 + poly_points_lon_lat = ensure_validity(poly_points_lon_lat) + p = get_polygon_covering_pole(poly_points_lon_lat, nsign) + reg_polys = [p] + else: + reg_polys = numba_divide_polys_by_meridians( + ensure_validity(poly_points_lon_lat) + ) + reg_polys = list(pygeos.to_shapely([pygeos.polygons(p) for p in reg_polys])) + + reg_group_polys += reg_polys + reg_group_loc_ids += [loc for i in range(len(reg_polys))] + + return reg_group_polys, reg_group_loc_ids + + +def get_spherical_voronoi_gser(pts, show_bar=True): + """From a list of points associated with IDs (which must be specified by + ``pts_df.index``), calculate the region of a globe closest to each ID-set, and + return a GeoSeries representing those "nearest" Polygons/MultiPolygons. + + Parameters + ---------- + pts_df : :py:class:`geopandas.GeoSeries` + GeoSeries of Points to be used as Voronoi generators. + + show_bar : bool + Show progress bar + + Returns + ------- + :py:class:`geopandas.GeoSeries` : GeoSeries representing Voronoi regions for each + input row. + """ + # Get indices of polar Voronoi regions + lats = pts.y.values + ymax = lats.max() + ymin = lats.min() + + ix_max = np.where(lats == ymax)[0] + ix_min = np.where(lats == ymin)[0] + + xyz_candidates = lon_lat_to_xyz(pts.x.values, lats) + + sv = SphericalVoronoi( + xyz_candidates, radius=1, threshold=SPHERICAL_VORONOI_THRESHOLD + ) + sv.sort_vertices_of_regions() + + polys = [] + loc_ids = [] + + loc_reg_lists = ( + pts.rename_axis(index="UID") + .reset_index(drop=False) + .reset_index(drop=False) + .groupby("UID")["index"] + .unique() + .to_dict() + ) + + iterator = tqdm(loc_reg_lists) if show_bar else loc_reg_lists + for loc in iterator: + includes_southpole = bool(set(ix_min) & set(loc_reg_lists[loc])) + includes_northpole = bool(set(ix_max) & set(loc_reg_lists[loc])) + + reg_cycles = get_groups_of_regions( + loc_reg_lists, + loc, + sv, + includes_southpole, + includes_northpole, + combine_by_id=True, + ) + + reg_group_polys, reg_group_loc_ids = get_polys_from_cycles( + loc_reg_lists, + reg_cycles, + sv, + loc, + includes_southpole, + includes_northpole, + ix_min, + ix_max, + ) + + reg_group_polys, reg_group_loc_ids = fix_ring_topology( + reg_group_polys, reg_group_loc_ids + ) + polys += reg_group_polys + loc_ids += reg_group_loc_ids + + # This should resolve some areas where regions are basically slivers, and + # the geometric slerp is too long to capture the correct topology of the + # region so that two lines of the same region cross along their planar coordinates. + # Based on testing and our use case, these are rare and small enough to ignore, + # and correcting for this with smaller slerp sections too computationally + # intensive, but improvements on this would be welcome. + polys = make_valid_shapely(polys) + + return ( + gpd.GeoDataFrame({pts.index.name: loc_ids}, geometry=polys, crs="EPSG:4326") + .dissolve(pts.index.name) + .geometry + ) + + +def append_extra_pts(sites): + """Define three extra points at the pole farthest from any point in `sites` + These can be necessary for compataibility with `SphericalVoronoi` when the + number of original sites is less than four. + + Parameters + ---------- + sites : :py:class:`geopandas.GeoSeries` + GeoSeries of Points. + + Returns + ------- + :py:class:`geopandas.GeoSeries` + Same as input, but with extra points near one of the poles included. + """ + y = sites.geometry.y + nsign = -1 if np.abs(y.max()) > np.abs(y.min()) else 1 + + out = sites.iloc[[0, 0, 0]].copy() + out.index = pd.Index( + ["placeholder1", "placeholder2", "placeholder3"], name=sites.index.name + ) + + out["geometry"] = gpd.GeoSeries.from_xy( + x=[0, 0, 180], + y=[90 * nsign, 89 * nsign, 89 * nsign], + index=out.index, + crs=sites.crs, + ) + + return pd.concat([sites, out]) + + +def get_voronoi_from_sites(sites): + """Get the Voronoi diagram corresponding to the points defined by `sites`. + + Parameters + ---------- + sites : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame of sites from which to generate a Voronoi diagram. Must include + index, Point `geometry` field. + + Returns + ------- + vor_gdf : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame where the geometry represents Voronoi regions for each site in + ``sites``. + """ + sites = remove_duplicate_points(sites) + if sites.index.nunique() == 1: + out = sites.iloc[0:1].copy() + out["geometry"] = box(-180, -90, 180, 90) + else: + if sites.shape[0] <= 3: + sites = append_extra_pts(sites) + vor_gser = get_spherical_voronoi_gser(sites.geometry, show_bar=False) + site_isos = ( + sites.reset_index(drop=False) + .drop(columns="geometry", errors="ignore") + .drop_duplicates() + .set_index("station_id") + ) + + out = vor_gser.to_frame().join(site_isos) + + return out + + +def get_stations_by_iso_voronoi(stations): + """From the GeoDataFrame of GTSM stations with assigned ISO values, + calculate a globally comprehensive set of shapes for each ISO mapping to the + closest station that has that ISO. + + Parameters + ---------- + stations : pandas.DataFrame + A DataFrame with fields `ISO`, `lon`, and `lat` + + Returns + ------- + out : geopandas.GeoDataFrame + A GeoDataFrame with fields `station_id` and `geometry`, indexed by `ISO` + `geometry` represents the region of the globe corresponding to the area + closer to station `station_id` than any other station in that `ISO` + + """ + + # Make sure none of the stations with too few points to calculate SphericalVoronoi + # are anywhere near the poles, so we can introduce the poles as extra points + iso_count = ( + stations.groupby("ISO")[["ISO"]].count().rename(columns={"ISO": "count"}) + ) + stations = stations.join(iso_count, on="ISO") + + lats = stations.geometry.y[stations["count"] <= 3] + assert (lats.max() < 60) and (lats.min() > -60) + + # Iterate through each country, add each Voronoi gdf to `vors` + all_isos = stations["ISO"].unique() + all_isos.sort() + + vors = [] + for iso in all_isos: + print(iso, end=" ") + iso_stations = stations[stations["ISO"] == iso].copy() + vors.append(get_voronoi_from_sites(iso_stations)) + + # Combine all Voronoi diagrams into one GeoDataFrame (results overlap) + vor_gdf = pd.concat(vors).drop( + ["placeholder1", "placeholder2", "placeholder3"], errors="ignore" + ) + + # Check that ISOs match + assert set(vor_gdf.index[vor_gdf["ISO"].isnull()].unique()) - set( + ["placeholder1", "placeholder2", "placeholder3"] + ) == set([]) + + # Clean up + vor_gdf = vor_gdf[vor_gdf["ISO"].notnull()].copy() + vor_gdf["geometry"] = vor_gdf["geometry"].apply(grab_polygons) + + return vor_gdf[["ISO", "geometry"]] + + +def remove_duplicate_points(pts, threshold=SPHERICAL_VORONOI_THRESHOLD): + """Remove points in DataFrame that are too close to each other to be + recognized as different in the `SphericalVoronoi` algorithm. + + Parameters + ---------- + pts : :py:class:`geopandas.GeoSeries` + GeoSeries of Points + + Returns + ------- + geopandas.DataFrame or pandas.DataFrame + DataFrame of `pts_df` points, with duplicates removed (i.e. leave one + of each set of duplicates). + """ + + xyz_candidates = lon_lat_to_xyz(pts.geometry.x.values, pts.geometry.y.values) + + res = cKDTree(xyz_candidates).query_pairs(threshold) + + first_point = np.array([p[0] for p in res]) + mask = np.ones(xyz_candidates.shape[0], dtype="bool") + + if len(first_point) > 0: + mask[first_point] = False + + return pts[mask] + + +def remove_already_attributed_land_from_vor( + vor_shapes, + all_gridded, + vor_ix, + existing, + vor_uid, + gridded_uid, + show_bar=True, + crs=None, +): + """Mask Voronoi regions with the pre-existing regions, so that the result + includes only the parts of the Voronoi regions that are not already + assigned to the pre-existing regions. + + Parameters + ---------- + vor_shapes : array of pygeos.Geometry + Shapes of globally comprehensive Voronoi regions + + all_gridded : array of pygeos.Geometry + Shapes of original regions + + vor_ix : np.ndarray + 1D array of indices of `vor_shapes` intersecting with `all_gridded` + + existing : np.ndarray + 1D array of indices of Polygons in `all_gridded` intersecting with + `vor_shapes` + + vor_uid : np.ndarray + 1D array of unique IDs corresponding with `vor_ix` + + gridded_uid : np.ndarray + 1D array of unique IDs corresponding with `existing` + + Returns + ------- + geopandas.GeoSeries + A GeoSeries based on `vor_shapes` that excludes the areas defined in + `all_gridded`. + """ + + calculated = [] + + iterator = range(len(vor_shapes)) + if show_bar: + iterator = tqdm(iterator) + for ix in iterator: + overlapping_ix = list(existing[(vor_ix == ix) & (gridded_uid != vor_uid)]) + if len(overlapping_ix) > 0: + overlapping_land = itemgetter(*overlapping_ix)(all_gridded) + uu = pygeos.union_all(overlapping_land) + remaining = pygeos.difference(vor_shapes[ix], uu) + else: + remaining = vor_shapes[ix] + calculated.append(remaining) + + return gpd.GeoSeries(calculated, crs=crs) + + +def get_voronoi_regions(full_regions): + """Computes a globally comprehensive set of shapes corresponding to the + nearest regions in each place from the set of `full_regions`. + + Parameters + ---------- + full_regions : :py:class:`geopandas.GeoDataFrame` + Contains regions for which you want to create Voronoi shapes + + Returns + ------- + out : :py:class:`geopandas.GeoDataFrame` + Same as input but with the geometry defined as the Voronoi shapes. + """ + + out_cols = [c for c in full_regions.columns if c != "geometry"] + + # avoiding GeoDataFrame.explode until geopandas v0.10.3 b/c of + # https://github.com/geopandas/geopandas/issues/2271 + # region_polys = full_regions.explode(index_parts=False) + region_polys = full_regions.drop(columns="geometry").join( + full_regions.geometry.explode(index_parts=False) + ) + + # This has been tested with up to 40 million coordinates, so cannot + # guarantee performance or memory usage on more complex shapefiles + assert ( + pygeos.count_coordinates(pygeos.from_shapely(region_polys.geometry.values)) + < sset.MAX_VORONOI_COMPLEXITY + ) + + gridded_gdf, all_oc = grid_gdf(region_polys) + + pts = polys_to_vor_pts(region_polys, all_oc) + + vor_gdf = get_spherical_voronoi_gser(pts).to_frame() + + vor_shapes = pygeos.from_shapely(vor_gdf["geometry"]) + all_gridded = pygeos.from_shapely(gridded_gdf["geometry"]) + + tree = pygeos.STRtree(all_gridded) + + vor_ix, existing = tree.query_bulk(vor_shapes, "intersects") + + gridded_uid = np.take(gridded_gdf.index.values, existing) + vor_uid = np.take(vor_gdf.index.values, vor_ix) + + vor_gdf["calculated"] = remove_already_attributed_land_from_vor( + vor_shapes, + all_gridded, + vor_ix, + existing, + vor_uid, + gridded_uid, + crs=full_regions.crs, + ).values + + full_regions = full_regions.join(vor_gdf.drop(columns="geometry"), how="left") + + full_regions["calculated"] = full_regions["calculated"].fillna(Polygon()) + + full_regions["combined"] = full_regions["geometry"].union( + full_regions["calculated"] + ) + + out = full_regions[full_regions.index.notnull()].combined.rename("geometry") + + out = out.apply(grab_polygons) + out = out.apply(strip_line_interiors) + + out = fill_in_gaps(out) + return gpd.GeoDataFrame(full_regions[out_cols].join(out, how="right")) + + +def get_points_along_segments(segments, tolerance=sset.DENSIFY_TOLERANCE): + """Get a set of points along line segments. Calls `pygeos.segmentize()` + to interpolate between endpoints of each line segment. + + Parameters + ---------- + segments : :py:class:`geopandas.GeoDataFrame` + Geometry column represents segments (as (Multi)LineStrings or + (Multi)Polygons). + + Returns + ------- + :py:class:`geopandas.GeoDataFrame` + GeoDataFrame of resulting endpoints and interpolated points, with same + non-geometry columns as ``segments``. + """ + + segments = segments[~segments.geometry.type.isnull()].copy() + + # avoiding GeoDataFrame.explode until geopandas v0.10.3 b/c of + # https://github.com/geopandas/geopandas/issues/2271 + # segments = segments.explode(index_parts=False) + segments = segments.drop(columns="geometry").join( + segments.geometry.explode(index_parts=False) + ) + + segments = segments[~segments["geometry"].is_empty].copy() + + segments["geometry"] = pygeos.segmentize( + pygeos.from_shapely(segments["geometry"]), tolerance + ) + + pts, pts_ix = pygeos.get_coordinates( + pygeos.from_shapely(segments["geometry"]), return_index=True + ) + + return gpd.GeoDataFrame( + segments.drop(columns="geometry").iloc[pts_ix], + geometry=gpd.points_from_xy(pts[:, 0], pts[:, 1]), + crs=segments.crs, + ) + + +def constrain_lons(arr, lon_mask): + if lon_mask is False: + return arr + out = arr.copy() + out = np.where((out > 180) & lon_mask, -360 + out, out) + out = np.where((out <= -180) & lon_mask, 360 + out, out) + return out + + +def grid_val_to_ix( + vals: Any, + cell_size: Union[int, Sequence], + map_nans: Union[int, Sequence] = None, + lon_mask: Union[bool, Sequence] = False, +) -> Any: + """Converts grid cell lon/lat/elevation values to i/j/k values. The function is + indifferent to order, of dimensions, but the order returned matches the order of the + inputs, which in turn must match the order of ``cell_size``. The origin of the grid + is the grid cell that has West, South, and bottom edges at (0,0,0) in + (lon, lat, elev) space, and we map everything to (-180,180] longitude. + + Parameters + ---------- + vals : array-like + The values in lon, lat, or elevation-space. The dimensions of this array should + be n_vals X n_dims (where dims is either 1, 2, or 3 depending on which of + lat/lon/elev are in the array). + cell_size : int or Sequence + The size of a cells along the dimensions included in ``vals``. If int, applies + to all columns of ``vals``. If Sequence, must be same length as the number of + columns of ``vals``. + map_nans : int or Sequence, optional + If not None, map this value in the input array to ``np.nan`` in the output + array. If int, applies to all columns of ``vals``. If Sequence, must be the same + length as ``vals``, with each element applied to the corresponding column of + ``vals``. + lon_mask : bool or array-like, optional + Specify an mask for values to constrain to (-180, 180] space. If value is a + bool, apply mask to all (True) or none of (False) the input ``vals``. If value + is array-like, must be broadcastable to the shape of ``vals`` and castable to + bool. + + Returns + ------- + out : array-like + An integer dtype object of the same type as vals defining the index of each grid + cell in ``vals``. + + Raises + ------ + ValueError + If `vals` contains null values but `map_nans` is None. + + Example + ------- + >>> import numpy as np + >>> lons = [-180.5, np.nan] + >>> lats = [-45, 0] + >>> elevs = [-5, 3.2] + >>> inputs = np.stack((lons, lats, elevs)).T + >>> grid_val_to_ix( + ... inputs, + ... cell_size=(.25, .25, .1), + ... map_nans=-9999, + ... lon_mask=np.array([1, 0, 0]) + ... ) # doctest: +NORMALIZE_WHITESPACE + array([[ 718, -180, -50], + [-9999, 0, 32]], dtype=int32) + """ + + # handle nans + nan_mask = np.isnan(vals) + is_nans = nan_mask.sum() + + out = vals.copy() + + if is_nans != 0: + if map_nans is None: + raise ValueError( + "NaNs not allowed in `vals`, unless `map_nans` is specified." + ) + else: + # convert to 0s to avoid warning in later type conversion + out = np.where(nan_mask, 0, out) + + out = constrain_lons(out, lon_mask) + + # convert to index + out = np.floor(out / cell_size).astype(np.int32) + + # fix nans to our chosen no data int value + if is_nans: + out = np.where(nan_mask, map_nans, out) + + return out + + +def grid_ix_to_val( + vals: Any, + cell_size: Union[int, Sequence], + map_nans: Union[int, Sequence] = None, + lon_mask: Union[bool, Sequence] = False, +) -> Any: + """Converts grid cell i/j/k values to lon/lat/elevation values. The function is + indifferent to order, of dimensions, but the order returned matches the order of the + inputs, which in turn must match the order of ``cell_size``. The origin of the grid + is the grid cell that has West, South, and bottom edges at (0,0,0) in + (lon, lat, elev) space, and we map everything to (-180,180] longitude. + + Parameters + ---------- + vals : array-like + The values in i, j, or k-space. The dimensions of this array should be + n_vals X n_dims (where dims is either 1, 2, or 3 depending on which of i/j/k are + in the array). + cell_size : Sequence + The size of a cells along the dimensions included in ``vals``. If int, applies + to all columns of ``vals``. If Sequence, must be same length as the number of + columns of ``vals``. + map_nans : int or Sequence, optional + If not None, map this value in the input array to ``np.nan`` in the output + array. If int, applies to all columns of ``vals``. If Sequence, must be the same + length as ``vals``, with each element applied to the corresponding column of + ``vals``. + lon_mask : bool or array-like, optional + Specify an mask for values to constrain to (-180, 180] space. If value is a + bool, apply mask to all (True) or none of (False) the input ``vals``. If value + is array-like, must be broadcastable to the shape of ``vals`` and castable to + bool. + + Returns + ------- + out : array-like + A float dtype object of the same type as vals defining the lat/lon/elev of each + grid cell in ``vals``. + + Raises + ------ + AssertionError + If `vals` is not an integer object + + Example + ------- + >>> i = [750, 100] + >>> j = [-3, 2] + >>> k = [-14, 34] + >>> inputs = np.stack((i, j, k)).T + >>> grid_ix_to_val( + ... inputs, + ... cell_size=(.25, .25, .1), + ... map_nans=-14, + ... lon_mask=np.array([1, 0, 0]) + ... ) # doctest: +NORMALIZE_WHITESPACE + array([[-172.375, -0.625, nan], + [ 25.125, 0.625, 3.45 ]]) + """ + + assert np.issubdtype(vals.dtype, np.integer) + + out = cell_size * (vals + 0.5) + out = constrain_lons(out, lon_mask) + + # apply nans + if map_nans is not None: + valid = vals != map_nans + out = np.where(valid, out, np.nan) + + return out + + +def great_circle_dist( + ax, + ay, + bx, + by, +): + """Calculate pair-wise Great Circle Distance (in km) between two sets of points. + + Note: ``ax``, ``ay``, ``bx``, ``by`` must be either: + a. 1-D, with the same length, in which case the distances are element-wise and + a 1-D array is returned, or + b. Broadcastable to a common shape, in which case a distance matrix is returned. + + Parameters + ---------- + ax, bx : array-like + Longitudes of the two point sets + ay, by : array-like + Latitudes of the two point sets + + Returns + ------- + array-like + The distance vector (if inputs are 1-D) or distance matrix (if inputs are + multidimensional and broadcastable to the same shape). + + Examples + -------- + We can calculate element-wise distances + + >>> lon1 = [0, 90] + >>> lat1 = [-45, 0] + >>> lon2 = [10, 100] + >>> lat2 = [-45, 10] + + >>> great_circle_dist(lon1, lat1, lon2, lat2) + array([ 785.76833086, 1568.52277257]) + + We can also create a distance matrix w/ 2-D inputs + + >>> lon1 = np.array(lon1)[:,np.newaxis] + >>> lat1 = np.array(lat1)[:,np.newaxis] + >>> lon2 = np.array(lon2)[np.newaxis,:] + >>> lat2 = np.array(lat2)[np.newaxis,:] + + >>> great_circle_dist(lon1, lat1, lon2, lat2) + array([[ 785.76833086, 11576.03341028], + [ 9223.29614889, 1568.52277257]]) + """ + radius = 6371.009 # earth radius + lat1, lon1, lat2, lon2 = map(np.radians, (ay, ax, by, bx)) + + # broadcast so it's easier to work with einstein summation below + if all(map(lambda x: isinstance(x, xr.DataArray), (lat1, lon1, lat2, lon2))): + broadcaster = xr.broadcast + else: + broadcaster = np.broadcast_arrays + lat1, lon1, lat2, lon2 = broadcaster(lat1, lon1, lat2, lon2) + + dlat = 0.5 * (lat2 - lat1) + dlon = 0.5 * (lon2 - lon1) + + # haversine formula: + hav = np.sin(dlat) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon) ** 2 + return 2 * np.arcsin(np.sqrt(hav)) * radius + + +def get_great_circle_nearest_index(df1, df2, x1="lon", y1="lat", x2="lon", y2="lat"): + """ + Finds the index in df2 of the nearest point to each element in df1 + + Parameters + ---------- + df1 : pandas.DataFrame + Points that will be assigned great circle nearest neighbors from df2 + df2 : pandas.DataFrame + Location of points to within which to select data + x1 : str + Name of x column in df1 + y1 : str + Name of y column in df1 + x2 : str + Name of x column in df2 + y2 : str + Name of y column in df2 + + Returns + ------- + nearest_indices : pandas.Series + :py:class:`pandas.Series` of indices in df2 for the nearest entries to + each row in df1, indexed by df1's index. + """ + + dists = great_circle_dist( + df1[[x1]].values, df1[[y1]].values, df2[x2].values, df2[y2].values + ) + + nearest_indices = pd.Series(df2.index.values[dists.argmin(axis=1)], index=df1.index) + + return nearest_indices + + +def coastlen_poly( + i, + coastlines_shp_path, + seg_adm_voronoi_parquet_path, + seg_var="seg_adm", + filesystem=None, +): + lensum = 0 + + # Import coastlines, CIAM seg and ADM1 voronoi polygons + clines = gpd.read_file(coastlines_shp_path) + poly = gpd.read_parquet( + seg_adm_voronoi_parquet_path, + filesystem=filesystem, + columns=["geometry"], + filters=[(seg_var, "=", i)], + ) + + assert len(poly) == 1 + + # Intersect polygon with coastlines + if not clines.intersects(poly.iloc[0].loc["geometry"]).any(): + return 0 + lines_int = gpd.overlay(clines, poly, how="intersection", keep_geom_type=True) + if len(lines_int) > 0: + for idx0 in range(len(lines_int)): + + def line_dist(line, npts=50): + dist = 0 + pts = get_points_on_lines(line, line.length / npts)[0] + for p in range(1, len(pts.geoms)): + dist += great_circle_dist( + pts.geoms[p - 1].x, + pts.geoms[p - 1].y, + pts.geoms[p].x, + pts.geoms[p].y, + ) + return dist + + line = lines_int.iloc[idx0] + + if line.geometry.type == "MultiLineString": + lines = line.geometry.explode() + for idx1 in range(len(lines)): + line = lines.iloc[idx1] + lensum += line_dist(line) + else: + lensum += line_dist(line.geometry) + + return lensum + + +def simplify_coastlines(coastlines): + """Read in coastlines and break them up into their component (2-point) + line segments + + Parameters + ---------- + coastlines : :py:class:`geopandas.GeoSeries` + GeoSeries containing a set of global coastline `LINESTRING`s. + + Returns + ------- + :py:class:`geopandas.GeoSeries` : + GeoSeries containing broken-up coastlines with their original + associated index. + + """ + + coords, linestring_ix = pygeos.get_coordinates( + pygeos.from_shapely(coastlines.values), return_index=True + ) + + start, end = coords[:-1], coords[1:] + + tiny_segs = pygeos.linestrings( + np.stack((start[:, 0], end[:, 0]), axis=1), + np.stack((start[:, 1], end[:, 1]), axis=1), + ) + + tiny_segs = tiny_segs[linestring_ix[:-1] == linestring_ix[1:]] + + linestring_ix = linestring_ix[:-1][linestring_ix[:-1] == linestring_ix[1:]] + + return gpd.GeoSeries( + tiny_segs, crs=coastlines.crs, index=coastlines.iloc[linestring_ix].index + ) + + +def join_coastlines_to_isos(coastlines, regions_voronoi): + """Get country-level coastlines by calculating intersection between + coastlines and countries. + + Parameters + ---------- + coastlines: :py:class:`geopandas.GeoSeries` + GeoSeries representing simplified global coastlines, i.e. outputs of + ``simplify_coastlines``. + + regions_voronoi: :py:class:`geopandas.GeoDataFrame` + GeoDataFrame including an `ISO` field and a `geometry` field. Should be globally + comprehensive, with a one-to-one mapping from coordinates to ISO values. + + Returns + ------- + joined : geopandas.GeoDataFrame + A GeoDataFrame with fields `region_geo`, `ISO`, and `geometry`, where `geometry` + represents the (entire) original linestring corresponding that overlaps with the + `region_geo` defined by ``regions_voronoi``. + """ + regions = regions_voronoi.to_crs(coastlines.crs) + + # Use regions as a proxy for countries. It's faster because the regions are more + # narrowly located than the countries in the STRtree, but could instead subdivide + # countries + tree = pygeos.STRtree(pygeos.from_shapely(regions["geometry"])) + + coastal_ix, region_ix = tree.query_bulk( + pygeos.from_shapely(coastlines), "intersects" + ) + + coastal_geo = coastlines.iloc[coastal_ix] + regions_out = regions.iloc[region_ix] + + joined = gpd.GeoDataFrame( + { + "region_geo": regions_out.geometry.values, + "ISO": regions_out.ISO.values, + }, + geometry=coastal_geo.values, + crs=coastal_geo.crs, + index=coastal_geo.index, + ) + + return joined + + +def get_coastlines_by_iso(coastlines, regions_voronoi, plot=True): + """Get country-level coastlines by calculating intersection between + coastlines and countries. + + Parameters + ---------- + coastlines : :py:class:`geopandas.GeoSeries` + GeoSeries containing a set of global coastline `LINESTRING`s. + + regions_voronoi: :py:class:`geopandas.GeoDataFrame` + GeoDataFrame including an `ISO` field and a `geometry` field. Should be globally + comprehensive, with a one-to-one mapping from coordinates to ISO values. + + plot : bool + True to see resulting coastlines by country, False to suppress plotting + + Returns + ------- + :py:class:`geopandas.GeoSeries` + Indexed by country, contains coastlines for each country. + """ + + # Get coastal components (line segments) + coastlines = simplify_coastlines(coastlines) + + # Get all matches between coastal components and regions + coastlines = join_coastlines_to_isos(coastlines, regions_voronoi) + + # Clip matched coastal components to the regions they are matched with + coastlines["geometry"] = coastlines["geometry"].intersection( + coastlines["region_geo"] + ) + coastlines = coastlines.drop(columns=["region_geo"]) + coastlines = coastlines[~coastlines["geometry"].is_empty] + + # Merge LineStrings where possible + coastlines["geometry"] = coastlines["geometry"].apply(grab_lines) + + out = coastlines.dissolve("ISO").geometry + + # Check output + if plot: + coastlines.reset_index(drop=False).plot( + color=add_rand_color(coastlines, col="ISO"), figsize=(20, 20) + ) + + return out + + +def get_coastal_segments_by_ciam_site(site_vor, coastlines, plot=True): + """Generate coastal segments corresponding to each CoDEC site. + + Parameters + ---------- + site_vor : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame with fields `ISO` and `geometry`, indexed by `station_id`, + where `geometry` represents the region of the globe corresponding + to the area closer to station `station_id` than any other station + in that `ISO`. (i.e. the output of ``get_stations_by_iso_voronoi``) + + coastlines : :py:class:`geopandas.GeoSeries` + Contains coastlines by country (i.e. the output of ``get_coastlines_by_iso``) + + Returns + ------- + coastal_segs : :py:class:`geopandas.GeoDataFrame` + Contains `ISO` and `geometry`, where `geometry` represents the coastline within + some ISO that is closer to the associated `station_id` than any other site + within that ISO. + """ + + # Join coastlines to CIAM site Voronoi + site_vor = site_vor.join(coastlines.rename("coastline"), on="ISO", how="left") + + assert site_vor["ISO"].isnull().sum() == 0 + + # Clip coastal segments within point-based Voronoi shapes + site_vor["segment"] = site_vor["coastline"].intersection(site_vor["geometry"]) + + coastal_segs = site_vor.drop(columns=["geometry", "coastline"]).rename( + columns={"segment": "geometry"} + ) + + # Merge LineStrings where possible + coastal_segs["geometry"] = coastal_segs["geometry"].apply(grab_lines) + + # Check output + if plot: + coastal_segs.plot(color=add_rand_color(coastal_segs, "ISO"), figsize=(20, 20)) + + return coastal_segs + + +def dist_matrix( + ax: Any, ay: Any, bx: Any, by: Any, radius: float = EARTH_RADIUS +) -> Any: + """Get the distance matrix (in km) between two sets of points defined by lat/lon. + + Parameters + ---------- + ax, bx : 1-d array-like + Longitudes of the two point sets + ay, by : 1-d array-like + Latitudes of the two point sets + + Returns + ------- + :class:`numpy.ndarray` + The distance distance matrix between the two point sets. + + Example + ------- + >>> lon1 = np.array([0, 90, 270]) + >>> lat1 = np.array([-45, 0, -60]) + >>> lon2 = np.array([10, 100]) + >>> lat2 = np.array([-45, 10]) + + >>> dist_matrix(lon1, lat1, lon2, lat2) + array([[ 785.76833086, 11576.03341028], + [ 9223.29614889, 1568.52277257], + [ 6289.84215841, 14393.39737057]]) + """ + + # broadcast manually + ax1 = ax[:, np.newaxis].repeat(bx.shape[0], axis=1) + ay1 = ay[:, np.newaxis].repeat(by.shape[0], axis=1) + bx1 = bx[np.newaxis, :].repeat(ax.shape[0], axis=0) + by1 = by[np.newaxis, :].repeat(ay.shape[0], axis=0) + + # get dist + return great_circle_dist(ax1, ay1, bx1, by1) + + +def create_overlay_voronois( + regions, seg_centroids, coastlines, overlay_name, plot=False +): + """Create two Voronoi objects necessary for assigning values to coastal segments in + SLIIDERS. + + Parameters + ---------- + regions : :py:class:`geopandas.GeoDataFrame` + Contains the Polygon/MultiPolygons of each region that you wish to run analyses + on separately. Columns are ``ISO`` and ``geometry``. Each region must be mapped + to a country (``ISO``). + seg_centroids : :py:class:`pandas.DataFrame` + Contains ``lon`` and ``lat`` columns, specifying the location of coastal segment + centroids. + coastlines : :py:class:`geopandas.GeoSeries` + Contains LineStrings representing global coastlines. The index is not important. + overlay_name : str + What you would like the variable representing each combination of segment and + region to be called + plot : bool + Whether to produce some diagnostic plots during calculation. Only valuable if + running in an interactive setting. + + Returns + ------- + all_overlays : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame representing Voronoi shapes, as administrative Voronoi + regions intersected with segment-based Voronoi regions. + + ciam_polys : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame representing segment-based Voronoi regions. + """ + # Generate global Voronoi shapes for regions + print("Generating global Voronoi shapes for regions...") + reg_vor = get_voronoi_regions(regions) + reg_vor["ISO"] = regions.ISO + adm0 = reg_vor.dissolve("ISO") + + # Assign ISO to seg centroids based on country Voronoi + print("Assigning countries to segment centroids...") + stations = ( + seg_centroids.rename("geometry") + .to_frame() + .sjoin(adm0, how="left", predicate="within") + .rename(columns={"index_right": adm0.index.name}) + ) + + # Generate ISO-level point-voronoi from CIAM points + print("Generating within-country Voronoi shapes for segment centroids...") + vor_gdf = get_stations_by_iso_voronoi(stations) + + # Get coastline by country + print("Generating country-level coastlines...") + coastlines_by_iso = get_coastlines_by_iso(coastlines, reg_vor, plot=plot) + + # Get coast-seg-by-CIAM point + print("Assigning segments to each centroid point...") + coastal_segs = get_coastal_segments_by_ciam_site( + vor_gdf, coastlines_by_iso, plot=plot + ) + + # Overlap coastline vor with region vor to get spatially comprehensive seg_reg. + print("Creating segment X region Voronoi shapes...") + return generate_voronoi_from_segments( + coastal_segs, + reg_vor, + overlay_name, + ) + + +def get_country_level_voronoi_gdf(all_pts_df): + """Get Voronoi diagram within a country based on a set of points derived + from that country's coastal segments. + + Parameters + ---------- + all_pts_df : :py:class:`geopandas.GeoDataFrame` + Voronoi-generator points within a country, containing `ISO` and `geometry` + columns. + + Returns + ------- + vor_gdf : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame representing Voronoi regions for each input point + """ + + all_isos = all_pts_df["ISO"].unique() + all_isos.sort() + + vors = [] + + for iso in all_isos: + print(iso, end=" ") + station_pts = all_pts_df[all_pts_df["ISO"] == iso].copy() + vors.append(get_voronoi_from_sites(station_pts)) + + vor_gdf = pd.concat(vors).drop( + ["placeholder1", "placeholder2", "placeholder3"], errors="ignore" + ) + + # Assign ISO to point-region shapes + assert vor_gdf["ISO"].isnull().sum() == 0 + + return vor_gdf + + +def generate_voronoi_from_segments(segments, region_gdf, overlay_name): + """Get global Voronoi diagram based on a set of coastal segments and + administrative regions. + + Parameters + ---------- + segments : :py:class:`geopandas.GeoDataFrame` + Coastal segments, including `ISO` column. + + region_gdf : :py:class:`geopandas.GeoDataFrame` + GeoDataFrame representing administrative Voronoi regions + + overlay_name : str + Name of the field in the returned GeoDataFrame representing the + intersections between administrative Voronoi regions and segment-based + Voronoi regions. + + Returns + ------- + all_overlays : geopandas.GeoDataFrame + GeoDataFrame representing Voronoi shapes, as administrative Voronoi + regions intersected with segment-based Voronoi regions. + + ciam_polys : geopandas.GeoDataFrame + GeoDataFrame representing segment-based Voronoi regions. + """ + + all_pts_df = get_points_along_segments(segments) + + vor_gdf = get_country_level_voronoi_gdf(all_pts_df) + + # Calculate Voronoi diagram of all coastal segments, independent of ISO + all_stations_vor = get_voronoi_from_sites(all_pts_df.drop(columns="ISO")) + + # Join ISO-level Voronoi diagrams with country shapes to get final seg-region + # polygons + + coastal_isos = vor_gdf["ISO"].unique() + coastal_isos.sort() + landlocked_isos = sorted(list(set(region_gdf["ISO"].unique()) - set(coastal_isos))) + + coastal_overlays = [] + + for iso in tqdm(coastal_isos): + print(iso, end=" ") + + ciam_iso = vor_gdf[vor_gdf["ISO"] == iso].copy() + + region_iso = region_gdf[region_gdf["ISO"] == iso].copy() + + coastal_overlays.append( + gpd.overlay( + ciam_iso.reset_index(), + region_iso.reset_index().drop(columns=["ISO"]), + keep_geom_type=True, + ) + ) + + coastal_overlays = pd.concat(coastal_overlays, ignore_index=True) + + landlocked_overlays = [] + for iso in tqdm(landlocked_isos): + print(iso, end=" ") + + region_iso = region_gdf[region_gdf["ISO"] == iso].copy() + + landlocked_overlays.append( + gpd.overlay( + all_stations_vor.reset_index(), + region_iso.reset_index(), + keep_geom_type=True, + ) + ) + + if len(landlocked_overlays): + landlocked_overlays = pd.concat(landlocked_overlays, ignore_index=True) + all_overlays = pd.concat( + [landlocked_overlays, coastal_overlays], ignore_index=True + ) + else: + all_overlays = coastal_overlays + + assert all_overlays.is_valid.all() + + all_overlays["geometry"] = fill_in_gaps(all_overlays.geometry) + + all_overlays[overlay_name] = ( + "seg_" + + all_overlays[segments.index.name].str.split("_").str[-1] + + f"_{region_gdf.index.name}_" + + all_overlays[region_gdf.index.name].astype(str) + ) + + return all_overlays + + +def get_degree_box(row): + """ + Get a 1-degree box containing a centroid + defined by row["lon"] and row["lat"] + + Parameters + ---------- + row : dict + A dictionary including values for "lon" and "lat" indicating the center + of the 1-degree box + + Returns + ------- + shapely.Polygon + A Shapely box representing the spatial extent of the 1-degree tile + """ + return box( + row["lon"] - 0.5, + row["lat"] - 0.5, + row["lon"] + 0.5, + row["lat"] + 0.5, + ) + + +def get_tile_names(df, lon_col, lat_col): + """Get tile names in the format used by CoastalDEM. + Defined by the southeastern point's 2-digit degree-distance + north (N) or south (S) of the equator, and then its 3-digit + distance east (E) or west (W) of the prime meridian. + + Parameters + ---------- + df : pandas.DataFrame + DataFrame with latitude and longitude + + lon_col : str + Name of field representing longitude in `df` + + lat_col : str + Name of field representing latitude in `df` + + Returns + ------- + np.ndarray + Array of strings. Tile names defined by latitude and longitude. + """ + tlon = np.floor(df[lon_col]).astype(int) + tlat = np.floor(df[lat_col]).astype(int) + + NS = np.where(tlat >= 0, "N", "S") + EW = np.where(tlon >= 0, "E", "W") + + return ( + NS + + np.abs(tlat).astype(int).astype(str).str.zfill(2) + + EW + + np.abs(tlon).astype(int).astype(str).str.zfill(3) + ) + + +def get_all_exp_tiles(lon, lat): + """ + Get the list of CoastalDEM tiles included in an exposure dataset. + + Parameters + ---------- + lon, lat : array-like + Defines the longitude and latitude of valid grid cells in an exposure dataset + + Returns + ------- + :py:class:`numpy.ndarray` + 1D array of unique 1-degree tile names + + """ + exp = pd.DataFrame( + np.floor( + grid_ix_to_val( + np.stack((lon, lat)).T, + cell_size=sset.LITPOP_GRID_WIDTH, + lon_mask=[True, False], + ) + ).astype(int), + columns=["lon", "lat"], + ) + + lonlats = exp.drop_duplicates(["lon", "lat"]).reset_index(drop=True) + + lonlats["londir"] = np.where(lonlats["lon"] < 0, "W", "E") + lonlats["latdir"] = np.where(lonlats["lat"] < 0, "S", "N") + + lonlats["lonnum"] = np.abs(lonlats["lon"]).astype(int).astype(str).str.zfill(3) + lonlats["latnum"] = np.abs(lonlats["lat"]).astype(int).astype(str).str.zfill(2) + + lonlats["tile_name"] = ( + lonlats["latdir"] + lonlats["latnum"] + lonlats["londir"] + lonlats["lonnum"] + ) + + return lonlats["tile_name"].values + + +def get_bbox(tile_name): + """ + Return bounding box from tile name in the string format "VXXHYYY" + representing the southwestern corner of a 1-degree tile, where "V" is "N" + (north) or "S" (south), "H" is "E" (east) or "W" (west), "XX" is a + two-digit zero-padded number indicating the number of degrees north or + south from 0,0, and "YYY" is a three-digit zero-padded number indicating + the number of degrees east or west from 0,0. + + Parameters + ---------- + tile_name : str + Tile name in the format described above + + Returns + ------- + shapely.Polygon + A box representing the spatial coverage of the tile + """ + lat_term, lon_term = tile_name[:3], tile_name[3:] + + lat_direction, lat_value = lat_term[0], int(lat_term[1:]) + lon_direction, lon_value = lon_term[0], int(lon_term[1:]) + + lat_sign = 1 if lat_direction == "N" else -1 + lon_sign = 1 if lon_direction == "E" else -1 + + llat = lat_sign * lat_value + llon = lon_sign * lon_value + + ulat = llat + 1 + ulon = llon + 1 + + return box(llon, llat, ulon, ulat) + + +def get_partial_covering_matches(elev_tile, bbox, gdf, id_name=None): + """ + Get shapes in `gdf` that overlap with `bbox`, as flattened array corresponding + to the indices of `elev_tile` + + Parameters + ---------- + elev_tile : xarray.DataArray + Elevation raster tile + + bbox : shapely.Polygon + Bounding box of `elev_tile` + + gdf : geopandas.GeoDataFrame + GeoDataFrame containing geometries to match to `elev_tile` coordinates + + id_name : str + Field of `gdf` to use as IDs in returned `region_matches` + + Returns + ------- + region_matches : np.ndarray + 1D array of matches between `elev_tile` and `gdf`. Represented as a + flattened array along coordinates of `elev_tile`. If `id_name` is None, + returns flag indicating there is some match. If `id_name` is defined, + returns ID of the match, or -1 if there's no match. + """ + gdf = gdf[gdf["geometry"].intersects(bbox)].copy() + + gdf["geometry"] = gdf["geometry"].intersection(bbox) + gdf = gdf[gdf["geometry"].area > 0].copy() + + if len(gdf) == 0: + res = np.zeros(elev_tile.size, dtype=int) + if id_name is None: + return res + return res - 1 + + gdf = gdf.reset_index(drop=True) + + regions = regionmask.from_geopandas(gdf, names=id_name, name="regions") + + mask = regions.mask(elev_tile.x.values, elev_tile.y.values) + + if id_name is None: + mask = ~np.isnan(mask) + + mask_df = mask.astype(bool if id_name is None else int) + + mask_df = mask_df.to_pandas().stack().reset_index().rename(columns={0: "region_id"}) + + if id_name is None: + return mask_df["region_id"].to_numpy() + + mask_df.loc[mask_df["region_id"] < 0, "region_id"] = len(regions.names) + + region_matches = np.take( + np.array(regions.names + [-1]), np.array(mask_df["region_id"]) + ) + + return region_matches + + +def get_vor_matches(elev_tile, bbox, regions_df, id_name, out_name, assert_filled=True): + """For each pixel of `elev_tile`, assign the corresponding shape in + `regions_df`. + + Parameters + ---------- + elev_tile : xarray.DataArray + Elevation raster tile + + bbox : shapely.Polygon + Bounding box of `elev_tile` + + regions_df : geopandas.GeoDataFrame + GeoDataFrame containing geometries to match to `elev_tile` coordinates + + id_name : str + Field of `regions_df` to use as IDs in returned array + + out_name : str + Name to use for `id_name` field in output + + assert_filled : bool + Whether to assert that regions are defined over the entire `elev_tile` + + Returns + ------- + pandas.Series + Array of matches between `elev_tile` and `regions_df`, defined by the field + `id_name` in `regions_df`. + + """ + regions = regionmask.from_geopandas(regions_df, names=id_name, name=out_name) + + mask = regions.mask(elev_tile.x.values, elev_tile.y.values, wrap_lon=False) + + # if there are pixels without shapes, buffer shapes + assert mask.isnull().sum().item() == 0 + mask_df = ( + mask.astype(int) + .to_pandas() + .stack() + .reset_index() + .rename(columns={0: "region_ix"}) + ) + + if assert_filled: + assert (mask_df["region_ix"] < 0).sum() == 0 + else: + mask_df.loc[mask_df["region_ix"] < 0, "region_ix"] = len(regions.names) + + mask_df[out_name] = np.take( + np.array(regions.names + [""]), np.array(mask_df["region_ix"]) + ) + + return mask_df[out_name] + + +def get_empty_exp_grid(elev_tile, grid_width): + """Initialize DataFrame template representing an exposure tile covering + the same space as `elev_tile`. + + Parameters + ---------- + elev_tile : xarray.DataArray + Elevation raster tile + + grid_width : float + Width of grid cells indexing exposure, in degrees + + Returns + ------- + df : pandas.DataFrame + DataFrame with fields `lat`, `lon`, `x_ix`, and `y_ix`, providing + template to fill in exposure fields like asset value and population + """ + + mg = np.meshgrid(elev_tile.x.values, elev_tile.y.values) + + df = pd.DataFrame({"lat": mg[1].flatten(), "lon": mg[0].flatten()}) + + df["x_ix"] = grid_val_to_ix(df["lon"], grid_width) + df["y_ix"] = grid_val_to_ix(df["lat"], grid_width) + + out_types = { + "lat": np.float32, + "lon": np.float32, + "x_ix": np.int16, + "y_ix": np.int16, + } + + df = df.astype({k: v for k, v in out_types.items() if k in df.columns}) + + return df + + +def get_cell_size_km(elev_tile, bbox): + """Get approximate size of a grid cell in `elev_tile`. Assumes the median + latitude extends over the entire cell, so that all grid cells are equal in + size. Could be improved by evaluating size at each pixel's latitude in + `elev_tile`. + + Parameters + ---------- + elev_tile : xarray.DataArray + Elevation raster tile + + bbox : shapely.Polygon + Bounding box of `elev_tile` + + Returns + ------- + cell_size_km : float + Approximate size in km2 of each pixel in `elev_tile` + + """ + + # grid cell area is determined by latitude + tile_size_km = np.cos(np.deg2rad(bbox.centroid.y)) * (LAT_TO_M / 1000) ** 2 + cell_size_km = tile_size_km / elev_tile.size + + return cell_size_km + + +def get_closest_valid_exp_tiles( + missing_exp_tiles, valid_exp_tiles, max_batch_comparisons=int(2e7) +): + """Get the closest valid exposure tiles to the invalid tiles with exposure + values that need to be re-assigned. + + Parameters + ---------- + missing_exp_tiles : pandas.DataFrame + Tiles with attributed exposure (asset value or population) that do not + overlap any land areas in the Digital Elevation Model. + + valid_exp_tiles : pandas.DataFrame + Tiles that overlap land areas in the Digital Elevation Model. (i.e., + candidates for re-assignment of the missing exposure) + + max_batch_comparisons : int + Maximum number of simultaneous comparisons to make using + `dist_matrix()`. High number of comparisons can reduce computation time + but increase memory footprint. + + Returns + ------- + pandas.DataFrame + DataFrame mapping `missing_exp_tiles` to their closest + `valid_exp_tiles`. Original indices are `x_ix` and `y_ix`, and the + valid indices are labelled `valid_x_ix` and `valid_y_ix`. + """ + + if len(valid_exp_tiles) == 0: + return None + + src_locs = missing_exp_tiles[["lon", "lat"]].to_numpy() + dst_locs = valid_exp_tiles[["lon", "lat"]].to_numpy() + + total_comparisons = len(src_locs) * len(dst_locs) + + closest_ix = np.zeros(len(src_locs), dtype=int) - 1 + + num_batches = int(total_comparisons / max_batch_comparisons) + 1 + batch_size_src = int(len(src_locs) / num_batches) + + for batch in range(num_batches): + batch_start = batch * batch_size_src + batch_end = min((batch + 1) * batch_size_src, len(src_locs)) + closest_ix_batch = dist_matrix( + src_locs[batch_start:batch_end, 0], + src_locs[batch_start:batch_end, 1], + dst_locs[:, 0], + dst_locs[:, 1], + ).argmin(axis=1) + + closest_ix[batch_start:batch_end] = closest_ix_batch + + missing_exp_tiles["closest_ix"] = closest_ix + + missing_exp_tiles["valid_x_ix"] = np.take( + valid_exp_tiles["x_ix"].to_numpy(), + missing_exp_tiles["closest_ix"].to_numpy(), + ) + missing_exp_tiles["valid_y_ix"] = np.take( + valid_exp_tiles["y_ix"].to_numpy(), + missing_exp_tiles["closest_ix"].to_numpy(), + ) + + return missing_exp_tiles[["x_ix", "y_ix", "valid_x_ix", "valid_y_ix"]] + + +def get_granular_grid(bbox, grid_width=3601, cap=sset.ELEV_CAP): + """Generate a dummy grid on a 1-degree tile using the same format and level + of granularity as the elevation tiles used in `sliiders` (i.e. 1 arcsec). + + Parameters + ---------- + bbox : shapely.Polygon + Bounding box of the 1-degree tile + + grid_width : int + Number of pixels to use as width and height in the area defined by + `bbox`. + + cap : int + An arbitrary integer higher than any elevations saved in SLIIDERS-ECON + outputs. Allows compatibility with functions that process elevation + tiles. + + Returns + ------- + granular_grid : xarray.DataArray + Grid with a dummy elevation variable set at `cap`, in the same format + as `sliiders` 1-degree elevation tiles. + + """ + + size = 1 / grid_width + + llon, llat, ulon, ulat = bbox.bounds + + lons_small = np.arange(llon + (size / 2), ulon, size) + lats_small = np.flip(np.arange(llat + (size / 2), ulat, size)) + + xx, yy = [i.flatten() for i in np.meshgrid(lons_small, lats_small)] + + granular_grid = pd.DataFrame( + {"y": yy, "x": xx, "v": cap * np.ones(grid_width**2)} + ).set_index(["y", "x"]) + granular_grid = granular_grid.to_xarray().v + + return granular_grid + + +def process_landscan( + landscan_zip, + dir_landscan_raw, + dir_landscan_int, + landscan_year, + save_to_file=True, + NWORKERS=20, +): + """Convert raw LandScan Zip-file into format suitable for global `sliiders` + grid. + + Parameters + ---------- + landscan_zip : pathlib.Path + Path to raw Zip-file downloaded from LandScan + + dir_landscan_raw : pathlib.Path + Path to unzipped LandScan directory containing raw files + + dir_landscan_int : pathlib.Path + Path to directory in which to store output of this function + + landscan_year : str + Year (i.e. version) of LandScan dataset, e.g. "2019" + + save_to_file : bool + Whether to save the output in a file. If True, two files will be saved + within `dir_landscan_int`, one with `x_ix`, `y_ix`, and `population`, + and another with the additional fields `x` and `y` to represent + longitude and latitude. + + NWORKERS : int + Number of Dask workers with which to run this function. + + Returns + ------- + pop_df : pandas.DataFrame + DataFrame of global population indexed by `x_ix`, `y_ix` coordinates + """ + warnings.filterwarnings("ignore", message=".*Reshaping is producing a large chunk*") + + # Unzipping + path_landscan = dir_landscan_raw / f"lspop{landscan_year}" / "hdr.adf" + + if not path_landscan.exists(): + with zipfile.ZipFile(landscan_zip, "r") as zip_ref: + zip_ref.extractall(dir_landscan_raw) + + # Organizing TIF to parquet + + image_name = sset.DASK_IMAGE + gateway = Gateway() + cluster = gateway.new_cluster(worker_image=image_name, profile="micro") + client = cluster.get_client() + cluster.scale(NWORKERS) + display(cluster) + + # Open raw population raster + + pop_ds = rioxarray.open_rasterio(path_landscan, chunks={"x": 2700, "y": 10440}) + pop_ds = pop_ds.squeeze().drop("band") + + # Replace null values with 0's + pop_ds = pop_ds.where(pop_ds >= 0, 0) + pop_ds = pop_ds.persist() + + # Transform to dataframe + + pop_da = pop_ds.to_dataset(name="population") + pop_ddf = pop_da.to_dask_dataframe() + pop_ddf = pop_ddf.drop(columns=["spatial_ref"]) + pop_ddf = pop_ddf.persist() + pop_ddf = pop_ddf[pop_ddf["population"] > 0].persist() + + # Bring to local + pop_df = pop_ddf.compute() + + # Convert coordinates to indices + + pop_df["x_ix"] = grid_val_to_ix(pop_df["x"].to_numpy(), sset.LANDSCAN_GRID_WIDTH) + + pop_df["y_ix"] = grid_val_to_ix(pop_df["y"].to_numpy(), sset.LANDSCAN_GRID_WIDTH) + + # Drop unnecessary columns + + pop_with_xy = pop_df.copy() + pop_df = pop_df.drop(columns=["x", "y"]).reset_index(drop=True) + pop_with_xy = pop_with_xy.reset_index(drop=True) + + # Save and shut down workers + if save_to_file: + dir_landscan_int.mkdir(exist_ok=True) + pop_df.to_parquet(dir_landscan_int / "population.parquet", index=False) + pop_with_xy.to_parquet( + dir_landscan_int / "population_with_xy.parquet", index=False + ) + + cluster.scale(0) + client.close() + cluster.close() + cluster.shutdown() + + return pop_df + + +def interpolate_da_like(da_in, da_out): + """Based on the coordinates of `da_out`, interpolate (bicubic) the data that is + contained in `da_in`; both `da_in` and `da_out` need to be `xarray.DataArray`s in + two-dimensional grid format, with coordinates `lon` and `lat`. + + Parameters + ---------- + da_in : xarray.DataArray + containing data that needs interpolation + da_out : xarray.DataArray + containing grid structure that `da_in` data will be interpolated over + + Returns + ------- + xarray.DataArray + containing bicubic interpolated version of `da_in` based on the grids of + `da_out` + + """ + + xx, yy = np.meshgrid(da_out.lon.values, da_out.lat.values) + interpolator = Grid2D(da_in, geodetic=True) + interp_out = interpolator.bicubic(coords={"lon": xx.flatten(), "lat": yy.flatten()}) + + return xr.DataArray( + interp_out.reshape(len(da_out.lat), len(da_out.lon)), + dims=["lat", "lon"], + coords=dict(da_out.coords), + ) diff --git a/tests/placeholder.txt b/tests/placeholder.txt new file mode 100644 index 0000000..e69de29