From 7c5c51064164dd2d6e8815324fdf7dc6e262cbf8 Mon Sep 17 00:00:00 2001 From: Junho Choi Date: Fri, 8 Apr 2022 23:48:15 +0000 Subject: [PATCH 1/3] CIA WFB workflow: functions for cleaning and new directories added --- sliiders/cia_wfb_clean.py | 1668 +++++++++++++++++++++++++++++++++ sliiders/country_level_ypk.py | 5 +- sliiders/settings.py | 58 +- 3 files changed, 1689 insertions(+), 42 deletions(-) create mode 100644 sliiders/cia_wfb_clean.py diff --git a/sliiders/cia_wfb_clean.py b/sliiders/cia_wfb_clean.py new file mode 100644 index 0000000..78b71b5 --- /dev/null +++ b/sliiders/cia_wfb_clean.py @@ -0,0 +1,1668 @@ +""" +Contains functions to clean CIA World Factbook (WFB). There are different versions +across the years, each with its own format -- this is why there are multiple functions +to organize different versions, some of which that can be grouped with one another due +to sharing similar formats. +""" + +import os +import re +from codecs import open as copen +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from bs4 import BeautifulSoup as BSoup +from tqdm.auto import tqdm + +from .settings import DIR_CIA_RAW, PATH_MPD_RAW, PATH_PWT_RAW + +REGIONS_TO_SKIP_CIA_WFB = [ + "Southern Ocean", + "Indian Ocean", + "Arctic Ocean", + "Atlantic Ocean", + "Pacific Ocean", + "Baker Island", +] + +# manually cleaning the country codes +CCODE_MANUAL = [ + ["American Samoa", "ASM"], + ["Andorra", "AND"], + ["Bahamas, The", "BHS"], + ["Bolivia", "BOL"], + ["Bouvet Island", "BVT"], + ["Brunei", "BRN"], + ["British Indian Ocean Territory", "IOT"], + ["Burma", "MMR"], + ["Cape Verde", "CPV"], + ["Christmas Island", "CXR"], + ["Cocos (Keeling) Islands", "CCK"], + ["Congo, Democratic Republic of the", "COD"], + ["Congo, Republic of the", "COG"], + ["Cook Islands", "COK"], + ["Cote d'Ivoire", "CIV"], + ["Curacao", "CUW"], + ["Czechia", "CZE"], + ["East Timor", "TLS"], + ["Falkland Islands (Islas Malvinas)", "FLK"], + ["Eritrea", "ERI"], + ["Faroe Islands", "FRO"], + ["Francetotal: ", "FRA"], + ["French Guiana", "GUF"], + ["French Polynesia", "PYF"], + ["French Southern and Antarctic Lands", "ATF"], + ["Gambia, The", "GMB"], + ["Gaza Strip", "PSE"], # Gaza Strip and West Bank will together constitute PSE + ["West Bank", "PSE"], + ["Jersey", "JEY"], + ["Guernsey", "GGY"], + ["Gibraltar", "GIB"], + ["Greenland", "GRL"], + ["Guadeloupe", "GLP"], + ["Guam", "GUM"], + ["Heard Island and McDonald Islands", "HMD"], + ["Holy See (Vatican City)", "VAT"], + ["Hong Kong", "HKG"], + ["Jan Mayen", "SJM"], # Svalbard and Jan Mayen are grouped together + ["Svalbard", "SJM"], + ["Kiribati", "KIR"], + ["Korea, North", "PRK"], + ["Korea, South", "KOR"], + ["Kosovo", "KO-"], # Kosovo's original code is XKX, but we use KO- + ["Laos", "LAO"], + ["Liechtenstein", "LIE"], + ["Macau", "MAC"], + ["Macedonia", "MKD"], + ["Man, Isle of", "IMN"], + ["Macedonia, The Former Yugoslav Republic of", "MKD"], + ["Martinique", "MTQ"], + ["Marshall Islands", "MHL"], + ["Mayotte", "MYT"], + ["Micronesia, Federated States of", "FSM"], + ["Moldova", "MDA"], + ["Monaco", "MCO"], + ["Nauru", "NRU"], + ["Netherlands Antilles", "BES+CUW+SXM"], # CIA WFB includes SXM, BES, CUW (not ABW) + ["Niue", "NIU"], + ["New Caledonia", "NCL"], + ["Norfolk Island", "NFK"], + ["Northern Mariana Islands", "MNP"], + ["Palau", "PLW"], + ["Papua New Guinea", "PNG"], + ["Pitcairn Islands", "PCN"], + ["Reunion", "REU"], + ["Russia", "RUS"], + ["Saint Barthelemy", "BLM"], + ["Saint Helena", "SHN"], # will use the below version whenever possible + ["Saint Helena, Ascension, and Tristan da Cunha", "SHN"], + ["Saint Martin", "MAF"], + ["Saint Pierre and Miquelon", "SPM"], + ["Saint Vincent and the Grenadines", "VCT"], + ["Samoa", "WSM"], + ["San Marino", "SMR"], + ["Serbia and Montenegro", "SRB+MNE"], + ["Sint Maarten", "SXM"], + ["Solomon Islands", "SLB"], + ["Somalia", "SOM"], + ["South Georgia and South Sandwich Islands", "SGS"], + ["South Sudan", "SSD"], + ["Syria", "SYR"], + ["Tanzania", "TZA"], + ["Timor-Leste", "TLS"], + ["Tokelau", "TKL"], + ["Tonga", "TON"], + ["Tuvalu", "TUV"], + ["Vanuatu", "VUT"], + ["Venezuela", "VEN"], + ["Virgin Islands", "VIR"], + ["Wallis and Futuna", "WLF"], + ["Western Sahara", "ESH"], +] +CCODE_MANUAL = pd.DataFrame(CCODE_MANUAL, columns=["country", "countrycode"]) + + +def helper_wfb_million_cleaner(string): + """Helper function for cleaning CIA WFB GDP values in millions of USD. + + Parameters + ---------- + string : str + containing information about the GDP value (e.g., '$42 million') + + Returns + ------- + numeric : float + containing GDP information in millions of USD + + """ + numeric = float(re.sub(r"[a-zA-Z]|\$| |\,|-", "", string)) + if "trillion" in string: + numeric = numeric * 1000000 + elif "billion" in string: + numeric = numeric * 1000 + elif "million" not in string: + numeric = numeric / 1000000 + + return numeric + + +def helper_wfb_gather_soups( + directory, subdirectory="geos", print_ver=False, encoding=None +): + """Helper function to go over each geographic location files (in `subdirectory`) + and gather `bs4.BeautifulSoup` for each file. + + Parameters + ---------- + directory : str or Path-like + containing the overall directory containing CIA WFB information for a specific + version + subdirectory : str + subdirectory (under) `directory` that contains all the geographic location files + print_ver : bool + if `True`, will gather `bs4.BeautifulSoup` for files with the header 'print_' + (e.g., `print_us.html`); if `False`, will gather those for files without such + headers + encoding : None or str + how the `codecs.open` function will process the .html file; default is `None`, + and this will process it as utf-8 + + Returns + ------- + soups : list of `bs4.BeautifulSoup` + for each of the geographic locations in the `subdirectory` under `directory` + + """ + + direc = Path(directory) / subdirectory + soups = [] + length = 7 + if print_ver: + length = 13 + for g in os.listdir(direc): + if not ((".html" in g) and (len(g) == length)): + continue + file = copen(str(direc / g), "r", encoding).read() + soup = BSoup(file, "html.parser") + soups.append(soup) + + return soups + + +def helper_fy_cleaner(list_of_years): + """Helper function for cleaning a list of years (in string format) that may have + financial year designations instead of YYYY format. + + Parameters + ---------- + list_of_years : array-like of str or str + containing years in string format + + Returns + ------- + list of int or int + of the year(s) cleaned in YYYY format + + """ + + single = False + if type(list_of_years) is str: + list_of_years = [list_of_years] + single = True + + if np.any(["FY" in x for x in list_of_years]): + fix = [] + for yr in list_of_years: + if "FY" in yr: + yr = int(yr.split("/")[-1]) + 1900 + if yr < 1950: + yr += 100 + fix.append(str(yr)) + if single: + return int(fix[0]) + return [int(x) for x in fix] + + if single: + return int(list_of_years[0]) + return [int(x) for x in list_of_years] + + +def organize_cia_wfb_2000_2001( + directory=(DIR_CIA_RAW / "factbook-2001"), + no_info_names=REGIONS_TO_SKIP_CIA_WFB, + wfb_year=2001, +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2000 and 2001 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2001 version, but can be specified to take + care of 2000 version as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + no_info_names : array-like of str + containing country/region names to be excluded when cleaning the information, + largely due to their pages containing no usable population and GDP information + (e.g., Arctic Ocean) + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_collect : pandas.DataFrame + containing country/region-level population (in ones of people) + gdp_collect : pandas.DataFrame + containing country/region-level PPP GDP (in millions of USD) and PPP GDP per + capita (in ones of USD) + + """ + + msg = "Cleans only 2000 to 2001 versions of CIA WFB." + assert wfb_year in range(2000, 2002), msg + soups = helper_wfb_gather_soups(directory) + + # population + pop_collect = [] + for soup in soups: + name = soup.find("title").text.split(" -- ")[-1].strip() + if name in no_info_names: + continue + + popstr = soup.text[ + soup.text.find("Population:") : soup.text.find("Age structure:") + ].replace("\n", "") + if ("no indigenous" in popstr) or ("uninhabited" in popstr): + continue + + popstr = [ + x + for x in re.split(r"\(|\)", re.sub(r"(Population:)|\,|(est.)", "", popstr)) + if len(x.replace(" ", "")) > 0 + ] + pop_val, pop_year = popstr[0], popstr[1] + + if name in ["South Africa", "Syria"]: + pop_year = popstr[-1] + + if "note:" in pop_val: + pop_val = pop_val.split("note:")[0] + + pop_collect.append([name, float(pop_val.strip()), int(pop_year.strip()[-4:])]) + pop_collect = pd.DataFrame(pop_collect, columns=["country", "pop", "year"]) + pop_collect["wfb_year"] = wfb_year + + # GDP and GDPpc + gdp_collect = [] + for soup in soups: + name = soup.find("title").text.split(" -- ")[-1].strip() + if name in no_info_names: + continue + + # GDP + gdp_txt = soup.text.replace("\n", " ") + front_txt = "GDP: purchasing power parity" + if wfb_year == 2001: + front_txt = "GDP: purchasing power parity" + if front_txt not in gdp_txt: + continue + + gdp_txt, gdppc_txt = gdp_txt[ + gdp_txt.find(front_txt) : gdp_txt.find("GDP - composition by sector") + ].split("GDP - real growth rate:") + gdp_txt = [ + x.strip() + for x in re.split( + r"\(|\)", re.sub(r"({} - \$)|( est.)".format(front_txt), "", gdp_txt) + ) + if len(x.strip()) > 0 + ] + if gdp_txt[0] == "NA": + continue + gdp_val = helper_wfb_million_cleaner(gdp_txt[0]) + gdp_year = helper_fy_cleaner([gdp_txt[1]])[0] + + # GDPpc + front_txt = "GDP - per capita: purchasing power parity - " + if wfb_year == 2001: + front_txt = "GDP - per capita: purchasing power parity - " + gdppc_txt = re.sub(r"\$|( est.)", "", gdppc_txt.split(front_txt)[-1]).strip() + additional_condition = (name in ["Svalbard", "Norway"]) and (wfb_year == 2001) + if (gdppc_txt == "NA") or additional_condition: + continue + gdppc_val, _ = gdppc_txt.split("(") + gdppc_val = float(gdppc_val.replace(",", "")) + # _ = helper_fy_cleaner([gdppc_year.replace(")", "")])[0] + + gdp_collect.append([name, gdp_val, gdppc_val, gdp_year]) + + gdp_collect = pd.DataFrame(gdp_collect, columns=["country", "gdp", "gdppc", "year"]) + gdp_collect["wfb_year"] = wfb_year + + return pop_collect, gdp_collect + + +def organize_cia_wfb_2002_2004( + directory=(DIR_CIA_RAW / "factbook-2002"), wfb_year=2002 +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2002-2004 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2002 version, but can be specified to take + care of 2003-2004 versions as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + + """ + + s_yr, e_yr = 2002, 2004 + msg = "Cleans only {} to {} versions of CIA WFB.".format(s_yr, e_yr) + assert wfb_year in range(s_yr, e_yr + 1), msg + + lst_directory = Path(directory) / "fields" + soups = [] + for i in [2001, 2004, 2119]: + file = copen(str(lst_directory / "{}.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + # GDP and GDP per capita + gdp_case = True + for soup in soups[0:2]: + gdp_lst = [ + re.sub(r"\n|\t", "", x.text) + for x in soup.find_all("tr") + if "power parity" in x.text + ][1:] + gdp_lst = [x.split("purchasing power parity - $") for x in gdp_lst] + if not gdp_case: + gdp_lst = [[x[0]] + [f.replace(",", "") for f in x[1:]] for x in gdp_lst] + gdp_collect = [] + for i in gdp_lst: + # let us manually take care of Cyprus; skip if only containing NA + if ("Cyp" in i[0]) or (" - NA " in i[0]): + continue + + gdp_val = i[1].strip().split(" (") + gdp_val, gdp_year = gdp_val[0], gdp_val[1:] + if "note" in gdp_val: + gdp_val = gdp_val.split("note")[0].strip() + elif "NA" in gdp_val: + continue + + if gdp_case: + gdp_val = helper_wfb_million_cleaner(gdp_val) + else: + gdp_val = float(gdp_val.strip()) + + if not gdp_year: + gdp_year = wfb_year + else: + gdp_year = gdp_year[0].split("est.")[0].replace(")", "").strip() + gdp_year = helper_fy_cleaner([gdp_year])[0] + + if "World" in i[0]: + gdp_collect.append(["World", gdp_val, gdp_year]) + else: + gdp_collect.append([i[0], gdp_val, gdp_year]) + + if gdp_case: + gdp_df = gdp_collect.copy() + gdp_case = False + else: + gdppc_df = gdp_collect.copy() + + gdppc_df = pd.DataFrame(gdppc_df, columns=["country", "gdppc", "year"]) + gdp_df = pd.DataFrame(gdp_df, columns=["country", "gdp", "year"]) + gdp_df["wfb_year"], gdppc_df["wfb_year"] = wfb_year, wfb_year + + # Population + pop_df = [] + pop_lst = [ + re.sub(r"\n|\t", "", x.text) + for x in soups[-1].find_all("tr") + if "est.)" in x.text + ][1:] + for i in pop_lst: + if ("no indigenous" in i) or ("uninhabited" in i): + continue + + pop_idx = re.search(r"[0-9]", i).span()[0] + name, pop_info = i[0:pop_idx], i[pop_idx:] + pop_val = pop_info.split("(") + pop_val, pop_year = pop_val[0], pop_val[-1] + if "note" in pop_val: + pop_val = pop_val.split("note")[0] + if "million" in pop_val: + pop_val = float(pop_val.strip().replace(" million", "")) * 1000000 + else: + pop_val = float(pop_val.strip().replace(",", "")) + pop_year = int(re.sub(r"[a-zA-Z]|\.|\)", "", pop_year)) + pop_df.append([name, pop_val, pop_year]) + + pop_df = pd.DataFrame(pop_df, columns=["country", "pop", "year"]) + pop_df["wfb_year"] = wfb_year + + return pop_df, gdp_df, gdppc_df + + +def organize_cia_wfb_2005_2008( + directory=(DIR_CIA_RAW / "factbook-2005"), wfb_year=2005 +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2005 and 2007 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2005 version, but can be specified to take + care of 2006-2007 versions as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + """ + + s_yr, e_yr = 2005, 2008 + msg = "Cleans only {} to {} versions of CIA WFB.".format(s_yr, e_yr) + assert wfb_year in range(s_yr, e_yr + 1), msg + + lst_directory = Path(directory) / "fields" + soups = [] + for i in [2001, 2004, 2119]: + file = copen(str(lst_directory / "{}.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + # GDP and GDP per capita + for case, soup in enumerate(soups): + collect = [] + lst = [ + re.sub(r"\n|\t", "", x.text) + for x in soup.find_all("tr") + if "est.)" in x.text + ][1:] + + for i in lst: + cnd_check = ("no indigenous" in i) or ("uninhabited" in i) or ("NA" in i) + if ("Cyprus" in i) or cnd_check: + continue + + searchby = r"\$" + if case == 2: + searchby = r"[0-9]" + + idx = re.search(searchby, i).span()[0] + name, value = i[0:idx].replace("purchasing power parity - ", ""), i[idx:] + + if "World" in name: + name = "World" + + value = value.split(" (") + value, year = value[0], value[-1] + if ("- supplemented" in value) or ("note" in value): + value = re.split(r"note|- supplemented", value)[0] + value = re.sub(r"\;", "", value) + if case == 0: + value = helper_wfb_million_cleaner(value.strip()) + else: + value = re.sub(r"\$|\,| for Serbia", "", value).strip() + if "million" in value: + value = float(value.replace("million", "").strip()) * 1000000 + value = int(value) + year = int( + re.sub( + r"[a-zA-Z]", "", year.replace(" est.", "").replace(")", "").strip() + ) + ) + + collect.append([name, value, year]) + if case == 0: + gdp_df = collect.copy() + elif case == 1: + gdppc_df = collect.copy() + + # GDP and GDPpc + gdp_df = pd.DataFrame(gdp_df, columns=["country", "gdp", "year"]) + gdppc_df = pd.DataFrame(gdppc_df, columns=["country", "gdppc", "year"]) + gdp_df["wfb_year"], gdppc_df["wfb_year"] = wfb_year, wfb_year + + # population + pop_df = pd.DataFrame(collect, columns=["country", "pop", "year"]) + pop_df["wfb_year"] = wfb_year + + return pop_df, gdp_df, gdppc_df + + +def organize_cia_wfb_2009_2012( + directory=(DIR_CIA_RAW / "factbook-2009"), wfb_year=2009 +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2009-2012 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2009 version, but can be specified to take + care of 2010-2012 versions as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + """ + + s_yr, e_yr = 2009, 2012 + msg = "Cleans only {} to {} versions of CIA WFB.".format(s_yr, e_yr) + assert wfb_year in range(s_yr, e_yr + 1), msg + + lst_directory = Path(directory) / "fields" + soups = [] + for i in [2001, 2004, 2119]: + file = copen(str(lst_directory / "{}.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + cat_dict = {"class": "category_data"} + for i, soup in enumerate(soups): + # every displayed "row" is organized as a "table" + souptable = [ + x + for x in soup.find_all("table") + if x.find("td", attrs=cat_dict) is not None + ] + souptable = [t.find("td", attrs={"class": "fl_region"}) for t in souptable] + names = [t.find("a").text for t in souptable] + values = [t.find("td", attrs=cat_dict) for t in souptable] + + # for reducing redundancies, as the tables are in a nested structure + # same country information can be searched multiple times + already_names = ["Akrotiri", "Dhekelia"] + cases = [] + collect_df = [] + for j, value in enumerate(values): + name, v = names[j], value.text + if name in already_names: + continue + if i != 2: + org = [ + x.strip() + for x in v.split("\n") + if (len(x.strip()) > 0) and ("NA" not in x) + ] + numbers = [ + x.replace("note:", "").strip() + for x in org + if (("(" in x) and (")" in x) and ("MADDISON" not in x)) + or ("est." in x) + ] + note = [x for x in org if (x not in numbers) and ("data are in" in x)] + num_orgs, years = [], [] + for num in numbers: + n, year = num.split("(")[0], num.split("(")[-1] + if i == 0: + n = helper_wfb_million_cleaner(n) + else: + n = int(re.sub("\$|\,", "", n).strip()) + num_orgs.append(n) + years.append( + int(re.sub(r"[a-zA-Z]|\)|\.|\;|\$", "", year).strip()[0:4]) + ) + + usd_years = years.copy() + if note: + nn = note[0].split(";")[0] + usd_years = [int(re.sub(r"[a-zA-Z]|\:|\.", "", nn).strip())] * len( + years + ) + df = pd.DataFrame( + data=dict( + zip( + ["country", "year", "gdp", "usd_year"], + [[name] * len(years), years, num_orgs, usd_years], + ) + ) + ) + else: + if ("no indigenous" in v) or ("uninhabited" in v): + continue + org = v.strip().replace("\n", "").replace(",", "").split("(") + num = org[0].split("note")[0].strip() + if "million" in num: + num = int(float(num.replace("million", "").strip()) * 1000000) + else: + num = int(num.replace("total:", "").strip()) + + if (name == "Curacao") and (wfb_year in [2010, 2011]): + year = re.sub(r"\)|\.", "", [x for x in org if "est." in x][0]) + elif (name == "South Sudan") and (wfb_year == 2011): + year = re.sub(r"\)", "", org[-1]) + else: + year = [x.split("est.)")[0] for x in org if "est.)" in x][0] + year = int(re.sub(r"[a-zA-Z]| ", "", year)) + df = [name, num, year] + + already_names.append(name) + collect_df.append(df) + if i == 0: + gdp_df = pd.concat(collect_df, axis=0).reset_index(drop=True) + elif i == 1: + gdppc_df = pd.concat(collect_df, axis=0).reset_index(drop=True) + gdppc_df.rename(columns={"gdp": "gdppc"}, inplace=True) + pop_df = pd.DataFrame(collect_df, columns=["country", "pop", "year"]) + gdp_df["wfb_year"] = wfb_year + gdppc_df["wfb_year"], pop_df["wfb_year"] = wfb_year, wfb_year + + # drop duplicates, due to multiple entries for North Korea in 2011 + gdppc_df.drop_duplicates(inplace=True) + gdp_df.drop_duplicates(inplace=True) + + # manual cleaning to fix or drop unreliable data + if wfb_year in [2011, 2012]: + gdppc_df.loc[ + (gdppc_df.country == "Gibraltar") & (gdppc_df.gdppc == 43000), + ["year", "usd_year"], + ] = 2008 + if wfb_year == 2012: + gdppc_df.loc[ + (gdppc_df.country == "Kosovo") & (gdppc_df.gdppc == 7400), + ["year", "usd_year"], + ] = 2012 + + return pop_df, gdp_df, gdppc_df + + +def organize_cia_wfb_2013_2014( + directory=(DIR_CIA_RAW / "factbook-2013"), wfb_year=2013 +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2013-2014 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2013 version, but can be specified to take + care of 2014 version as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + + """ + + s_yr, e_yr = 2013, 2014 + msg = "Cleans only {} to {} versions of CIA WFB.".format(s_yr, e_yr) + assert wfb_year in range(s_yr, e_yr + 1), msg + + lst_directory = Path(directory) / "fields" + soups = [] + for i in [2001, 2004, 2119]: + file = copen(str(lst_directory / "{}.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + for i, soup in enumerate(soups): + if (i == 2) and (wfb_year == 2015): + continue + soupfind = soup.find("div", attrs={"class": "text-holder-full"}).find_all( + "td", attrs={"class": "fl_region"} + ) + df_agg = [] + for j, case in enumerate(soupfind): + name = case.text.split("\n\n")[0] + values = case.find("td").text + + cnd_skip1 = name in ["Akrotiri", "Dhekelia"] + cnd_skip2 = (i != 2) and (("NA" in values) or (name == "Gaza Strip")) + cnd_skip3 = ("no indigenous" in values) or ("uninhabited" in values) + if cnd_skip1 or cnd_skip2 or cnd_skip3: + continue + values = [x for x in values.split("\n") if len(x.strip()) > 0] + if (i == 2) and (wfb_year == 2014): + note = [x for x in values if ("note" in x)] + values = [ + x for x in values if ("note" not in x) and ("top ten" not in x) + ] + else: + note = [x for x in values if ("note: data are in" in x)] + if np.any(["est." in x for x in values]) and (i != 2): + values = [ + x + for x in values + if ("est." in x) and ("note" not in x) and ("top ten" not in x) + ] + else: + values = [ + x for x in values if ("note" not in x) and ("top ten" not in x) + ] + + nums, years = [], [] + for val in values: + if (name == "Bahrain") and (i == 2) and (wfb_year == 2013): + num, year = val.split("July") + elif (i == 2) and (wfb_year == 2014) and (len(note) > 0): + num = val.strip() + if "(" in num: + num, year = num.split("(") + if "est." in note[0]: + year = note[0].split("(")[-1] + elif "(" in val: + num, year = val.split("(") + else: + num, year = val.strip(), str(wfb_year) + year = re.sub(r"\(|\)|est.", "", year).strip() + if "FY" in year: + year = helper_fy_cleaner(year) + else: + year = int(re.sub(r"[a-zA-Z]", "", year).strip()) + if i == 0: + num = helper_wfb_million_cleaner(num.strip()) + else: + num = int(re.sub(r"\$|\,", "", num)) + years.append(year) + nums.append(num) + + if len(nums) == 0: + continue + + if i != 2: + usd_years = years.copy() + if len(note) > 0: + note = note[0].split("note: data are in")[-1].split("US dollars")[0] + usd_years = [int(note.strip())] * len(years) + + columns = ["country", "year", "gdp", "usd_year"] + datavals = [[name] * len(years), years, nums, usd_years] + + else: + columns = ["country", "year", "pop"] + datavals = [[name] * len(years), years, nums] + + df_agg.append(pd.DataFrame(data=dict(zip(columns, datavals)))) + + if i == 0: + gdp_df = pd.concat(df_agg, axis=0).reset_index(drop=True) + gdp_df["wfb_year"] = wfb_year + elif i == 1: + gdppc_df = pd.concat(df_agg, axis=0).reset_index(drop=True) + gdppc_df.rename(columns={"gdp": "gdppc"}, inplace=True) + + pop_df = pd.concat(df_agg, axis=0).reset_index(drop=True) + gdppc_df["wfb_year"], pop_df["wfb_year"] = wfb_year, wfb_year + + # manual cleaning to fix or drop unreliable data + if wfb_year == 2013: + gdp_df.loc[(gdp_df.country == "Macau") & (gdp_df.gdp > 47000), "year"] = 2012 + gdp_df = gdp_df.loc[ + ~((gdp_df.country == "Syria") & (gdp_df.year == 2010)), : + ].copy() + gdppc_df.loc[ + (gdppc_df.country == "Gibraltar") & (gdppc_df.gdppc == 43000), "year" + ] = 2008 + else: + gdp_df = gdp_df.loc[ + ~((gdp_df.country == "Croatia") & (gdp_df.year == 2012)), : + ].copy() + gdppc_df = gdppc_df.loc[ + ~((gdppc_df.country == "Kenya") & (gdppc_df.year == 2013)), : + ].copy() + gdppc_df = gdppc_df.loc[ + ~((gdppc_df.country == "Syria") & (gdppc_df.year == 2010)), : + ].copy() + + return pop_df, gdp_df, gdppc_df + + +def organize_cia_wfb_2015(directory=(DIR_CIA_RAW / "factbook-2015")): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) version 2015 into `pandas.DataFrame` formats. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + """ + + lst_directory = Path(directory) / "rankorder" + soups = [] + for i in [2001, 2004, 2119]: + file = copen(str(lst_directory / "{}rank.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + for i, soup in enumerate(soups): + ranks = soup.find("table", attrs={"id": "rankOrder"}) + rows = ranks.find_all("tr") + df = [] + for tr in rows: + if "Date of Information" in tr.text: + continue + + ranking, name, value, year = tr.find_all("td") + if len(value.text.strip()) == 0: + continue + if len(year.text.strip()) == 0: + year = 2014 + elif "FY" in year.text: + front, back = year.text.split("/") + back = re.sub(r"[a-zA-Z]|\.", "", back).strip() + year = int(back) + 2000 + if year > 2050: + year -= 100 + else: + year = int(re.sub(r"[a-zA-Z]|\.", "", year.text).strip()) + + value = int(re.sub(r"\$|\,", "", value.text).strip()) + if i == 0: + value /= 1000000 + df.append([name.text, year, value]) + + if i == 0: + gdp_df = pd.DataFrame(df, columns=["country", "year", "gdp"]) + elif i == 1: + gdppc_df = pd.DataFrame(df, columns=["country", "year", "gdppc"]) + + gdppc_df["usd_year"], gdp_df["usd_year"] = gdppc_df["year"], gdp_df["year"] + gdppc_df["wfb_year"], gdp_df["wfb_year"] = 2015, 2015 + pop_df = pd.DataFrame(df, columns=["country", "year", "pop"]) + pop_df["wfb_year"] = 2015 + + return pop_df, gdp_df, gdppc_df + + +def organize_cia_wfb_2016_2017( + directory=(DIR_CIA_RAW / "factbook-2016"), wfb_year=2016 +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2016-2017 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2016 version, but can be specified to take + care of 2017 version as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + + """ + + s_yr, e_yr = 2016, 2017 + msg = "Cleans only {} to {} versions of CIA WFB.".format(s_yr, e_yr) + assert wfb_year in range(s_yr, e_yr + 1), msg + + lst_directory = Path(directory) / "fields" + soups = [] + for i in [2001, 2004, 2119]: + file = copen(str(lst_directory / "{}.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + soup_case = [] + for i, soup in enumerate(soups): + names = [x.text for x in soup.find_all("td", attrs={"class": "country"})] + cases = soup.find_all("td", attrs={"class": "fieldData"}) + if i == 2: + cases = [x.text for x in cases] + else: + cases = [x.text.split("\n") for x in cases] + df_agg = [] + for j, name in enumerate(names): + cnd_skip1 = name in ["Akrotiri", "Dhekelia"] + cnd_skip2 = (i != 2) and np.any(["NA" in x for x in cases[j]]) + cnd_skip3 = (i == 2) and ( + ("no indigenous" in cases[j]) or ("uninhabited" in cases[j]) + ) + if cnd_skip1 or cnd_skip2 or cnd_skip3: + continue + if i != 2: + values = [ + x.strip() + for x in cases[j] + if ("est." in x) + and ("top ten" not in x) + and ("note" not in x) + and (len(x.strip()) > 0) + ] + note = [ + x.strip() + for x in cases[j] + if (len(x.strip()) > 0) and ("note" in x) + ] + + nums, years = [], [] + for val in values: + num, year = val.split("(") + if i == 0: + num = helper_wfb_million_cleaner(num) + else: + num = int(re.sub(r"\,|\$", "", num).strip()) + year = re.sub(r"\)|est.", "", year).strip() + if "FY" in year: + year = helper_fy_cleaner(year) + else: + year = int(year.strip()) + nums.append(num), years.append(year) + usd_years = years.copy() + if len(note) > 0: + note_fix = note[0].replace(" US", "") + idx = note_fix.find(" dollars") + if idx != -1: + usd_years = [int(note_fix[(idx - 4) : idx])] * len(years) + + else: + case = cases[j].replace("\n", "").split("(") + if np.any(["top ten" in x for x in case]): + num, year = case[0], case[1].split("top ten")[0] + else: + num = re.split(r"note|rank by population", case[0])[0] + year = case[-1] + year = year.replace(")", "").split("note")[0] + if "FY" in year: + year = helper_fy_cleaner( + "FY" + re.sub(r"[a-zA-Z]|\.", "", year).strip() + ) + else: + year = int(re.sub(r"[a-zA-Z]|\.", "", year).strip()) + if "million" in num: + num = int(float(num.replace("million", "").strip()) * 1000000) + else: + num = int(re.sub(r"\,|[a-zA-Z]|\:", "", num).strip()) + + if i != 2: + columns = ["country", "year", "gdp", "usd_year"] + datavals = [[name] * len(nums), years, nums, usd_years] + df_agg.append(pd.DataFrame(data=dict(zip(columns, datavals)))) + else: + df_agg.append([name, year, num]) + + if i == 0: + gdp_df = pd.concat(df_agg, axis=0).reset_index(drop=True) + gdp_df["wfb_year"] = wfb_year + elif i == 1: + gdppc_df = pd.concat(df_agg, axis=0).reset_index(drop=True) + gdppc_df.rename(columns={"gdp": "gdppc"}, inplace=True) + + pop_df = pd.DataFrame(df_agg, columns=["country", "year", "pop"]) + gdppc_df["wfb_year"], pop_df["wfb_year"] = wfb_year, wfb_year + gdppc_df.drop_duplicates(inplace=True) + + # manual cleaning to fix or drop unreliable data + gdp_df = gdp_df.loc[ + ~((gdp_df.country == "American Samoa") & (gdp_df.year == 2012)), : + ].copy() + gdp_df = gdp_df.loc[ + ~((gdp_df.country == "Faroe Islands") & (gdp_df.year == 2013)), : + ].copy() + if wfb_year == 2016: + gdppc_df = gdppc_df.loc[ + ~((gdppc_df.country == "Syria") & (gdppc_df.year == 2010)), : + ].copy() + + return pop_df, gdp_df, gdppc_df + + +def organize_cia_wfb_2018_2019( + directory=(DIR_CIA_RAW / "factbook-2018"), wfb_year=2018 +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) versions 2018-2019 into `pandas.DataFrame` formats. Baseline + function is based on organizing the 2018 version, but can be specified to take + care of 2019 version as well. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + + Returns + ------- + pop_df : pandas.DataFrame + containing country/region-level population information (in ones of people) + gdp_df : pandas.DataFrame + containing country/region-level PPP GDP information (in millions of USD) + gdppc_df : pandas.DataFrame + containing country/region-level PPP GDP per capita information (in ones of USD) + + """ + + s_yr, e_yr = 2018, 2019 + msg = "Cleans only {} to {} versions of CIA WFB.".format(s_yr, e_yr) + assert wfb_year in range(s_yr, e_yr + 1), msg + + lst_directory = Path(directory) / "fields" + soups = [] + + # html names for storing GDP, GDP per capita (both in PPP), and population + html_lst = [208, 211, 335] + for i in html_lst: + file = copen(str(lst_directory / "{}.html".format(i)), "r").read() + soups.append(BSoup(file, "html.parser")) + + find_val_fields = [ + "field-gdp-purchasing-power-parity", + "field-gdp-per-capita-ppp", + "field-population", + ] + find_category = "category_data subfield historic" + df_cols_list = [ + ["country", "year", "gdp", "usd_year"], + ["country", "year", "gdppc", "usd_year"], + ["country", "year", "pop"], + ] + for i, soup in enumerate(soups): + find_val_field, df_cols = find_val_fields[i], df_cols_list[i] + souptable = soup.find("table", attrs={"id": "fieldListing"}) + countries = [ + x.text.replace("\n", "") + for x in souptable.find_all("td", attrs={"class": "country"}) + ] + if i == 2: + find_category = "category_data subfield numeric" + values = souptable.find_all("div", attrs={"id": find_val_field}) + notes = [v.find("div", attrs={"class": "category_data note"}) for v in values] + values = [v.find_all("div", attrs={"class": find_category}) for v in values] + + df_collect = [] + for j, val in enumerate(values): + # case when there are no information available + if len(val) == 0: + continue + + # getting the country name and note (note could be None) + name, note = countries[j], notes[j] + + # multiple years and values available in versions 2017 and onwards + numbers, years = [], [] + no_known = False + for v in val: + year = None + num = v.text.replace("\n", "").split("(") + if len(num) > 1: + num, year = num[0], num[-1] + year = re.sub(r"[a-zA-Z]|\)| |\.", "", year) + if ("FY" in v.text) and ("/" in year): + year = "FY" + year + year = helper_fy_cleaner(year) + + if i == 0: + numbers.append(helper_wfb_million_cleaner(num)) + years.append(year) + else: + cnd_check = ( + ("no indigenous" in num) + or ("uninhabited" in num) + or ("Akrotiri" in num) + or ("NA" in num) + or (year is None) + ) + if cnd_check: + continue + if ("million" in num) and (i == 2): + num = int(float(num.replace("million", "").strip()) * 1000000) + numbers.append(num) + else: + numbers.append(int(re.sub(r"\$|\,|[a-zA-Z]", "", num.strip()))) + years.append(year) + if len(numbers) == 0: + continue + + name = [name] * len(years) + + # what year the GDP values are in + if i != 2: + usd_years = years.copy() + if note is not None: + note = note.text + if not (("data are in" in note) and ("dollars" in note)): + continue + if (";" in note) or ("the war-driven" in note): + note = [ + x + for x in re.split(r"\;|the war-driven", note) + if ("data are in" in x) and ("dollars" in x) + ][0] + noteyear = re.sub(r"[a-zA-Z]| |\n|\:", "", note) + usd_years = [int(noteyear)] * len(years) + + df_vals = [name, years, numbers, usd_years] + else: + df_vals = [name, years, numbers] + df_collect.append(pd.DataFrame(data=dict(zip(df_cols, df_vals)))) + + if i == 0: + gdp_df = pd.concat(df_collect, axis=0).reset_index(drop=True) + elif i == 1: + gdppc_df = pd.concat(df_collect, axis=0).reset_index(drop=True) + + pop_df = pd.concat(df_collect, axis=0).reset_index(drop=True) + gdp_df["wfb_year"], gdppc_df["wfb_year"] = wfb_year, wfb_year + pop_df["wfb_year"] = wfb_year + + return pop_df, gdp_df, gdppc_df + + +def helper_wfb_2020(soup): + """Simple helper function for finding and cleaning the name of a country/region, + used for organizing CIA World Factbook versions 2018 to 2020 (in conjunction with + the function `organize_cia_wfb_2018_2020`). + + Parameters + ---------- + soup : bs4.BeautifulSoup + containing country/region information + + Returns + ------- + name : str + of the country/region being represented in `soup` + + """ + name = soup.find("title").text + if " :: " in name: + name = name.split(" :: ")[1].split(" — ")[0] + else: + name = name.split(" - ")[0] + + return name + + +def organize_cia_wfb_2020( + directory=(DIR_CIA_RAW / "factbook-2020"), + wfb_year=2020, + no_info_names=REGIONS_TO_SKIP_CIA_WFB, +): + """Organizes the population, GDP, and GDP per capita information from the CIA World + Factbook (WFB) version 2020 into `pandas.DataFrame` format. + + Parameters + ---------- + directory : pathlib.Path or str + containing the relevant WFB information + wfb_year : int + year that the WFB version was released in + no_info_names : array-like of str + containing country/region names to be excluded when cleaning the information, + largely due to their pages containing no usable population and GDP information + (e.g., Arctic Ocean) + + Returns + ------- + pop_collect : pandas.DataFrame + containing population information (units in ones of people) + gdp_collect : pandas.DataFrame + containing PPP GDP information (units in millions of USD, USD year designated + by the column `usd_year`) + gdppc_collect : pandas.DataFrame + containing PPP GDP per capita information (units in ones of USD, USD year + designated by the column `usd_year`) + """ + + msg = "Cleans only 2020 version of CIA WFB." + assert wfb_year == 2020, msg + + # gathering soups + soups = helper_wfb_gather_soups(directory, print_ver=True) + + # population + pop_collect = [] + for soup in soups: + name = helper_wfb_2020(soup) + if name in no_info_names: + continue + + pop_text = ( + soup.text[ + soup.text.find("People and Society ::") : soup.text.find("Nationality:") + ] + .split("Population:\n")[1] + .replace("\n", " ") + ) + if ("no indigenous" in pop_text) or ("uninhabited" in pop_text): + pop_val, pop_year = 0, 2020 + else: + if "note" in pop_text: + pop_text = pop_text.split("note")[0] + + if name in ["Akrotiri", "Dhekelia"]: + continue + elif name == "European Union": + pop_val = float( + re.sub(r" |\,", "", pop_text.split("rank by population:")[0]) + ) + pop_year = 2020 + else: + pop_val = pop_text.split(" (")[0].replace(" ", "") + pop_year = pop_text.split(" (")[-1] + split_by = ")" + if "est. est.)" in pop_year: + split_by = "est. est.)" + elif "est.)" in pop_year: + split_by = "est.)" + + pop_year = [ + x for x in pop_year.split(split_by)[0].split(" ") if len(x) > 0 + ][-1] + if "million" in pop_val: + pop_val = float(pop_val.replace("million", "")) * 1000000 + else: + pop_val = float(re.sub(r"[a-zA-Z\W]", "", pop_val)) + + pop_collect += [[name, pop_val, int(pop_year)]] + + pop_collect = pd.DataFrame(pop_collect, columns=["country", "pop", "year"]) + pop_collect["wfb_year"] = wfb_year + + gdp_str_first = "GDP (purchasing power parity) - real:" + if wfb_year != 2020: + gdp_str_first = gdp_str_first.split(" - ")[0] + + # GDP and GDP per capita + gdp_collect, gdppc_collect = [], [] + for soup in soups: + name = helper_wfb_2020(soup) + if name in no_info_names + ["Gaza Strip"]: + continue + + # GDP (not GDPpc) information + gdp_info_all = ( + soup.text[ + soup.text.find(gdp_str_first) : soup.text.find("Gross national saving:") + ] + .replace("\n", " ") + .split("GDP (official exchange rate):") + ) + + gdp_info = gdp_info_all[0].replace(gdp_str_first, "") + if "NA" in gdp_info: + continue + + if len(gdp_info) > 0: + if (wfb_year != 2020) and (gdp_info[0] in [":", ";"]): + gdp_info = gdp_info[1:] + + note = None + if ("note: " in gdp_info) and (name != "Saint Pierre and Miquelon"): + gdp_info = gdp_info.split("note: ") + gdp_info, note = gdp_info[0], gdp_info[1:] + + if (wfb_year != 2020) and ("country comparison to" in gdp_info): + gdp_info = gdp_info.split("country comparison to")[0] + gdp_info = [ + x.strip() for x in re.split(r"\(|\)", gdp_info) if len(x.strip()) > 0 + ] + + if len(gdp_info) > 0: + gdp_vals = gdp_info[0::2] + if name == "Saint Pierre and Miquelon": + gdp_vals = gdp_vals[0:-1] + gdp_vals = [helper_wfb_million_cleaner(x) for x in gdp_vals] + gdp_yrs = helper_fy_cleaner( + [x.replace("est.", "").strip() for x in gdp_info[1::2]] + ) + + usd_year_assumed = "usd_year_assumed" + if note is not None: + note = re.sub(r"[a-zA-Z]| ", "", note[0]) + if note[0] == ";": + note = note[1:] + elif (";" in note) or ("-" in note): + note = re.split(r";|-", note)[0] + + if (":" in note) and (wfb_year != 2020): + note = note.split(":")[0] + + gdp_usd_yrs = [int(note.replace(".", ""))] * len(gdp_yrs) + usd_year_assumed = "usd_year_original" + else: + gdp_usd_yrs = gdp_yrs + append_this = [] + for l, yr in enumerate(gdp_yrs): + append_this.append( + [name, yr, gdp_usd_yrs[l], gdp_vals[l], usd_year_assumed] + ) + gdp_collect += append_this + + # GDPpc information + gdppc_info = gdp_info_all[-1].split("GDP - per capita (PPP):")[-1] + if len(gdppc_info.strip()) > 0: + if "country comparison" not in gdppc_info: + if "GDP - composition, by sector of origin" in gdppc_info: + gdppc_info = gdppc_info.split( + "GDP - composition, by sector of origin" + )[0] + else: + gdppc_info = gdppc_info.split("country comparison")[0] + + for string in ["Ease of Doing Business", "GDP - composition, by sector"]: + if string in gdppc_info: + gdppc_info = gdppc_info.split(string)[0] + + if "NA" in gdppc_info: + continue + + note = None + if "note:" in gdppc_info: + gdppc_info, note = gdppc_info.split("note:") + + gdppc_info = [ + x for x in re.split(r"\(|\)", gdppc_info) if len(x.replace(" ", "")) > 0 + ] + gdppc_vals, gdppc_years = gdppc_info[0::2], gdppc_info[1::2] + gdppc_vals = [float(re.sub(r"\$|,", "", x.strip())) for x in gdppc_vals] + gdppc_years = helper_fy_cleaner( + [x.strip().replace(" est.", "") for x in gdppc_years] + ) + + usd_year_assumed = "usd_year_assumed" + gdppc_usd_years = gdppc_years + if (note is not None) and (name != "West Bank"): + gdppc_usd_years = [int(re.sub(r"[a-zA-Z]|\.", "", note).strip())] * len( + gdppc_years + ) + usd_year_assumed = "usd_year_orig" + + append_this = [] + for l, yr in enumerate(gdppc_usd_years): + append_this.append( + [name, gdppc_years[l], yr, gdppc_vals[l], usd_year_assumed] + ) + gdppc_collect += append_this + + # organizing in pandas.DataFrame format + gdp_columns = ["country", "year", "usd_year", "gdp", "usd_year_source"] + gdp_collect = pd.DataFrame(gdp_collect, columns=gdp_columns) + gdp_collect["wfb_year"] = wfb_year + + gdp_columns[3] = "gdppc" + gdppc_collect = pd.DataFrame(gdppc_collect, columns=gdp_columns) + gdppc_collect["wfb_year"] = wfb_year + + # fixing Cote d'Ivoire name + gdp_collect.loc[ + gdp_collect.country == "Cote d'Ivoire", "country" + ] = "Cote d'Ivoire" + gdppc_collect.loc[ + gdppc_collect.country == "Cote d'Ivoire", "country" + ] = "Cote d'Ivoire" + pop_collect.loc[ + pop_collect.country == "Cote d'Ivoire", "country" + ] = "Cote d'Ivoire" + + # manual cleaning to fix or drop unreliable data + gdppc_error_ctries = [ + "Togo", + "Zimbabwe", + "Turkmenistan", + "Venezuela", + "Sierra Leone", + "Kosovo", + "Guinea-Bissau", + "Benin", + "Cote d'Ivoire", + "Kuwait", + "Niger", + "Taiwan", + "Germany", + ] + gdppc_collect = gdppc_collect.loc[ + ~( + (gdppc_collect.country.isin(gdppc_error_ctries)) + & (gdppc_collect.year == 2017) + ), + :, + ].copy() + + gdp_error_ctries = [ + x for x in gdppc_error_ctries if x not in ["Kosovo", "Sierra Leone", "Taiwan"] + ] + gdp_error_ctries += ["Mozambique", "Mauritania", "Pakistan", "Jordan"] + gdp_collect = gdp_collect.loc[ + ~((gdp_collect.country.isin(gdp_error_ctries)) & (gdp_collect.year == 2017)), : + ].copy() + + return pop_collect, gdp_collect, gdppc_collect + + +def organize_gather_cia_wfb_2000_2020(years=list(range(2000, 2021))): + """Cleaning all CIA WFB versions, from 2000 to 2020, and gathering them in list + format (one list each for population, GDP, and GDP per capita). + + Parameters + ---------- + years : array-like of int + containing the version years to be cleaned; default runs from 2000 to 2020. + + Returns + ------- + cia_pop_gather : list of pandas.DataFrame + containing population data from the oldest version to the newest (data is in + ones of people) + cia_gdp_gather : list of pandas.DataFrame + containing GDP data from the oldest version to the newest (data is in millions + of USD) + cia_gdppc_gather : list of pandas.DataFrame + containing GDP per capita data from the oldest version to the newest (data is + in ones of USD) + + """ + + years = np.sort(years) + msg = "Only cleans versions 2000 to 2020." + assert (year.max() <= 2020) and (year.min() >= 2000), msg + + # gathering country name to country code conversion + CCODE_PWT = pd.read_excel(PATH_PWT_RAW)[ + ["countrycode", "country"] + ].drop_duplicates() + CCODE_MPD = pd.read_excel(PATH_MPD_RAW)[ + ["countrycode", "country"] + ].drop_duplicates() + CCODE_DF = ( + pd.concat([CCODE_MPD, CCODE_PWT, CCODE_MANUAL], axis=0) + .drop_duplicates() + .reset_index(drop=True) + .rename(columns={"countrycode": "ccode"}) + ) + + cia_gdp_gather = [] + cia_pop_gather = [] + cia_gdppc_gather = [] + for yr in tqdm(years): + directory = DIR_CIA_RAW / "factbook-{}".format(yr) + if yr in [2000, 2001]: + pop_df, gdp_df = organize_cia_wfb_2000_2001(directory, wfb_year=yr) + gdppc_df = gdp_df.copy()[["country", "year", "gdppc", "wfb_year"]] + gdp_df = gdp_df[["country", "year", "gdp", "wfb_year"]] + elif yr in [2002, 2003, 2004]: + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2002_2004(directory, yr) + elif yr in range(2005, 2009): + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2005_2008(directory, yr) + elif yr in range(2009, 2013): + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2009_2012(directory, yr) + elif yr in [2013, 2014]: + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2013_2014(directory, yr) + elif yr == 2015: + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2015() + elif yr in [2016, 2017]: + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2016_2017(directory, yr) + elif yr in [2018, 2019]: + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2018_2019(directory, yr) + else: + pop_df, gdp_df, gdppc_df = organize_cia_wfb_2020() + + if "usd_year" not in gdp_df.columns: + gdp_df["usd_year"] = gdp_df["year"] + if "usd_year" not in gdppc_df.columns: + gdppc_df["usd_year"] = gdppc_df["year"] + + pop_df = pop_df.merge(CCODE_DF, on=["country"], how="left") + gdp_df = gdp_df.merge(CCODE_DF, on=["country"], how="left") + gdppc_df = gdppc_df.merge(CCODE_DF, on=["country"], how="left") + + # manual cleaning after Palestine (West Bank + Gaza Strip) + if "PSE" in pop_df.ccode.values: + pse_df = pop_df.loc[pop_df.ccode == "PSE", :].reset_index(drop=True) + pse_df = pse_df.groupby(["ccode", "year"]).sum()[["pop"]].reset_index() + pse_df["country"] = "Palestine" + pse_df["wfb_year"] = yr + pop_df = pd.concat( + [pse_df, pop_df.loc[pop_df.ccode != "PSE", :].copy()], axis=0 + ).reset_index(drop=True) + if "PSE" in gdp_df.ccode.values: + pse_df = gdp_df.loc[gdp_df.ccode == "PSE", :].reset_index(drop=True) + pse_df = ( + pse_df.groupby(["ccode", "year", "usd_year"]) + .sum()[["gdp"]] + .reset_index() + ) + pse_df["country"] = "Palestine" + pse_df["wfb_year"] = yr + gdp_df = pd.concat( + [pse_df, gdp_df.loc[gdp_df.ccode != "PSE", :].copy()], axis=0 + ).reset_index(drop=True) + if "PSE" in gdppc_df.ccode.values: + # getting those that do not have more than 1 ccode-year observations + pse_df = gdppc_df.loc[gdppc_df.ccode == "PSE", :].reset_index(drop=True) + pse_df["counter"] = 1 + pse_counter = ( + pse_df.groupby(["ccode", "year", "usd_year"]) + .sum()[["counter"]] + .reset_index() + ) + pse_df.drop(["counter"], axis=1, inplace=True) + pse_df = pse_df.merge( + pse_counter, on=["ccode", "year", "usd_year"], how="left" + ) + pse_df = pse_df.loc[ + pse_df.counter == 1, ["ccode", "year", "gdppc", "usd_year", "wfb_year"] + ] + pse_df["country"] = "Palestine" + gdppc_df = pd.concat( + [pse_df, gdppc_df.loc[gdppc_df.ccode != "PSE", :].copy()], axis=0 + ).reset_index(drop=True) + + cia_pop_gather.append(pop_df) + cia_gdp_gather.append(gdp_df) + cia_gdppc_gather.append(gdppc_df) + + return cia_pop_gather, cia_gdp_gather, cia_gdppc_gather + + +def wfb_merge_year_by_year(df_old, df_new, varname="gdp"): + """Based on the version year (column `wfb_year`), updates the information of + dataset `df_old` with that of dataset `df_new`, and also merges any information that + is newly introduced in `df_new`. + + Parameters + ---------- + df_old : pandas.DataFrame + contains older data, whose information is from CIA WFB versions that are older + than of `df_new` + df_new : pandas.DataFrame + contains newer data, whose information is from CIA WFB version that is newer + than any version present in `df_new` + varname : str + variable name to aggregate for, can be either `gdp`, `gdppc`, or `pop` + + Returns + ------- + pandas.DataFrame + cleaned data containing the updated and newer data using versions of CIA WFB + contained in both `df_old` and `df_new`. + + """ + + msg = "Only able to clean 'gdp', 'gdppc' or 'pop'" + assert varname in ["gdp", "gdppc", "pop"], msg + + msg = "`df_old` should be older than `df_new`; check the columns `wfb_year`" + old_ver = df_old.wfb_year.max() + new_ver = df_new.wfb_year.unique()[0] + assert old_ver < new_ver, msg + + col_rename = {varname: varname + "_old", "wfb_year": "wfb_year_old"} + col_select = ["ccode", "year", "wfb_year", varname] + if varname != "pop": + col_rename["usd_year"] = "usd_year_old" + col_select.append("usd_year") + if "usd_year" not in df_new.columns: + df_new["usd_year"] = df_new["year"] + + df_old_merge = ( + df_old.loc[~pd.isnull(df_old.ccode), col_select] + .rename(columns=col_rename) + .set_index(["ccode", "year"]) + ) + merged_df = df_old_merge.merge( + df_new.loc[~pd.isnull(df_new.ccode), col_select].set_index(["ccode", "year"]), + left_index=True, + right_index=True, + how="outer", + ) + + merged_df.loc[pd.isnull(merged_df[varname]), "wfb_year"] = merged_df.loc[ + pd.isnull(merged_df[varname]), "wfb_year_old" + ].values + merged_df.loc[pd.isnull(merged_df[varname]), varname] = merged_df.loc[ + pd.isnull(merged_df[varname]), varname + "_old" + ].values + if varname != "pop": + merged_df.loc[pd.isnull(merged_df["usd_year"]), "usd_year"] = merged_df.loc[ + pd.isnull(merged_df["usd_year"]), "usd_year_old" + ].values + + return merged_df[col_select[2:]].sort_index().reset_index() diff --git a/sliiders/country_level_ypk.py b/sliiders/country_level_ypk.py index 3d0503a..bb33d85 100644 --- a/sliiders/country_level_ypk.py +++ b/sliiders/country_level_ypk.py @@ -1,4 +1,7 @@ -# various functions used for the country-level information workflow +""" +various functions used for the country-level information workflow in +`notebooks/create-SLIIDERS-ECON/country_level_ypk` +""" from itertools import product as lstprod import matplotlib.pyplot as plt diff --git a/sliiders/settings.py b/sliiders/settings.py index 286759d..ded488d 100644 --- a/sliiders/settings.py +++ b/sliiders/settings.py @@ -6,8 +6,9 @@ from .gcs import FS, fuse_to_gcsmap # Versions -GLOBAL_PROTECTED_AREAS_VERS = "v0.2" -LEVEES_VERS = "v0.2" +GLOBAL_PROTECTED_AREAS_VERS = "v0.1" +US_PROTECTED_AREAS_VERS = "v0.1" +LEVEES_VERS = "v0.1" GPW_VERS = "v4rev11" LANDSCAN_YEAR = "2019" LANDSCAN_VERS = f"LandScan Global {LANDSCAN_YEAR}" @@ -30,6 +31,7 @@ IMF_WEO_VERS = "October_2021" UN_WPP_VERS = "2019" IIASA_PROJECTIONS_DOWNLOAD_VERS = "2018" +CIA_WFB_VERS = "20220408" # Definitions SPATIAL_WARNINGS_TO_IGNORE = [ @@ -415,17 +417,7 @@ HIGHEST_WITHELEV_EXPOSURE_METERS = 20 ELEV_CAP = HIGHEST_WITHELEV_EXPOSURE_METERS + 1 # "higher than coastal" value -## Spatial - -# Area, in "square degrees", above which we will consider endorheic basins as protected areas -# N.B. this is an arbitrary choice (something more robust could use something like a bathtub model -# over a highly resolved elevation grid). -MIN_BASIN_TILE_DEGREE_AREA = 20.0 - -# minimum distance in degrees from the ocean to include an endorheic basin as -# a "protected area" -ENDORHEIC_BASIN_OCEAN_BUFFER = 0.2 - +# Spatial MAX_VORONOI_COMPLEXITY = ( 40e6 # Maximum number of initial points in shapefile when generating Voronoi ) @@ -467,8 +459,7 @@ DIR_IFILES_INT = DIR_SLR_INT / "ifiles" PATH_SLR_N_GCMS = fuse_to_gcsmap(DIR_SLR_INT / f"numGCMs_{SLIIDERS_VERS}.zarr", FS) -DIR_GEOG_RAW = DIR_DATA_RAW / "geography" -DIR_GEOG_INT = DIR_DATA_INT / "geography" +DIR_GEOGRAPHY_INT = DIR_DATA_INT / "geography" PATH_CIAM_2016 = fuse_to_gcsmap( DIR_DATA_RAW / "CIAM_2016" / "diaz2016_inputs_raw.zarr", FS @@ -481,13 +472,13 @@ DIR_RESULTS / f"sliiders-slr-{SLIIDERS_VERS}.zarr", FS ) -PATH_SEG_CENTROIDS = DIR_GEOG_INT / "gtsm_stations_thinned_ciam" +PATH_SEG_CENTROIDS = DIR_GEOGRAPHY_INT / "gtsm_stations_thinned_ciam" -PATH_CIAM_COASTLINES = DIR_GEOG_INT / "ne_coastline_lines_CIAM_wexp_or_gtsm" +PATH_CIAM_COASTLINES = DIR_GEOGRAPHY_INT / "ne_coastline_lines_CIAM_wexp_or_gtsm" -DIR_GTSM_STATIONS_TOTHIN = DIR_GEOG_RAW / "gtsm_stations_eur_tothin" - -DIR_CIAM_VORONOI = DIR_GEOG_INT / "ciam_and_adm1_intersections" / EXPOSURE_BINNED_VERS +DIR_CIAM_VORONOI = ( + DIR_GEOGRAPHY_INT / "ciam_and_adm1_intersections" / EXPOSURE_BINNED_VERS +) PATH_CIAM_ADM1_VORONOI_INTERSECTIONS = ( DIR_CIAM_VORONOI / "ciam_and_adm1_intersections.parquet" ) @@ -513,10 +504,7 @@ / "LitPop_pc_30arcsec.parquet" ) -PATH_NATURALEARTH_OCEAN = DIR_SHAPEFILES / "natural_earth" / "ne_10m_ocean" -DIR_HYDROBASINS_RAW = DIR_DATA_RAW / "hydrosheds" / "hydrobasins" - -DIR_GLOBAL_PROTECTED_AREAS = ( +DIR_GLOBAL_PROTECTED_AREAS = Path( DIR_EXPOSURE_INT / "protected_locations" / "global" @@ -524,17 +512,6 @@ / GLOBAL_PROTECTED_AREAS_VERS ) -PATH_US_MANUAL_PROTECTED_AREAS = ( - DIR_EXPOSURE_RAW - / "protected_areas" - / "usa" - / "manual" - / "us_manual_protected_areas.parquet" -) - -PATH_MANUAL_PROTECTED_AREAS = ( - DIR_GLOBAL_PROTECTED_AREAS / "manual_global_basins.parquet" -) PATH_GLOBAL_PROTECTED_AREAS = DIR_GLOBAL_PROTECTED_AREAS / "all_protected_areas.parquet" DIR_WETLANDS_RAW = DIR_DATA_RAW / "wetlands_mangroves" @@ -594,14 +571,9 @@ DIR_EXPOSURE_BINNED / EXPOSURE_BINNED_VERS / "binned_exposure_withelev_base.parquet" ) -DIR_GEOG_DATUMS_RAW = DIR_GEOG_RAW / "datum_conversions" +DIR_GEOG_INT = DIR_DATA_INT / "geography" DIR_GEOG_DATUMS_INT = DIR_GEOG_INT / "datum_conversions" -DIR_GEOG_DATUMS_EGM96_WGS84 = DIR_GEOG_DATUMS_RAW / "egm96" -DIR_GEOG_DATUMS_XGM2019e_WGS84 = DIR_GEOG_DATUMS_RAW / "xgm2019e" - -PATH_GEOG_MDT_RAW = DIR_GEOG_RAW / "mdt" / "aviso_2018" / "mdt_cnes_cls18_global.nc" - PATH_GEOG_DATUMS_GRID = fuse_to_gcsmap( DIR_GEOG_DATUMS_INT / f"datum_conversions_gridded_{DATUM_CONVERSION_VERS}.zarr", FS ) @@ -623,6 +595,10 @@ DIR_YPK_FINAL / "gdp_gdppc_pop_capital_proj_2010_2100.parquet" ) +DIR_CIA_RAW = DIR_YPK_RAW / "cia_wfb" +PATH_CIA_INT = ( + DIR_YPK_INT / "cia_wfb" / CIA_WFB_VERS / "cia_wfb_constant_2017_ppp_usd.parquet" +) DIR_UN_AMA_RAW = DIR_YPK_RAW / "un_ama" / UN_AMA_DATESTAMP DIR_UN_WPP_RAW = DIR_YPK_RAW / "un_wpp" / UN_WPP_VERS DIR_WB_WDI_RAW = DIR_YPK_RAW / "wb_wdi" / WB_WDI_DATESTAMP From 3bdc4825a9d1b2e1a6f67f3a4dddce15ffb590ad Mon Sep 17 00:00:00 2001 From: Junho Choi Date: Fri, 8 Apr 2022 23:50:16 +0000 Subject: [PATCH 2/3] Notebooks updated for cleaning and incorporating newer CIA WFB --- .../country_level_ypk/ypk1_prep_clean.ipynb | 702 ++++++------------ .../ypk2_reorg_and_impute_ypk.ipynb | 393 +--------- 2 files changed, 249 insertions(+), 846 deletions(-) diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb index 0894701..f49226c 100644 --- a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk1_prep_clean.ipynb @@ -42,9 +42,13 @@ "from py7zr import unpack_7zarchive\n", "from tqdm.auto import tqdm\n", "\n", - "from sliiders import country_level_ypk as ypk_fn\n", + "from sliiders.country_level_ypk import ppp_conversion_specific_year\n", "from sliiders import settings as sset\n", "from sliiders import spatial\n", + "from sliiders.cia_wfb_clean import (\n", + " organize_gather_cia_wfb_2000_2020,\n", + " wfb_merge_year_by_year,\n", + ")\n", "\n", "# dask gateway setup\n", "gateway = Gateway()\n", @@ -59,7 +63,8 @@ "outputs": [], "source": [ "# creating necessary directory\n", - "sset.DIR_YPK_INT.mkdir(parents=True, exist_ok=True)" + "sset.DIR_YPK_INT.mkdir(exist_ok=True, parents=True)\n", + "sset.PATH_CIA_INT.parent.mkdir(exist_ok=True, parents=True)" ] }, { @@ -251,20 +256,20 @@ "metadata": {}, "outputs": [], "source": [ - "# attaching country codes; first import un_pop information\n", + "## attaching country codes; first import un_pop information\n", "by_age = pd.read_csv(sset.DIR_UN_WPP_RAW / \"UN_WPP2019_Population_by_Age.csv\")\n", "\n", - "# attaching the country codes\n", + "## attaching the country codes\n", "un_df_dic = dict(zip(un_df.Location, un_df.index.get_level_values(\"ccode\")))\n", "by_age[\"ccode\"] = by_age.Location.map(un_df_dic)\n", "\n", - "# double checking if any are missing country codes\n", + "## double checking if any are missing country codes\n", "print(\"The missing-ccode rows are:\", by_age[pd.isnull(by_age.ccode)].shape[0])\n", "\n", - "# saving the ccodes as indices\n", + "## saving the ccodes as indices\n", "by_age.set_index([\"ccode\"], inplace=True)\n", "\n", - "# exporting\n", + "## exporting\n", "by_age.to_parquet(sset.DIR_YPK_INT / \"un_population_by_age.parquet\")" ] }, @@ -396,570 +401,312 @@ }, { "cell_type": "markdown", - "id": "32e2ac64", + "id": "0bd0ad1d-0a6b-4c62-8855-8abdada0d747", "metadata": {}, "source": [ - "## CIA World Factbook: gathering GDP PPP terms\n", - "\n", - "The information gathered through sources such as PWT, World Bank WDI, and OECD Regional data often lack GDP information about many of the smaller or disputed countries and territories. In order to account for these countries, we incorporate data from CIA World Factbook dataset which has not much year-to-year information but has more countries covered.\n", + "## CIA World Factbook organization\n", "\n", - "### Unzipping and organizing the files\n", + "Here, the following are carried out:\n", + "1. Clean each yearly version into `pandas.DataFrame` format\n", + "2. Attach ISO-3166 alpha-3 codes for easier merging\n", + "3. Merge the different versions into one dataset; update older data with newer data whenever possible\n", + "4. For GDP and GDP per capita, make sure that they are in constant 2017 PPP USD terms, as the raw dataset has varying PPP USD years\n", "\n", - "Note that the cell directly below needs to be run **only once** since it is basically unzipping the `.7z` zip file and may take a long time to repeat over again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c92fa2c", - "metadata": {}, - "outputs": [], - "source": [ - "# unzipping: this may take a long time\n", - "CIA_DIR, zip_file_name = sset.DIR_YPK_RAW, \"weekly_json.7z\"\n", - "shutil.register_unpack_format(\"7zip\", [\".7z\"], unpack_7zarchive)\n", - "shutil.unpack_archive(CIA_DIR / zip_file_name, CIA_DIR)" + "### Cleaning the yearly versions and attaching country codes" ] }, { "cell_type": "code", "execution_count": null, - "id": "20797d4d", + "id": "00cb5c92-703f-4659-b19c-baec2326c046", "metadata": {}, "outputs": [], "source": [ - "# ordering them by time (Earlier entries first)\n", - "CIA_DIR_week = sset.DIR_YPK_RAW / \"weekly_json\"\n", - "file_lst = np.sort(list(CIA_DIR_week.glob(\"*\")))" + "# yearly versions, 2000 to 2020\n", + "cia_wfb_pop, cia_wfb_gdp, cia_wfb_gdppc = organize_gather_cia_wfb_2000_2020()" ] }, { "cell_type": "markdown", - "id": "2df988e2", + "id": "2cee6533-dd88-4a56-87da-b26632cb2c6f", "metadata": {}, "source": [ - "### Fetch necessary information from the individual `.json` files" + "### Merge the yearly datasets, updating the previous version with a newer version" ] }, { "cell_type": "code", "execution_count": null, - "id": "4132539f", + "id": "69f7432a-b605-484a-ac71-5a4b7fcdc7fa", "metadata": {}, "outputs": [], "source": [ - "def file_gdp_fetcher(filename):\n", - " \"\"\"From weekly-scraped CIA World Factbook data (in json format), gather relevant GDP\n", - " information and save as a dictionary.\n", - "\n", - " Parameters\n", - " ----------\n", - " filename : Path-like or str\n", - " individual weekly-scraped CIA World Factbook data file path\n", - "\n", - " overall_dict : dict\n", - " information (in dictionary format) containing the countries' GDP information\n", - " (in purchasing power parity) and for which year(s) those information is provided\n", - "\n", - " \"\"\"\n", - "\n", - " with open(filename) as fp:\n", - " data = json.load(fp)\n", - " ctries = list(data[\"countries\"].keys())\n", - " ctries.sort()\n", - "\n", - " note_phrase_1 = \"data are in \"\n", - " note_phrase_2 = \" dollars\"\n", - " note_phrase_3 = \" us dollars\"\n", - "\n", - " overall_dict = dict([])\n", - " for c in ctries:\n", - "\n", - " try:\n", - " info = data[\"countries\"][c][\"data\"][\"economy\"][\"gdp\"]\n", - " info = info[\"purchasing_power_parity\"]\n", - " note = info.get(\"note\")\n", - "\n", - " base_yr = None\n", - " if note is not None:\n", - " note = note.lower()\n", - " if (note_phrase_1 in note) and (note_phrase_3 in note):\n", - " note_ = note.split(note_phrase_1)[1]\n", - " note_ = note_.split(note_phrase_3)[0]\n", - " base_yr = int(note_[0:4])\n", - " elif (note_phrase_1 in note) and (note_phrase_2 in note):\n", - " note_ = note.split(note_phrase_1)[1]\n", - " note_ = note_.split(note_phrase_2)[0]\n", - " base_yr = int(note_[0:4])\n", - " info_values = info.get(\"annual_values\")\n", - " if (info_values is not None) and (type(info_values) in [tuple, list]):\n", - " keys = []\n", - " values = []\n", - " for i in info_values:\n", - " keys.append(int(i[\"date\"]))\n", - " values.append((i[\"value\"], int(i[\"date\"])))\n", - " if base_yr is not None:\n", - " values = [(x[0], base_yr) for x in values]\n", - " yr_dict = dict(zip(keys, values))\n", - " overall_dict[c] = yr_dict\n", - "\n", - " except KeyError:\n", - " continue\n", - "\n", - " return overall_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e8e681f", - "metadata": {}, - "outputs": [], - "source": [ - "# individual results of the file_gdp_fetcher function stored in a list\n", - "lst_results = []\n", - "for f in tqdm(file_lst):\n", - " lst_results.append(file_gdp_fetcher(f))" + "# merging year-by-year, separately for GDP, GDPpc, and population\n", + "for i, gdp_df in enumerate(cia_wfb_gdp):\n", + " pop_df = cia_wfb_pop[i]\n", + " gdppc_df = cia_wfb_gdppc[i]\n", + " if i == 0:\n", + " updated_gdp_df = gdp_df.copy()\n", + " updated_gdppc_df = gdppc_df.copy()\n", + " updated_pop_df = pop_df.copy()\n", + " else:\n", + " updated_gdp_df = wfb_merge_year_by_year(updated_gdp_df, gdp_df, \"gdp\")\n", + " updated_gdppc_df = wfb_merge_year_by_year(updated_gdppc_df, gdppc_df, \"gdppc\")\n", + " updated_pop_df = wfb_merge_year_by_year(updated_pop_df, pop_df, \"pop\")\n", + "\n", + "# we will only use positively-valued GDP and GDPpc datasets\n", + "updated_gdppc_df = updated_gdppc_df.loc[updated_gdppc_df.gdppc > 0, :].reset_index()" ] }, { "cell_type": "markdown", - "id": "c5d2d1e9", + "id": "c9684469-1868-4fda-b989-db794fd4787c", "metadata": {}, "source": [ - "### Updating the individual dictionaries with the most recent information" + "### Turning into constant 2017 PPP USD terms for GDP and GDP per capita" ] }, { "cell_type": "code", "execution_count": null, - "id": "9248d602", + "id": "42f0889b-2e52-45ea-b3d0-c13edcd6c1cf", "metadata": {}, "outputs": [], "source": [ - "def update_one_with_two(dict1, dict2):\n", - " \"\"\"For simple updating of dictionaries, from `dict2` onto `dict1` in order to make\n", - " sure that all relevant CIA World Factbook data are gathered\n", + "# fetching the PPP conversion factors\n", + "ppp_to_17 = ppp_conversion_specific_year(2017, to=True, extrap_sim=True)\n", "\n", - " Parameters\n", - " ----------\n", - " dict1 : dict\n", - " dictionary to implement the updates onto\n", - " dict2 : dict\n", - " dictionary to gather new information from\n", + "# We will clean Netherland Antilles (BES+CUW+SXM, as defined in CIA WFB) with\n", + "# Curacao (CUW) PPP conversion rates; CUW has the largest economy, based on GDP\n", + "ppp_neth_antil_17 = ppp_to_17.loc[(\"CUW\", slice(None)), :].reset_index()\n", + "ppp_neth_antil_17[\"ccode\"] = \"BES+CUW+SXM\"\n", + "ppp_neth_antil_17.set_index([\"ccode\", \"year\"], inplace=True)\n", + "ppp_neth_antil_17[\"conv_fill\"] = \"copy_from_CUW\"\n", + "ppp_neth_antil_17[\"pl_gdpo_fill\"] = \"copy_from_CUW\"\n", "\n", - " Returns\n", - " -------\n", - " dict_ : dict\n", - " updated dictionary containing the information of both dictionaries\n", + "# We use Montenegro (MNE) PPP conversion rates for SRB+MNE (Serbia and Montenegro)\n", + "# This is arbitrarily chosen, but for the years 1995-2019 there is not much difference\n", + "# in the conversion rates\n", + "ppp_srbmnt_17 = ppp_to_17.loc[(\"MNE\", slice(None)), :].reset_index()\n", + "ppp_srbmnt_17[\"ccode\"] = \"SRB+MNE\"\n", + "ppp_srbmnt_17.set_index([\"ccode\", \"year\"], inplace=True)\n", + "ppp_srbmnt_17[\"conv_fill\"] = \"copy_from_MNE\"\n", + "ppp_srbmnt_17[\"pl_gdpo_fill\"] = \"copy_from_MNE\"\n", "\n", - " \"\"\"\n", + "# merging\n", + "ppp_to_17 = pd.concat(\n", + " [ppp_to_17, ppp_neth_antil_17, ppp_srbmnt_17], axis=0\n", + ").sort_index()\n", "\n", - " dict_ = dict(dict1)\n", - " lst1 = list(dict1.keys())\n", - "\n", - " for key in dict2.keys():\n", - " if key not in lst1:\n", - " dict_[key] = dict2[key]\n", - " continue\n", - "\n", - " subdict = dict2[key]\n", - " subkeys = list(subdict.keys())\n", - " for subkey in subkeys:\n", - " dict_[key][subkey] = subdict[subkey]\n", - "\n", - " return dict_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c6aa475", - "metadata": {}, - "outputs": [], - "source": [ - "i = 0\n", - "for res in tqdm(lst_results[1:]):\n", - " if i == 0:\n", - " midres = update_one_with_two(lst_results[0], res)\n", - " else:\n", - " midres = update_one_with_two(midres, res)\n", - " i += 1" - ] - }, - { - "cell_type": "markdown", - "id": "fe86819a", - "metadata": {}, - "source": [ - "### Saving into a long-panel format dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23ddf2f1", - "metadata": {}, - "outputs": [], - "source": [ - "ctry_dfs = []\n", - "for i in midres.keys():\n", - " info = midres[i]\n", - " i_k = list(info.keys())\n", - " i_v = [info[i_k_] for i_k_ in i_k]\n", - " ctry_info = [[i, i_k[l]] + list(i_v[l]) for l in range(len(i_k))]\n", - " ctry_df = pd.DataFrame(ctry_info, columns=[\"country\", \"year\", \"gdp\", \"ppp_year\"])\n", - " ctry_dfs.append(ctry_df)\n", - "ctry_agg_df = pd.concat(ctry_dfs, axis=0)\n", - "ctry_agg_df[\"country\"] = [x.replace(\"_\", \" \") for x in ctry_agg_df[\"country\"]]\n", - "ctry_agg_df.set_index([\"country\", \"year\"], inplace=True)\n", - "ctry_agg_df.sort_index(inplace=True)" - ] - }, - { - "cell_type": "markdown", - "id": "dabdcdca", - "metadata": {}, - "source": [ - "### Assigning countrycodes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56b9a0fb", - "metadata": {}, - "outputs": [], - "source": [ - "# let's use the UN populations data, since it should have the most countries\n", - "# to match names with values\n", - "un_loc = sset.DIR_YPK_INT\n", - "unpop = pd.read_parquet(un_loc / \"un_population.parquet\").reset_index()\n", - "\n", - "unpop[\"Location_lower\"] = [x.lower() for x in unpop.Location]\n", - "initial_cleanup = dict(zip(unpop.Location_lower, unpop.ccode))\n", - "\n", - "## attaching the cleaned countrycodes\n", - "initial_df = [list(initial_cleanup.keys()), list(initial_cleanup.values())]\n", - "initial_df = pd.DataFrame(\n", - " np.array(initial_df).T, columns=[\"country\", \"ccode\"]\n", - ").set_index([\"country\"])\n", - "ctry_agg_df = ctry_agg_df.merge(\n", - " initial_df, left_index=True, right_index=True, how=\"left\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c2eeaea", - "metadata": {}, - "outputs": [], - "source": [ - "## checking which didn't get country codes\n", - "cia_ccodes_only = ctry_agg_df.reset_index()[[\"country\", \"ccode\"]].drop_duplicates()\n", - "unknown_case = []\n", - "for i, case in enumerate(cia_ccodes_only[\"ccode\"]):\n", - " if pd.isnull(case):\n", - " unknown_case.append(cia_ccodes_only[\"country\"].values[i])\n", - "unknown_case = np.sort(np.unique(unknown_case))\n", - "print(unknown_case)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7ab6757", - "metadata": {}, - "outputs": [], - "source": [ - "# manual cleanup\n", - "unknown_case_ccodes = [\"BHS\", \"BOL\", \"BRN\", \"MMR\", \"CPV\", \"COD\", \"COG\", \"CIV\", \"CUW\"]\n", - "unknown_case_ccodes += [\"CZE\", \"TLS\", \"-\", \"FLK\", \"GMB\", \"-\", \"GGY\", \"GNB\", \"HKG\"]\n", - "unknown_case_ccodes += [\"IRN\", \"JEY\", \"PRK\", \"KOR\", \"KO-\", \"LAO\", \"MAC\", \"MKD\", \"FSM\"]\n", - "unknown_case_ccodes += [\"MDA\", \"-\", \"RUS\", \"SHN\", \"MAF\", \"SXM\", \"SWZ\", \"SYR\", \"TWN\"]\n", - "unknown_case_ccodes += [\"TZA\", \"TLS\", \"USA\", \"VEN\", \"VNM\", \"VIR\", \"WLF\", \"-\"]\n", - "\n", - "# double-checking the names' lengths\n", - "print(len(unknown_case) == len(unknown_case_ccodes))\n", - "\n", - "# getting a dataframe\n", - "update_df = pd.DataFrame(data={\"country\": unknown_case, \"ccode2\": unknown_case_ccodes})\n", - "update_df.set_index([\"country\"], inplace=True)\n", - "ctry_agg_df = ctry_agg_df.merge(\n", - " update_df, left_index=True, right_index=True, how=\"left\"\n", + "# checking the country codes that are not in `ppp_to_17`\n", + "print()\n", + "print(\n", + " \"Missing from the PPP conversion table:\\n\",\n", + " np.setdiff1d(\n", + " np.union1d(\n", + " updated_gdp_df[\"ccode\"].unique(), updated_gdppc_df[\"ccode\"].unique()\n", + " ),\n", + " ppp_to_17.index.get_level_values(\"ccode\").unique(),\n", + " ),\n", ")\n", - "ctry_agg_df.loc[pd.isnull(ctry_agg_df.ccode), \"ccode\"] = ctry_agg_df.loc[\n", - " pd.isnull(ctry_agg_df.ccode), \"ccode2\"\n", - "].values" - ] - }, - { - "cell_type": "markdown", - "id": "6631b61c", - "metadata": {}, - "source": [ - "### Fetching the PPP conversion rates (to constant 2017 PPP USD), and applying the conversion rates\n", "\n", - "Also, turn it into millions of USD (currently in ones of USD)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a08c1ebf", - "metadata": {}, - "outputs": [], - "source": [ - "ppp_to_17 = ypk_fn.ppp_conversion_specific_year(2017, to=True, extrap_sim=True)" + "# changing the 'year' index to be named 'usd_year'\n", + "ppp_to_17 = (\n", + " ppp_to_17.reset_index()\n", + " .rename(columns={\"year\": \"usd_year\"})\n", + " .set_index([\"ccode\", \"usd_year\"])\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "cf2edb1f", + "id": "8ad1af34-c5e0-487b-8635-f82c7d347f37", "metadata": {}, "outputs": [], "source": [ - "# neutral assumption when conversion rates are missing\n", - "ctry_agg_df = (\n", - " ctry_agg_df.reset_index()\n", - " .set_index([\"ccode\", \"year\"])\n", - " .drop([\"ccode2\"], axis=1)\n", - " .merge(ppp_to_17, left_index=True, right_index=True, how=\"left\")\n", + "# fetching the USD GDP deflators\n", + "defla_to_17 = (\n", + " pd.read_excel(sset.PATH_PWT_RAW)\n", + " .rename(columns={\"year\": \"usd_year\"})\n", + " .set_index([\"countrycode\", \"usd_year\"])\n", ")\n", - "ctry_agg_df.loc[pd.isnull(ctry_agg_df.conv), \"conv\"] = 1\n", - "\n", - "# first, divide by 1000000\n", - "ctry_agg_df[\"gdp\"] = ctry_agg_df[\"gdp\"] / 1000000\n", - "\n", - "# applying the conversion by multiplying\n", - "ctry_agg_df[\"gdp_ppp2017_currUSD\"] = ctry_agg_df[\"gdp\"] * ctry_agg_df[\"conv\"]" - ] - }, - { - "cell_type": "markdown", - "id": "6ba4de36", - "metadata": {}, - "source": [ - "### Attaching the US deflators and generating constant 2017 PPP USD values\n", - "\n", - "Note that while they are now in PPP of 2017, they are yet to be turned into constant 2017 PPP (since they are in current USD, for many). Therefore, we will need to fetch the US deflators (using `pl_gdpo` from PWT)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79e9dc22", - "metadata": {}, - "outputs": [], - "source": [ - "pwt = pd.read_excel(sset.PATH_PWT_RAW).rename(columns={\"countrycode\": \"ccode\"})\n", - "pwt.set_index([\"ccode\", \"year\"], inplace=True)\n", - "\n", - "us_defla = (\n", - " pwt.loc[\"USA\", [\"pl_gdpo\"]]\n", + "defla_to_17 = (\n", + " defla_to_17.loc[([\"USA\"], slice(None)), [\"pl_gdpo\"]]\n", " .reset_index()\n", - " .rename(columns={\"pl_gdpo\": \"pl_usa\", \"year\": \"ppp_year\"})\n", - ")\n", - "ctry_agg_df = (\n", - " ctry_agg_df.reset_index()\n", - " .merge(us_defla, on=[\"ppp_year\"], how=\"left\")\n", - " .set_index([\"ccode\", \"year\"])\n", + " .drop([\"countrycode\"], axis=1)\n", + " .set_index([\"usd_year\"])\n", ")\n", + "defla_to_17[\"gdp_defla\"] = defla_to_17.loc[2017, \"pl_gdpo\"] / defla_to_17[\"pl_gdpo\"]\n", + "defla_to_17.drop([\"pl_gdpo\"], axis=1, inplace=True)\n", "\n", - "# generating constant 2017 ppp\n", - "ctry_agg_df[\"gdp_constant2017ppp\"] = (\n", - " ctry_agg_df[\"gdp_ppp2017_currUSD\"] / ctry_agg_df[\"pl_usa\"]\n", - ")\n", - "\n", - "ctry_agg_df_reorg = ctry_agg_df[[\"gdp_constant2017ppp\", \"country\"]].sort_index()" - ] - }, - { - "cell_type": "markdown", - "id": "81fed947", - "metadata": {}, - "source": [ - "### Checking for redundancies in country (ISO) codes\n", - "\n", - "Except when there aren't any country-codes attached, these redundancies are occurring because there have been changes to the countries' names over the years or there are multiple names for one country. We will use the following rule to remove some of the overlaps:\n", - "- SHN: Take only `saint helena ascension and tristan da cunha`\n", - "- CZE: For 2006-2012, use `czech republic` information; for 2013 and onwards, use `czechia` information.\n", - "- MKD: For 2006-2014, use `macedonia` information; for 2015 and onwards, use `north macedonia` information.\n", - "- SWZ: For 2006-2014, use `swaziland` information; for 2015 and onwards, use `eswatini` information.\n", - "- CPV: For 2006-2011, use `cape verde` information; for 2012 and onwards, use `cabo verde` information.\n", - "- TLS: Take only `timor leste`." + "# merging with the PPP conversion rates\n", + "ppp_to_17 = ppp_to_17.merge(defla_to_17, left_index=True, right_index=True, how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "37ee6478", + "id": "f2b3b5b5-0616-49a7-9064-1b08e16d8ed4", "metadata": {}, "outputs": [], "source": [ - "reorg_ccodes = ctry_agg_df_reorg.reset_index()[[\"ccode\", \"country\"]].drop_duplicates()\n", - "reorg_ccodes.set_index([\"ccode\"], inplace=True)\n", - "for i, ccode in enumerate(np.unique(reorg_ccodes.index.values)):\n", - " countrycases = reorg_ccodes.loc[ccode, \"country\"]\n", - " if (ccode != \"-\") and (type(countrycases) != str):\n", - " print(ccode, countrycases.values)" + "# we manually check if USD year terms agree with one another; if they don't, we check\n", + "# the WFB versions and use the available USD years (some are assumed from their years)\n", + "check_usd_year = updated_gdppc_df.set_index([\"ccode\", \"year\"]).merge(\n", + " updated_gdp_df.set_index([\"ccode\", \"year\"]),\n", + " how=\"outer\",\n", + " left_index=True,\n", + " right_index=True,\n", + ")\n", + "check_usd_year = check_usd_year.loc[\n", + " (check_usd_year.usd_year_y != check_usd_year.usd_year_x)\n", + " & ~pd.isnull(check_usd_year.usd_year_y)\n", + " & ~pd.isnull(check_usd_year.usd_year_x)\n", + "]\n", + "\n", + "print(\n", + " \"Manually check the following countries:\\n\",\n", + " check_usd_year.index.get_level_values(\"ccode\").unique().values,\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "ab54b382", + "id": "71afec1f-7077-4510-aa67-d284338ad4cd", "metadata": {}, "outputs": [], "source": [ - "redundant_ones = [\"SHN\", \"CZE\", \"MKD\", \"SWZ\", \"CPV\", \"TLS\"]\n", - "ctry_agg_df_redun = ctry_agg_df_reorg.reset_index()\n", + "# manual cleansing for USD years\n", + "take_usd_year_from_gdp = [\n", + " (\"AND\", [2010, 2011, 2013, 2014, 2015]),\n", + " (\"ASM\", [2014, 2015]),\n", + " (\"GGY\", [2014]),\n", + " (\"GNQ\", [2011, 2012]),\n", + " (\"GRL\", [2013, 2014]),\n", + " (\"JEY\", [2015]),\n", + " (\"MAC\", [2006, 2008, 2014, 2016]),\n", + " (\"MCO\", [2006, 2009, 2011, 2013, 2014]),\n", + " (\"MHL\", [2008]),\n", + " (\"MNP\", [2014, 2015, 2016]),\n", + " (\"PLW\", [2008]),\n", + " (\"PSE\", [2012, 2013]),\n", + " (\"SOM\", [2013, 2009, 2008]),\n", + " (\"SSD\", [2010]),\n", + " (\"TUV\", [2010]),\n", + " (\"VIR\", [2011, 2012, 2014, 2015, 2016]),\n", + "]\n", "\n", - "ctry_shn = ctry_agg_df_redun.loc[\n", - " ctry_agg_df_redun.country == \"saint helena ascension and tristan da cunha\"\n", - "].set_index([\"ccode\", \"year\"])\n", + "take_usd_year_from_gdppc = [\n", + " ([\"FSM\", \"NRU\", \"PLW\"], 2013),\n", + "]\n", "\n", - "ctry_cze = ctry_agg_df_redun.loc[\n", - " ((ctry_agg_df_redun.country == \"czechia\") & (ctry_agg_df_redun.year >= 2013))\n", - " | (\n", - " (ctry_agg_df_redun.country == \"czech republic\")\n", - " & (ctry_agg_df_redun.year <= 2012)\n", - " )\n", - "].set_index([\"ccode\", \"year\"])\n", - "\n", - "ctry_mkd = ctry_agg_df_redun[\n", - " ((ctry_agg_df_redun.country == \"macedonia\") & (ctry_agg_df_redun.year <= 2014))\n", - " | (\n", - " (ctry_agg_df_redun.country == \"north macedonia\")\n", - " & (ctry_agg_df_redun.year >= 2015)\n", - " )\n", - "].set_index([\"ccode\", \"year\"])\n", + "updated_gdppc_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + "updated_gdp_df.set_index([\"ccode\", \"year\"], inplace=True)\n", + "for i in take_usd_year_from_gdp:\n", + " updated_gdppc_df.loc[i, \"usd_year\"] = updated_gdp_df.loc[i, \"usd_year\"].values\n", "\n", - "ctry_swz = ctry_agg_df_redun[\n", - " ((ctry_agg_df_redun.country == \"swaziland\") & (ctry_agg_df_redun.year <= 2014))\n", - " | ((ctry_agg_df_redun.country == \"eswatini\") & (ctry_agg_df_redun.year >= 2015))\n", - "].set_index([\"ccode\", \"year\"])\n", + "for i in take_usd_year_from_gdppc:\n", + " updated_gdp_df.loc[i, \"usd_year\"] = updated_gdppc_df.loc[i, \"usd_year\"].values\n", "\n", - "ctry_cpv = ctry_agg_df_redun[\n", - " ((ctry_agg_df_redun.country == \"cape verde\") & (ctry_agg_df_redun.year <= 2011))\n", - " | ((ctry_agg_df_redun.country == \"cabo verde\") & (ctry_agg_df_redun.year >= 2012))\n", - "].set_index([\"ccode\", \"year\"])\n", - "\n", - "ctry_tls = ctry_agg_df_redun.loc[\n", - " ctry_agg_df_redun.country == \"timor leste\", :\n", - "].set_index([\"ccode\", \"year\"])" + "updated_gdppc_df.reset_index(inplace=True)\n", + "updated_gdp_df.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": null, - "id": "a7ca3e2f", + "id": "a3418fc2-2650-4fc2-878f-2b6bfab7b23a", "metadata": {}, "outputs": [], "source": [ - "ctry_agg_df_final = ctry_agg_df_reorg[\n", - " ~ctry_agg_df_reorg.index.get_level_values(\"ccode\").isin(\n", - " [\"-\", \"WIDE\"] + redundant_ones\n", - " )\n", - "].copy()\n", - "\n", - "ctry_agg_df_final = pd.concat(\n", - " [ctry_agg_df_final, ctry_shn, ctry_cze, ctry_mkd, ctry_swz, ctry_cpv, ctry_tls],\n", - " axis=0,\n", - ").sort_index()" - ] - }, - { - "cell_type": "markdown", - "id": "633df76b", - "metadata": {}, - "source": [ - "### Adding those that are not in the files\n", + "# GDP per capita; not using index merging due to ccode-usd_year indices not being\n", + "# unique in CIA WFB datasets\n", + "ppp_17_gdppc_df = updated_gdppc_df.merge(\n", + " ppp_to_17.reset_index(), how=\"left\", on=[\"ccode\", \"usd_year\"]\n", + ")\n", + "ppp_17_gdppc_df.loc[\n", + " pd.isnull(ppp_17_gdppc_df.conv), [\"conv_fill\", \"pl_gdpo_fill\"]\n", + "] = \"neutral_assumption\"\n", + "ppp_17_gdppc_df.loc[pd.isnull(ppp_17_gdppc_df.conv), \"conv\"] = 1\n", + "\n", + "# only turning USD values to 2017 USD values, as we aren't too sure about PPP base year\n", + "ppp_17_gdppc_df[\"gdppc_usd_17\"] = ppp_17_gdppc_df[[\"gdppc\", \"gdp_defla\"]].product(\n", + " axis=1\n", + ")\n", "\n", - "**Tokelau `TKL`**\n", + "# assuming PPP year = USD year, turning to constant 2017 PPP USD terms\n", + "ppp_17_gdppc_df[\"rgdpna_pc_17\"] = ppp_17_gdppc_df[[\"conv\", \"gdppc_usd_17\"]].product(\n", + " axis=1\n", + ")\n", "\n", - "According to Tokelau government (link [here](https://www.tokelau.org.nz/Bulletin/April+2017/GDP+first.html)), its PPP USD was 10 million (in 2017). So we will fill this in." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1c760d2", - "metadata": {}, - "outputs": [], - "source": [ - "tkl = pd.DataFrame(\n", - " [\n", - " [\"TKL\", 2017, 10, \"tokelau\"],\n", - " ],\n", - " columns=[\"ccode\", \"year\", \"gdp_constant2017ppp\", \"country\"],\n", - ").set_index([\"ccode\", \"year\"])\n", - "ctry_agg_df_final = pd.concat([ctry_agg_df_final, tkl], axis=0)" + "# similar process for GDP\n", + "ppp_17_gdp_df = updated_gdp_df.merge(\n", + " ppp_to_17.reset_index(), how=\"left\", on=[\"ccode\", \"usd_year\"]\n", + ")\n", + "ppp_17_gdp_df.loc[\n", + " pd.isnull(ppp_17_gdp_df.conv), [\"conv_fill\", \"pl_gdpo_fill\"]\n", + "] = \"neutral_assumption\"\n", + "ppp_17_gdp_df.loc[pd.isnull(ppp_17_gdp_df.conv), \"conv\"] = 1\n", + "ppp_17_gdp_df[\"gdp_usd_17\"] = ppp_17_gdp_df[[\"gdp\", \"gdp_defla\"]].product(axis=1)\n", + "ppp_17_gdp_df[\"rgdpna_17\"] = ppp_17_gdp_df[[\"conv\", \"gdp_usd_17\"]].product(axis=1)" ] }, { "cell_type": "markdown", - "id": "e7b2ec2f", + "id": "6031352d-4d54-479d-9f99-3ae2c73efd7a", "metadata": {}, "source": [ - "**Saint Helena (`SHN`)**\n", - "\n", - "I update the latest values using the CIA World Factbook's January 7, 2021 vintage (link [here](https://www.cia.gov/the-world-factbook/)). For `SHN`, it is said that the 2009 value of GDP (in constant 2009 PPP USD) is 31.1 million, but we do not have the explicit PPP conversion for `SHN`. Since `SHN` is a British territory, `GBR` PPP rates are used." + "### Merging population, GDP, and GDP per capita datasets altogether" ] }, { "cell_type": "code", "execution_count": null, - "id": "ab2c1cbd", + "id": "d3157d12-c0a1-4d7e-8278-3e7da0552769", "metadata": {}, "outputs": [], "source": [ - "shn_rate = ppp_to_17.loc[(\"SHN\", 2009), \"conv\"]\n", - "us_def09 = pwt.loc[(\"USA\", 2009), \"pl_gdpo\"]\n", - "shn = pd.DataFrame(\n", - " [\n", - " [\"SHN\", 2009, shn_rate / us_def09 * 31.1, \"saint helena\"],\n", - " ],\n", - " columns=[\"ccode\", \"year\", \"gdp_constant2017ppp\", \"country\"],\n", - ").set_index([\"ccode\", \"year\"])\n", + "# merging GDP and GDPpc\n", + "gdp_rename = {\n", + " \"usd_year\": \"orig_usd_year_gdp\",\n", + " \"wfb_year\": \"wfb_year_gdp\",\n", + " \"conv_fill\": \"conv_fill_gdp\",\n", + " \"pl_gdpo_fill\": \"pl_gdpo_fill_gdp\",\n", + "}\n", + "gdp_merge_ready = ppp_17_gdp_df.rename(columns=gdp_rename).drop(\n", + " [\"gdp_defla\", \"conv\", \"gdp\"], axis=1\n", + ")\n", "\n", - "ctry_agg_df_final = pd.concat([ctry_agg_df_final, shn], axis=0)" - ] - }, - { - "cell_type": "markdown", - "id": "207fa442", - "metadata": {}, - "source": [ - "**Vatican (`VAT`)**\n", + "gdppc_rename = {\n", + " \"usd_year\": \"orig_usd_year_gdppc\",\n", + " \"wfb_year\": \"wfb_year_gdppc\",\n", + " \"conv_fill\": \"conv_fill_gdppc\",\n", + " \"pl_gdpo_fill\": \"pl_gdpo_fill_gdppc\",\n", + "}\n", + "gdppc_merge_ready = ppp_17_gdppc_df.rename(columns=gdppc_rename).drop(\n", + " [\"gdp_defla\", \"conv\", \"gdppc\"], axis=1\n", + ")\n", "\n", - "While not in the latest CIA World Factbook, the 2000 version has some information about Vatican city (archived [here](https://www.encyclopedia.com/places/spain-portugal-italy-greece-and-balkans/italian-political-geography/vatican-city)) which we will be able to use. It says that the 1999 estimate of the Vatican GDP (assuming it's constant 1999 PPP) was 21 million USD. Let us use the PPP conversion rates of Italy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "554deb3e", - "metadata": {}, - "outputs": [], - "source": [ - "vat_rate = ppp_to_17.loc[(\"VAT\", 1999), \"conv\"]\n", - "us_def99 = pwt.loc[(\"USA\", 1999), \"pl_gdpo\"]\n", - "vat = pd.DataFrame(\n", - " [\n", - " [\"VAT\", 1999, vat_rate / us_def99 * 21, \"vatican\"],\n", - " ],\n", - " columns=[\"ccode\", \"year\", \"gdp_constant2017ppp\", \"country\"],\n", - ").set_index([\"ccode\", \"year\"])\n", + "gdp_and_gdppc_merge_ready = gdp_merge_ready.set_index([\"ccode\", \"year\"]).merge(\n", + " gdppc_merge_ready.set_index([\"ccode\", \"year\"]),\n", + " left_index=True,\n", + " right_index=True,\n", + " how=\"outer\",\n", + ")\n", "\n", - "ctry_agg_df_final = pd.concat([ctry_agg_df_final, vat], axis=0)" + "# merging GDP + GDPpc with population\n", + "all_merged = (\n", + " updated_pop_df.set_index([\"ccode\", \"year\"])\n", + " .rename(columns={\"wfb_year\": \"wfb_year_pop\"})\n", + " .merge(gdp_and_gdppc_merge_ready, left_index=True, right_index=True, how=\"outer\")\n", + ")" ] }, { "cell_type": "markdown", - "id": "483b6c8c", + "id": "d919582c-848a-40cd-89b8-8badf243b45e", "metadata": {}, "source": [ "### Exporting" @@ -968,15 +715,34 @@ { "cell_type": "code", "execution_count": null, - "id": "2aad9b65", + "id": "32bb2826-a34a-4555-8ee8-61f682cf6f91", "metadata": {}, "outputs": [], "source": [ - "ctry_agg_df_final.sort_index(inplace=True)\n", - "ctry_agg_df_final.rename(columns={\"gdp_constant2017ppp\": \"cia_rgdpna\"}, inplace=True)\n", - "ctry_agg_df_final.to_parquet(\n", - " sset.DIR_YPK_INT / \"cia_wf_gdp_constant_2017_ppp_usd_ver.parquet\"\n", - ")" + "# re-ordering and changing data types for cleaner viewing\n", + "ordering = [\n", + " \"pop\",\n", + " \"gdp_usd_17\",\n", + " \"rgdpna_17\",\n", + " \"gdppc_usd_17\",\n", + " \"rgdpna_pc_17\",\n", + " \"wfb_year_pop\",\n", + " \"wfb_year_gdp\",\n", + " \"wfb_year_gdppc\",\n", + " \"orig_usd_year_gdp\",\n", + " \"orig_usd_year_gdppc\",\n", + " \"conv_fill_gdp\",\n", + " \"conv_fill_gdppc\",\n", + " \"pl_gdpo_fill_gdp\",\n", + " \"pl_gdpo_fill_gdppc\",\n", + "]\n", + "all_merged = all_merged[ordering].reset_index()\n", + "all_merged[\"year\"] = all_merged[\"year\"].astype(\"int64\")\n", + "all_merged.set_index([\"ccode\", \"year\"], inplace=True)\n", + "all_merged.sort_index(inplace=True)\n", + "\n", + "# exporting\n", + "all_merged.to_parquet(sset.PATH_CIA_INT)" ] } ], diff --git a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb index c281a92..120772b 100644 --- a/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb +++ b/notebooks/create-SLIIDERS-ECON/country_level_ypk/ypk2_reorg_and_impute_ypk.ipynb @@ -41,10 +41,18 @@ { "cell_type": "markdown", "id": "7444df03-7e87-43d1-bf13-bf7f7cd9885e", + "metadata": { + "tags": [] + }, + "source": [ + "## Importing all raw data, and creating a merged, long-panel version" + ] + }, + { + "cell_type": "markdown", + "id": "dacf6537-d3e6-48b0-92ef-4f6a71ced27c", "metadata": {}, "source": [ - "## Importing all raw data, and creating a merged, long-panel version\n", - "\n", "### PWT" ] }, @@ -320,9 +328,11 @@ "metadata": {}, "outputs": [], "source": [ - "cia = pd.read_parquet(sset.DIR_YPK_INT / \"cia_wf_gdp_constant_2017_ppp_usd_ver.parquet\")\n", + "cia = pd.read_parquet(sset.PATH_CIA_INT)[[\"rgdpna_17\", \"rgdpna_pc_17\"]].rename(\n", + " columns={\"rgdpna_17\": \"cia_rgdpna\", \"rgdpna_pc_17\": \"cia_rgdpna_pc\"}\n", + ")\n", "gdp_pop_df = gdp_pop_df.merge(\n", - " cia[[\"cia_rgdpna\"]], left_index=True, right_index=True, how=\"outer\"\n", + " cia[[\"cia_rgdpna\", \"cia_rgdpna_pc\"]], left_index=True, right_index=True, how=\"outer\"\n", ")" ] }, @@ -3406,380 +3416,7 @@ }, "widgets": { "application/vnd.jupyter.widget-state+json": { - "state": { - "0b9dcb8e7e194842ae99b1a9baea9759": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_ee7a58fc46dd4b4d9da4f91588307bb2", - "style": "IPY_MODEL_561a97a4b45448a9a8eae441f8e5d0da", - "value": "100%" - } - }, - "1c73fdc454c6491fb61f02cb7ed9fb60": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1c9166276ee347d39c95faf165bc9450": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_53f1e8cca07a4d38aee57777f84c46e9", - "style": "IPY_MODEL_fdee249ae0a74cd5883c86469c39aab9", - "value": " 58/128 [00:03<00:03, 18.74it/s]" - } - }, - "2219a9c4f88340f88bdac773d3ff4b81": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_0b9dcb8e7e194842ae99b1a9baea9759", - "IPY_MODEL_bfb81f9cf73640fa97dd0610729aff52", - "IPY_MODEL_7898ad926b0d4883bcba893ae7fc4767" - ], - "layout": "IPY_MODEL_fd84190ae61144e09e5218bfe17d1fc7" - } - }, - "28f49eb672ba4d298a6053625cd08683": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "bar_style": "success", - "layout": "IPY_MODEL_93f37f94c1c743138ac2c6ee01c067a8", - "max": 41, - "style": "IPY_MODEL_7f3aaa711aac4b12852fd21f7e39945d", - "value": 41 - } - }, - "2ca5cda929ec4f598887d57499131c4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "2cf7257329bc449a8f7f2cea82f102e2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "2f2ef046bcef46b8977cc0ec176e5e3d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "3e4ddebc6d8146c5aa104043051a7d9c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_7080a43f19074187898a40c3a20ee40d", - "style": "IPY_MODEL_ca9b28f1e0f14fc8b47d9c8744aed724", - "value": "100%" - } - }, - "3f5b1a2098e44766aa993c6b82496274": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_404dbe7264f44e20bfcf76a2442bc307", - "IPY_MODEL_d472be28fc3f41e9a88461090353c6e7", - "IPY_MODEL_1c9166276ee347d39c95faf165bc9450" - ], - "layout": "IPY_MODEL_9c15bafa92dc45289705416df944586c" - } - }, - "404dbe7264f44e20bfcf76a2442bc307": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_d70e0b24d77f4ae8a376588cb6ed1950", - "style": "IPY_MODEL_7552bca25c9a43a2905ae9fde45c5edb", - "value": " 45%" - } - }, - "45da96f00ed548cfa6d134939acbadd5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_c21431b349d84789982ea663823146b3", - "style": "IPY_MODEL_8cb6432b2f084c3cb409d6c7beba6632", - "value": "100%" - } - }, - "4d24dc58e4b84e11a720130e59a5cd77": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_45da96f00ed548cfa6d134939acbadd5", - "IPY_MODEL_28f49eb672ba4d298a6053625cd08683", - "IPY_MODEL_f0842f58af714d499f34aedec233b7c9" - ], - "layout": "IPY_MODEL_1c73fdc454c6491fb61f02cb7ed9fb60" - } - }, - "50b4c5c87b804e0f9214977cd06fef37": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "bar_style": "success", - "layout": "IPY_MODEL_f1fd6be11513474aa87063932f6eac2b", - "max": 128, - "style": "IPY_MODEL_729bb13470b447e79b3caea1241f1e93", - "value": 128 - } - }, - "53f1e8cca07a4d38aee57777f84c46e9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "55b9a6a30f8b416cbba6c9d62e2087fd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_3e4ddebc6d8146c5aa104043051a7d9c", - "IPY_MODEL_50b4c5c87b804e0f9214977cd06fef37", - "IPY_MODEL_5c2902f5fee74ae3aa47520fef604943" - ], - "layout": "IPY_MODEL_63385f1808b547e6a8d3d4ed7b96b70a" - } - }, - "561a97a4b45448a9a8eae441f8e5d0da": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "5c2902f5fee74ae3aa47520fef604943": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_f81dbdbbbbf54f7ca9164896bc1bff6e", - "style": "IPY_MODEL_2ca5cda929ec4f598887d57499131c4e", - "value": " 128/128 [00:06<00:00, 18.85it/s]" - } - }, - "63385f1808b547e6a8d3d4ed7b96b70a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7080a43f19074187898a40c3a20ee40d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "729bb13470b447e79b3caea1241f1e93": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "description_width": "" - } - }, - "7552bca25c9a43a2905ae9fde45c5edb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7898ad926b0d4883bcba893ae7fc4767": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_a21a9092bda140fdafae94f953e21434", - "style": "IPY_MODEL_2cf7257329bc449a8f7f2cea82f102e2", - "value": " 229/229 [00:00<00:00, 517.42it/s]" - } - }, - "7f3aaa711aac4b12852fd21f7e39945d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "description_width": "" - } - }, - "8b21dc66894e4f06aaf34585481aae6e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "8cb6432b2f084c3cb409d6c7beba6632": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "93f37f94c1c743138ac2c6ee01c067a8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "9c15bafa92dc45289705416df944586c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a21a9092bda140fdafae94f953e21434": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a2f1698ea33a4075b704c93fbfe2a56d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ab97341f3436466aa6f4964db63be7f6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "description_width": "" - } - }, - "bfb81f9cf73640fa97dd0610729aff52": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "bar_style": "success", - "layout": "IPY_MODEL_8b21dc66894e4f06aaf34585481aae6e", - "max": 229, - "style": "IPY_MODEL_d578d2e01b464318a2a7aa63c44b45a2", - "value": 229 - } - }, - "c21431b349d84789982ea663823146b3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ca9b28f1e0f14fc8b47d9c8744aed724": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "d472be28fc3f41e9a88461090353c6e7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "bar_style": "danger", - "layout": "IPY_MODEL_f300e0de6459495c925d508aa567cc10", - "max": 128, - "style": "IPY_MODEL_ab97341f3436466aa6f4964db63be7f6", - "value": 58 - } - }, - "d578d2e01b464318a2a7aa63c44b45a2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "description_width": "" - } - }, - "d70e0b24d77f4ae8a376588cb6ed1950": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ee7a58fc46dd4b4d9da4f91588307bb2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f0842f58af714d499f34aedec233b7c9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_a2f1698ea33a4075b704c93fbfe2a56d", - "style": "IPY_MODEL_2f2ef046bcef46b8977cc0ec176e5e3d", - "value": " 41/41 [00:02<00:00, 17.28it/s]" - } - }, - "f1fd6be11513474aa87063932f6eac2b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f300e0de6459495c925d508aa567cc10": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f81dbdbbbbf54f7ca9164896bc1bff6e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "fd84190ae61144e09e5218bfe17d1fc7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "fdee249ae0a74cd5883c86469c39aab9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - } - }, + "state": {}, "version_major": 2, "version_minor": 0 } From ccea0034900c86fa4bf439fdc9e2c08ac6965a52 Mon Sep 17 00:00:00 2001 From: Junho Choi Date: Sat, 9 Apr 2022 00:05:18 +0000 Subject: [PATCH 3/3] CIA WFB download code updated --- .../download-sliiders-econ-input-data.ipynb | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb b/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb index f9391ae..528df09 100644 --- a/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb +++ b/notebooks/create-SLIIDERS-ECON/download-sliiders-econ-input-data.ipynb @@ -21,7 +21,7 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", + "from os import remove as osrem\n", "import ssl\n", "import subprocess\n", "import tarfile\n", @@ -341,6 +341,30 @@ "file.close()" ] }, + { + "cell_type": "markdown", + "id": "2d45bbf7-2569-4365-b810-cd81c075286d", + "metadata": {}, + "source": [ + "### CIA World Factbook, versions 2000 to 2020" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a86932d-877b-4788-a25e-1764ba958212", + "metadata": {}, + "outputs": [], + "source": [ + "cia_download_url = \"https://www.cia.gov/the-world-factbook/about/archives/download\"\n", + "cia_files = [f\"factbook-{x}.zip\" for x in range(2000, 2021)]\n", + "\n", + "for i in tqdm(cia_files):\n", + " cia_req = requests.get(\"/\".join([cia_download_url, i]))\n", + " cia_zip = ZipFile(BytesIO(cia_req.content))\n", + " cia_zip.extractall(str(sset.DIR_CIA_RAW))" + ] + }, { "cell_type": "markdown", "id": "a9f0b8fa-7c93-4735-9caa-e07777d150a2", @@ -469,7 +493,7 @@ " file.extractall(sset.DIR_LITPOP_RAW)\n", "\n", "# clear storage for the existing tar file\n", - "os.remove(regular_litpop)" + "osrem(regular_litpop)" ] }, { @@ -499,7 +523,7 @@ "\n", "# unzipping\n", "outpath = sset.DIR_GEG15_RAW / zip_path.stem\n", - "os.makedirs(outpath, exist_ok=True)\n", + "outpath.mkdir(parents=True, exist_ok=True)\n", "subprocess.Popen([\"unzip\", f\"{zip_path}\", \"-d\", f\"{outpath}\"])" ] }, @@ -511,7 +535,7 @@ "outputs": [], "source": [ "# remove zip file (use after unzipping)\n", - "os.remove(zip_path)" + "osrem(zip_path)" ] }, { @@ -672,13 +696,6 @@ "2. Once on the page, download the dataset through your MY AVISO+ account (click on `access via MY AVISO+` link and follow the instructions).\n", "3. After following the instructions, you will acquire the file `mdt_cnes_cls18_global.nc.gz`. Extract the file `mdt_cnes_cls18_global.nc` from the `.gz` file and save it as `sset.PATH_GEOG_MDT_RAW`.\n", "\n", - "### CIA World Factbook (compiled by Coleman [2020])\n", - "\n", - "1. Travel to this [link](https://github.com/iancoleman/cia_world_factbook_api) (credit to Coleman [2020]), and scroll down to the `readme.md`.\n", - "2. In the **Data** section of the `readme.md` file, there should be a link on \"Historical\"; click on this link to travel to a `mega.nz` website having `weekly_json.7z` file.\n", - "3. After checking that the filename to download is `weekly_json.7z`, download the said file by clicking on the \"Download\" button.\n", - "4. When download is successful, import `weekly_json.7z` to the preferred directory (`sset.DIR_YPK_RAW` in this implementation).\n", - "\n", "### HydroSHEDS\n", "1. Go to https://hydrosheds.org/downloads\n", "2. Download the \"standard\" level-0 HydroBASINS files for each continent (use the Dropbox link if available--this appears as \"NOTE: you may also download data from here.\" as of 8/16/21. Download the shapefiles into the directory defined in `sset.DIR_HYDROBASINS_RAW`"