diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml index 6c5674ca441..2751caae2e3 100644 --- a/dag/poverty_inequality.yml +++ b/dag/poverty_inequality.yml @@ -8,7 +8,7 @@ steps: # Poverty and inequality file for Joe's PhD data://explorers/poverty_inequality/latest/poverty_inequality_export: - data://garden/wb/2024-03-27/world_bank_pip - - data://garden/wid/2023-08-24/world_inequality_database + - data://garden/wid/2024-05-24/world_inequality_database - data://garden/lis/2023-08-30/luxembourg_income_study - data://garden/wb/2024-01-22/thousand_bins_distribution - data://garden/worldbank_wdi/2023-05-29/wdi @@ -27,18 +27,19 @@ steps: - data://garden/wb/2024-03-27/world_bank_pip # World Inequality Database - data://meadow/wid/2023-08-24/world_inequality_database: + data://meadow/wid/2024-05-24/world_inequality_database: - data://garden/regions/2023-01-01/regions - - snapshot://wid/2023-08-24/world_inequality_database.csv - - snapshot://wid/2023-08-24/world_inequality_database_with_extrapolations.csv - - snapshot://wid/2023-08-24/world_inequality_database_distribution.csv - - snapshot://wid/2023-08-24/world_inequality_database_distribution_with_extrapolations.csv - data://garden/wid/2023-08-24/world_inequality_database: - - data://meadow/wid/2023-08-24/world_inequality_database - data://grapher/wid/2023-08-24/world_inequality_database: - - data://garden/wid/2023-08-24/world_inequality_database + - snapshot://wid/2024-05-24/world_inequality_database.csv + - snapshot://wid/2024-05-24/world_inequality_database_with_extrapolations.csv + - snapshot://wid/2024-05-24/world_inequality_database_distribution.csv + - snapshot://wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.csv + - snapshot://wid/2024-05-24/world_inequality_database_fiscal.csv + data://garden/wid/2024-05-24/world_inequality_database: + - data://meadow/wid/2024-05-24/world_inequality_database + data://grapher/wid/2024-05-24/world_inequality_database: + - data://garden/wid/2024-05-24/world_inequality_database data://explorers/wid/latest/world_inequality_database: - - data://garden/wid/2023-08-24/world_inequality_database + - data://garden/wid/2024-05-24/world_inequality_database # Luxembourg Income Study data://meadow/lis/2023-08-30/luxembourg_income_study: diff --git a/etl/steps/data/garden/wb/2024-01-17/shared.py b/etl/steps/data/garden/wb/2024-01-17/shared.py index d2de965bd94..1a5746fbfd0 100644 --- a/etl/steps/data/garden/wb/2024-01-17/shared.py +++ b/etl/steps/data/garden/wb/2024-01-17/shared.py @@ -158,7 +158,7 @@ }, "avg": { "title": "Average", - "description": "The mean {inc_cons_dict[wel]['name_distribution']} per year within the {pct_dict[pct]['decile10']} (tenth of the population).", + "description": "The mean {inc_cons_dict[wel]['name_distribution']} per day within the {pct_dict[pct]['decile10']} (tenth of the population).", "unit": "international-$ in {ppp} prices", "short_unit": "$", "numDecimalPlaces": 2, @@ -172,7 +172,7 @@ }, "thr": { "title": "Threshold", - "description": "The level of {inc_cons_dict[wel]['name_distribution']} per year below which {str(pct)}% of the population falls.", + "description": "The level of {inc_cons_dict[wel]['name_distribution']} per day below which {str(pct)}% of the population falls.", "unit": "international-$ in {ppp} prices", "short_unit": "$", "numDecimalPlaces": 2, diff --git a/etl/steps/data/garden/wb/2024-03-27/shared.py b/etl/steps/data/garden/wb/2024-03-27/shared.py index 62489a9877c..f3a487f2f27 100644 --- a/etl/steps/data/garden/wb/2024-03-27/shared.py +++ b/etl/steps/data/garden/wb/2024-03-27/shared.py @@ -158,7 +158,7 @@ }, "avg": { "title": "Average", - "description": "The mean {inc_cons_dict[wel]['name_distribution']} per year within the {pct_dict[pct]['decile10']} (tenth of the population).", + "description": "The mean {inc_cons_dict[wel]['name_distribution']} per day within the {pct_dict[pct]['decile10']} (tenth of the population).", "unit": "international-$ in {ppp} prices", "short_unit": "$", "numDecimalPlaces": 2, @@ -172,7 +172,7 @@ }, "thr": { "title": "Threshold", - "description": "The level of {inc_cons_dict[wel]['name_distribution']} per year below which {str(pct)}% of the population falls.", + "description": "The level of {inc_cons_dict[wel]['name_distribution']} per day below which {str(pct)}% of the population falls.", "unit": "international-$ in {ppp} prices", "short_unit": "$", "numDecimalPlaces": 2, diff --git a/etl/steps/data/garden/wid/2024-05-24/shared.py b/etl/steps/data/garden/wid/2024-05-24/shared.py new file mode 100644 index 00000000000..3cfad39f351 --- /dev/null +++ b/etl/steps/data/garden/wid/2024-05-24/shared.py @@ -0,0 +1,564 @@ +""" +This function creates the metadata for each variable in the WID dataset, from the dictionaries defined below +If new variables are included in the dataset (from `wid` command in Stata) the dictionaries feeding metadata functions have to be updated (if not an error will show up) +""" + +from owid.catalog import Table, VariableMeta, VariablePresentationMeta + +# Define PPP year +# NOTE: Change the year when needed +PPP_YEAR = 2023 + +# Define default tolerance for each variable +TOLERANCE = 5 + +# This is text common to all variables + +ADDITIONAL_DESCRIPTION = [ + "The data is estimated from a combination of household surveys, tax records and national accounts data. This combination can provide a more accurate picture of the incomes of the richest, which tend to be captured poorly in household survey data alone.", + "These underlying data sources are not always available. For some countries, observations are extrapolated from data relating to other years, or are sometimes modeled based on data observed in other countries. For more information on this methodology, see this related [technical note](https://wid.world/document/countries-with-regional-income-imputations-on-wid-world-world-inequality-lab-technical-note-2021-15/).", + "In the case of national post-tax income, when the data sources are not available, distributions are constructed by using the more widely available pre-tax distributions, combined with tax revenue and government expenditure aggregates. This method is described in more detail in this [technical note](https://wid.world/document/preliminary-estimates-of-global-posttax-income-distributions-world-inequality-lab-technical-note-2023-02/).", +] + +RELATIVE_POVERTY_DESCRIPTION = "This data has been estimated by calculating the {povline}, and then checking that value against the closest threshold in the percentile distribution. The headcount ratio is then the percentile, the share of the population below that threshold." + +PROCESSING_DESCRIPTION = """We extract estimations of Gini, mean, percentile thresholds, averages, and shares via the [`wid` Stata command](https://github.com/thomasblanchet/wid-stata-tool). We calculate threshold and share ratios by dividing different thresholds and shares, respectively.""" + +PPP_DESCRIPTION = f"The data is measured in international-$ at {PPP_YEAR} prices – this adjusts for inflation and for differences in the cost of living between countries." + +# These are parameters specifically defined for each type of variable +VAR_DICT = { + "avg": { + "title": "Average", + "description": "The mean {WELFARE_DICT[wel]['type']} per year within the {PCT_DICT[pct]['decile10_extra'].lower()}.", + "unit": f"international-$ in {PPP_YEAR} prices", + "short_unit": "$", + "numDecimalPlaces": 0, + }, + "share": { + "title": "Share", + "description": "The share of {WELFARE_DICT[wel]['type']} {WELFARE_DICT[wel]['verb']} by the {PCT_DICT[pct]['decile10_extra'].lower()}.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "p50p90_share": { + "title": "Middle 40% - Share", + "description": "The share of {WELFARE_DICT[wel]['type']} {WELFARE_DICT[wel]['verb']} by the middle 40%. The middle 40% is the share of the population whose {WELFARE_DICT[wel]['type']} lies between the poorest 50% and the richest 10%.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "thr": { + "title": "Threshold", + "description": "The level of {WELFARE_DICT[wel]['type']} per year below which {str(PCT_DICT[pct]['thr_number'])}% of the population falls.", + "unit": f"international-$ in {PPP_YEAR} prices", + "short_unit": "$", + "numDecimalPlaces": 0, + }, + "p0p100_gini": { + "title": "Gini coefficient", + "description": "The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + "p0p100_avg": { + "title": "Mean", + "description": "Mean {WELFARE_DICT[wel]['type']}.", + "unit": f"international-$ in {PPP_YEAR} prices", + "short_unit": "$", + "numDecimalPlaces": 0, + }, + "median": { + "title": "Median", + "description": "Median {WELFARE_DICT[wel]['type']}.", + "unit": f"international-$ in {PPP_YEAR} prices", + "short_unit": "$", + "numDecimalPlaces": 0, + }, + "palma_ratio": { + "title": "Palma ratio", + "description": "The Palma ratio is a measure of inequality that divides the share received by the richest 10% by the share of the poorest 40%. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "s80_s20_ratio": { + "title": "S80/S20 ratio", + "description": "The S80/S20 ratio is a measure of inequality that divides the share received by the richest 20% by the share of the poorest 20%. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p90_p10_ratio": { + "title": "P90/P10 ratio", + "description": "P90 and P10 are the levels of {WELFARE_DICT[wel]['type']} below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p90_p50_ratio": { + "title": "P90/P50 ratio", + "description": "The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median {WELFARE_DICT[wel]['type']}.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p50_p10_ratio": { + "title": "P50/P10 ratio", + "description": "The P50/P10 ratio measures the degree of inequality within the poorest half of the population. A ratio of 2 means that the median {WELFARE_DICT[wel]['type']} is two times higher than that of someone just falling in the poorest tenth of the population.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "headcount_ratio": { + "title": "Share of population in poverty", + "description": "Share of the population living below the poverty line of {povline}", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 0, + }, +} + +# Details for each income variable +WELFARE_DICT = { + "pretax": { + "name": "Pretax", + "type": "income", + "verb": "received", + "description": "Income is ‘pre-tax’ — measured before taxes have been paid and most government benefits have been received. It is, however, measured after the operation of pension schemes, both private and public.", + }, + "posttax_dis": { + "name": "Post-tax disposable", + "type": "income", + "verb": "received", + "description": "Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received, but does not include in-kind benefits and therefore does not add up to national income.", + }, + "posttax_nat": { + "name": "Post-tax national", + "type": "income", + "verb": "received", + "description": "Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received.", + }, + "wealth": { + "name": "Net national wealth", + "type": "wealth", + "verb": "owned", + "description": "This measure is related to net national wealth, which is the total value of non-financial and financial assets (housing, land, deposits, bonds, equities, etc.) held by households, minus their debts.", + }, +} + +# Details for naming each decile/percentile +PCT_DICT = { + "p0p10": { + "decile10": "Poorest decile", + "decile9": "Minimum value", + "thr_number": 0, + "decile10_extra": "Poorest decile (tenth of the population)", + }, + "p10p20": { + "decile10": "2nd decile", + "decile9": "Poorest decile", + "thr_number": 10, + "decile10_extra": "2nd decile (tenth of the population)", + }, + "p20p30": { + "decile10": "3rd decile", + "decile9": "2nd decile", + "thr_number": 20, + "decile10_extra": "3rd decile (tenth of the population)", + }, + "p30p40": { + "decile10": "4th decile", + "decile9": "3rd decile", + "thr_number": 30, + "decile10_extra": "4th decile (tenth of the population)", + }, + "p40p50": { + "decile10": "5th decile", + "decile9": "4th decile", + "thr_number": 40, + "decile10_extra": "5th decile (tenth of the population)", + }, + "p50p60": { + "decile10": "6th decile", + "decile9": "5th decile (median)", + "thr_number": 50, + "decile10_extra": "6th decile (tenth of the population)", + }, + "p60p70": { + "decile10": "7th decile", + "decile9": "6th decile", + "thr_number": 60, + "decile10_extra": "7th decile (tenth of the population)", + }, + "p70p80": { + "decile10": "8th decile", + "decile9": "7th decile", + "thr_number": 70, + "decile10_extra": "8th decile (tenth of the population)", + }, + "p80p90": { + "decile10": "9th decile", + "decile9": "8th decile", + "thr_number": 80, + "decile10_extra": "9th decile (tenth of the population)", + }, + "p90p100": { + "decile10": "Richest decile", + "decile9": "Richest decile", + "thr_number": 90, + "decile10_extra": "Richest decile (tenth of the population)", + }, + "p99p100": {"decile10": "Top 1%", "decile9": "Top 1%", "thr_number": 99, "decile10_extra": "Richest 1%"}, + "p99_9p100": {"decile10": "Top 0.1%", "decile9": "Top 0.1%", "thr_number": 99.9, "decile10_extra": "Richest 0.1%"}, + "p99_99p100": { + "decile10": "Top 0.01%", + "decile9": "Top 0.01%", + "thr_number": 99.99, + "decile10_extra": "Richest 0.01%", + }, + "p99_999p100": { + "decile10": "Top 0.001%", + "decile9": "Top 0.001%", + "thr_number": 99.999, + "decile10_extra": "Richest 0.001%", + }, + "p0p50": {"decile10": "Bottom 50%", "decile9": "Bottom 50%", "thr_number": "", "decile10_extra": "Poorest 50%"}, + "p90p99": { + "decile10": "Between 90th and 99th percentiles", + "decile9": "", + "thr_number": "", + "decile10_extra": "People between the 90th and 99th percentiles", + }, +} + +# Details for each relative poverty line +REL_DICT = {40: "40% of the median", 50: "50% of the median", 60: "60% of the median"} + +# Details for extrapolations or estimations +EXTRAPOLATION_DICT = { + "": { + "title": "Estimated", + "description": "Interpolations and extrapolations are excluded by using the option `exclude` in the Stata command.", + }, + "_extrapolated": { + "title": "Extrapolated", + "description": "Interpolations and extrapolations are included.", + }, +} + + +def add_metadata_vars(tb_garden: Table) -> Table: + """ + This function adds metadata to all the variables in the WID dataset + """ + # Get a list of all the variables available + cols = list(tb_garden.columns) + + for var in VAR_DICT: + for wel in WELFARE_DICT: + for ext in EXTRAPOLATION_DICT: + # For variables that use income variable + col_name = f"{var}_{wel}{ext}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_income(var, origins, wel, ext) + # Replace income/wealth words according to `wel` + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace( + "{WELFARE_DICT[wel]['type']}", str(WELFARE_DICT[wel]["type"]) + ) + .replace("{WELFARE_DICT[wel]['verb']}", str(WELFARE_DICT[wel]["verb"])) + ) + + for rel in REL_DICT: + # For variables that use income variable, equivalence scale and relative poverty lines + col_name = f"{var}_{rel}_median_{wel}{ext}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_income_relative(var, origins, wel, rel, ext) + + # Replace values in description_short according to `rel` + tb_garden[col_name].metadata.description_short = tb_garden[ + col_name + ].metadata.description_short.replace("{povline}", REL_DICT[rel]) + + # Replace values in description_processing according to `rel` + tb_garden[col_name].metadata.description_processing = tb_garden[ + col_name + ].metadata.description_processing.replace("{povline}", REL_DICT[rel]) + + for pct in PCT_DICT: + # For variables that use income variable and percentiles (deciles) + col_name = f"{pct}_{var}_{wel}{ext}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_income_percentiles(var, origins, wel, pct, ext) + + # Replace values in description_short according to `pct`, depending on `var` + if var == "thr": + tb_garden[col_name].metadata.description_short = tb_garden[ + col_name + ].metadata.description_short.replace( + "{str(PCT_DICT[pct]['thr_number'])}", str(PCT_DICT[pct]["thr_number"]) + ) + + else: + tb_garden[col_name].metadata.description_short = tb_garden[ + col_name + ].metadata.description_short.replace( + "{PCT_DICT[pct]['decile10_extra'].lower()}", + PCT_DICT[pct]["decile10_extra"].lower(), + ) + + # Replace income/wealth words according to `wel` + tb_garden[col_name].metadata.description_short = tb_garden[ + col_name + ].metadata.description_short.replace( + "{WELFARE_DICT[wel]['verb']}", str(WELFARE_DICT[wel]["verb"]) + ) + tb_garden[col_name].metadata.description_short = tb_garden[ + col_name + ].metadata.description_short.replace( + "{WELFARE_DICT[wel]['type']}", str(WELFARE_DICT[wel]["type"]) + ) + + return tb_garden + + +# Metadata functions to show a clearer main code + + +def var_metadata_income(var, origins, wel, ext) -> VariableMeta: + """ + This function assigns each of the metadata fields for the variables not depending on deciles + """ + # For monetary variables I include the PPP description + if var == "p0p100_avg" or var == "median": + meta = VariableMeta( + title=f"{VAR_DICT[var]['title']} ({WELFARE_DICT[wel]['name']}) ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT[var]["description"], + description_key=[PPP_DESCRIPTION, WELFARE_DICT[wel]["description"]] + ADDITIONAL_DESCRIPTION, + description_processing=f"""{PROCESSING_DESCRIPTION} + +{EXTRAPOLATION_DICT[ext]['description']}""", + unit=VAR_DICT[var]["unit"], + short_unit=VAR_DICT[var]["short_unit"], + origins=origins, + ) + + else: + meta = VariableMeta( + title=f"{VAR_DICT[var]['title']} ({WELFARE_DICT[wel]['name']}) ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT[var]["description"], + description_key=[WELFARE_DICT[wel]["description"]] + ADDITIONAL_DESCRIPTION, + description_processing=f"""{PROCESSING_DESCRIPTION} + +{EXTRAPOLATION_DICT[ext]['description']}""", + unit=VAR_DICT[var]["unit"], + short_unit=VAR_DICT[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": VAR_DICT[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_income_percentiles(var, origins, wel, pct, ext) -> VariableMeta: + """ + This function assigns each of the metadata fields for the variables depending on deciles + """ + if var == "thr": + meta = VariableMeta( + title=f"{PCT_DICT[pct]['decile9']} - {VAR_DICT[var]['title']} ({WELFARE_DICT[wel]['name']}) ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT[var]["description"], + description_key=[PPP_DESCRIPTION, WELFARE_DICT[wel]["description"]] + ADDITIONAL_DESCRIPTION, + description_processing=f"""{PROCESSING_DESCRIPTION} + +{EXTRAPOLATION_DICT[ext]['description']}""", + unit=VAR_DICT[var]["unit"], + short_unit=VAR_DICT[var]["short_unit"], + origins=origins, + ) + + elif var == "avg": + meta = VariableMeta( + title=f"{PCT_DICT[pct]['decile10']} - {VAR_DICT[var]['title']} ({WELFARE_DICT[wel]['name']}) ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT[var]["description"], + description_key=[PPP_DESCRIPTION, WELFARE_DICT[wel]["description"]] + ADDITIONAL_DESCRIPTION, + description_processing=f"""{PROCESSING_DESCRIPTION} + +{EXTRAPOLATION_DICT[ext]['description']}""", + unit=VAR_DICT[var]["unit"], + short_unit=VAR_DICT[var]["short_unit"], + origins=origins, + ) + + # Shares do not have PPP description + else: + meta = VariableMeta( + title=f"{PCT_DICT[pct]['decile10']} - {VAR_DICT[var]['title']} ({WELFARE_DICT[wel]['name']}) ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT[var]["description"], + description_key=[WELFARE_DICT[wel]["description"]] + ADDITIONAL_DESCRIPTION, + description_processing=f"""{PROCESSING_DESCRIPTION} + +{EXTRAPOLATION_DICT[ext]['description']}""", + unit=VAR_DICT[var]["unit"], + short_unit=VAR_DICT[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": VAR_DICT[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_income_relative(var, origins, wel, rel, ext) -> VariableMeta: + meta = VariableMeta( + title=f"{REL_DICT[rel]} - {VAR_DICT[var]['title']} ({WELFARE_DICT[wel]['name']}) ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT[var]["description"], + description_key=[WELFARE_DICT[wel]["description"]] + ADDITIONAL_DESCRIPTION, + description_processing=f"""{PROCESSING_DESCRIPTION} + +{RELATIVE_POVERTY_DESCRIPTION} + +{EXTRAPOLATION_DICT[ext]['description']}""", + unit=VAR_DICT[var]["unit"], + short_unit=VAR_DICT[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": VAR_DICT[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +############################################################################################################## +# This is the code for the distribution variables +############################################################################################################## + +VAR_DICT_DISTRIBUTION = { + "avg": { + "title": "Average", + "description": "The mean income or wealth per year within each percentile.", + "unit": f"international-$ in {PPP_YEAR} prices", + "short_unit": "$", + "numDecimalPlaces": 0, + }, + "share": { + "title": "Share", + "description": "The share of income or wealth received/owned by each percentile.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "thr": { + "title": "Threshold", + "description": "The level of income or wealth per year below which 1%, 2%, 3%, ... , 99%, 99.9%, 99.99%, 99.999% of the population falls.", + "unit": f"international-$ in {PPP_YEAR} prices", + "short_unit": "$", + "numDecimalPlaces": 0, + }, +} + +# Define welfare variables + +WELFARE_DEFINITIONS = [ + "Data refers to four types of welfare measures:", + "`welfare = 'pretax'` is ‘pre-tax’ income — measured before taxes have been paid and most government benefits have been received. It is, however, measured after the operation of pension schemes, both private and public.", + "`welfare = 'posttax_dis'` is ‘post-tax’ income — measured after taxes have been paid and most government benefits have been received, but does not include in-kind benefits and therefore does not add up to national income.", + "`welfare = 'posttax_nat'` is ‘post-tax’ income — measured after taxes have been paid and most government benefits have been received.", + "`welfare = 'wealth'` is net national wealth, which is the total value of non-financial and financial assets (housing, land, deposits, bonds, equities, etc.) held by households, minus their debts.", +] + +PROCESSING_DESCRIPTION_DISTRIBUTIONS = ( + """Estimations are extracted via the [`wid` Stata command](https://github.com/thomasblanchet/wid-stata-tool).""" +) + + +def add_metadata_vars_distribution(tb_garden: Table) -> Table: + # Get a list of all the variables available + cols = list(tb_garden.columns) + + for var in VAR_DICT_DISTRIBUTION: + for ext in EXTRAPOLATION_DICT: + # All the variables follow whis structure + col_name = f"{var}{ext}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_distribution(var, origins, ext) + + return tb_garden + + +def var_metadata_distribution(var: str, origins, ext: str) -> VariableMeta: + """ + This function assigns each of the metadata fields for the distribution variables + """ + # Shares do not include PPP description + if var == "share": + meta = VariableMeta( + title=f"Income or wealth {VAR_DICT_DISTRIBUTION[var]['title'].lower()} ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT_DISTRIBUTION[var]["description"], + description_key=WELFARE_DEFINITIONS + ADDITIONAL_DESCRIPTION, + description_processing=f"{PROCESSING_DESCRIPTION_DISTRIBUTIONS} {EXTRAPOLATION_DICT[ext]['description']}", + unit=VAR_DICT_DISTRIBUTION[var]["unit"], + short_unit=VAR_DICT_DISTRIBUTION[var]["short_unit"], + origins=origins, + ) + + # For monetary variables I include the PPP description + else: + meta = VariableMeta( + title=f"{VAR_DICT_DISTRIBUTION[var]['title']} income or wealth ({EXTRAPOLATION_DICT[ext]['title']})", + description_short=VAR_DICT_DISTRIBUTION[var]["description"], + description_key=[PPP_DESCRIPTION] + WELFARE_DEFINITIONS + ADDITIONAL_DESCRIPTION, + description_processing=f"{PROCESSING_DESCRIPTION_DISTRIBUTIONS} {EXTRAPOLATION_DICT[ext]['description']}", + unit=VAR_DICT_DISTRIBUTION[var]["unit"], + short_unit=VAR_DICT_DISTRIBUTION[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": VAR_DICT_DISTRIBUTION[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta diff --git a/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml new file mode 100644 index 00000000000..2cc4f031a2e --- /dev/null +++ b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.meta.yml @@ -0,0 +1,140 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Economic Inequality + grapher_config: + originUrl: https://ourworldindata.org/economic-inequality + $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json + processing_level: major + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 365 + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + world_inequality_database: + # Learn more about the available fields: + # http://docs.owid.io/projects/etl/architecture/metadata/reference/indicator/ + variables: + p99p100_share_pretax: + presentation: + title_variant: Before tax + grapher_config: + title: Income share of the richest 1% (before tax) + subtitle: >- + The share of income received by the richest 1% of the population. Income here is measured before taxes and benefits. + note: >- + Income is measured before payment of taxes and non-pension benefits, but after + the payment of public and private pensions. + hasMapTab: true + tab: map + variantName: WID + yAxis: + min: 0 + map: + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 5 + - 10 + - 15 + - 20 + - 25 + - 30 + selectedEntityNames: + - Chile + - Brazil + - United States + - South Africa + - China + - France + p90p100_share_pretax: + presentation: + title_variant: Before tax + grapher_config: + title: Income share of the richest 10% (before tax) + subtitle: >- + The share of income received by the richest 10% of the population. Income here is measured before taxes and benefits. + note: >- + Income is measured before payment of taxes and non-pension benefits, but after + the payment of public and private pensions. + hasMapTab: true + tab: map + variantName: WID + yAxis: + min: 0 + map: + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericMinValue: 100 + customNumericValues: + - 30 + - 35 + - 40 + - 45 + - 50 + - 55 + - 60 + - 65 + selectedEntityNames: + - Chile + - Brazil + - United States + - South Africa + - China + - France + + world_inequality_database_fiscal: + common: + description_key: + - Income is ‘fiscal’ — it represents the total income that is or should be reported on tax declarations (before any specific deduction allowed by fiscal legislation). + - The data is estimated from a combination of household surveys and tax records. This combination can provide a more accurate picture of the incomes of the richest, which tend to be captured poorly in household survey data alone. + - These underlying data sources are not always available. For some countries, observations are extrapolated from data relating to other years, or are sometimes modeled based on data observed in other countries. + description_processing: We extract estimations via the [`wid` Stata command](https://github.com/thomasblanchet/wid-stata-tool). + variables: + p99p100_share_fiscal992i: + title: Top 1% - Share (Fiscal, individuals) + description_short: The share of income received by the richest 1%. + unit: "%" + short_unit: "%" + display: + name: Top 1% - Share (Fiscal, individuals) + numDecimalPlaces: 1 + tolerance: 5 + presentation: + title_public: Top 1% - Share (Fiscal, individuals) + + p99p100_share_fiscal992j: + title: Top 1% - Share (Fiscal, equal-split adults) + description_short: The share of income received by the richest 1%. + unit: "%" + short_unit: "%" + display: + name: Top 1% - Share (Fiscal, equal-split adults) + numDecimalPlaces: 1 + tolerance: 5 + presentation: + title_public: Top 1% - Share (Fiscal, equal-split adults) + + p99p100_share_fiscal992t: + title: Top 1% - Share (Fiscal, tax units) + description_short: The share of income received by the richest 1%. + unit: "%" + short_unit: "%" + display: + name: Top 1% - Share (Fiscal, tax units) + numDecimalPlaces: 1 + tolerance: 5 + presentation: + title_public: Top 1% - Share (Fiscal, tax units) + + diff --git a/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py new file mode 100644 index 00000000000..0a6ca752ddf --- /dev/null +++ b/etl/steps/data/garden/wid/2024-05-24/world_inequality_database.py @@ -0,0 +1,371 @@ +""" +Load World Inequality Database meadow dataset and create a garden dataset. + +NOTE: To extract the log of the process (to review sanity checks, for example), run the following command in the terminal: + nohup poetry run etl run world_inequality_database > output.log 2>&1 & + +""" + + +import owid.catalog.processing as pr +from owid.catalog import Table +from shared import add_metadata_vars, add_metadata_vars_distribution +from structlog import get_logger +from tabulate import tabulate + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Initialize logger. +log = get_logger() + +# Define combinations of variables to calculate relative poverty +EXTRAPOLATED_DICT = {"no": "", "yes": "_extrapolated"} +WELFARE_VARS = ["pretax", "posttax_nat", "posttax_dis", "wealth"] + +# Set table format when printing +TABLEFMT = "pretty" + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("world_inequality_database") + + # Read tables from meadow dataset. + tb = ds_meadow["world_inequality_database"].reset_index() + tb_percentiles = ds_meadow["world_inequality_database_distribution"].reset_index() + tb_fiscal = ds_meadow["world_inequality_database_fiscal"].reset_index() + + # + # Process data. + # Change units and drop unnecessary columns + tb = drop_columns_and_transform(tb) + + # Sanity checks + sanity_checks(tb) + + ######################################## + # Percentile data + ######################################## + + # Process data. + # Multiple share and share_extrapolated columns by 100 + tb_percentiles[["share", "share_extrapolated"]] *= 100 + + # Multiply columns containing share in tb_fiscal by 100 + tb_fiscal[list(tb_fiscal.filter(like="share"))] *= 100 + + # Add relative poverty values + tb_relative_poverty = add_relative_poverty(tb, tb_percentiles, EXTRAPOLATED_DICT, WELFARE_VARS) + + # Merge tables + tb = pr.merge(tb, tb_relative_poverty, on=["country", "year"], how="left") + + # Add metadata by code (key indicators) + tb = add_metadata_vars(tb) + + # Add metadata by code (distributions) + tb_percentiles = add_metadata_vars_distribution(tb_percentiles) + + # Set index and sort + tb = tb.format() + tb_percentiles = tb_percentiles.format(["country", "year", "welfare", "p", "percentile"]) + tb_fiscal = tb_fiscal.format() + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset and add the garden table. + ds_garden = create_dataset( + dest_dir, + tables=[tb, tb_percentiles, tb_fiscal], + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +# Data processing function (cleaning and small transformations) +def drop_columns_and_transform(tb: Table) -> Table: + """ + Drop columns and transform shares by multiplying by 100 + """ + # Multiply shares by 100 + tb[list(tb.filter(like="share"))] *= 100 + + # Delete age and pop, two one-value variables + tb = tb.drop(columns=["age", "pop", "age_extrapolated", "pop_extrapolated"]) + + # Delete some share ratios we are not using, and also the p0p40 (share) variable only available for pretax + drop_list = ["s90_s10_ratio", "s90_s50_ratio", "p0p40"] + + for var in drop_list: + tb = tb[tb.columns.drop(list(tb.filter(like=var)))] + + return tb + + +def add_relative_poverty(tb: Table, tb_percentiles: Table, extrapolated_dict: dict, welfare_vars: list) -> Table: + """ + Add relative poverty values by estimating the median and checking that value against the percentile distribution + """ + + # Make copies of the tables + tb = tb.copy() + tb_percentiles = tb_percentiles.copy() + + # Make tb_percentiles wide, by creating a column for each welfare + tb_percentiles = tb_percentiles.pivot( + index=["country", "year", "p", "percentile"], columns="welfare", values=["thr", "thr_extrapolated"] + ) + + # Flatten column names + tb_percentiles.columns = ["_".join(col).strip() for col in tb_percentiles.columns.values] + tb_percentiles = tb_percentiles.reset_index() + + # Calculate 40, 50, and 60 percent of the median for each country and year + for var in welfare_vars: + for yn, extrapolated in extrapolated_dict.items(): + for pct in [40, 50, 60]: + tb[f"median{pct}pct_{var}{extrapolated}"] = tb[f"median_{var}{extrapolated}"] * pct / 100 + + # Merge the two tables + tb_percentiles = pr.merge(tb_percentiles, tb, on=["country", "year"], how="left") + + # Calculate absolute difference between thresholds and percentage of median + for var in welfare_vars: + for yn, extrapolated in extrapolated_dict.items(): + for pct in [40, 50, 60]: + tb_percentiles[f"abs_diff{pct}pct_{var}{extrapolated}"] = abs( + tb_percentiles[f"thr{extrapolated}_{var}"] - tb_percentiles[f"median{pct}pct_{var}{extrapolated}"] + ) + + # For each country and year, find the percentile with the minimum absolute difference for each welafre, extrapolation and pct + tb_relative_poverty = Table() + for var in welfare_vars: + for yn, extrapolated in extrapolated_dict.items(): + for pct in [40, 50, 60]: + tb_percentiles[f"min{pct}pct_{var}{extrapolated}"] = tb_percentiles.groupby(["country", "year"])[ + f"abs_diff{pct}pct_{var}{extrapolated}" + ].transform("min") + + # Create a table with the minimum absolute difference for each country and year + tb_min = tb_percentiles[ + tb_percentiles[f"abs_diff{pct}pct_{var}{extrapolated}"] + == tb_percentiles[f"min{pct}pct_{var}{extrapolated}"] + ] + # The result generates multiple values for some countries and years, so we need to drop duplicates + tb_min = tb_min.drop_duplicates(subset=["country", "year"], keep="last") + + # Select only what is needed + tb_min = tb_min[["country", "year", "p"]] + # Multiply by 100 to get the headcount ratio in percentage and rename + tb_min["p"] *= 100 + tb_min = tb_min.rename(columns={"p": f"headcount_ratio_{pct}_median_{var}{extrapolated}"}) + + # Merge this table with tb_relative_poverty + if tb_relative_poverty.empty: + tb_relative_poverty = tb_min + else: + tb_relative_poverty = pr.merge(tb_relative_poverty, tb_min, on=["country", "year"], how="outer") + + return tb_relative_poverty + + +def sanity_checks(tb: Table) -> Table: + """ + Perform sanity checks on the data + """ + + tb = tb.copy() + + check_between_0_and_1(tb, variables=["p0p100_gini"], welfare=WELFARE_VARS) + check_shares_sum_100(tb, welfare=WELFARE_VARS, margin=0.5) + check_negative_values(tb) + check_monotonicity(tb, metric=["avg", "thr", "share"], welfare=WELFARE_VARS) + check_avg_between_thr(tb, welfare=WELFARE_VARS) + + return tb + + +def check_between_0_and_1(tb: Table, variables: list, welfare: list): + """ + Check that indicators are between 0 and 1 + """ + + tb = tb.copy() + + for e in EXTRAPOLATED_DICT: + for v in variables: + for w in welfare: + # Filter only values lower than 0 or higher than 1 + col = f"{v}_{w}{EXTRAPOLATED_DICT[e]}" + mask = (tb[col] > 1) | (tb[col] < 0) + tb_error = tb[mask].copy().reset_index() + + if not tb_error.empty and w != "wealth": + log.fatal( + f"""Values for {col} are not between 0 and 1: + {tabulate(tb_error[['country', 'year', col]], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + + elif not tb_error.empty and w == "wealth": + log.warning( + f"""Values for {col} are not between 0 and 1: + {tabulate(tb_error[['country', 'year', col]], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + + return tb + + +def check_shares_sum_100(tb: Table, welfare: list, margin: float): + """ + Check if the sum of the variables is 100 + """ + + tb = tb.copy() + # Create a list of variables containing pxpy_share + variables = [f"p{i}p{i+10}_share" for i in range(0, 100, 10)] + for e in EXTRAPOLATED_DICT: + for w in welfare: + # Set columns to evaluate + cols = [f"{v}_{w}{EXTRAPOLATED_DICT[e]}" for v in variables] + # Get sum of shares + tb["sum_check"] = tb[cols].sum(axis=1) + # Count the nulls between the 10 decile share variables + tb["null_check"] = tb[cols].isnull().sum(1) + + mask = (tb["sum_check"] >= 100 + margin) | (tb["sum_check"] <= 100 - margin) & (tb["null_check"] == 0) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.fatal( + f"""{len(tb_error)} share observations ({w}{EXTRAPOLATED_DICT[e]}) are not adding up to 100%: + {tabulate(tb_error[['country', 'year', 'sum_check']].sort_values(by='sum_check', ascending=False).reset_index(drop=True), headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + + return tb + + +def check_negative_values(tb: Table): + """ + Check if there are negative values in the variables + """ + + tb = tb.copy() + + # Define columns as all the columns minus country and year, the ones containing "share" and the ones containing "gini" + variables = [ + col for col in tb.columns if "gini" not in col and "wealth" not in col and col not in ["country", "year"] + ] + + for v in variables: + # Create a mask to check if any value is negative + mask = tb[v] < 0 + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""{len(tb_error)} observations for {v} are negative: + {tabulate(tb_error[['country', 'year', v]], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + + return tb + + +def check_monotonicity(tb: Table, metric: list, welfare: list): + """ + Check monotonicity for shares, thresholds and averages + """ + + tb = tb.copy() + + # Create a list of variables containing pxpy_share + variables = [f"p{i}p{i+10}" for i in range(0, 100, 10)] + + for e in EXTRAPOLATED_DICT: + for w in welfare: + for m in metric: + # Set columns to evaluate + cols = [f"{v}_{m}_{w}{EXTRAPOLATED_DICT[e]}" for v in variables] + + check_vars = [] + for i in range(len(cols) - 1): + # Create a column that checks if the next value is higher than the previous one + tb[f"monotonicity_check_{i}"] = tb[cols[i + 1]] >= tb[cols[i]] + check_vars.append(f"monotonicity_check_{i}") + + # Create a column that checks if all the previous columns are True + tb["monotonicity_check"] = tb[check_vars].all(1) + + # Count the nulls between the 10 decile share variables + tb["null_check"] = tb[cols].isnull().sum(1) + + # Create a mask to check if all the previous columns are True + mask = (~tb["monotonicity_check"]) & (tb["null_check"] == 0) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.fatal( + f"""{len(tb_error)} observations for {m}_{w}{EXTRAPOLATED_DICT[e]} are not monotonically increasing: + {tabulate(tb_error[['country', 'year'] + cols], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".2f")}""" + ) + + return tb + + +def check_avg_between_thr(tb: Table, welfare: list) -> Table: + """ + Check that each avg is between the corresponding thr + """ + + tb = tb.copy() + + for e in EXTRAPOLATED_DICT: + for w in welfare: + check_cols = [] + check_nulls = [] + for i in range(0, 100, 10): + # Create lower bound, avg and upper bound columns + tb["thr_lower"] = tb[f"p{i}p{i+10}_thr_{w}{EXTRAPOLATED_DICT[e]}"] + tb["avg"] = tb[f"p{i}p{i+10}_avg_{w}{EXTRAPOLATED_DICT[e]}"] + + if i < 90: + tb["thr_upper"] = tb[f"p{i+10}p{i+20}_thr_{w}{EXTRAPOLATED_DICT[e]}"] + + # Count the nulls between the vars I am checking + tb[f"null_check_{i}"] = tb[["thr_lower", "avg", "thr_upper"]].isnull().sum(1) + + # Create check column + tb[f"check_{i}"] = (tb["avg"] >= tb["thr_lower"]) & (tb["avg"] <= tb["thr_upper"]) + else: + # Count the nulls between the vars I am checking + tb[f"null_check_{i}"] = tb[["thr_lower", "avg"]].isnull().sum(1) + + # Create check column + tb[f"check_{i}"] = tb["avg"] >= tb["thr_lower"] + + check_cols.append(f"check_{i}") + check_nulls.append(f"null_check_{i}") + + tb["check"] = tb[check_cols].all(1) + tb["null_check"] = tb[check_nulls].sum(1) + + mask = (~tb["check"]) & (tb["null_check"] == 0) + + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.fatal( + f"""{len(tb_error)} observations for avg {w}{EXTRAPOLATED_DICT[e]} are not between the corresponding thresholds: + {tabulate(tb_error[['country', 'year'] + check_cols], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + + return tb diff --git a/etl/steps/data/grapher/wid/2024-05-24/world_inequality_database.py b/etl/steps/data/grapher/wid/2024-05-24/world_inequality_database.py new file mode 100644 index 00000000000..04a42b95184 --- /dev/null +++ b/etl/steps/data/grapher/wid/2024-05-24/world_inequality_database.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("world_inequality_database") + + # Read table from garden dataset. + tb = ds_garden["world_inequality_database"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/wid/2024-05-24/world_inequality_database.py b/etl/steps/data/meadow/wid/2024-05-24/world_inequality_database.py new file mode 100644 index 00000000000..fffdb61ebe4 --- /dev/null +++ b/etl/steps/data/meadow/wid/2024-05-24/world_inequality_database.py @@ -0,0 +1,223 @@ +"""Load a snapshot and create the World Inequality Dataset meadow dataset.""" + + +import owid.catalog.processing as pr +from owid.catalog import Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# List of countries/regions not included in the ISO2 standard, but added by WID +iso2_missing = { + "CN-RU": "China (rural)", + "CN-UR": "China (urban)", + "DD": "East Germany", + "KS": "Kosovo", + "OA": "Other Russia and Central Asia (WID)", + "OB": "Other East Asia (WID)", + "OC": "Other Western Europe (WID)", + "OD": "Other Latin America (WID)", + "OE": "Other MENA (WID)", + "OH": "Other North America and Oceania (WID)", + "OI": "Other South & South-East Asia (WID)", + "OJ": "Other Sub-Saharan Africa (WID)", + "QB": "Africa (WID)", + "QD": "Asia (WID)", + "QE": "Europe (WID)", + "QF": "Oceania (WID)", + "QJ": "Central Asia (WID)", + "QK": "East Africa (WID)", + "QL": "East Asia (WID)", + "QM": "Eastern Europe (WID)", + "QN": "Middle Africa (WID)", + "QO": "North Africa (WID)", + "QP": "North America (WID)", + "QS": "South-East Asia (WID)", + "QT": "South Africa region (WID)", + "QU": "South Asia (WID)", + "QV": "West Africa (WID)", + "QW": "West Asia (WID)", + "QX": "Western Europe (WID)", + "QY": "European Union (WID)", + "WO": "World", + "XA": "Asia (excluding Middle East) (WID)", + "XB": "North America and Oceania (WID)", + "XF": "Sub-Saharan Africa (WID)", + "XL": "Latin America (WID)", + "XM": "Middle East (WID)", + "XN": "MENA (WID)", + "XR": "Russia and Central Asia (WID)", + "XS": "South & South-East Asia (WID)", + "ZZ": "Zanzibar", +} + +# Market exchange rates regions (we are not using them) +iso2_missing_mer = { + "OA-MER": "Other Russia and Central Asia (at market exchange rate) (WID)", + "OB-MER": "Other East Asia (at market exchange rate) (WID)", + "OC-MER": "Other Western Europe (at market exchange rate) (WID)", + "OD-MER": "Other Latin America (at market exchange rate) (WID)", + "OE-MER": "Other MENA (at market exchange rate) (WID)", + "OH-MER": "Other North America and Oceania (at market exchange rate) (WID)", + "OI-MER": "Other South & South-East Asia (at market exchange rate) (WID)", + "OJ-MER": "Other Sub-Saharan Africa (at market exchange rate) (WID)", + "QB-MER": "Africa (at market exchange rate) (WID)", + "QD-MER": "Asia (at market exchange rate) (WID)", + "QE-MER": "Europe (at market exchange rate) (WID)", + "QF-MER": "Oceania (at market exchange rate) (WID)", + "QJ-MER": "Central Asia (at market exchange rate) (WID)", + "QK-MER": "East Africa (at market exchange rate) (WID)", + "QL-MER": "East Asia (at market exchange rate) (WID)", + "QM-MER": "Eastern Europe (at market exchange rate) (WID)", + "QN-MER": "Middle Africa (at market exchange rate) (WID)", + "QO-MER": "North Africa (at market exchange rate) (WID)", + "QP-MER": "North America (at market exchange rate) (WID)", + "QS-MER": "South-East Asia (at market exchange rate) (WID)", + "QT-MER": "South Africa region (at market exchange rate) (WID)", + "QU-MER": "South Asia (at market exchange rate) (WID)", + "QV-MER": "West Africa (at market exchange rate) (WID)", + "QW-MER": "West Asia (at market exchange rate) (WID)", + "QX-MER": "Western Europe (at market exchange rate) (WID)", + "QY-MER": "European Union (at market exchange rate) (WID)", + "WO-MER": "World (at market exchange rate) (WID)", + "XA-MER": "Asia (excluding Middle East) (at market exchange rate) (WID)", + "XB-MER": "North America and Oceania (at market exchange rate) (WID)", + "XF-MER": "Sub-Saharan Africa (at market exchange rate) (WID)", + "XL-MER": "Latin America (at market exchange rate) (WID)", + "XM-MER": "Middle East (at market exchange rate) (WID)", + "XN-MER": "MENA (at market exchange rate) (WID)", + "XR-MER": "Russia and Central Asia (at market exchange rate) (WID)", + "XS-MER": "South & South-East Asia (at market exchange rate) (WID)", +} + +# Create a dictionary with the names of the snapshots and their id variables +snapshots_dict = { + "world_inequality_database": ["country", "year"], + "world_inequality_database_distribution": ["country", "year", "welfare", "p", "percentile"], +} + + +def run(dest_dir: str) -> None: + # Keep snapshot info for the main snapshot + snap_main = paths.load_snapshot("world_inequality_database.csv") + + # Load regions table + ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() + + # + # Load inputs. + + # Initialize tables list + tables = [] + for tb_name, tb_ids in snapshots_dict.items(): + # Load data from snapshot. + # `keep_default_na` and `na_values` are included because there is a country labeled NA, Namibia, which becomes null without the parameters + na_values = [ + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A N/A", + "#N/A", + "N/A", + "n/a", + "", + "#NA", + "NULL", + "null", + "NaN", + "-NaN", + "nan", + "-nan", + "", + ] + + # Retrieve snapshot. + snap = paths.load_snapshot(f"{tb_name}.csv") + tb = snap.read(keep_default_na=False, na_values=na_values) + + # Retrieve snapshot with extrapolations + snap = paths.load_snapshot(f"{tb_name}_with_extrapolations.csv") + # Load data from snapshot. + tb_extrapolations = snap.read(keep_default_na=False, na_values=na_values) + + # Combine both datasets + tb = pr.merge(tb, tb_extrapolations, on=tb_ids, how="outer", suffixes=("", "_extrapolated"), short_name=tb_name) + + # + # Process data. + # + # Harmonize countries + tb = harmonize_countries(tb, tb_regions, iso2_missing, iso2_missing_mer) + + # Set index and sort + tb = tb.format(tb_ids) + + # Append current table + tables.append(tb) + + # Add fiscal income data + snap_fiscal = paths.load_snapshot("world_inequality_database_fiscal.csv") + tb_fiscal = snap_fiscal.read(keep_default_na=False, na_values=na_values) + + # Harmonize countries + tb_fiscal = harmonize_countries(tb_fiscal, tb_regions, iso2_missing, iso2_missing_mer) + tb_fiscal.metadata.short_name = "world_inequality_database_fiscal" + tb_fiscal = tb_fiscal.format() + + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, tables=tables + [tb_fiscal], check_variables_metadata=True, default_metadata=snap_main.metadata + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +# Country harmonization function, using both the reference country/regional OWID dataset and WID's `iso2_missing` list +def harmonize_countries(tb: Table, tb_regions: Table, iso2_missing: dict, iso_2_missing_mer: dict) -> Table: + # Merge dataset and country dictionary to get the name of the country + tb = pr.merge(tb, tb_regions[["name", "iso_alpha2"]], left_on="country", right_on="iso_alpha2", how="left") + + # Several countries are not matched, because WID amends the ISO-2 list with additional countries and regions + # See https://wid.world/codes-dictionary/#country-code + + # Make country string to avoid problems with categorical data + tb["name"] = tb["name"].astype(str) + tb["country"] = tb["country"].astype(str) + + # Replace missing items + for x, y in iso2_missing.items(): + tb.loc[tb["country"] == x, "name"] = y + + # Create list of unmatched entitites + missing_list = list(tb[tb["name"] == "nan"]["country"].unique()) + # Substract iso2_missing_mer from missing_list + missing_list = [x for x in missing_list if x not in iso_2_missing_mer.keys()] + missing_count = len(missing_list) + + # Warns if there are still entities missing + if missing_count > 0: + log.warning( + f"There are still {missing_count} unnamed WID countries/regions in {tb.m.short_name}! Take a look at this list:\n {missing_list}" + ) + + # Drop rows without match (MER if there was not any error) + tb = tb[~(tb["name"] == "nan")].reset_index(drop=True) + + # Drop old country and ISO alpha 2 variable. Rename the newly built variable as `country` + tb = tb.drop(columns=["country", "iso_alpha2"]) + tb = tb.rename(columns={"name": "country"}) + + # Move country and year to the beginning + cols_to_move = ["country", "year"] + tb = tb[cols_to_move + [col for col in tb.columns if col not in cols_to_move]] + + return tb diff --git a/snapshots/wid/2024-05-24/wid_indices.do b/snapshots/wid/2024-05-24/wid_indices.do new file mode 100644 index 00000000000..9833c695974 --- /dev/null +++ b/snapshots/wid/2024-05-24/wid_indices.do @@ -0,0 +1,308 @@ +/* +WID COMMANDS FOR OUR WORLD IN DATA + +This program extracts inequality data from LIS for three types of income and one type of wealth: + - Pretax national income, income before the payment and receipt of taxes and benefits, but after payment of public and private pensions. + - Post-tax disposable income, income that includes all cash redistribution through the tax and transfer system, but does not include in-kind benefits and therefore does not add up to national income. + - Post-tax national income, income that includes all cash redistribution through the tax and transfer system and also in-kind transfers (i.e., government consumption expenditures) to individuals. + - Net national wealth, the total value of non-financial and financial assets (housing, land, deposits, bonds, equities, etc.) held by households, minus their debts. + +The inequality variables extracted from here include Gini coefficients, averages, thresholds and shares per decile, statistics for the top 1, 0.1, 0.01 and 0.001% percentile and share ratios. +When needed, values are converted to PPP (2011 vintage) adjusted to prices of the most recent year available. + +HOW TO EXECUTE: + +1. Open this do-file in a local installation of Stata (execution time: ~40 minutes) +2. It generates four files, which exclude and include extrapolations. They need to be imported as snapshots in the ETL, as + python snapshots/wid/2024-05-24/world_inequality_database.py --path-to-file snapshots/wid/2024-05-24/wid_indices_992j_exclude.csv + python snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.py --path-to-file snapshots/wid/2024-05-24/wid_indices_992j_include.csv + python snapshots/wid/2024-05-24/world_inequality_database_distribution.py --path-to-file snapshots/wid/2024-05-24/wid_distribution_992j_exclude.csv + python snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.py --path-to-file snapshots/wid/2024-05-24/wid_distribution_992j_include.csv + python snapshots/wid/2024-05-24/world_inequality_database_fiscal.py --path-to-file snapshots/wid/2024-05-24/wid_indices_fiscal_992ijt_exclude.csv +3. After the execution above ends, delete the csv files created by this do-file. + + (Change the date for future updates) + +*/ + +////////////////////////////////////////////////////////////////////////////////////// +/* SETTINGS + +This code will run these two options automatically +1 is the main dataset with key indicators: Gini, thresholds, shares, averages +2 is the distributional dataset, that includes 130 fractiles +*/ + +global options 1 2 + +* Select age. The default is individuals over age 20 (992). See the full list: https://wid.world/codes-dictionary/#three-digit-code +global age 992 + +* Select population unit. The default is equal-split adults (j).See the full list (2.1.5): https://wid.world/codes-dictionary/#one-letter-code +global unit j + +*Select the dataset to extract. "all" for the entire LIS data, "test" for test data, small (CL GB) +global dataset = "all" + +/////////////////////////////////////////////////////////////////////////////////////// + +* If condition to select all the data or a part of it +if "$dataset" == "all" { + global areas _all +} + +else if "$dataset" == "test" { + global areas CL GB +} + +*Run this code to include and exclude extrapolations +global exclude_extrapolations 1 0 + +* Average and threshold indicators do not vary between key variables and distributional datasets +global indicators_avg_thr aptinc tptinc adiinc tdiinc acainc tcainc ahweal thweal + +*Show entire output +set more off + +*Get maximum year value to have to correct PPP conversion +qui wid, indicators(xlcusp) clear +qui sum year +global max_year = r(max) + +dis "Year of PPP data: $max_year" + +*Get ppp data to convert to USD +wid, indicators(xlcusp) year($max_year) clear +rename value ppp +tempfile ppp +save "`ppp'" + + +foreach option in $options { + + * Define different indicators and percentiles depending on the dataset + if `option' == 1 { + global indicators_gini_share sptinc gptinc sdiinc gdiinc scainc gcainc shweal ghweal + global percentiles p0p10 p10p20 p20p30 p30p40 p40p50 p50p60 p60p70 p70p80 p80p90 p90p100 p0p100 p0p50 p99p100 p99.9p100 p99.99p100 p99.999p100 + } + + else if `option' == 2 { + global indicators_gini_share sptinc sdiinc scainc shweal + global percentiles p0p1 p1p2 p2p3 p3p4 p4p5 p5p6 p6p7 p7p8 p8p9 p9p10 p10p11 p11p12 p12p13 p13p14 p14p15 p15p16 p16p17 p17p18 p18p19 p19p20 p20p21 p21p22 p22p23 p23p24 p24p25 p25p26 p26p27 p27p28 p28p29 p29p30 p30p31 p31p32 p32p33 p33p34 p34p35 p35p36 p36p37 p37p38 p38p39 p39p40 p40p41 p41p42 p42p43 p43p44 p44p45 p45p46 p46p47 p47p48 p48p49 p49p50 p50p51 p51p52 p52p53 p53p54 p54p55 p55p56 p56p57 p57p58 p58p59 p59p60 p60p61 p61p62 p62p63 p63p64 p64p65 p65p66 p66p67 p67p68 p68p69 p69p70 p70p71 p71p72 p72p73 p73p74 p74p75 p75p76 p76p77 p77p78 p78p79 p79p80 p80p81 p81p82 p82p83 p83p84 p84p85 p85p86 p86p87 p87p88 p88p89 p89p90 p90p91 p91p92 p92p93 p93p94 p94p95 p95p96 p96p97 p97p98 p98p99 p99p100 p99p99.1 p99.1p99.2 p99.2p99.3 p99.3p99.4 p99.4p99.5 p99.5p99.6 p99.6p99.7 p99.7p99.8 p99.8p99.9 p99.9p100 p99.9p99.91 p99.91p99.92 p99.92p99.93 p99.93p99.94 p99.94p99.95 p99.95p99.96 p99.96p99.97 p99.97p99.98 p99.98p99.99 p99.99p100 p99.99p99.991 p99.991p99.992 p99.992p99.993 p99.993p99.994 p99.994p99.995 p99.995p99.996 p99.996p99.997 p99.997p99.998 p99.998p99.999 p99.999p100 + } + + foreach excl_option in $exclude_extrapolations { + + * If excl_option is 1 we will add exclude_option to the wid command and add "exclude" as a variable suffix + if `excl_option' == 1 { + global exclude_option exclude + local excl_option_slug = "exclude" + } + + * If excl_option is 0 no text is included to the wid command and "include" is added as a variable suffix + else if `excl_option' == 0 { + global exclude_option + local excl_option_slug = "include" + } + + *Get average and threshold income for pre tax and post tax (nat and dis) data + wid, indicators($indicators_avg_thr) perc($percentiles) areas($areas) ages($age) pop($unit) $exclude_option clear + + *Merge with ppp data to transform monetary values to international-$ + merge n:1 country using "`ppp'", keep(match) + replace value = value/ppp + drop ppp + drop _merge + tempfile avgthr + save "`avgthr'" + + *Gets shares and Gini for pre and post tax income + wid, indicators($indicators_gini_share) perc($percentiles) areas($areas) ages($age) pop($unit) $exclude_option clear + + *Union with average and threshold income + append using "`avgthr'" + + if `option' == 1 { + + *Variable adjustments to create a wide dataset + + *Create percentile-variable and country-year variables (used as indices when the table is reshaped) + egen varp = concat(percentile variable), punct(_) + egen couy = concat(country year), punct(+) + + *Drop variables to only keep joined indices + drop variable percentile country year + + *Replace all occurrences of "." in the newly created `varp` (mainly in p99.9p100 and similar) + *This is because names of variables with "." are not allowed + replace varp = subinstr(varp, ".", "_", .) + + *Reshape dataset: couy is the main index and varp are what Stata calls subobservations, in this case metrics associated with percentiles + reshape wide value, j(varp) i(couy) string + + *After the reshape, country and years are split into two variables again and the outcome is renamed + split couy, p(+) destring + rename couy1 country + rename couy2 year + + *Drop couy, as it is not longer needed + drop couy + + *Internal WID codes are replaced for more human-readable variable names + + rename value* * + rename *sptinc* *share_pretax + rename *gptinc* *gini_pretax + rename *aptinc* *avg_pretax + rename *tptinc* *thr_pretax + rename *sdiinc* *share_posttax_nat + rename *gdiinc* *gini_posttax_nat + rename *adiinc* *avg_posttax_nat + rename *tdiinc* *thr_posttax_nat + rename *scainc* *share_posttax_dis + rename *gcainc* *gini_posttax_dis + rename *acainc* *avg_posttax_dis + rename *tcainc* *thr_posttax_dis + rename *shweal* *share_wealth + rename *ghweal* *gini_wealth + rename *ahweal* *avg_wealth + rename *thweal* *thr_wealth + + *Drop shares and thresholds for the entire distribution, as they do not have relevance for analysis (or they repeat other numbers from the dataset) + *Same for some p0p50 indicators + drop p0p100_share* + drop p0p100_thr* + drop p0p50_avg* + drop p0p50_thr* + + *Define each income/wealth variable + local var_names pretax posttax_nat posttax_dis wealth + + *Calculate ratios for each variable + create a duplicate variable for median + * Also, generate a variable for the share between p90 and p99 and recalculate p50p90_share, because their components are more available. + foreach var in `var_names' { + + gen palma_ratio_`var' = p90p100_share_`var' / (p0p50_share_`var' - p40p50_share_`var') + gen s90_s10_ratio_`var' = p90p100_share_`var' / p0p10_share_`var' + gen s80_s20_ratio_`var' = (p80p90_share_`var' + p90p100_share_`var') / (p0p10_share_`var' + p10p20_share_`var') + gen s90_s50_ratio_`var' = p90p100_share_`var' / p0p50_share_`var' + gen p90_p10_ratio_`var' = p90p100_thr_`var' / p10p20_thr_`var' + gen p90_p50_ratio_`var' = p90p100_thr_`var' / p50p60_thr_`var' + gen p50_p10_ratio_`var' = p50p60_thr_`var' / p10p20_thr_`var' + + gen median_`var' = p50p60_thr_`var' + + gen p90p99_share_`var' = p90p100_share_`var' - p99p100_share_`var' + + gen p50p90_share_`var' = p50p60_share_`var' + p60p70_share_`var' + p70p80_share_`var' + p80p90_share_`var' + + } + + *Order variables according to different variable groups + order country year *gini_pretax *gini*dis *gini*nat *gini_wealth *_ratio*pretax *_ratio*dis *_ratio*nat *_ratio*wealth *share_pretax *share*dis *share*nat *share_wealth *avg_pretax *avg*dis *avg*nat *avg_wealth *thr_pretax *thr*dis *thr*nat *thr_wealth median* + + + *Sort country and year + sort country year + + *Export csv + export delimited using "wid_indices_${age}${unit}_`excl_option_slug'.csv", replace + + } + + else if `option' == 2 { + + * Extract from variable the indicator (a,t,s) and welfare (ptinc, diinc, cainc, hweal) + gen indicator = substr(variable, 1, 1) + gen welfare = substr(variable, 2, 5) + + * Create an index variable to make the table wide + egen couypw = concat(country year percentile welfare), punct(+) + drop country year percentile welfare variable age pop + + * Make the table wide + reshape wide value, j(indicator) i(couypw) string + + * Split the index variable to recover the columns + split couypw, p(+) destring + + * Rename resulting columns and drop what's not needed + rename couypw1 country + rename couypw2 year + rename couypw3 percentile + rename couypw4 welfare + + drop couyp + + * Rename resulting average, share and threshold columns + rename valuea avg + rename values share + rename valuet thr + + * Replace welfare codes with new text + replace welfare = "pretax" if welfare == "ptinc" + replace welfare = "posttax_nat" if welfare == "diinc" + replace welfare = "posttax_dis" if welfare == "cainc" + replace welfare = "wealth" if welfare == "hweal" + + * Extract percentile from WID's name + split percentile, p(p) + destring percentile2, generate(p) + replace p = p/100 + drop percentile1 percentile2 percentile3 + + * Sort, order and save + sort country year p welfare + + order country year welfare percentile p thr avg share + + export delimited using "wid_distribution_${age}${unit}_`excl_option_slug'.csv", replace + + } + + } + +} + +* Add fiscal income data (Chartbook of Economic Inequality) +wid, indicators(sfiinc) perc(p99p100) ages(992) pop(i j t) exclude clear + +*Variable adjustments to create a wide dataset + +*Create percentile-variable and country-year variables (used as indices when the table is reshaped) +egen varp = concat(percentile variable), punct(_) +egen couy = concat(country year), punct(+) + +*Drop variables to only keep joined indices +drop variable percentile country year pop age + +*Replace all occurrences of "." in the newly created `varp` +*This is because names of variables with "." are not allowed +replace varp = subinstr(varp, ".", "_", .) + +*Reshape dataset: couy is the main index and varp are what Stata calls subobservations, in this case metrics associated with percentiles +reshape wide value, j(varp) i(couy) string + +*After the reshape, country and years are split into two variables again and the outcome is renamed +split couy, p(+) destring +rename couy1 country +rename couy2 year + +*Drop couy, as it is not longer needed +drop couy + +*Internal WID codes are replaced for more human-readable variable names + +rename value* * +rename *sfiinc* *share_fiscal* + +*Order variables according to different variable groups +order country year *share* + +*Sort country and year +sort country year + +*Export csv +export delimited using "wid_indices_fiscal_992ijt_exclude.csv", replace + +exit, clear diff --git a/snapshots/wid/2024-05-24/world_inequality_database.csv.dvc b/snapshots/wid/2024-05-24/world_inequality_database.csv.dvc new file mode 100644 index 00000000000..e2118271a31 --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Inequality Database (WID) + description: |- + The World Inequality Database (WID.world) aims to provide open and convenient access to the most extensive available database on the historical evolution of the world distribution of income and wealth, both within countries and between countries. + date_published: 2024-03-28 + title_snapshot: World Inequality Database (WID) - Key indicators + + # Citation + producer: World Inequality Database (WID.world) + citation_full: |- + World Inequality Database (WID), https://wid.world + attribution_short: WID + + # Files + url_main: https://wid.world + date_accessed: 2024-04-04 + + # License + license: + name: CC BY 4.0 + url: https://wid.world/ + +wdir: ../../../data/snapshots/wid/2024-05-24 +outs: + - md5: d536ea088536db6cb0ee94ce47b22232 + size: 15409866 + path: world_inequality_database.csv diff --git a/snapshots/wid/2024-05-24/world_inequality_database.py b/snapshots/wid/2024-05-24/world_inequality_database.py new file mode 100644 index 00000000000..7a8ff0a3aac --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wid/{SNAPSHOT_VERSION}/world_inequality_database.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wid/2024-05-24/world_inequality_database_distribution.csv.dvc b/snapshots/wid/2024-05-24/world_inequality_database_distribution.csv.dvc new file mode 100644 index 00000000000..7c765a72f65 --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_distribution.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Inequality Database (WID) + description: |- + The World Inequality Database (WID.world) aims to provide open and convenient access to the most extensive available database on the historical evolution of the world distribution of income and wealth, both within countries and between countries. + date_published: 2024-03-28 + title_snapshot: World Inequality Database (WID) - Distributional data + + # Citation + producer: World Inequality Database (WID.world) + citation_full: |- + World Inequality Database (WID), https://wid.world + attribution_short: WID + + # Files + url_main: https://wid.world + date_accessed: 2024-04-04 + + # License + license: + name: CC BY 4.0 + url: https://wid.world/ + +wdir: ../../../data/snapshots/wid/2024-05-24 +outs: + - md5: 3ce5fb14f333c1ba5be44a2453e8a1d9 + size: 201045888 + path: world_inequality_database_distribution.csv diff --git a/snapshots/wid/2024-05-24/world_inequality_database_distribution.py b/snapshots/wid/2024-05-24/world_inequality_database_distribution.py new file mode 100644 index 00000000000..93f9612730c --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_distribution.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wid/{SNAPSHOT_VERSION}/world_inequality_database_distribution.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.csv.dvc b/snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.csv.dvc new file mode 100644 index 00000000000..31ec4a60a27 --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Inequality Database (WID) + description: |- + The World Inequality Database (WID.world) aims to provide open and convenient access to the most extensive available database on the historical evolution of the world distribution of income and wealth, both within countries and between countries. + date_published: 2024-03-28 + title_snapshot: World Inequality Database (WID) - Distributional data (with extrapolations) + + # Citation + producer: World Inequality Database (WID.world) + citation_full: |- + World Inequality Database (WID), https://wid.world + attribution_short: WID + + # Files + url_main: https://wid.world + date_accessed: 2024-04-04 + + # License + license: + name: CC BY 4.0 + url: https://wid.world/ + +wdir: ../../../data/snapshots/wid/2024-05-24 +outs: + - md5: bc3109d6abc222ee3be0e447ac93f216 + size: 269127003 + path: world_inequality_database_distribution_with_extrapolations.csv diff --git a/snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.py b/snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.py new file mode 100644 index 00000000000..37f2485c188 --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_distribution_with_extrapolations.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wid/{SNAPSHOT_VERSION}/world_inequality_database_distribution_with_extrapolations.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wid/2024-05-24/world_inequality_database_fiscal.csv.dvc b/snapshots/wid/2024-05-24/world_inequality_database_fiscal.csv.dvc new file mode 100644 index 00000000000..fbb8f7ac8b1 --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_fiscal.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Inequality Database (WID) + description: |- + The World Inequality Database (WID.world) aims to provide open and convenient access to the most extensive available database on the historical evolution of the world distribution of income and wealth, both within countries and between countries. + date_published: 2024-03-28 + title_snapshot: World Inequality Database (WID) - Fiscal income + + # Citation + producer: World Inequality Database (WID.world) + citation_full: |- + World Inequality Database (WID), https://wid.world + attribution_short: WID + + # Files + url_main: https://wid.world + date_accessed: 2024-05-09 + + # License + license: + name: CC BY 4.0 + url: https://wid.world/ + +wdir: ../../../data/snapshots/wid/2024-05-24 +outs: + - md5: 7f55e871e47be0fc321c02d26d4389ea + size: 142857 + path: world_inequality_database_fiscal.csv diff --git a/snapshots/wid/2024-05-24/world_inequality_database_fiscal.py b/snapshots/wid/2024-05-24/world_inequality_database_fiscal.py new file mode 100644 index 00000000000..edd8e971ddf --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_fiscal.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wid/{SNAPSHOT_VERSION}/world_inequality_database_fiscal.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.csv.dvc b/snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.csv.dvc new file mode 100644 index 00000000000..8763c078efc --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Inequality Database (WID) + description: |- + The World Inequality Database (WID.world) aims to provide open and convenient access to the most extensive available database on the historical evolution of the world distribution of income and wealth, both within countries and between countries. + date_published: 2024-03-28 + title_snapshot: World Inequality Database (WID) - Key indicators (with extrapolations) + + # Citation + producer: World Inequality Database (WID.world) + citation_full: |- + World Inequality Database (WID), https://wid.world + attribution_short: WID + + # Files + url_main: https://wid.world + date_accessed: 2024-04-04 + + # License + license: + name: CC BY 4.0 + url: https://wid.world/ + +wdir: ../../../data/snapshots/wid/2024-05-24 +outs: + - md5: 152114c0fee66bce7939867c18dc5b7f + size: 20368809 + path: world_inequality_database_with_extrapolations.csv diff --git a/snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.py b/snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.py new file mode 100644 index 00000000000..dfbf256c0d8 --- /dev/null +++ b/snapshots/wid/2024-05-24/world_inequality_database_with_extrapolations.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wid/{SNAPSHOT_VERSION}/world_inequality_database_with_extrapolations.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main()