diff --git a/apps/wizard/app_pages/chart_diff/chart_diff.py b/apps/wizard/app_pages/chart_diff/chart_diff.py index af74d1fe8f2..3ed8b71f3a0 100644 --- a/apps/wizard/app_pages/chart_diff/chart_diff.py +++ b/apps/wizard/app_pages/chart_diff/chart_diff.py @@ -1,5 +1,6 @@ import datetime as dt import difflib +import json import pprint from typing import Any, Dict, List, Optional @@ -653,6 +654,7 @@ def _modified_chart_configs_on_staging( select c.id as chartId, MD5(cc.full) as chartChecksum, + cc.full as chartConfig, c.lastEditedByUserId as chartLastEditedByUserId, c.publishedByUserId as chartPublishedByUserId, c.lastEditedAt as chartLastEditedAt @@ -699,6 +701,20 @@ def _modified_chart_configs_on_staging( diff = source_df.copy() diff["configEdited"] = source_df["chartChecksum"] != target_df["chartChecksum"] + # Go through edited configs and do a more detailed comparison + ix = diff["configEdited"] & target_df["chartChecksum"].notnull() + equal_configs = [] + for chart_id, row in diff.loc[ix].iterrows(): + source_config = json.loads(row["chartConfig"]) + target_config = json.loads(target_df.loc[chart_id, "chartConfig"]) + + # Compare configs + if configs_are_equal(source_config, target_config): + equal_configs.append(chart_id) + + # Exclude configs that have different chartChecksum, but are actually the same (e.g. have just different version) + diff = diff[~diff.index.isin(equal_configs)] + # Add flag 'edited in staging' diff["chartEditedInStaging"] = True diff --git a/etl/steps/data/garden/covid/latest/sequence.meta.yml b/etl/steps/data/garden/covid/latest/sequence.meta.yml index acebff8b8e4..1f79931b216 100644 --- a/etl/steps/data/garden/covid/latest/sequence.meta.yml +++ b/etl/steps/data/garden/covid/latest/sequence.meta.yml @@ -21,13 +21,13 @@ tables: num_sequences: title: "Number of sequenced COVID-19 genomes - Variant: << variant >>" description_short: |- - <% set mapping = dict( - non_who="The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.", - other="The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.", - else="The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced." - ) %> - - << mapping.get(variant, mapping['else']) >> + <% if variant == 'non_who' %> + The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced. + <% elif variant == 'other' %> + The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced. + <% else %> + The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced. + <%- endif -%> unit: "sequenced genomes" display: tolerance: 28 diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml index fd736fb1874..f3f04bbd41d 100644 --- a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml @@ -29,26 +29,28 @@ definitions: Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account. description_key_scenarios: |- - <% if scenario == "Historical" %> + <% if scenario == "Historical estimates" %> Estimates are based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts. For more details about the methodology, please refer to the [World Bank PIP documentation](https://datanalytics.worldbank.org/PIP-Methodology/lineupestimates.html#nowcasts). - <% elif scenario == "Current forecast + historical growth" %> + <% elif scenario == "Current forecast + historical growth projections" %> This data is a projection of the estimates based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019. - <% elif scenario == "2% growth" %> + <% elif scenario == "Historical estimates + projections" %> + This data combines data based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts, with projections based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019. + <% elif scenario == "2% growth projections" %> This data is a projection of the estimates based on a scenario of 2% average GDP per capita growth, while keeping income inequality constant. - <% elif scenario == "2% growth + Gini reduction 1%" %> + <% elif scenario == "2% growth + Gini reduction 1% projections" %> This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 1% of the Gini coefficient per year. - <% elif scenario == "2% growth + Gini reduction 2%" %> + <% elif scenario == "2% growth + Gini reduction 2% projections" %> This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 2% of the Gini coefficient per year. - <% elif scenario == "4% growth" %> + <% elif scenario == "4% growth projections" %> This data is a projection of the estimates based on a scenario of 4% average GDP per capita growth, while keeping income inequality constant. - <% elif scenario == "6% growth" %> + <% elif scenario == "6% growth projections" %> This data is a projection of the estimates based on a scenario of 6% average GDP per capita growth, while keeping income inequality constant. - <% elif scenario == "8% growth" %> + <% elif scenario == "8% growth projections" %> This data is a projection of the estimates based on a scenario of 8% average GDP per capita growth, while keeping income inequality constant. <%- endif -%> isprojection_by_scenario: |- - <% if scenario == "Historical" %> + <% if scenario == "Historical estimates" or scenario == "Historical estimates + projections" %> false <% else %> true diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py index fa62ff97730..66e637c2fcd 100644 --- a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py @@ -18,14 +18,14 @@ # Define scenarios and new names SCENARIOS = { - "historical": "Historical", - "current_forecast": "Current forecast + historical growth", - "2pct": "2% growth", - "2pct_gini1": "2% growth + Gini reduction 1%", - "2pct_gini2": "2% growth + Gini reduction 2%", - "4pct": "4% growth", - "6pct": "6% growth", - "8pct": "8% growth", + "historical": "Historical estimates", + "current_forecast": "Current forecast + historical growth projections", + "2pct": "2% growth projections", + "2pct_gini1": "2% growth + Gini reduction 1% projections", + "2pct_gini2": "2% growth + Gini reduction 2% projections", + "4pct": "4% growth projections", + "6pct": "6% growth projections", + "8pct": "8% growth projections", } # Define index columns @@ -92,6 +92,10 @@ def connect_estimates_with_projections(tb: Table) -> Table: tb = tb.copy() + # Save tb_historical and tb_current_forecast, by filtering scenario in historical and current_forecast + tb_historical = tb[tb["scenario"] == "historical"].copy().reset_index(drop=True) + tb_current_forecast = tb[tb["scenario"] == "current_forecast"].copy().reset_index(drop=True) + # Make table wider, by using scenario as columns tb = tb.pivot(index=["country", "year", "povertyline"], columns="scenario", values=INDICATOR_COLUMNS) @@ -116,4 +120,16 @@ def connect_estimates_with_projections(tb: Table) -> Table: for indicator in INDICATOR_COLUMNS: tb[indicator] = tb[indicator].copy_metadata(tb["country"]) + # Combine historical and current_forecast, by concatenating tb_historical and tb_current_forecast + tb_connected = pr.concat([tb_historical, tb_current_forecast], ignore_index=True) + + # Rename scenario column to "Historical + current forecast + historical growth" + tb_connected["scenario"] = "Historical estimates + projections" + + # Keep only the columns in INDEX_COLUMNS and INDICATOR_COLUMNS + tb_connected = tb_connected[INDEX_COLUMNS + INDICATOR_COLUMNS] + + # Concatenate tb and tb_connected + tb = pr.concat([tb, tb_connected], ignore_index=True) + return tb diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index 745563a2d81..d24f55d2c5f 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -119,7 +119,7 @@ def add( utils.validate_underscore(col, "Variable's name") if not table.primary_key: - if "OWID_STRICT" in environ: + if environ.get("OWID_STRICT"): raise PrimaryKeyMissing( f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving" ) @@ -128,7 +128,7 @@ def add( f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving" ) - if not table.index.is_unique and "OWID_STRICT" in environ: + if not table.index.is_unique and environ.get("OWID_STRICT"): [(k, dups)] = table.index.value_counts().head(1).to_dict().items() raise NonUniqueIndex( f"Table `{table.metadata.short_name}` has duplicate values in the index -- could you have made a mistake?\n\n" diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 555d87e87a4..d103b37e950 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,12 +9,12 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-12-10 - date_published: 2024-12-10 + date_accessed: 2024-12-11 + date_published: 2024-12-11 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license outs: - - md5: d1de4bd7ac3c08a0dcc6eb63f891f71b - size: 12799309 + - md5: fc6f8b908a2988b2d8048707526c460a + size: 12799310 path: weekly_wildfires.csv diff --git a/snapshots/covid/latest/cases_deaths.csv.dvc b/snapshots/covid/latest/cases_deaths.csv.dvc index 2552d99dd6e..247507293dc 100644 --- a/snapshots/covid/latest/cases_deaths.csv.dvc +++ b/snapshots/covid/latest/cases_deaths.csv.dvc @@ -22,7 +22,7 @@ meta: version_producer: WHO COVID-19 Dashboard - Daily cases and deaths url_main: https://covid19.who.int/ url_download: https://srhdpeuwpubsa.blob.core.windows.net/whdh/COVID/WHO-COVID-19-global-daily-data.csv - date_accessed: 2024-12-10 + date_accessed: 2024-12-11 date_published: '2024-07-07' license: name: CC BY 4.0 diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index bdc68216782..29102377237 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-12-10 + date_accessed: 2024-12-11 publication_date: 2024-11-11 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index ffdbd6a06db..d7b4d86e4fd 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-12-10 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 71e3a116720..91a48ea6e6d 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-12-10 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index b3cf92134a5..4571161e0bd 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-12-10 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc index 516ded55647..4d27ab747a3 100644 --- a/snapshots/health/latest/global_health_mpox.csv.dvc +++ b/snapshots/health/latest/global_health_mpox.csv.dvc @@ -22,6 +22,6 @@ meta: url: https://global.health/terms-of-use/ outs: - - md5: 7928d79ed3caf862d86ba729737fc255 - size: 16733780 + - md5: 08388d2230adafbb7fe28ddcd1eb0dc8 + size: 16813136 path: global_health_mpox.csv diff --git a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc index 2e560863971..08689d46ba7 100644 --- a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc +++ b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc @@ -12,6 +12,7 @@ meta: producer: Lakner et al. citation_full: |- Lakner, C., Genoni, M. E., Stemmler, H., Yonzan, N., & Tetteh Baah, S. K. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024. World Bank. https://doi.org/10.60572/KGE4-CX54 + attribution: Lakner et al. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024 # Files url_main: https://reproducibility.worldbank.org/index.php/catalog/189/ diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index c98234464de..30b2f569464 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 516f378e03682d099c5bdcecb732b38b - size: 168097330 + - md5: 811f5ca9e719e680bc1cde286e599f9d + size: 168107745 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index dc62b167acc..6a11439d09e 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 50775d6806b50d572bc79031134bc3e3 - size: 27221232 + - md5: b687f5f92351d148e71bb3b5d60c0c50 + size: 27222953 path: flunet.csv diff --git a/tests/test_datadiff.py b/tests/test_datadiff.py index 316ecfdeaf1..be4466781c1 100644 --- a/tests/test_datadiff.py +++ b/tests/test_datadiff.py @@ -1,3 +1,6 @@ +import os +from unittest.mock import patch + import pandas as pd from owid.catalog import Dataset, DatasetMeta, Table @@ -19,6 +22,7 @@ def _create_datasets(tmp_path): return ds_a, ds_b +@patch.dict(os.environ, {"OWID_STRICT": ""}) def test_DatasetDiff_summary(tmp_path): ds_a, ds_b = _create_datasets(tmp_path) @@ -43,6 +47,7 @@ def test_DatasetDiff_summary(tmp_path): ] +@patch.dict(os.environ, {"OWID_STRICT": ""}) def test_new_data(tmp_path): ds_a, ds_b = _create_datasets(tmp_path) diff --git a/tests/test_steps.py b/tests/test_steps.py index ff266f1917d..5693fcd05fd 100644 --- a/tests/test_steps.py +++ b/tests/test_steps.py @@ -15,6 +15,7 @@ from unittest.mock import patch import pandas as pd +import requests from owid.catalog import Dataset from etl import paths @@ -162,7 +163,11 @@ def test_select_dirty_steps(): def test_get_etag(): - etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md") + try: + etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md") + # ignore SSL errors + except requests.exceptions.SSLError: + return assert etag