Merge branch 'master' of github.com:owid/etl into wizard-on-provider-…

…analytics
owid · Dec 11, 2024 · 0983da2 · 0983da2
2 parents 29f00b6 + b8d4a0c
commit 0983da2
Show file tree

Hide file tree

Showing 17 changed files with 87 additions and 42 deletions.
diff --git a/apps/wizard/app_pages/chart_diff/chart_diff.py b/apps/wizard/app_pages/chart_diff/chart_diff.py
@@ -1,5 +1,6 @@
 import datetime as dt
 import difflib
+import json
 import pprint
 from typing import Any, Dict, List, Optional
 
@@ -653,6 +654,7 @@ def _modified_chart_configs_on_staging(
     select
         c.id as chartId,
         MD5(cc.full) as chartChecksum,
+        cc.full as chartConfig,
         c.lastEditedByUserId as chartLastEditedByUserId,
         c.publishedByUserId as chartPublishedByUserId,
         c.lastEditedAt as chartLastEditedAt
@@ -699,6 +701,20 @@ def _modified_chart_configs_on_staging(
     diff = source_df.copy()
     diff["configEdited"] = source_df["chartChecksum"] != target_df["chartChecksum"]
 
+    # Go through edited configs and do a more detailed comparison
+    ix = diff["configEdited"] & target_df["chartChecksum"].notnull()
+    equal_configs = []
+    for chart_id, row in diff.loc[ix].iterrows():
+        source_config = json.loads(row["chartConfig"])
+        target_config = json.loads(target_df.loc[chart_id, "chartConfig"])
+
+        # Compare configs
+        if configs_are_equal(source_config, target_config):
+            equal_configs.append(chart_id)
+
+    # Exclude configs that have different chartChecksum, but are actually the same (e.g. have just different version)
+    diff = diff[~diff.index.isin(equal_configs)]
+
     # Add flag 'edited in staging'
     diff["chartEditedInStaging"] = True
 

diff --git a/etl/steps/data/garden/covid/latest/sequence.meta.yml b/etl/steps/data/garden/covid/latest/sequence.meta.yml
@@ -21,13 +21,13 @@ tables:
       num_sequences:
         title: "Number of sequenced COVID-19 genomes - Variant: << variant >>"
         description_short: |-
-          <% set mapping = dict(
-            non_who="The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.",
-            other="The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.",
-            else="The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced."
-          ) %>
-
-          << mapping.get(variant, mapping['else']) >>
+          <% if variant == 'non_who' %>
+          The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.
+          <% elif variant == 'other' %>
+          The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.
+          <% else %>
+          The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.
+          <%- endif -%>
         unit: "sequenced genomes"
         display:
           tolerance: 28

diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml
@@ -29,26 +29,28 @@ definitions:
     Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account.
 
   description_key_scenarios: |-
-    <% if scenario == "Historical" %>
+    <% if scenario == "Historical estimates" %>
     Estimates are based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts. For more details about the methodology, please refer to the [World Bank PIP documentation](https://datanalytics.worldbank.org/PIP-Methodology/lineupestimates.html#nowcasts).
-    <% elif scenario == "Current forecast + historical growth" %>
+    <% elif scenario == "Current forecast + historical growth projections" %>
     This data is a projection of the estimates based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019.
-    <% elif scenario == "2% growth" %>
+    <% elif scenario == "Historical estimates + projections" %>
+    This data combines data based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts, with projections based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019.
+    <% elif scenario == "2% growth projections" %>
     This data is a projection of the estimates based on a scenario of 2% average GDP per capita growth, while keeping income inequality constant.
-    <% elif scenario == "2% growth + Gini reduction 1%" %>
+    <% elif scenario == "2% growth + Gini reduction 1% projections" %>
     This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 1% of the Gini coefficient per year.
-    <% elif scenario == "2% growth + Gini reduction 2%" %>
+    <% elif scenario == "2% growth + Gini reduction 2% projections" %>
     This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 2% of the Gini coefficient per year.
-    <% elif scenario == "4% growth" %>
+    <% elif scenario == "4% growth projections" %>
     This data is a projection of the estimates based on a scenario of 4% average GDP per capita growth, while keeping income inequality constant.
-    <% elif scenario == "6% growth" %>
+    <% elif scenario == "6% growth projections" %>
     This data is a projection of the estimates based on a scenario of 6% average GDP per capita growth, while keeping income inequality constant.
-    <% elif scenario == "8% growth" %>
+    <% elif scenario == "8% growth projections" %>
     This data is a projection of the estimates based on a scenario of 8% average GDP per capita growth, while keeping income inequality constant.
     <%- endif -%>
 
   isprojection_by_scenario: |-
-    <% if scenario == "Historical" %>
+    <% if scenario == "Historical estimates" or scenario == "Historical estimates + projections" %>
     false
     <% else %>
     true

diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py
@@ -18,14 +18,14 @@
 
 # Define scenarios and new names
 SCENARIOS = {
-    "historical": "Historical",
-    "current_forecast": "Current forecast + historical growth",
-    "2pct": "2% growth",
-    "2pct_gini1": "2% growth + Gini reduction 1%",
-    "2pct_gini2": "2% growth + Gini reduction 2%",
-    "4pct": "4% growth",
-    "6pct": "6% growth",
-    "8pct": "8% growth",
+    "historical": "Historical estimates",
+    "current_forecast": "Current forecast + historical growth projections",
+    "2pct": "2% growth projections",
+    "2pct_gini1": "2% growth + Gini reduction 1% projections",
+    "2pct_gini2": "2% growth + Gini reduction 2% projections",
+    "4pct": "4% growth projections",
+    "6pct": "6% growth projections",
+    "8pct": "8% growth projections",
 }
 
 # Define index columns
@@ -92,6 +92,10 @@ def connect_estimates_with_projections(tb: Table) -> Table:
 
     tb = tb.copy()
 
+    # Save tb_historical and tb_current_forecast, by filtering scenario in historical and current_forecast
+    tb_historical = tb[tb["scenario"] == "historical"].copy().reset_index(drop=True)
+    tb_current_forecast = tb[tb["scenario"] == "current_forecast"].copy().reset_index(drop=True)
+
     # Make table wider, by using scenario as columns
     tb = tb.pivot(index=["country", "year", "povertyline"], columns="scenario", values=INDICATOR_COLUMNS)
 
@@ -116,4 +120,16 @@ def connect_estimates_with_projections(tb: Table) -> Table:
     for indicator in INDICATOR_COLUMNS:
         tb[indicator] = tb[indicator].copy_metadata(tb["country"])
 
+    # Combine historical and current_forecast, by concatenating tb_historical and tb_current_forecast
+    tb_connected = pr.concat([tb_historical, tb_current_forecast], ignore_index=True)
+
+    # Rename scenario column to "Historical + current forecast + historical growth"
+    tb_connected["scenario"] = "Historical estimates + projections"
+
+    # Keep only the columns in INDEX_COLUMNS and INDICATOR_COLUMNS
+    tb_connected = tb_connected[INDEX_COLUMNS + INDICATOR_COLUMNS]
+
+    # Concatenate tb and tb_connected
+    tb = pr.concat([tb, tb_connected], ignore_index=True)
+
     return tb
diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py
@@ -119,7 +119,7 @@ def add(
             utils.validate_underscore(col, "Variable's name")
 
         if not table.primary_key:
-            if "OWID_STRICT" in environ:
+            if environ.get("OWID_STRICT"):
                 raise PrimaryKeyMissing(
                     f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving"
                 )
@@ -128,7 +128,7 @@ def add(
                     f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving"
                 )
 
-        if not table.index.is_unique and "OWID_STRICT" in environ:
+        if not table.index.is_unique and environ.get("OWID_STRICT"):
             [(k, dups)] = table.index.value_counts().head(1).to_dict().items()
             raise NonUniqueIndex(
                 f"Table `{table.metadata.short_name}` has duplicate values in the index -- could you have made a mistake?\n\n"

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -9,12 +9,12 @@ meta:
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-12-10
-    date_published: 2024-12-10
+    date_accessed: 2024-12-11
+    date_published: 2024-12-11
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
 outs:
-  - md5: d1de4bd7ac3c08a0dcc6eb63f891f71b
-    size: 12799309
+  - md5: fc6f8b908a2988b2d8048707526c460a
+    size: 12799310
     path: weekly_wildfires.csv
diff --git a/snapshots/covid/latest/cases_deaths.csv.dvc b/snapshots/covid/latest/cases_deaths.csv.dvc
@@ -22,7 +22,7 @@ meta:
     version_producer: WHO COVID-19 Dashboard - Daily cases and deaths
     url_main: https://covid19.who.int/
     url_download: https://srhdpeuwpubsa.blob.core.windows.net/whdh/COVID/WHO-COVID-19-global-daily-data.csv
-    date_accessed: 2024-12-10
+    date_accessed: 2024-12-11
     date_published: '2024-07-07'
     license:
       name: CC BY 4.0

diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-12-10
+    date_accessed: 2024-12-11
     publication_date: 2024-11-11
     publication_year: 2024
     published_by: |-

diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-12-10
+    date_accessed: 2024-12-11
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-12-10
+    date_accessed: 2024-12-11
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-12-10
+    date_accessed: 2024-12-11
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc
@@ -22,6 +22,6 @@ meta:
       url: https://global.health/terms-of-use/
 
 outs:
-  - md5: 7928d79ed3caf862d86ba729737fc255
-    size: 16733780
+  - md5: 08388d2230adafbb7fe28ddcd1eb0dc8
+    size: 16813136
     path: global_health_mpox.csv
diff --git a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc
@@ -12,6 +12,7 @@ meta:
     producer: Lakner et al.
     citation_full: |-
       Lakner, C., Genoni, M. E., Stemmler, H., Yonzan, N., & Tetteh Baah, S. K. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024. World Bank. https://doi.org/10.60572/KGE4-CX54
+    attribution: Lakner et al. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024
 
     # Files
     url_main: https://reproducibility.worldbank.org/index.php/catalog/189/

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 516f378e03682d099c5bdcecb732b38b
-    size: 168097330
+  - md5: 811f5ca9e719e680bc1cde286e599f9d
+    size: 168107745
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 50775d6806b50d572bc79031134bc3e3
-    size: 27221232
+  - md5: b687f5f92351d148e71bb3b5d60c0c50
+    size: 27222953
     path: flunet.csv
diff --git a/tests/test_datadiff.py b/tests/test_datadiff.py
@@ -1,3 +1,6 @@
+import os
+from unittest.mock import patch
+
 import pandas as pd
 from owid.catalog import Dataset, DatasetMeta, Table
 
@@ -19,6 +22,7 @@ def _create_datasets(tmp_path):
     return ds_a, ds_b
 
 
+@patch.dict(os.environ, {"OWID_STRICT": ""})
 def test_DatasetDiff_summary(tmp_path):
     ds_a, ds_b = _create_datasets(tmp_path)
 
@@ -43,6 +47,7 @@ def test_DatasetDiff_summary(tmp_path):
     ]
 
 
+@patch.dict(os.environ, {"OWID_STRICT": ""})
 def test_new_data(tmp_path):
     ds_a, ds_b = _create_datasets(tmp_path)
 

diff --git a/tests/test_steps.py b/tests/test_steps.py
@@ -15,6 +15,7 @@
 from unittest.mock import patch
 
 import pandas as pd
+import requests
 from owid.catalog import Dataset
 
 from etl import paths
@@ -162,7 +163,11 @@ def test_select_dirty_steps():
 
 
 def test_get_etag():
-    etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md")
+    try:
+        etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md")
+    # ignore SSL errors
+    except requests.exceptions.SSLError:
+        return
     assert etag