Skip to content

Commit

Permalink
Merge branch 'master' of github.com:owid/etl into wizard-on-provider-…
Browse files Browse the repository at this point in the history
…analytics
  • Loading branch information
pabloarosado committed Dec 11, 2024
2 parents 29f00b6 + b8d4a0c commit 0983da2
Show file tree
Hide file tree
Showing 17 changed files with 87 additions and 42 deletions.
16 changes: 16 additions & 0 deletions apps/wizard/app_pages/chart_diff/chart_diff.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime as dt
import difflib
import json
import pprint
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -653,6 +654,7 @@ def _modified_chart_configs_on_staging(
select
c.id as chartId,
MD5(cc.full) as chartChecksum,
cc.full as chartConfig,
c.lastEditedByUserId as chartLastEditedByUserId,
c.publishedByUserId as chartPublishedByUserId,
c.lastEditedAt as chartLastEditedAt
Expand Down Expand Up @@ -699,6 +701,20 @@ def _modified_chart_configs_on_staging(
diff = source_df.copy()
diff["configEdited"] = source_df["chartChecksum"] != target_df["chartChecksum"]

# Go through edited configs and do a more detailed comparison
ix = diff["configEdited"] & target_df["chartChecksum"].notnull()
equal_configs = []
for chart_id, row in diff.loc[ix].iterrows():
source_config = json.loads(row["chartConfig"])
target_config = json.loads(target_df.loc[chart_id, "chartConfig"])

# Compare configs
if configs_are_equal(source_config, target_config):
equal_configs.append(chart_id)

# Exclude configs that have different chartChecksum, but are actually the same (e.g. have just different version)
diff = diff[~diff.index.isin(equal_configs)]

# Add flag 'edited in staging'
diff["chartEditedInStaging"] = True

Expand Down
14 changes: 7 additions & 7 deletions etl/steps/data/garden/covid/latest/sequence.meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ tables:
num_sequences:
title: "Number of sequenced COVID-19 genomes - Variant: << variant >>"
description_short: |-
<% set mapping = dict(
non_who="The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.",
other="The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.",
else="The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced."
) %>
<< mapping.get(variant, mapping['else']) >>
<% if variant == 'non_who' %>
The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.
<% elif variant == 'other' %>
The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.
<% else %>
The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.
<%- endif -%>
unit: "sequenced genomes"
display:
tolerance: 28
Expand Down
20 changes: 11 additions & 9 deletions etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,28 @@ definitions:
Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account.
description_key_scenarios: |-
<% if scenario == "Historical" %>
<% if scenario == "Historical estimates" %>
Estimates are based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts. For more details about the methodology, please refer to the [World Bank PIP documentation](https://datanalytics.worldbank.org/PIP-Methodology/lineupestimates.html#nowcasts).
<% elif scenario == "Current forecast + historical growth" %>
<% elif scenario == "Current forecast + historical growth projections" %>
This data is a projection of the estimates based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019.
<% elif scenario == "2% growth" %>
<% elif scenario == "Historical estimates + projections" %>
This data combines data based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts, with projections based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019.
<% elif scenario == "2% growth projections" %>
This data is a projection of the estimates based on a scenario of 2% average GDP per capita growth, while keeping income inequality constant.
<% elif scenario == "2% growth + Gini reduction 1%" %>
<% elif scenario == "2% growth + Gini reduction 1% projections" %>
This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 1% of the Gini coefficient per year.
<% elif scenario == "2% growth + Gini reduction 2%" %>
<% elif scenario == "2% growth + Gini reduction 2% projections" %>
This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 2% of the Gini coefficient per year.
<% elif scenario == "4% growth" %>
<% elif scenario == "4% growth projections" %>
This data is a projection of the estimates based on a scenario of 4% average GDP per capita growth, while keeping income inequality constant.
<% elif scenario == "6% growth" %>
<% elif scenario == "6% growth projections" %>
This data is a projection of the estimates based on a scenario of 6% average GDP per capita growth, while keeping income inequality constant.
<% elif scenario == "8% growth" %>
<% elif scenario == "8% growth projections" %>
This data is a projection of the estimates based on a scenario of 8% average GDP per capita growth, while keeping income inequality constant.
<%- endif -%>
isprojection_by_scenario: |-
<% if scenario == "Historical" %>
<% if scenario == "Historical estimates" or scenario == "Historical estimates + projections" %>
false
<% else %>
true
Expand Down
32 changes: 24 additions & 8 deletions etl/steps/data/garden/wb/2024-12-03/poverty_projections.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@

# Define scenarios and new names
SCENARIOS = {
"historical": "Historical",
"current_forecast": "Current forecast + historical growth",
"2pct": "2% growth",
"2pct_gini1": "2% growth + Gini reduction 1%",
"2pct_gini2": "2% growth + Gini reduction 2%",
"4pct": "4% growth",
"6pct": "6% growth",
"8pct": "8% growth",
"historical": "Historical estimates",
"current_forecast": "Current forecast + historical growth projections",
"2pct": "2% growth projections",
"2pct_gini1": "2% growth + Gini reduction 1% projections",
"2pct_gini2": "2% growth + Gini reduction 2% projections",
"4pct": "4% growth projections",
"6pct": "6% growth projections",
"8pct": "8% growth projections",
}

# Define index columns
Expand Down Expand Up @@ -92,6 +92,10 @@ def connect_estimates_with_projections(tb: Table) -> Table:

tb = tb.copy()

# Save tb_historical and tb_current_forecast, by filtering scenario in historical and current_forecast
tb_historical = tb[tb["scenario"] == "historical"].copy().reset_index(drop=True)
tb_current_forecast = tb[tb["scenario"] == "current_forecast"].copy().reset_index(drop=True)

# Make table wider, by using scenario as columns
tb = tb.pivot(index=["country", "year", "povertyline"], columns="scenario", values=INDICATOR_COLUMNS)

Expand All @@ -116,4 +120,16 @@ def connect_estimates_with_projections(tb: Table) -> Table:
for indicator in INDICATOR_COLUMNS:
tb[indicator] = tb[indicator].copy_metadata(tb["country"])

# Combine historical and current_forecast, by concatenating tb_historical and tb_current_forecast
tb_connected = pr.concat([tb_historical, tb_current_forecast], ignore_index=True)

# Rename scenario column to "Historical + current forecast + historical growth"
tb_connected["scenario"] = "Historical estimates + projections"

# Keep only the columns in INDEX_COLUMNS and INDICATOR_COLUMNS
tb_connected = tb_connected[INDEX_COLUMNS + INDICATOR_COLUMNS]

# Concatenate tb and tb_connected
tb = pr.concat([tb, tb_connected], ignore_index=True)

return tb
4 changes: 2 additions & 2 deletions lib/catalog/owid/catalog/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def add(
utils.validate_underscore(col, "Variable's name")

if not table.primary_key:
if "OWID_STRICT" in environ:
if environ.get("OWID_STRICT"):
raise PrimaryKeyMissing(
f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving"
)
Expand All @@ -128,7 +128,7 @@ def add(
f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving"
)

if not table.index.is_unique and "OWID_STRICT" in environ:
if not table.index.is_unique and environ.get("OWID_STRICT"):
[(k, dups)] = table.index.value_counts().head(1).to_dict().items()
raise NonUniqueIndex(
f"Table `{table.metadata.short_name}` has duplicate values in the index -- could you have made a mistake?\n\n"
Expand Down
8 changes: 4 additions & 4 deletions snapshots/climate/latest/weekly_wildfires.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ meta:
citation_full: Global Wildfire Information System
attribution_short: GWIS
url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
date_accessed: 2024-12-10
date_published: 2024-12-10
date_accessed: 2024-12-11
date_published: 2024-12-11
license:
name: CC BY 4.0
url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
outs:
- md5: d1de4bd7ac3c08a0dcc6eb63f891f71b
size: 12799309
- md5: fc6f8b908a2988b2d8048707526c460a
size: 12799310
path: weekly_wildfires.csv
2 changes: 1 addition & 1 deletion snapshots/covid/latest/cases_deaths.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ meta:
version_producer: WHO COVID-19 Dashboard - Daily cases and deaths
url_main: https://covid19.who.int/
url_download: https://srhdpeuwpubsa.blob.core.windows.net/whdh/COVID/WHO-COVID-19-global-daily-data.csv
date_accessed: 2024-12-10
date_accessed: 2024-12-11
date_published: '2024-07-07'
license:
name: CC BY 4.0
Expand Down
2 changes: 1 addition & 1 deletion snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ meta:
HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
url: https://www.mortality.org/Data/STMF
source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
date_accessed: 2024-12-10
date_accessed: 2024-12-11
publication_date: 2024-11-11
publication_year: 2024
published_by: |-
Expand Down
2 changes: 1 addition & 1 deletion snapshots/excess_mortality/latest/wmd.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ meta:
Published paper available at https://elifesciences.org/articles/69336.
url: https://github.com/akarlinsky/world_mortality/
source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
date_accessed: 2024-12-10
date_accessed: 2024-12-11
publication_date: '2021-06-30'
publication_year: 2021
published_by: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ meta:
For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
url: https://github.com/dkobak/excess-mortality
source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
date_accessed: 2024-12-10
date_accessed: 2024-12-11
publication_date: '2021-06-30'
publication_year: 2021
published_by: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ meta:
For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
url: https://github.com/dkobak/excess-mortality
source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
date_accessed: 2024-12-10
date_accessed: 2024-12-11
publication_date: '2021-06-30'
publication_year: 2021
published_by: |-
Expand Down
4 changes: 2 additions & 2 deletions snapshots/health/latest/global_health_mpox.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ meta:
url: https://global.health/terms-of-use/

outs:
- md5: 7928d79ed3caf862d86ba729737fc255
size: 16733780
- md5: 08388d2230adafbb7fe28ddcd1eb0dc8
size: 16813136
path: global_health_mpox.csv
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ meta:
producer: Lakner et al.
citation_full: |-
Lakner, C., Genoni, M. E., Stemmler, H., Yonzan, N., & Tetteh Baah, S. K. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024. World Bank. https://doi.org/10.60572/KGE4-CX54
attribution: Lakner et al. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024

# Files
url_main: https://reproducibility.worldbank.org/index.php/catalog/189/
Expand Down
4 changes: 2 additions & 2 deletions snapshots/who/latest/fluid.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ meta:
The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
wdir: ../../../data/snapshots/who/latest
outs:
- md5: 516f378e03682d099c5bdcecb732b38b
size: 168097330
- md5: 811f5ca9e719e680bc1cde286e599f9d
size: 168107745
path: fluid.csv
4 changes: 2 additions & 2 deletions snapshots/who/latest/flunet.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ meta:
The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
wdir: ../../../data/snapshots/who/latest
outs:
- md5: 50775d6806b50d572bc79031134bc3e3
size: 27221232
- md5: b687f5f92351d148e71bb3b5d60c0c50
size: 27222953
path: flunet.csv
5 changes: 5 additions & 0 deletions tests/test_datadiff.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
from unittest.mock import patch

import pandas as pd
from owid.catalog import Dataset, DatasetMeta, Table

Expand All @@ -19,6 +22,7 @@ def _create_datasets(tmp_path):
return ds_a, ds_b


@patch.dict(os.environ, {"OWID_STRICT": ""})
def test_DatasetDiff_summary(tmp_path):
ds_a, ds_b = _create_datasets(tmp_path)

Expand All @@ -43,6 +47,7 @@ def test_DatasetDiff_summary(tmp_path):
]


@patch.dict(os.environ, {"OWID_STRICT": ""})
def test_new_data(tmp_path):
ds_a, ds_b = _create_datasets(tmp_path)

Expand Down
7 changes: 6 additions & 1 deletion tests/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from unittest.mock import patch

import pandas as pd
import requests
from owid.catalog import Dataset

from etl import paths
Expand Down Expand Up @@ -162,7 +163,11 @@ def test_select_dirty_steps():


def test_get_etag():
etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md")
try:
etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md")
# ignore SSL errors
except requests.exceptions.SSLError:
return
assert etag


Expand Down

0 comments on commit 0983da2

Please sign in to comment.