From 05f552ab972319afc67c0618165be4ed7c2da5fb Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Mon, 25 Nov 2024 16:08:59 +0000 Subject: [PATCH 1/4] empty From 8a24b53c68e53517aa34bbe448115acce666d98f Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Tue, 26 Nov 2024 14:45:48 +0000 Subject: [PATCH 2/4] :sparkles: add sanity checks --- .../2023-11-01/government_revenue_dataset.py | 63 ++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py index acc32e890b5..aec62604618 100644 --- a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py +++ b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py @@ -1,7 +1,15 @@ -"""Load a meadow dataset and create a garden dataset.""" +""" +Load a meadow dataset and create a garden dataset. + +NOTE: To extract the log of the process (to review sanity checks, for example), follow these steps: + 1. Define LONG_FORMAT as True. + 2. Run the following command in the terminal: + nohup uv run etl run government_revenue_dataset > output.log 2>&1 & +""" from owid.catalog import Table from structlog import get_logger +from tabulate import tabulate from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -11,6 +19,12 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) +# Set table format when printing +TABLEFMT = "pretty" + +# Define if I show the full table or just the first 5 rows for assertions +LONG_FORMAT = True + def run(dest_dir: str) -> None: # @@ -34,6 +48,9 @@ def run(dest_dir: str) -> None: df=tb, countries_file=paths.country_mapping_path, ) + + tb = sanity_checks(tb) + tb = tb.set_index(["country", "year"], verify_integrity=True) # @@ -100,3 +117,47 @@ def drop_flagged_rows_and_unnecessary_columns(tb: Table) -> Table: ) return tb + + +def sanity_checks(tb: Table) -> None: + """ + Perform sanity checks on the data. + """ + + tb = tb.copy() + + tb = check_negative_values(tb) + + return tb + + +def check_negative_values(tb: Table): + """ + Check if there are negative values in the variables + """ + + tb = tb.copy() + + # Define columns as all the columns minus country and year + variables = [col for col in tb.columns if col not in ["country", "year"]] + + for v in variables: + # Create a mask to check if any value is negative + mask = tb[v] < 0 + any_error = mask.any() + + if any_error: + tb_error = tb[mask].reset_index(drop=True).copy() + paths.log.warning( + f"""{len(tb_error)} observations for {v} are negative: + {_tabulate(tb_error[['country', 'year', v]], long_format=LONG_FORMAT)}""" + ) + + return tb + + +def _tabulate(tb: Table, long_format: bool, headers="keys", tablefmt=TABLEFMT, **kwargs): + if long_format: + return tabulate(tb, headers=headers, tablefmt=tablefmt, **kwargs) + else: + return tabulate(tb.head(5), headers=headers, tablefmt=tablefmt, **kwargs) From 5c2f665289955af6b72813b00687b9847c3ac1f9 Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Tue, 26 Nov 2024 15:33:30 +0000 Subject: [PATCH 3/4] :sparkles: sanity check tables --- .../2023-11-01/government_revenue_dataset.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py index aec62604618..2d846acb3a6 100644 --- a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py +++ b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py @@ -23,7 +23,7 @@ TABLEFMT = "pretty" # Define if I show the full table or just the first 5 rows for assertions -LONG_FORMAT = True +LONG_FORMAT = False def run(dest_dir: str) -> None: @@ -41,15 +41,11 @@ def run(dest_dir: str) -> None: # # Process data. - tb = drop_flagged_rows_and_unnecessary_columns(tb) - - # tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, ) - - tb = sanity_checks(tb) + tb = drop_flagged_rows_and_unnecessary_columns(tb) tb = tb.set_index(["country", "year"], verify_integrity=True) @@ -109,11 +105,19 @@ def drop_flagged_rows_and_unnecessary_columns(tb: Table) -> Table: "cautionnotes", "resourcerevenuenotes", "socialcontributionsnotes", + ] + ) + + tb = sanity_checks(tb) + + # Remove all caution columns + tb = tb.drop( + columns=[ + "caution1accuracyqualityorco", "caution2resourcerevenuestax", "caution3unexcludedresourcere", "caution4inconsistencieswiths", ] - + caution_variables ) return tb @@ -139,7 +143,18 @@ def check_negative_values(tb: Table): tb = tb.copy() # Define columns as all the columns minus country and year - variables = [col for col in tb.columns if col not in ["country", "year"]] + variables = [ + col + for col in tb.columns + if col + not in ["country", "year"] + + [ + "caution1accuracyqualityorco", + "caution2resourcerevenuestax", + "caution3unexcludedresourcere", + "caution4inconsistencieswiths", + ] + ] for v in variables: # Create a mask to check if any value is negative @@ -150,7 +165,7 @@ def check_negative_values(tb: Table): tb_error = tb[mask].reset_index(drop=True).copy() paths.log.warning( f"""{len(tb_error)} observations for {v} are negative: - {_tabulate(tb_error[['country', 'year', v]], long_format=LONG_FORMAT)}""" + {_tabulate(tb_error[['country', 'year', 'caution1accuracyqualityorco', 'caution2resourcerevenuestax','caution3unexcludedresourcere','caution4inconsistencieswiths',v]], long_format=LONG_FORMAT)}""" ) return tb From a93c2c774ebcafaae455fc16b549aef48627767b Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Tue, 26 Nov 2024 15:37:23 +0000 Subject: [PATCH 4/4] :lipstick: format log file name --- .../garden/unu_wider/2023-11-01/government_revenue_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py index 2d846acb3a6..3d281c14dde 100644 --- a/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py +++ b/etl/steps/data/garden/unu_wider/2023-11-01/government_revenue_dataset.py @@ -4,7 +4,7 @@ NOTE: To extract the log of the process (to review sanity checks, for example), follow these steps: 1. Define LONG_FORMAT as True. 2. Run the following command in the terminal: - nohup uv run etl run government_revenue_dataset > output.log 2>&1 & + nohup uv run etl run government_revenue_dataset > government_revenue_dataset.log 2>&1 & """ from owid.catalog import Table