Skip to content

Commit

Permalink
✨ unu_wider: Create charts and fix data issues for Government Revenue…
Browse files Browse the repository at this point in the history
… Dataset (#3613)

* empty

* ✨ add sanity checks

* ✨ sanity check tables

* 💄 format log file name
  • Loading branch information
paarriagadap authored Dec 3, 2024
1 parent 40e452b commit dcd9874
Showing 1 changed file with 81 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
"""Load a meadow dataset and create a garden dataset."""
"""
Load a meadow dataset and create a garden dataset.
NOTE: To extract the log of the process (to review sanity checks, for example), follow these steps:
1. Define LONG_FORMAT as True.
2. Run the following command in the terminal:
nohup uv run etl run government_revenue_dataset > government_revenue_dataset.log 2>&1 &
"""

from owid.catalog import Table
from structlog import get_logger
from tabulate import tabulate

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset
Expand All @@ -11,6 +19,12 @@
# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

# Set table format when printing
TABLEFMT = "pretty"

# Define if I show the full table or just the first 5 rows for assertions
LONG_FORMAT = False


def run(dest_dir: str) -> None:
#
Expand All @@ -27,13 +41,12 @@ def run(dest_dir: str) -> None:

#
# Process data.
tb = drop_flagged_rows_and_unnecessary_columns(tb)

#
tb = geo.harmonize_countries(
df=tb,
countries_file=paths.country_mapping_path,
)
tb = drop_flagged_rows_and_unnecessary_columns(tb)

tb = tb.set_index(["country", "year"], verify_integrity=True)

#
Expand Down Expand Up @@ -92,11 +105,74 @@ def drop_flagged_rows_and_unnecessary_columns(tb: Table) -> Table:
"cautionnotes",
"resourcerevenuenotes",
"socialcontributionsnotes",
]
)

tb = sanity_checks(tb)

# Remove all caution columns
tb = tb.drop(
columns=[
"caution1accuracyqualityorco",
"caution2resourcerevenuestax",
"caution3unexcludedresourcere",
"caution4inconsistencieswiths",
]
+ caution_variables
)

return tb


def sanity_checks(tb: Table) -> None:
"""
Perform sanity checks on the data.
"""

tb = tb.copy()

tb = check_negative_values(tb)

return tb


def check_negative_values(tb: Table):
"""
Check if there are negative values in the variables
"""

tb = tb.copy()

# Define columns as all the columns minus country and year
variables = [
col
for col in tb.columns
if col
not in ["country", "year"]
+ [
"caution1accuracyqualityorco",
"caution2resourcerevenuestax",
"caution3unexcludedresourcere",
"caution4inconsistencieswiths",
]
]

for v in variables:
# Create a mask to check if any value is negative
mask = tb[v] < 0
any_error = mask.any()

if any_error:
tb_error = tb[mask].reset_index(drop=True).copy()
paths.log.warning(
f"""{len(tb_error)} observations for {v} are negative:
{_tabulate(tb_error[['country', 'year', 'caution1accuracyqualityorco', 'caution2resourcerevenuestax','caution3unexcludedresourcere','caution4inconsistencieswiths',v]], long_format=LONG_FORMAT)}"""
)

return tb


def _tabulate(tb: Table, long_format: bool, headers="keys", tablefmt=TABLEFMT, **kwargs):
if long_format:
return tabulate(tb, headers=headers, tablefmt=tablefmt, **kwargs)
else:
return tabulate(tb.head(5), headers=headers, tablefmt=tablefmt, **kwargs)

0 comments on commit dcd9874

Please sign in to comment.