Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ unu_wider: Create charts and fix data issues for Government Revenue Dataset #3613

Merged
merged 5 commits into from
Dec 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
"""Load a meadow dataset and create a garden dataset."""
"""
Load a meadow dataset and create a garden dataset.

NOTE: To extract the log of the process (to review sanity checks, for example), follow these steps:
1. Define LONG_FORMAT as True.
2. Run the following command in the terminal:
nohup uv run etl run government_revenue_dataset > government_revenue_dataset.log 2>&1 &
"""

from owid.catalog import Table
from structlog import get_logger
from tabulate import tabulate

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset
Expand All @@ -11,6 +19,12 @@
# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

# Set table format when printing
TABLEFMT = "pretty"

# Define if I show the full table or just the first 5 rows for assertions
LONG_FORMAT = False


def run(dest_dir: str) -> None:
#
Expand All @@ -27,13 +41,12 @@ def run(dest_dir: str) -> None:

#
# Process data.
tb = drop_flagged_rows_and_unnecessary_columns(tb)

#
tb = geo.harmonize_countries(
df=tb,
countries_file=paths.country_mapping_path,
)
tb = drop_flagged_rows_and_unnecessary_columns(tb)

tb = tb.set_index(["country", "year"], verify_integrity=True)

#
Expand Down Expand Up @@ -92,11 +105,74 @@ def drop_flagged_rows_and_unnecessary_columns(tb: Table) -> Table:
"cautionnotes",
"resourcerevenuenotes",
"socialcontributionsnotes",
]
)

tb = sanity_checks(tb)

# Remove all caution columns
tb = tb.drop(
columns=[
"caution1accuracyqualityorco",
"caution2resourcerevenuestax",
"caution3unexcludedresourcere",
"caution4inconsistencieswiths",
]
+ caution_variables
)

return tb


def sanity_checks(tb: Table) -> None:
"""
Perform sanity checks on the data.
"""

tb = tb.copy()

tb = check_negative_values(tb)

return tb


def check_negative_values(tb: Table):
"""
Check if there are negative values in the variables
"""

tb = tb.copy()

# Define columns as all the columns minus country and year
variables = [
col
for col in tb.columns
if col
not in ["country", "year"]
+ [
"caution1accuracyqualityorco",
"caution2resourcerevenuestax",
"caution3unexcludedresourcere",
"caution4inconsistencieswiths",
]
]

for v in variables:
# Create a mask to check if any value is negative
mask = tb[v] < 0
any_error = mask.any()

if any_error:
tb_error = tb[mask].reset_index(drop=True).copy()
paths.log.warning(
f"""{len(tb_error)} observations for {v} are negative:
{_tabulate(tb_error[['country', 'year', 'caution1accuracyqualityorco', 'caution2resourcerevenuestax','caution3unexcludedresourcere','caution4inconsistencieswiths',v]], long_format=LONG_FORMAT)}"""
)

return tb


def _tabulate(tb: Table, long_format: bool, headers="keys", tablefmt=TABLEFMT, **kwargs):
if long_format:
return tabulate(tb, headers=headers, tablefmt=tablefmt, **kwargs)
else:
return tabulate(tb.head(5), headers=headers, tablefmt=tablefmt, **kwargs)
Loading