From a7c63db961e5a73c44559946e8c9686c82d903fa Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 14 May 2024 09:38:01 +0200 Subject: [PATCH] Remove data from the last, incomplete year, improve metadata, and fill values of total deaths with deaths --- .../2024-05-09/natural_hazards.meta.yml | 8 ++++---- .../noaa_ncei/2024-05-09/natural_hazards.py | 19 ++++++++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.meta.yml b/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.meta.yml index c8e20bd476c..e8f3965d797 100644 --- a/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.meta.yml +++ b/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.meta.yml @@ -27,10 +27,10 @@ tables: unit: deaths missing: title: Number of reported missing persons directly caused by a natural disaster - unit: missing + unit: missing people injuries: title: Number of reported injuries directly caused by a natural disaster - unit: injuries + unit: injured people damage: title: Economic damage directly caused by a natural disaster unit: current US$ @@ -46,10 +46,10 @@ tables: unit: deaths missing_total: title: Total number of missing persons caused by a disaster and secondary effects - unit: missing + unit: missing people injuries_total: title: Total number of injuries caused by a disaster and secondary effects - unit: injuries + unit: injured people damage_total: title: Total economic damage caused by a disaster and secondary effects unit: current US$ diff --git a/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.py b/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.py index 7bdbe2f5131..4b960772f8b 100644 --- a/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.py +++ b/etl/steps/data/garden/noaa_ncei/2024-05-09/natural_hazards.py @@ -11,6 +11,7 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) + # The common impact metrics among the three tables are as follows: # Note that the definitions are not easy to be found explicitly anywhere. # But in the resulting search table, by clicking on the header of each column, a pop up appears with a short definition. @@ -301,6 +302,15 @@ def run(dest_dir: str) -> None: tables[table_name], countries_file=paths.country_mapping_path, warn_on_unused_countries=False ) + # Column "deaths_total" is more often informed than "deaths", but there are also cases where there is "deaths" + # but not "deaths_total". Since "deaths_total" includes all deaths (including secondary ones), it would make + # sense to fill empty "deaths_total" with "deaths". + # Note that there are events where "deaths" > "deaths_total", which should not happen. + # We contacted Nicolas Arcos (NOAA Federal) who suggested indeed to fill missing "deaths_total" with "deaths". + # He also explained that events that have a value for "deaths" but not for "deaths_total" are under review, and + # are more uncertain. + tables[table_name]["deaths_total"] = tables[table_name]["deaths_total"].fillna(tables[table_name]["deaths"]) + # Calculate the number of events and the total impact (for each metric) per country-year. # Note that the data on the socio-economic impacts is very sparse. # For example, the percentage of events that lack data on "deaths" is @@ -331,11 +341,10 @@ def run(dest_dir: str) -> None: .assign(**{"type": table_name}) ) - # NOTE: Usually "deathstotal" is more often informed than "deaths", but there are also cases where there is - # "deaths" but not "deathstotal". Since "deathstotal" includes all deaths (including secondary ones), it would make - # sense to fill empty "deathstotal" with "deaths". - # However, note that there are events where "deaths" > "deathstotal", which should not happen. - # These are probably data issues. I contacted the data provider to ask about it. + # Since the current year is never complete, remove it from the data. + tables[table_name] = tables[table_name][ + tables[table_name]["year"] < int(paths.version.split("-")[0]) + ].reset_index(drop=True) # Merge all tables. tb = pr.concat(list(tables.values()), ignore_index=True)