From cff1a2b0a6f523c1ebc465446129ea83ebf8e93d Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:42:18 +0200 Subject: [PATCH] computer_memory_storage: fix missing values in ddrives --- .../2024-05-13/computer_memory_storage.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py index a4891187b70..c23fe64b7ca 100644 --- a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py +++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py @@ -9,7 +9,7 @@ PARSING_INSTRUCTIONS = { "MEMORY": {"skiprows": 4, "usecols": [0, 1]}, - "DDRIVES": {"skiprows": 4, "usecols": [1, 4]}, + "DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]}, "SSD": {"skiprows": 4, "usecols": [1, 3]}, "FLASH": {"skiprows": 4, "usecols": [1, 5]}, } @@ -18,14 +18,23 @@ def read_sheet(snapshot, sheet_name): """Read a sheet from a snapshot.""" tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name]) + + # If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values + # from columns 2 and 3 when empty. + if sheet_name == "DDRIVES": + tb[4] = tb[4].fillna(tb[2]) + tb[4] = tb[4].fillna(tb[3]) + # Drop columns 2 and 3 + tb = tb.drop(columns=[2, 3]) + tb.columns = ["year", "price"] tb["type"] = sheet_name.lower() return tb def clean_data(tb): - # Remove NA years - tb = tb.dropna(subset=["year"]) + # Remove NA years and prices + tb = tb.dropna(subset=["year", "price"]) # Convert year to integer tb["year"] = tb["year"].astype(int)