From cff1a2b0a6f523c1ebc465446129ea83ebf8e93d Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:42:18 +0200
Subject: [PATCH] computer_memory_storage: fix missing values in ddrives

---
 .../2024-05-13/computer_memory_storage.py         | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
index a4891187b70..c23fe64b7ca 100644
--- a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
+++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
@@ -9,7 +9,7 @@
 
 PARSING_INSTRUCTIONS = {
     "MEMORY": {"skiprows": 4, "usecols": [0, 1]},
-    "DDRIVES": {"skiprows": 4, "usecols": [1, 4]},
+    "DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]},
     "SSD": {"skiprows": 4, "usecols": [1, 3]},
     "FLASH": {"skiprows": 4, "usecols": [1, 5]},
 }
@@ -18,14 +18,23 @@
 def read_sheet(snapshot, sheet_name):
     """Read a sheet from a snapshot."""
     tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name])
+
+    # If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values
+    # from columns 2 and 3 when empty.
+    if sheet_name == "DDRIVES":
+        tb[4] = tb[4].fillna(tb[2])
+        tb[4] = tb[4].fillna(tb[3])
+        # Drop columns 2 and 3
+        tb = tb.drop(columns=[2, 3])
+
     tb.columns = ["year", "price"]
     tb["type"] = sheet_name.lower()
     return tb
 
 
 def clean_data(tb):
-    # Remove NA years
-    tb = tb.dropna(subset=["year"])
+    # Remove NA years and prices
+    tb = tb.dropna(subset=["year", "price"])
 
     # Convert year to integer
     tb["year"] = tb["year"].astype(int)