From 22c050d7683c64622f94f1cc5c926a922f75196f Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 14:39:36 +0200
Subject: [PATCH 1/9] computer_memory_storage: create snapshot

---
 .../2024-05-13/computer_memory_storage.py     | 24 +++++++++++++++++
 .../computer_memory_storage.xlsx.dvc          | 27 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 snapshots/technology/2024-05-13/computer_memory_storage.py
 create mode 100644 snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc

diff --git a/snapshots/technology/2024-05-13/computer_memory_storage.py b/snapshots/technology/2024-05-13/computer_memory_storage.py
new file mode 100644
index 00000000000..90b0c5d5120
--- /dev/null
+++ b/snapshots/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,24 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"technology/{SNAPSHOT_VERSION}/computer_memory_storage.xlsx")
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc b/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc
new file mode 100644
index 00000000000..591c582310b
--- /dev/null
+++ b/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc
@@ -0,0 +1,27 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: Price and Performance Changes of Computer Technology with Time
+    date_published: "2023-11-16"
+
+    # Citation
+    producer: John C. McCallum
+    citation_full: |-
+      John C. McCallum, Price and Performance Changes of Computer Technology with Time
+
+    # Files
+    url_main: https://jcmit.net/memoryprice.htm
+    url_download: https://jcmit.net/MemDiskPrice-XL2010.xlsx
+    date_accessed: 2024-05-13
+
+    # License
+    license:
+      name: copyright 2001, 2022, 2023 John C. McCallum
+      url: https://jcmit.net/memoryprice.htm
+
+outs:
+  - md5: 54b3fcc2931bb60e428dc1dcc8fea034
+    size: 171246
+    path: computer_memory_storage.xlsx

From a7dbf19f835fc3e6087ed1afdf3ef3eaaa00e402 Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:17:34 +0200
Subject: [PATCH 2/9] computer_memory_storage: add meadow

---
 .../2024-05-13/computer_memory_storage.py     | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py

diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
new file mode 100644
index 00000000000..ff262d557c3
--- /dev/null
+++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,94 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+from owid.catalog import Table
+
+import pandas as pd
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+PARSING_INSTRUCTIONS = {
+    "MEMORY": {"skiprows": 4, "usecols": [0, 1]},
+    "DDRIVES": {"skiprows": 4, "usecols": [1, 4]},
+    "SSD": {"skiprows": 4, "usecols": [1, 3]},
+    "FLASH": {"skiprows": 4, "usecols": [1, 5]},
+}
+
+
+def read_sheet(snapshot, sheet_name):
+    """Read a sheet from a snapshot."""
+    tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name])
+    tb.columns = ["year", "price"]
+    tb["type"] = sheet_name.lower()
+    return tb
+
+
+def clean_data(tb):
+    # Remove NA years
+    tb = tb.dropna(subset=["year"])
+
+    # Convert year to integer
+    tb["year"] = tb["year"].astype(int)
+
+    # Convert price to float
+    tb["price"] = tb["price"].astype(float)
+
+    # Keep cheapest price per year
+    tb = tb.groupby(["year", "type"]).min().reset_index()
+
+    # Sort by year
+    tb = tb.sort_values(["year", "type"])
+
+    # For each type, keep cheapest value over time use cummin
+    tb["price"] = tb.groupby("type")["price"].cummin()
+
+    # Convert prices to $/TB instead of $/MB
+    tb["price"] = tb.price.mul(1000000).round(2)
+
+    # Add country World
+    tb["country"] = "World"
+
+    return tb
+
+
+def reshape_data(tb):
+    # Move type to columns
+    tb = tb.pivot(index=["country", "year"], columns="type", values="price").reset_index()
+    return tb
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("computer_memory_storage.xlsx")
+
+    # Load data from snapshot.
+    data = []
+    for sheet_name in PARSING_INSTRUCTIONS.keys():
+        data.append(read_sheet(snap, sheet_name))
+    tb = pd.concat(data)
+
+    #
+    # Process data.
+    #
+    tb = clean_data(tb)
+    tb = reshape_data(tb)
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(["country", "year"])
+    tb.metadata.short_name = paths.short_name
+
+    # Ensure metadata is correctly associated.
+    for column in tb.columns:
+        tb[column].metadata.origins = [snap.metadata.origin]
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()

From eac93a60e92cf4cca40274eff600e99806139649 Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:17:42 +0200
Subject: [PATCH 3/9] computer_memory_storage: add garden

---
 .../computer_memory_storage.meta.yml          | 41 +++++++++++++++++++
 .../2024-05-13/computer_memory_storage.py     | 33 +++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
 create mode 100644 etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py

diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
new file mode 100644
index 00000000000..7a9761c2255
--- /dev/null
+++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
@@ -0,0 +1,41 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Technological Change
+    processing_level: major
+
+dataset:
+  update_period_days: 365
+
+
+tables:
+  computer_memory_storage:
+    variables:
+      ddrives:
+        title: Historical price of memory
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses."
+        display:
+          numDecimalPlaces: 0
+      flash:
+        title: Historical price of memory
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
+        display:
+          numDecimalPlaces: 0
+      memory:
+        title: Historical price of memory
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        display:
+          numDecimalPlaces: 0
+      ssd:
+        title: Historical price of memory
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
+        display:
+          numDecimalPlaces: 0
diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py
new file mode 100644
index 00000000000..1e1c015cf02
--- /dev/null
+++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,33 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("computer_memory_storage")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["computer_memory_storage"].reset_index()
+
+    #
+    # Process data.
+    #
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()

From b5b1f1afeb15b42474d46e86ac51db6febdc5d98 Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:17:46 +0200
Subject: [PATCH 4/9] computer_memory_storage: add grapher

---
 .../2024-05-13/computer_memory_storage.py     | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py

diff --git a/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py
new file mode 100644
index 00000000000..c5dce9b4af0
--- /dev/null
+++ b/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,28 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("computer_memory_storage")
+
+    # Read table from garden dataset.
+    tb = ds_garden["computer_memory_storage"]
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()

From 3c48a5d4bd0c4caeb2af18b0ea1e4c72c48d361f Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:17:50 +0200
Subject: [PATCH 5/9] computer_memory_storage: add to DAG

---
 dag/main.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dag/main.yml b/dag/main.yml
index 70d73615809..a111642604c 100644
--- a/dag/main.yml
+++ b/dag/main.yml
@@ -605,6 +605,14 @@ steps:
   data://grapher/technology/2023-11-28/dna_sequencing:
     - data://garden/technology/2023-11-28/dna_sequencing
 
+  # John C. McCallum, Price and Performance Changes of Computer Technology with Time
+  data://meadow/technology/2024-05-13/computer_memory_storage:
+    - snapshot://technology/2024-05-13/computer_memory_storage.xlsx
+  data://garden/technology/2024-05-13/computer_memory_storage:
+    - data://meadow/technology/2024-05-13/computer_memory_storage
+  data://grapher/technology/2024-05-13/computer_memory_storage:
+    - data://garden/technology/2024-05-13/computer_memory_storage
+
   # European Social Survey - Trust questions
   data://meadow/ess/2023-08-02/ess_trust:
     - snapshot://ess/2023-08-02/ess_trust.csv

From 0f406459285428015cf5099aba18240926dd9c3b Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:22:20 +0200
Subject: [PATCH 6/9] computer_memory_storage: fix indicator titles

---
 .../technology/2024-05-13/computer_memory_storage.meta.yml  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
index 7a9761c2255..d2b77bfea42 100644
--- a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
+++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
@@ -13,14 +13,14 @@ tables:
   computer_memory_storage:
     variables:
       ddrives:
-        title: Historical price of memory
+        title: Historical price of disk drives
         unit: "current US$ per terabyte"
         short_unit: "$/TB"
         description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses."
         display:
           numDecimalPlaces: 0
       flash:
-        title: Historical price of memory
+        title: Historical price of flash memory
         unit: "current US$ per terabyte"
         short_unit: "$/TB"
         description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
@@ -33,7 +33,7 @@ tables:
         display:
           numDecimalPlaces: 0
       ssd:
-        title: Historical price of memory
+        title: Historical price of solid-state drives
         unit: "current US$ per terabyte"
         short_unit: "$/TB"
         description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."

From c8dc7a9229d7743484ee0cf6bcba0016f134bfcd Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:22:52 +0200
Subject: [PATCH 7/9] computer_memory_storage: change to minor processing

---
 .../technology/2024-05-13/computer_memory_storage.meta.yml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
index d2b77bfea42..520017bb2fd 100644
--- a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
+++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
@@ -3,7 +3,7 @@ definitions:
     presentation:
       topic_tags:
         - Technological Change
-    processing_level: major
+    processing_level: minor
 
 dataset:
   update_period_days: 365

From 02f86e395818bf4593493dc83d3b14591d50da5a Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:24:44 +0200
Subject: [PATCH 8/9] computer_memory_storage: make lint

---
 .../meadow/technology/2024-05-13/computer_memory_storage.py  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
index ff262d557c3..a4891187b70 100644
--- a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
+++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
@@ -1,10 +1,9 @@
 """Load a snapshot and create a meadow dataset."""
 
-from etl.helpers import PathFinder, create_dataset
-from owid.catalog import Table
-
 import pandas as pd
 
+from etl.helpers import PathFinder, create_dataset
+
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
 

From cff1a2b0a6f523c1ebc465446129ea83ebf8e93d Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Mon, 13 May 2024 15:42:18 +0200
Subject: [PATCH 9/9] computer_memory_storage: fix missing values in ddrives

---
 .../2024-05-13/computer_memory_storage.py         | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
index a4891187b70..c23fe64b7ca 100644
--- a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
+++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
@@ -9,7 +9,7 @@
 
 PARSING_INSTRUCTIONS = {
     "MEMORY": {"skiprows": 4, "usecols": [0, 1]},
-    "DDRIVES": {"skiprows": 4, "usecols": [1, 4]},
+    "DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]},
     "SSD": {"skiprows": 4, "usecols": [1, 3]},
     "FLASH": {"skiprows": 4, "usecols": [1, 5]},
 }
@@ -18,14 +18,23 @@
 def read_sheet(snapshot, sheet_name):
     """Read a sheet from a snapshot."""
     tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name])
+
+    # If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values
+    # from columns 2 and 3 when empty.
+    if sheet_name == "DDRIVES":
+        tb[4] = tb[4].fillna(tb[2])
+        tb[4] = tb[4].fillna(tb[3])
+        # Drop columns 2 and 3
+        tb = tb.drop(columns=[2, 3])
+
     tb.columns = ["year", "price"]
     tb["type"] = sheet_name.lower()
     return tb
 
 
 def clean_data(tb):
-    # Remove NA years
-    tb = tb.dropna(subset=["year"])
+    # Remove NA years and prices
+    tb = tb.dropna(subset=["year", "price"])
 
     # Convert year to integer
     tb["year"] = tb["year"].astype(int)