Merge pull request #2642 from owid/update-comp-mem-storage

Update Price and Performance Changes of Computer Technology with Time
owid · May 13, 2024 · 7054e8b · 7054e8b
2 parents 7c0025a + cff1a2b
commit 7054e8b
Show file tree

Hide file tree

Showing 7 changed files with 263 additions and 0 deletions.
diff --git a/dag/main.yml b/dag/main.yml
@@ -605,6 +605,14 @@ steps:
   data://grapher/technology/2023-11-28/dna_sequencing:
     - data://garden/technology/2023-11-28/dna_sequencing
 
+  # John C. McCallum, Price and Performance Changes of Computer Technology with Time
+  data://meadow/technology/2024-05-13/computer_memory_storage:
+    - snapshot://technology/2024-05-13/computer_memory_storage.xlsx
+  data://garden/technology/2024-05-13/computer_memory_storage:
+    - data://meadow/technology/2024-05-13/computer_memory_storage
+  data://grapher/technology/2024-05-13/computer_memory_storage:
+    - data://garden/technology/2024-05-13/computer_memory_storage
+
   # European Social Survey - Trust questions
   data://meadow/ess/2023-08-02/ess_trust:
     - snapshot://ess/2023-08-02/ess_trust.csv

diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
@@ -0,0 +1,41 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Technological Change
+    processing_level: minor
+
+dataset:
+  update_period_days: 365
+
+
+tables:
+  computer_memory_storage:
+    variables:
+      ddrives:
+        title: Historical price of disk drives
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses."
+        display:
+          numDecimalPlaces: 0
+      flash:
+        title: Historical price of flash memory
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
+        display:
+          numDecimalPlaces: 0
+      memory:
+        title: Historical price of memory
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        display:
+          numDecimalPlaces: 0
+      ssd:
+        title: Historical price of solid-state drives
+        unit: "current US$ per terabyte"
+        short_unit: "$/TB"
+        description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
+        display:
+          numDecimalPlaces: 0
diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,33 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("computer_memory_storage")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["computer_memory_storage"].reset_index()
+
+    #
+    # Process data.
+    #
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,28 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("computer_memory_storage")
+
+    # Read table from garden dataset.
+    tb = ds_garden["computer_memory_storage"]
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,102 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import pandas as pd
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+PARSING_INSTRUCTIONS = {
+    "MEMORY": {"skiprows": 4, "usecols": [0, 1]},
+    "DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]},
+    "SSD": {"skiprows": 4, "usecols": [1, 3]},
+    "FLASH": {"skiprows": 4, "usecols": [1, 5]},
+}
+
+
+def read_sheet(snapshot, sheet_name):
+    """Read a sheet from a snapshot."""
+    tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name])
+
+    # If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values
+    # from columns 2 and 3 when empty.
+    if sheet_name == "DDRIVES":
+        tb[4] = tb[4].fillna(tb[2])
+        tb[4] = tb[4].fillna(tb[3])
+        # Drop columns 2 and 3
+        tb = tb.drop(columns=[2, 3])
+
+    tb.columns = ["year", "price"]
+    tb["type"] = sheet_name.lower()
+    return tb
+
+
+def clean_data(tb):
+    # Remove NA years and prices
+    tb = tb.dropna(subset=["year", "price"])
+
+    # Convert year to integer
+    tb["year"] = tb["year"].astype(int)
+
+    # Convert price to float
+    tb["price"] = tb["price"].astype(float)
+
+    # Keep cheapest price per year
+    tb = tb.groupby(["year", "type"]).min().reset_index()
+
+    # Sort by year
+    tb = tb.sort_values(["year", "type"])
+
+    # For each type, keep cheapest value over time use cummin
+    tb["price"] = tb.groupby("type")["price"].cummin()
+
+    # Convert prices to $/TB instead of $/MB
+    tb["price"] = tb.price.mul(1000000).round(2)
+
+    # Add country World
+    tb["country"] = "World"
+
+    return tb
+
+
+def reshape_data(tb):
+    # Move type to columns
+    tb = tb.pivot(index=["country", "year"], columns="type", values="price").reset_index()
+    return tb
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("computer_memory_storage.xlsx")
+
+    # Load data from snapshot.
+    data = []
+    for sheet_name in PARSING_INSTRUCTIONS.keys():
+        data.append(read_sheet(snap, sheet_name))
+    tb = pd.concat(data)
+
+    #
+    # Process data.
+    #
+    tb = clean_data(tb)
+    tb = reshape_data(tb)
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(["country", "year"])
+    tb.metadata.short_name = paths.short_name
+
+    # Ensure metadata is correctly associated.
+    for column in tb.columns:
+        tb[column].metadata.origins = [snap.metadata.origin]
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/technology/2024-05-13/computer_memory_storage.py b/snapshots/technology/2024-05-13/computer_memory_storage.py
@@ -0,0 +1,24 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"technology/{SNAPSHOT_VERSION}/computer_memory_storage.xlsx")
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc b/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc
@@ -0,0 +1,27 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: Price and Performance Changes of Computer Technology with Time
+    date_published: "2023-11-16"
+
+    # Citation
+    producer: John C. McCallum
+    citation_full: |-
+      John C. McCallum, Price and Performance Changes of Computer Technology with Time
+
+    # Files
+    url_main: https://jcmit.net/memoryprice.htm
+    url_download: https://jcmit.net/MemDiskPrice-XL2010.xlsx
+    date_accessed: 2024-05-13
+
+    # License
+    license:
+      name: copyright 2001, 2022, 2023 John C. McCallum
+      url: https://jcmit.net/memoryprice.htm
+
+outs:
+  - md5: 54b3fcc2931bb60e428dc1dcc8fea034
+    size: 171246
+    path: computer_memory_storage.xlsx