Skip to content

Commit

Permalink
Merge pull request #2642 from owid/update-comp-mem-storage
Browse files Browse the repository at this point in the history
Update Price and Performance Changes of Computer Technology with Time
  • Loading branch information
edomt authored May 13, 2024
2 parents 7c0025a + cff1a2b commit 7054e8b
Show file tree
Hide file tree
Showing 7 changed files with 263 additions and 0 deletions.
8 changes: 8 additions & 0 deletions dag/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,14 @@ steps:
data://grapher/technology/2023-11-28/dna_sequencing:
- data://garden/technology/2023-11-28/dna_sequencing

# John C. McCallum, Price and Performance Changes of Computer Technology with Time
data://meadow/technology/2024-05-13/computer_memory_storage:
- snapshot://technology/2024-05-13/computer_memory_storage.xlsx
data://garden/technology/2024-05-13/computer_memory_storage:
- data://meadow/technology/2024-05-13/computer_memory_storage
data://grapher/technology/2024-05-13/computer_memory_storage:
- data://garden/technology/2024-05-13/computer_memory_storage

# European Social Survey - Trust questions
data://meadow/ess/2023-08-02/ess_trust:
- snapshot://ess/2023-08-02/ess_trust.csv
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
definitions:
common:
presentation:
topic_tags:
- Technological Change
processing_level: minor

dataset:
update_period_days: 365


tables:
computer_memory_storage:
variables:
ddrives:
title: Historical price of disk drives
unit: "current US$ per terabyte"
short_unit: "$/TB"
description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses."
display:
numDecimalPlaces: 0
flash:
title: Historical price of flash memory
unit: "current US$ per terabyte"
short_unit: "$/TB"
description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
display:
numDecimalPlaces: 0
memory:
title: Historical price of memory
unit: "current US$ per terabyte"
short_unit: "$/TB"
display:
numDecimalPlaces: 0
ssd:
title: Historical price of solid-state drives
unit: "current US$ per terabyte"
short_unit: "$/TB"
description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices."
display:
numDecimalPlaces: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Load a meadow dataset and create a garden dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("computer_memory_storage")

# Read table from meadow dataset.
tb = ds_meadow["computer_memory_storage"].reset_index()

#
# Process data.
#
tb = tb.format(["country", "year"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("computer_memory_storage")

# Read table from garden dataset.
tb = ds_garden["computer_memory_storage"]

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
)

# Save changes in the new grapher dataset.
ds_grapher.save()
102 changes: 102 additions & 0 deletions etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Load a snapshot and create a meadow dataset."""

import pandas as pd

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

PARSING_INSTRUCTIONS = {
"MEMORY": {"skiprows": 4, "usecols": [0, 1]},
"DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]},
"SSD": {"skiprows": 4, "usecols": [1, 3]},
"FLASH": {"skiprows": 4, "usecols": [1, 5]},
}


def read_sheet(snapshot, sheet_name):
"""Read a sheet from a snapshot."""
tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name])

# If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values
# from columns 2 and 3 when empty.
if sheet_name == "DDRIVES":
tb[4] = tb[4].fillna(tb[2])
tb[4] = tb[4].fillna(tb[3])
# Drop columns 2 and 3
tb = tb.drop(columns=[2, 3])

tb.columns = ["year", "price"]
tb["type"] = sheet_name.lower()
return tb


def clean_data(tb):
# Remove NA years and prices
tb = tb.dropna(subset=["year", "price"])

# Convert year to integer
tb["year"] = tb["year"].astype(int)

# Convert price to float
tb["price"] = tb["price"].astype(float)

# Keep cheapest price per year
tb = tb.groupby(["year", "type"]).min().reset_index()

# Sort by year
tb = tb.sort_values(["year", "type"])

# For each type, keep cheapest value over time use cummin
tb["price"] = tb.groupby("type")["price"].cummin()

# Convert prices to $/TB instead of $/MB
tb["price"] = tb.price.mul(1000000).round(2)

# Add country World
tb["country"] = "World"

return tb


def reshape_data(tb):
# Move type to columns
tb = tb.pivot(index=["country", "year"], columns="type", values="price").reset_index()
return tb


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("computer_memory_storage.xlsx")

# Load data from snapshot.
data = []
for sheet_name in PARSING_INSTRUCTIONS.keys():
data.append(read_sheet(snap, sheet_name))
tb = pd.concat(data)

#
# Process data.
#
tb = clean_data(tb)
tb = reshape_data(tb)
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.format(["country", "year"])
tb.metadata.short_name = paths.short_name

# Ensure metadata is correctly associated.
for column in tb.columns:
tb[column].metadata.origins = [snap.metadata.origin]

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)

# Save changes in the new meadow dataset.
ds_meadow.save()
24 changes: 24 additions & 0 deletions snapshots/technology/2024-05-13/computer_memory_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"technology/{SNAPSHOT_VERSION}/computer_memory_storage.xlsx")

# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(upload=upload)


if __name__ == "__main__":
main()
27 changes: 27 additions & 0 deletions snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: Price and Performance Changes of Computer Technology with Time
date_published: "2023-11-16"

# Citation
producer: John C. McCallum
citation_full: |-
John C. McCallum, Price and Performance Changes of Computer Technology with Time

# Files
url_main: https://jcmit.net/memoryprice.htm
url_download: https://jcmit.net/MemDiskPrice-XL2010.xlsx
date_accessed: 2024-05-13

# License
license:
name: copyright 2001, 2022, 2023 John C. McCallum
url: https://jcmit.net/memoryprice.htm

outs:
- md5: 54b3fcc2931bb60e428dc1dcc8fea034
size: 171246
path: computer_memory_storage.xlsx

0 comments on commit 7054e8b

Please sign in to comment.