-
-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2642 from owid/update-comp-mem-storage
Update Price and Performance Changes of Computer Technology with Time
- Loading branch information
Showing
7 changed files
with
263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
definitions: | ||
common: | ||
presentation: | ||
topic_tags: | ||
- Technological Change | ||
processing_level: minor | ||
|
||
dataset: | ||
update_period_days: 365 | ||
|
||
|
||
tables: | ||
computer_memory_storage: | ||
variables: | ||
ddrives: | ||
title: Historical price of disk drives | ||
unit: "current US$ per terabyte" | ||
short_unit: "$/TB" | ||
description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses." | ||
display: | ||
numDecimalPlaces: 0 | ||
flash: | ||
title: Historical price of flash memory | ||
unit: "current US$ per terabyte" | ||
short_unit: "$/TB" | ||
description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices." | ||
display: | ||
numDecimalPlaces: 0 | ||
memory: | ||
title: Historical price of memory | ||
unit: "current US$ per terabyte" | ||
short_unit: "$/TB" | ||
display: | ||
numDecimalPlaces: 0 | ||
ssd: | ||
title: Historical price of solid-state drives | ||
unit: "current US$ per terabyte" | ||
short_unit: "$/TB" | ||
description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices." | ||
display: | ||
numDecimalPlaces: 0 |
33 changes: 33 additions & 0 deletions
33
etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
"""Load a meadow dataset and create a garden dataset.""" | ||
|
||
from etl.helpers import PathFinder, create_dataset | ||
|
||
# Get paths and naming conventions for current step. | ||
paths = PathFinder(__file__) | ||
|
||
|
||
def run(dest_dir: str) -> None: | ||
# | ||
# Load inputs. | ||
# | ||
# Load meadow dataset. | ||
ds_meadow = paths.load_dataset("computer_memory_storage") | ||
|
||
# Read table from meadow dataset. | ||
tb = ds_meadow["computer_memory_storage"].reset_index() | ||
|
||
# | ||
# Process data. | ||
# | ||
tb = tb.format(["country", "year"]) | ||
|
||
# | ||
# Save outputs. | ||
# | ||
# Create a new garden dataset with the same metadata as the meadow dataset. | ||
ds_garden = create_dataset( | ||
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata | ||
) | ||
|
||
# Save changes in the new garden dataset. | ||
ds_garden.save() |
28 changes: 28 additions & 0 deletions
28
etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
"""Load a garden dataset and create a grapher dataset.""" | ||
|
||
from etl.helpers import PathFinder, create_dataset | ||
|
||
# Get paths and naming conventions for current step. | ||
paths = PathFinder(__file__) | ||
|
||
|
||
def run(dest_dir: str) -> None: | ||
# | ||
# Load inputs. | ||
# | ||
# Load garden dataset. | ||
ds_garden = paths.load_dataset("computer_memory_storage") | ||
|
||
# Read table from garden dataset. | ||
tb = ds_garden["computer_memory_storage"] | ||
|
||
# | ||
# Save outputs. | ||
# | ||
# Create a new grapher dataset with the same metadata as the garden dataset. | ||
ds_grapher = create_dataset( | ||
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata | ||
) | ||
|
||
# Save changes in the new grapher dataset. | ||
ds_grapher.save() |
102 changes: 102 additions & 0 deletions
102
etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
"""Load a snapshot and create a meadow dataset.""" | ||
|
||
import pandas as pd | ||
|
||
from etl.helpers import PathFinder, create_dataset | ||
|
||
# Get paths and naming conventions for current step. | ||
paths = PathFinder(__file__) | ||
|
||
PARSING_INSTRUCTIONS = { | ||
"MEMORY": {"skiprows": 4, "usecols": [0, 1]}, | ||
"DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]}, | ||
"SSD": {"skiprows": 4, "usecols": [1, 3]}, | ||
"FLASH": {"skiprows": 4, "usecols": [1, 5]}, | ||
} | ||
|
||
|
||
def read_sheet(snapshot, sheet_name): | ||
"""Read a sheet from a snapshot.""" | ||
tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name]) | ||
|
||
# If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values | ||
# from columns 2 and 3 when empty. | ||
if sheet_name == "DDRIVES": | ||
tb[4] = tb[4].fillna(tb[2]) | ||
tb[4] = tb[4].fillna(tb[3]) | ||
# Drop columns 2 and 3 | ||
tb = tb.drop(columns=[2, 3]) | ||
|
||
tb.columns = ["year", "price"] | ||
tb["type"] = sheet_name.lower() | ||
return tb | ||
|
||
|
||
def clean_data(tb): | ||
# Remove NA years and prices | ||
tb = tb.dropna(subset=["year", "price"]) | ||
|
||
# Convert year to integer | ||
tb["year"] = tb["year"].astype(int) | ||
|
||
# Convert price to float | ||
tb["price"] = tb["price"].astype(float) | ||
|
||
# Keep cheapest price per year | ||
tb = tb.groupby(["year", "type"]).min().reset_index() | ||
|
||
# Sort by year | ||
tb = tb.sort_values(["year", "type"]) | ||
|
||
# For each type, keep cheapest value over time use cummin | ||
tb["price"] = tb.groupby("type")["price"].cummin() | ||
|
||
# Convert prices to $/TB instead of $/MB | ||
tb["price"] = tb.price.mul(1000000).round(2) | ||
|
||
# Add country World | ||
tb["country"] = "World" | ||
|
||
return tb | ||
|
||
|
||
def reshape_data(tb): | ||
# Move type to columns | ||
tb = tb.pivot(index=["country", "year"], columns="type", values="price").reset_index() | ||
return tb | ||
|
||
|
||
def run(dest_dir: str) -> None: | ||
# | ||
# Load inputs. | ||
# | ||
# Retrieve snapshot. | ||
snap = paths.load_snapshot("computer_memory_storage.xlsx") | ||
|
||
# Load data from snapshot. | ||
data = [] | ||
for sheet_name in PARSING_INSTRUCTIONS.keys(): | ||
data.append(read_sheet(snap, sheet_name)) | ||
tb = pd.concat(data) | ||
|
||
# | ||
# Process data. | ||
# | ||
tb = clean_data(tb) | ||
tb = reshape_data(tb) | ||
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently. | ||
tb = tb.format(["country", "year"]) | ||
tb.metadata.short_name = paths.short_name | ||
|
||
# Ensure metadata is correctly associated. | ||
for column in tb.columns: | ||
tb[column].metadata.origins = [snap.metadata.origin] | ||
|
||
# | ||
# Save outputs. | ||
# | ||
# Create a new meadow dataset with the same metadata as the snapshot. | ||
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) | ||
|
||
# Save changes in the new meadow dataset. | ||
ds_meadow.save() |
24 changes: 24 additions & 0 deletions
24
snapshots/technology/2024-05-13/computer_memory_storage.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
"""Script to create a snapshot of dataset.""" | ||
|
||
from pathlib import Path | ||
|
||
import click | ||
|
||
from etl.snapshot import Snapshot | ||
|
||
# Version for current snapshot dataset. | ||
SNAPSHOT_VERSION = Path(__file__).parent.name | ||
|
||
|
||
@click.command() | ||
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") | ||
def main(upload: bool) -> None: | ||
# Create a new snapshot. | ||
snap = Snapshot(f"technology/{SNAPSHOT_VERSION}/computer_memory_storage.xlsx") | ||
|
||
# Download data from source, add file to DVC and upload to S3. | ||
snap.create_snapshot(upload=upload) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
27 changes: 27 additions & 0 deletions
27
snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Learn more at: | ||
# http://docs.owid.io/projects/etl/architecture/metadata/reference/ | ||
meta: | ||
origin: | ||
# Data product / Snapshot | ||
title: Price and Performance Changes of Computer Technology with Time | ||
date_published: "2023-11-16" | ||
|
||
# Citation | ||
producer: John C. McCallum | ||
citation_full: |- | ||
John C. McCallum, Price and Performance Changes of Computer Technology with Time | ||
|
||
# Files | ||
url_main: https://jcmit.net/memoryprice.htm | ||
url_download: https://jcmit.net/MemDiskPrice-XL2010.xlsx | ||
date_accessed: 2024-05-13 | ||
|
||
# License | ||
license: | ||
name: copyright 2001, 2022, 2023 John C. McCallum | ||
url: https://jcmit.net/memoryprice.htm | ||
|
||
outs: | ||
- md5: 54b3fcc2931bb60e428dc1dcc8fea034 | ||
size: 171246 | ||
path: computer_memory_storage.xlsx |