From 22c050d7683c64622f94f1cc5c926a922f75196f Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 14:39:36 +0200 Subject: [PATCH 1/9] computer_memory_storage: create snapshot --- .../2024-05-13/computer_memory_storage.py | 24 +++++++++++++++++ .../computer_memory_storage.xlsx.dvc | 27 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 snapshots/technology/2024-05-13/computer_memory_storage.py create mode 100644 snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc diff --git a/snapshots/technology/2024-05-13/computer_memory_storage.py b/snapshots/technology/2024-05-13/computer_memory_storage.py new file mode 100644 index 00000000000..90b0c5d5120 --- /dev/null +++ b/snapshots/technology/2024-05-13/computer_memory_storage.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"technology/{SNAPSHOT_VERSION}/computer_memory_storage.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc b/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc new file mode 100644 index 00000000000..591c582310b --- /dev/null +++ b/snapshots/technology/2024-05-13/computer_memory_storage.xlsx.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Price and Performance Changes of Computer Technology with Time + date_published: "2023-11-16" + + # Citation + producer: John C. McCallum + citation_full: |- + John C. McCallum, Price and Performance Changes of Computer Technology with Time + + # Files + url_main: https://jcmit.net/memoryprice.htm + url_download: https://jcmit.net/MemDiskPrice-XL2010.xlsx + date_accessed: 2024-05-13 + + # License + license: + name: copyright 2001, 2022, 2023 John C. McCallum + url: https://jcmit.net/memoryprice.htm + +outs: + - md5: 54b3fcc2931bb60e428dc1dcc8fea034 + size: 171246 + path: computer_memory_storage.xlsx From a7dbf19f835fc3e6087ed1afdf3ef3eaaa00e402 Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:17:34 +0200 Subject: [PATCH 2/9] computer_memory_storage: add meadow --- .../2024-05-13/computer_memory_storage.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py new file mode 100644 index 00000000000..ff262d557c3 --- /dev/null +++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py @@ -0,0 +1,94 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset +from owid.catalog import Table + +import pandas as pd + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +PARSING_INSTRUCTIONS = { + "MEMORY": {"skiprows": 4, "usecols": [0, 1]}, + "DDRIVES": {"skiprows": 4, "usecols": [1, 4]}, + "SSD": {"skiprows": 4, "usecols": [1, 3]}, + "FLASH": {"skiprows": 4, "usecols": [1, 5]}, +} + + +def read_sheet(snapshot, sheet_name): + """Read a sheet from a snapshot.""" + tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name]) + tb.columns = ["year", "price"] + tb["type"] = sheet_name.lower() + return tb + + +def clean_data(tb): + # Remove NA years + tb = tb.dropna(subset=["year"]) + + # Convert year to integer + tb["year"] = tb["year"].astype(int) + + # Convert price to float + tb["price"] = tb["price"].astype(float) + + # Keep cheapest price per year + tb = tb.groupby(["year", "type"]).min().reset_index() + + # Sort by year + tb = tb.sort_values(["year", "type"]) + + # For each type, keep cheapest value over time use cummin + tb["price"] = tb.groupby("type")["price"].cummin() + + # Convert prices to $/TB instead of $/MB + tb["price"] = tb.price.mul(1000000).round(2) + + # Add country World + tb["country"] = "World" + + return tb + + +def reshape_data(tb): + # Move type to columns + tb = tb.pivot(index=["country", "year"], columns="type", values="price").reset_index() + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("computer_memory_storage.xlsx") + + # Load data from snapshot. + data = [] + for sheet_name in PARSING_INSTRUCTIONS.keys(): + data.append(read_sheet(snap, sheet_name)) + tb = pd.concat(data) + + # + # Process data. + # + tb = clean_data(tb) + tb = reshape_data(tb) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + tb.metadata.short_name = paths.short_name + + # Ensure metadata is correctly associated. + for column in tb.columns: + tb[column].metadata.origins = [snap.metadata.origin] + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() From eac93a60e92cf4cca40274eff600e99806139649 Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:17:42 +0200 Subject: [PATCH 3/9] computer_memory_storage: add garden --- .../computer_memory_storage.meta.yml | 41 +++++++++++++++++++ .../2024-05-13/computer_memory_storage.py | 33 +++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml create mode 100644 etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml new file mode 100644 index 00000000000..7a9761c2255 --- /dev/null +++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml @@ -0,0 +1,41 @@ +definitions: + common: + presentation: + topic_tags: + - Technological Change + processing_level: major + +dataset: + update_period_days: 365 + + +tables: + computer_memory_storage: + variables: + ddrives: + title: Historical price of memory + unit: "current US$ per terabyte" + short_unit: "$/TB" + description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses." + display: + numDecimalPlaces: 0 + flash: + title: Historical price of memory + unit: "current US$ per terabyte" + short_unit: "$/TB" + description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices." + display: + numDecimalPlaces: 0 + memory: + title: Historical price of memory + unit: "current US$ per terabyte" + short_unit: "$/TB" + display: + numDecimalPlaces: 0 + ssd: + title: Historical price of memory + unit: "current US$ per terabyte" + short_unit: "$/TB" + description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices." + display: + numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py new file mode 100644 index 00000000000..1e1c015cf02 --- /dev/null +++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.py @@ -0,0 +1,33 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("computer_memory_storage") + + # Read table from meadow dataset. + tb = ds_meadow["computer_memory_storage"].reset_index() + + # + # Process data. + # + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() From b5b1f1afeb15b42474d46e86ac51db6febdc5d98 Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:17:46 +0200 Subject: [PATCH 4/9] computer_memory_storage: add grapher --- .../2024-05-13/computer_memory_storage.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py diff --git a/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py new file mode 100644 index 00000000000..c5dce9b4af0 --- /dev/null +++ b/etl/steps/data/grapher/technology/2024-05-13/computer_memory_storage.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("computer_memory_storage") + + # Read table from garden dataset. + tb = ds_garden["computer_memory_storage"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() From 3c48a5d4bd0c4caeb2af18b0ea1e4c72c48d361f Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:17:50 +0200 Subject: [PATCH 5/9] computer_memory_storage: add to DAG --- dag/main.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dag/main.yml b/dag/main.yml index 70d73615809..a111642604c 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -605,6 +605,14 @@ steps: data://grapher/technology/2023-11-28/dna_sequencing: - data://garden/technology/2023-11-28/dna_sequencing + # John C. McCallum, Price and Performance Changes of Computer Technology with Time + data://meadow/technology/2024-05-13/computer_memory_storage: + - snapshot://technology/2024-05-13/computer_memory_storage.xlsx + data://garden/technology/2024-05-13/computer_memory_storage: + - data://meadow/technology/2024-05-13/computer_memory_storage + data://grapher/technology/2024-05-13/computer_memory_storage: + - data://garden/technology/2024-05-13/computer_memory_storage + # European Social Survey - Trust questions data://meadow/ess/2023-08-02/ess_trust: - snapshot://ess/2023-08-02/ess_trust.csv From 0f406459285428015cf5099aba18240926dd9c3b Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:22:20 +0200 Subject: [PATCH 6/9] computer_memory_storage: fix indicator titles --- .../technology/2024-05-13/computer_memory_storage.meta.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml index 7a9761c2255..d2b77bfea42 100644 --- a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml +++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml @@ -13,14 +13,14 @@ tables: computer_memory_storage: variables: ddrives: - title: Historical price of memory + title: Historical price of disk drives unit: "current US$ per terabyte" short_unit: "$/TB" description_from_producer: "In general, these are the lowest priced disk drives for which I found prices at the time. The floppy drives are not the lowest price per capacity. Floppies are included because they set a low unit price, making disk drives accessible to the masses." display: numDecimalPlaces: 0 flash: - title: Historical price of memory + title: Historical price of flash memory unit: "current US$ per terabyte" short_unit: "$/TB" description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices." @@ -33,7 +33,7 @@ tables: display: numDecimalPlaces: 0 ssd: - title: Historical price of memory + title: Historical price of solid-state drives unit: "current US$ per terabyte" short_unit: "$/TB" description_from_producer: "Flash memory provides a solid state alternative to disk drives, although at a greater cost per megabyte. In general, these are the lowest priced USB flash memory devices I could find. I have stopped recording separate flash prices, since SSDs are now the best source of flash memory prices." From c8dc7a9229d7743484ee0cf6bcba0016f134bfcd Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:22:52 +0200 Subject: [PATCH 7/9] computer_memory_storage: change to minor processing --- .../technology/2024-05-13/computer_memory_storage.meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml index d2b77bfea42..520017bb2fd 100644 --- a/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml +++ b/etl/steps/data/garden/technology/2024-05-13/computer_memory_storage.meta.yml @@ -3,7 +3,7 @@ definitions: presentation: topic_tags: - Technological Change - processing_level: major + processing_level: minor dataset: update_period_days: 365 From 02f86e395818bf4593493dc83d3b14591d50da5a Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:24:44 +0200 Subject: [PATCH 8/9] computer_memory_storage: make lint --- .../meadow/technology/2024-05-13/computer_memory_storage.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py index ff262d557c3..a4891187b70 100644 --- a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py +++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py @@ -1,10 +1,9 @@ """Load a snapshot and create a meadow dataset.""" -from etl.helpers import PathFinder, create_dataset -from owid.catalog import Table - import pandas as pd +from etl.helpers import PathFinder, create_dataset + # Get paths and naming conventions for current step. paths = PathFinder(__file__) From cff1a2b0a6f523c1ebc465446129ea83ebf8e93d Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Mon, 13 May 2024 15:42:18 +0200 Subject: [PATCH 9/9] computer_memory_storage: fix missing values in ddrives --- .../2024-05-13/computer_memory_storage.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py index a4891187b70..c23fe64b7ca 100644 --- a/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py +++ b/etl/steps/data/meadow/technology/2024-05-13/computer_memory_storage.py @@ -9,7 +9,7 @@ PARSING_INSTRUCTIONS = { "MEMORY": {"skiprows": 4, "usecols": [0, 1]}, - "DDRIVES": {"skiprows": 4, "usecols": [1, 4]}, + "DDRIVES": {"skiprows": 4, "usecols": [1, 2, 3, 4]}, "SSD": {"skiprows": 4, "usecols": [1, 3]}, "FLASH": {"skiprows": 4, "usecols": [1, 5]}, } @@ -18,14 +18,23 @@ def read_sheet(snapshot, sheet_name): """Read a sheet from a snapshot.""" tb = snapshot.read_excel(sheet_name=sheet_name, header=None, **PARSING_INSTRUCTIONS[sheet_name]) + + # If sheet_name = "DDRIVES", the price column is based on column 4, filled in with values + # from columns 2 and 3 when empty. + if sheet_name == "DDRIVES": + tb[4] = tb[4].fillna(tb[2]) + tb[4] = tb[4].fillna(tb[3]) + # Drop columns 2 and 3 + tb = tb.drop(columns=[2, 3]) + tb.columns = ["year", "price"] tb["type"] = sheet_name.lower() return tb def clean_data(tb): - # Remove NA years - tb = tb.dropna(subset=["year"]) + # Remove NA years and prices + tb = tb.dropna(subset=["year", "price"]) # Convert year to integer tb["year"] = tb["year"].astype(int)