diff --git a/.gitignore b/.gitignore index ee11d40..97e5249 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,7 @@ venv.bak/ *.hdf5 *.nc *.tif + +*.log +notebooks/logs +notebooks/results diff --git a/environment.yml b/environment.yml index e69de29..555ab7f 100644 --- a/environment.yml +++ b/environment.yml @@ -0,0 +1,23 @@ +name: h5cloud +channels: + - conda-forge +dependencies: + - jupyterlab + - boto3 + - tqdm + - matplotlib-base + - pandas + - numpy + - s3fs + - xarray + - dask + - distributed + - geopandas + - h5py>=3.10 + - zarr + - kerchunk + - h5netcdf + - pip + - pip: + - git+https://github.com/betolink/filesystem_spec.git + - git+https://github.com/ICESat2-SlideRule/h5coro.git diff --git a/h5tests/h5coro_arr_mean.py b/h5tests/h5coro_arr_mean.py index 58b3f76..6969c4a 100644 --- a/h5tests/h5coro_arr_mean.py +++ b/h5tests/h5coro_arr_mean.py @@ -1,27 +1,40 @@ -from .h5test import H5Test, timer_decorator -import numpy as np import subprocess +import numpy as np +from h5test import H5Test, timer_decorator + try: import h5coro except: - completed_process = subprocess.run([ - 'mamba', 'install', '-c', 'conda-forge', 'h5coro', '--yes' - ]) + completed_process = subprocess.run( + ["pip", "install", "git+https://github.com/ICESat2-SlideRule/h5coro.git@main"] + ) import h5coro -from h5coro import h5coro, s3driver, filedriver -h5coro.config(errorChecking=True, verbose=False, enableAttributes=False) - +from h5coro import h5coro, s3driver + +driver = s3driver.S3Driver + + class H5CoroArrMean(H5Test): @timer_decorator - def run(self): - group = '/gt1l/heights' - variable = 'h_ph' + def run(self, dataset="/gt1l/heights", variable="h_ph"): + group = dataset + variable = variable final_h5coro_array = [] + for file in self.files: - h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver) - output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True) - data = h5obj[f'{group}/{variable}'].values - final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None) + if link.startswith("s3://nasa-cryo-persistent/"): + h5obj = h5coro.H5Coro(link.replace("s3://", ""), s3driver.S3Driver) + else: + h5obj = h5coro.H5Coro( + link.replace("s3://", ""), + s3driver.S3Driver, + credentials={"annon": True}, + ) + ds = h5obj.readDatasets(datasets=[f"{group}/{variable}"], block=True) + data = ds[f"{group}/{variable}"][:] + final_h5coro_array = np.insert( + final_h5coro_array, len(final_h5coro_array), data, axis=None + ) return np.mean(final_h5coro_array) diff --git a/h5tests/h5py_arr_mean.py b/h5tests/h5py_arr_mean.py index 7c35407..8e059cf 100644 --- a/h5tests/h5py_arr_mean.py +++ b/h5tests/h5py_arr_mean.py @@ -1,21 +1,26 @@ -from .h5test import H5Test, timer_decorator import h5py import numpy as np +from h5test import H5Test, fsspec_logging_decorator, timer_decorator + class H5pyArrMean(H5Test): @timer_decorator - def run(self): - final_h5py_array = [] - # TODO: Do we need to make this configurable or consistent? - group = '/gt1l/heights' - variable = 'h_ph' + @fsspec_logging_decorator + def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): + final_h5py_array = [] + fsspec_params = {} + h5py_params = {} + if "fsspec_params" in io_params: + fsspec_params = io_params["fsspec_params"] + if "h5py_params" in io_params: + h5py_params = io_params["h5py_params"] + self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files] for file in self.files: - with h5py.File(self.s3_fs.open(file, 'rb')) as f: - data = f[f'{group}/{variable}'][:] - # Need to test if using concatenate is faster - final_h5py_array = np.insert( - final_h5py_array, - len(final_h5py_array), - data, axis=None - ) + with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo: + print("h5py params: ", h5py_params) + with h5py.File(fo, **h5py_params) as f: + data = f[f"{dataset}/{variable}"][:] + final_h5py_array = np.insert( + final_h5py_array, len(final_h5py_array), data, axis=None + ) return np.mean(final_h5py_array) diff --git a/h5tests/h5py_arr_subset_mean.py b/h5tests/h5py_arr_subset_mean.py index e8ceeea..f2c2629 100644 --- a/h5tests/h5py_arr_subset_mean.py +++ b/h5tests/h5py_arr_subset_mean.py @@ -1,16 +1,16 @@ import os import sys -from .h5test import H5Test, timer_decorator import h5py import numpy as np +from h5test import H5Test, fsspec_logging_decorator, timer_decorator -current = os.path.abspath('..') +current = os.path.abspath("..") sys.path.append(current) -from helpers.geospatial import get_subset_region, get_subset_indices +from helpers.geospatial import get_subset_indices, get_subset_region + class H5pyArrSubsetMean(H5Test): - def __init__(self, data_format, geometry=None): """ geometry : path to geojson file containing geometry @@ -18,32 +18,34 @@ def __init__(self, data_format, geometry=None): """ super().__init__(data_format) self.bounds = get_subset_region(geometry) - + @timer_decorator - def run(self): - final_h5py_array = [] + @fsspec_logging_decorator + def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): + final_h5py_array = [] # TODO: Do we need to make this configurable or consistent? - group = '/gt1l/heights' - variable = 'h_ph' + if "fsspec_params" in io_params: + fsspec_params = io_params["fsspec_params"] + if "h5py_params" in io_params: + h5py_params = io_params["h5py_params"] for file in self.files: - with h5py.File(self.s3_fs.open(file, 'rb')) as f: - - lat = f[f'{group}/lat_ph'][:] - lon = f[f'{group}/lon_ph'][:] - + with h5py.File( + self.s3_fs.open(file, "rb", **fsspec_params), **h5py_params + ) as f: + lat = f[f"{dataset}/lat_ph"][:] + lon = f[f"{dataset}/lon_ph"][:] + idx_start, idx_end = get_subset_indices(lat, lon, self.bounds) - + # Leaving this code here so that we can create a DataFrame or - # Dataset at a later date. Suggest creating dict which can be + # Dataset at a later date. Suggest creating dict which can be # passsed to xarray or (geo)pandas # lat[idx_start:idx_end]) # lon[idx_start:idx_end]) - data = f[f'{group}/{variable}'][idx_start:idx_end] + data = f[f"{dataset}/{variable}"][idx_start:idx_end] # Need to test if using concatenate is faster final_h5py_array = np.insert( - final_h5py_array, - len(final_h5py_array), - data, axis=None + final_h5py_array, len(final_h5py_array), data, axis=None ) - return np.mean(final_h5py_array) \ No newline at end of file + return np.mean(final_h5py_array) diff --git a/h5tests/h5test.py b/h5tests/h5test.py index 74bb4de..b77e81c 100644 --- a/h5tests/h5test.py +++ b/h5tests/h5test.py @@ -1,74 +1,217 @@ -import boto3 import csv -from io import StringIO +import logging +import os +import pathlib +import sys import time from datetime import datetime -import os +from io import StringIO + +import boto3 import s3fs -import sys -current = os.path.abspath('..') +current = os.path.abspath("..") sys.path.append(current) -from helpers.links import S3Links -def generate_timestamp(): - return datetime.now().strftime('%Y-%m-%d-%H%M%S') + +def fsspec_logging_decorator(func): + """ + It will store the fsspec logs inside ./logs and will get some stats from file access + Will pass values to timer_decorator + """ + + def __setup_logging(self): + pathlib.Path(f"./logs").mkdir(exist_ok=True) + logger = logging.getLogger("fsspec") + logger.setLevel(logging.DEBUG) + self._file_handler = logging.FileHandler(self.log_filename) + self._file_handler.setLevel(logging.DEBUG) + logging.getLogger("fsspec").addHandler(self._file_handler) + + def __turnoff_logging(self): + [ + logging.getLogger("fsspec").debug(f"FileSize: {size}") + for size in self.file_sizes + ] + logging.getLogger("fsspec").removeHandler(self._file_handler) + self._file_handler.close() + + def fsspec_stats(log_file): + stats = None + with open(log_file, "r") as input_file: + num_requests = 0 + total_requested_bytes = 0 + for line in input_file: + try: + read_range = line.split("read:")[1].split(" - ") + request_size = int(read_range[1]) - int(read_range[0]) + total_requested_bytes += request_size + num_requests += 1 + except Exception: + pass + if total_requested_bytes > 0: + stats = { + "total_reqs": num_requests, + "total_reqs_bytes": total_requested_bytes, + "avg_req_size": int(round(total_requested_bytes / num_requests, 2)), + } + return stats + + def wrapper(self, *args, **kwargs): + tstamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + self.log_filename = f"logs/{self.data_format}-{tstamp}.log" + + __setup_logging(self) + result = func(self, *args, **kwargs) + __turnoff_logging(self) + + self.io_stats = fsspec_stats(self.log_filename) + return result, {"logs": self.log_filename, "io_stats": self.io_stats} + + return wrapper + def timer_decorator(func): """ A decorator to measure the execution time of the wrapped function. """ + def wrapper(self, *args, **kwargs): + tstamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + start_time = time.time() result = func(self, *args, **kwargs) end_time = time.time() execution_time = end_time - start_time + if "io_params" in kwargs: + self.runtime_params = kwargs["io_params"] + if len(args) > 0: + self.runtime_params = args[0] + # Call the store method here if self.store_results: - results_key = f"{generate_timestamp()}_{self.name}_{self.data_format}_results.csv" - s3_key = f"{self.results_directory}/{results_key}" - self.store(run_time=execution_time, result=result, bucket=self.bucket, s3_key=s3_key) - return result, execution_time + if type(result) in [list, dict, tuple]: + # unpack + func_result, _ = result + else: + func_result = result + results_key = f"{tstamp}_{self.name}_{self.data_format}_results.csv" + self.store( + run_time=execution_time, result=func_result, file_name=results_key + ) + return result, {"execution_time": execution_time} + return wrapper + class H5Test: - def __init__(self, data_format: str, files=None, store_results=True): + def __init__( + self, + data_format: str, + files=[], + store_results=True, + ): self.name = self.__class__.__name__ + self.io_stats = None + self.runtime_params = None + self.log_filename = "" self.data_format = data_format - if files: + if len(files) > 0: self.files = files else: - self.files = S3Links().get_links_by_format(data_format) - self.s3_client = boto3.client('s3') # Ensure AWS credentials are configured - self.s3_fs = s3fs.S3FileSystem(anon=False) + raise ValueError("We need at least 1 ATL03 granule URL hosted in S3") + self.store_results = store_results - self.bucket = "nasa-cryo-scratch" - self.results_directory = "h5cloud/benchmark_results" + + if files[0].startswith("s3://nasa-cryo-persistent"): + self.s3_client = boto3.client("s3") # + self.annon_access = False + self.results_bucket = "s3://nasa-cryo-persistent/" + self.results_directory = "h5cloud/benchmark_results" + self.results_store_type = "S3" + else: + self.annon_access = True + self.results_path = "results" + pathlib.Path(f"./{self.results_path}").mkdir(exist_ok=True) + self.results_store_type = "Local" + + self.s3_fs = s3fs.S3FileSystem(anon=self.annon_access) + self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files] @timer_decorator - def run(self): + def run(self, io_params, dataset, variable): raise NotImplementedError("The run method has not been implemented") - def store(self, run_time: float, result: str, bucket: str, s3_key: str): + def store(self, run_time: float, result: str, file_name: str): """ Store test results to an S3 bucket as a CSV file. - :param run_time: The runtime of the test :param result: The result of the test - :param bucket: The name of the S3 bucket where the CSV will be uploaded - :param s3_key: The S3 key (filename) where the CSV will be stored + :param file_name: file to store the results """ # Create a CSV in-memory csv_buffer = StringIO() csv_writer = csv.writer(csv_buffer) - csv_writer.writerow(['Name', 'Data Format', 'Run Time', 'Result']) # Headers - csv_writer.writerow([self.name, self.data_format, run_time, result]) + if self.io_stats: # if we are using the fsspec logger decorator + csv_writer.writerow( + [ + "Name", + "Data Format", + "Run Time", + "Result", + "Runtime Params", + "Access Log", + "Total Bytes Tranferred", + "Total Requests", + "Average Request Size", + ] + ) # Headers + csv_writer.writerow( + [ + self.name, + self.data_format, + run_time, + result, + self.runtime_params, + self.log_filename, + self.io_stats["total_reqs_bytes"], + self.io_stats["total_reqs"], + self.io_stats["avg_req_size"], + ] + ) + else: + csv_writer.writerow( + [ + "Name", + "Data Format", + "Run Time", + "Result", + ] + ) # Headers + csv_writer.writerow( + [ + self.name, + self.data_format, + run_time, + result, + ] + ) # Reset the buffer's position to the beginning csv_buffer.seek(0) # Upload the CSV to S3 - self.s3_client.put_object(Bucket=bucket, Key=s3_key, Body=csv_buffer.getvalue()) + if self.results_store_type == "S3": + # assumes s3 can write to bucket + self.s3_client.put_object( + Bucket=self.results_bucket, + Key=f"{self.results_directory}/{file_name}", + Body=csv_buffer.getvalue(), + ) + else: + with open(f"{self.results_path}/{file_name}", "w", newline="") as csv_file: + csv_file.write(csv_buffer.getvalue()) + ## Example subclass # class SampleTest(H5Test): diff --git a/h5tests/single-test.ipynb b/h5tests/single-test.ipynb index 1ea5439..cd7a3e0 100644 --- a/h5tests/single-test.ipynb +++ b/h5tests/single-test.ipynb @@ -1,22 +1,27 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "a1039b23-f008-4740-adbd-bafb8eaccfd2", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Testing access time on ICESat-2 ATL03 HDF5 files in AWS S3.\n", + "\n", + "This notebook runs a single test from the different access patterns and stores the results in `results/` and `logs/`\n", + "If we use files in the CryoCloud the results will be send to the S3 bucket `s3://nasa-cryo-persistent/h5cloud/benchmark_results/`\n" + ] + }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 1, "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload \n", @@ -25,25 +30,54 @@ "import os\n", "current = os.path.abspath('..')\n", "sys.path.append(current)\n", - "from h5tests.xarray_arr_len import XarrayArrLen\n", - "from helpers.links import S3Links" + "from xarray_arr_mean import XarrayArrMean\n", + "import pandas as pd\n", + "\n", + "benchmarks = []" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 2, "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", "metadata": { "tags": [] }, "outputs": [], "source": [ - "test = XarrayArrLen('kerchunk-repacked', store_results=False)" + "granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20200922221235_13680801_006_02.h5\",\n", + " # \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5\",\n", + "]\n", + "\n", + "# We create the test cases for each kind of granule.\n", + "xarray_test = XarrayArrMean('atl03-xarray-original', files=granules, store_results=True)" + ] + }, + { + "cell_type": "markdown", + "id": "33dcde98-df71-4e49-b051-f67c865981c6", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Benchmarking access patterns \n", + "\n", + "```python\n", + "io_params ={\n", + " \"fsspec_params\": {}, # if we use fsspec we can pass io params here\n", + " \"h5py_params\" : {} # if we use h5py we can pass io params here\n", + "}\n", + "```\n", + "\n", + "Accesing ATL03 with Xarray takes considerably longer than using h5py directly, this is mainly due the decoding and metadata that Xarray uses to represent the data.\n", + "Using Xarray it takes approx ~10 minutes per granule out of region (6+ GB granules) and ~2 minutes per granule in-region (6+ GB granules) when we access the non optimized granules.\n" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 3, "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", "metadata": { "tags": [] @@ -51,17 +85,163 @@ "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
libraryformatmeantimetotal_requested_bytestotal_requestsavg_req_size
0xarrayoriginal18.12813670.1682850852992382813284
\n", + "
" + ], "text/plain": [ - "(338294671, 69.48884391784668)" + " library format mean time total_requested_bytes \\\n", + "0 xarray original 18.128136 70.16828 50852992 \n", + "\n", + " total_requests avg_req_size \n", + "0 3828 13284 " ] }, - "execution_count": 75, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "test.run()" + "# we don't need this when using the original granules.\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"cache_type\": \"blockcache\",\n", + " # \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + "# \"driver_kwds\": {\n", + "# \"page_buf_size\": 64*1024*1024,\n", + "# \"rdcc_nbytes\": 8*1024*1024\n", + "# }\n", + "\n", + " }\n", + "}\n", + "\n", + "# this info gets stored in logs and csv files as usual but we want to plot them here too.\n", + "execution_info, execution_time = xarray_test.run(io_params)\n", + "\n", + "io_stats = execution_info[1][\"io_stats\"]\n", + "\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"original\",\n", + " \"mean\": execution_info[0],\n", + " \"time\": execution_time[\"execution_time\"],\n", + " \"total_requested_bytes\": io_stats[\"total_reqs_bytes\"],\n", + " \"total_requests\": io_stats[\"total_reqs\"],\n", + " \"avg_req_size\": io_stats[\"avg_req_size\"]})\n", + "\n", + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "3d41bada-8735-4973-8536-bd9050b2e31f", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Plotting the resuls\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2), f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.7), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" ] } ], @@ -81,7 +261,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/h5tests/xarray_arr_mean.py b/h5tests/xarray_arr_mean.py index 0c1fc92..6287a39 100644 --- a/h5tests/xarray_arr_mean.py +++ b/h5tests/xarray_arr_mean.py @@ -1,31 +1,46 @@ -from .h5test import H5Test, timer_decorator import fsspec -import xarray as xr import numpy as np +import xarray as xr +from h5test import H5Test, fsspec_logging_decorator + class XarrayArrMean(H5Test): - def open_reference_ds(self, file): + def open_reference_ds(self, file: str, dataset: str): fs = fsspec.filesystem( - 'reference', - fo=file, - remote_protocol='s3', - remote_options=dict(anon=False), - skip_instance_cache=True + "reference", + fo=file, + remote_protocol="s3", + remote_options=dict(anon=self.anon_access), + skip_instance_cache=True, + ) + return xr.open_dataset( + fs.get_mapper(""), engine="zarr", consolidated=False, group=dataset ) - return xr.open_dataset(fs.get_mapper(""), engine='zarr', consolidated=False, group='gt1l/heights') @timer_decorator - def run(self): - group = '/gt1l/heights' - variable = 'h_ph' - if 'kerchunk' in self.data_format: - datasets = [self.open_reference_ds(file) for file in self.files] + @fsspec_logging_decorator + def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"): + if "kerchunk" in self.data_format: + datasets_ref = [ + self.open_reference_ds(file, dataset) for file in self.files + ] h_ph_values = [] - for dataset in datasets: - h_ph_values = np.append(h_ph_values, dataset['h_ph'].values) + for ds in datasets_ref: + h_ph_values = np.append(h_ph_values, ds[variable].values) return np.mean(h_ph_values) else: - s3_fileset = [self.s3_fs.open(file) for file in self.files] - xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf') - h_ph_values = xrds['h_ph'] + if "fsspec_params" in io_params: + fsspec_params = io_params["fsspec_params"] + if "h5py_params" in io_params: + h5py_params = io_params["h5py_params"] + + s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files] + xrds = xr.open_mfdataset( + s3_fileset, + group=dataset, + combine="by_coords", + engine="h5netcdf", + **h5py_params + ) + h_ph_values = xrds[variable] return float(np.mean(h_ph_values).values) diff --git a/helpers/links-old.json b/helpers/links-old.json new file mode 100644 index 0000000..1f0b836 --- /dev/null +++ b/helpers/links-old.json @@ -0,0 +1,53 @@ +{ + "flatgeobuf": { + "ATL03_20181120182818_08110112_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20181120182818_08110112_006_02.fgb", + "ATL03_20190219140808_08110212_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20190219140808_08110212_006_02.fgb", + "ATL03_20200217204710_08110612_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20200217204710_08110612_006_01.fgb", + "ATL03_20211114142614_08111312_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20211114142614_08111312_006_01.fgb", + "ATL03_20230211164520_08111812_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20230211164520_08111812_006_01.fgb" + }, + "flatgeobuf_no_sindex": { + "ATL03_20181120182818_08110112_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20181120182818_08110112_006_02_no_sindex.fgb", + "ATL03_20190219140808_08110212_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20190219140808_08110212_006_02_no_sindex.fgb", + "ATL03_20200217204710_08110612_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20200217204710_08110612_006_01_no_sindex.fgb", + "ATL03_20211114142614_08111312_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20211114142614_08111312_006_01_no_sindex.fgb", + "ATL03_20230211164520_08111812_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20230211164520_08111812_006_01_no_sindex.fgb" + }, + "geoparquet": { + "ATL03_20181120182818_08110112_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20181120182818_08110112_006_02.h5.gpq", + "ATL03_20190219140808_08110212_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20190219140808_08110212_006_02.h5.gpq", + "ATL03_20200217204710_08110612_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20200217204710_08110612_006_01.h5.gpq", + "ATL03_20211114142614_08111312_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20211114142614_08111312_006_01.h5.gpq", + "ATL03_20230211164520_08111812_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20230211164520_08111812_006_01.h5.gpq", + "['ATL03_20200217204710_08110612_006_01.h5'].gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/['ATL03_20200217204710_08110612_006_01.h5'].gpq" + }, + + "h5repack": { + "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5", + "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5", + "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5", + "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5", + "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5" + }, + "kerchunk-original": { + "original_ATL03_20181120182818_08110112_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20181120182818_08110112_006_02.json", + "original_ATL03_20190219140808_08110212_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20190219140808_08110212_006_02.json", + "original_ATL03_20200217204710_08110612_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20200217204710_08110612_006_01.json", + "original_ATL03_20211114142614_08111312_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20211114142614_08111312_006_01.json", + "original_ATL03_20230211164520_08111812_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20230211164520_08111812_006_01.json" + }, + "kerchunk-repacked": { + "h5repack_ATL03_20181120182818_08110112_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20181120182818_08110112_006_02_repacked.json", + "h5repack_ATL03_20190219140808_08110212_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20190219140808_08110212_006_02_repacked.json", + "h5repack_ATL03_20200217204710_08110612_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20200217204710_08110612_006_01_repacked.json", + "h5repack_ATL03_20211114142614_08111312_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20211114142614_08111312_006_01_repacked.json", + "h5repack_ATL03_20230211164520_08111812_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20230211164520_08111812_006_01_repacked.json" + }, + "original": { + "ATL03_20181120182818_08110112_006_02.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5", + "ATL03_20190219140808_08110212_006_02.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5", + "ATL03_20200217204710_08110612_006_01.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5", + "ATL03_20211114142614_08111312_006_01.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5", + "ATL03_20230211164520_08111812_006_01.h5": "s3://nasa-cryo-permanent/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5" + } +} \ No newline at end of file diff --git a/helpers/links.py b/helpers/links.py index 5590042..e212ef0 100644 --- a/helpers/links.py +++ b/helpers/links.py @@ -4,8 +4,7 @@ import s3fs -S3LINK = "s3://nasa-cryo-scratch/h5cloud/" -S3FILELINKS = Path("../helpers/s3filelinks.json") +S3LINK = "s3://nasa-cryo-permanent/h5cloud/" class S3Links: @@ -41,9 +40,11 @@ class S3Links: 'h5cloud/original/ATL03_20181120182818_08110112_006_02.h5' """ - def __init__(self): - self.json_file = S3FILELINKS - self.table = load_s3testfile(S3FILELINKS) + def __init__(self, file="../helpers/s3filelinks.json"): + self.S3FILELINKS = Path(file) + + self.json_file = self.S3FILELINKS + self.table = load_s3testfile(self.S3FILELINKS) self.formats = list(self.table.keys()) def get_links_by_format(self, file_format): @@ -86,9 +87,9 @@ def update_links(self, write_to_file=True): print("Differences between self.table and S3 buckets: updating self.table") self.table = filelinks self.formats = list(self.table.keys()) - response = input(f"Update {S3FILELINKS} (y or n)?") + response = input(f"Update {self.S3FILELINKS} (y or n)?") if response.lower() == "y": - print(f"Updating {S3FILELINKS}") + print(f"Updating {self.S3FILELINKS}") write_s3links(filelinks) diff --git a/helpers/s3filelinks.json b/helpers/s3filelinks.json index 2a4ab87..2818b4b 100644 --- a/helpers/s3filelinks.json +++ b/helpers/s3filelinks.json @@ -1,52 +1,67 @@ { "flatgeobuf": { - "ATL03_20181120182818_08110112_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20181120182818_08110112_006_02.fgb", - "ATL03_20190219140808_08110212_006_02.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20190219140808_08110212_006_02.fgb", - "ATL03_20200217204710_08110612_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20200217204710_08110612_006_01.fgb", - "ATL03_20211114142614_08111312_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20211114142614_08111312_006_01.fgb", - "ATL03_20230211164520_08111812_006_01.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf/ATL03_20230211164520_08111812_006_01.fgb" + }, "flatgeobuf_no_sindex": { - "ATL03_20181120182818_08110112_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20181120182818_08110112_006_02_no_sindex.fgb", - "ATL03_20190219140808_08110212_006_02_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20190219140808_08110212_006_02_no_sindex.fgb", - "ATL03_20200217204710_08110612_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20200217204710_08110612_006_01_no_sindex.fgb", - "ATL03_20211114142614_08111312_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20211114142614_08111312_006_01_no_sindex.fgb", - "ATL03_20230211164520_08111812_006_01_no_sindex.fgb": "s3://nasa-cryo-scratch/h5cloud/flatgeobuf_no_sindex/ATL03_20230211164520_08111812_006_01_no_sindex.fgb" + }, "geoparquet": { - "ATL03_20181120182818_08110112_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20181120182818_08110112_006_02.h5.gpq", - "ATL03_20190219140808_08110212_006_02.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20190219140808_08110212_006_02.h5.gpq", - "ATL03_20200217204710_08110612_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20200217204710_08110612_006_01.h5.gpq", - "ATL03_20211114142614_08111312_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20211114142614_08111312_006_01.h5.gpq", - "ATL03_20230211164520_08111812_006_01.h5.gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/ATL03_20230211164520_08111812_006_01.h5.gpq", - "['ATL03_20200217204710_08110612_006_01.h5'].gpq": "s3://nasa-cryo-scratch/h5cloud/geoparquet/['ATL03_20200217204710_08110612_006_01.h5'].gpq" - }, - "h5repack": { - "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5", - "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5", - "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5", - "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5", - "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5" - }, - "kerchunk-original": { - "original_ATL03_20181120182818_08110112_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20181120182818_08110112_006_02.json", - "original_ATL03_20190219140808_08110212_006_02.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20190219140808_08110212_006_02.json", - "original_ATL03_20200217204710_08110612_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20200217204710_08110612_006_01.json", - "original_ATL03_20211114142614_08111312_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20211114142614_08111312_006_01.json", - "original_ATL03_20230211164520_08111812_006_01.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-original/original_ATL03_20230211164520_08111812_006_01.json" - }, - "kerchunk-repacked": { - "h5repack_ATL03_20181120182818_08110112_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20181120182818_08110112_006_02_repacked.json", - "h5repack_ATL03_20190219140808_08110212_006_02_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20190219140808_08110212_006_02_repacked.json", - "h5repack_ATL03_20200217204710_08110612_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20200217204710_08110612_006_01_repacked.json", - "h5repack_ATL03_20211114142614_08111312_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20211114142614_08111312_006_01_repacked.json", - "h5repack_ATL03_20230211164520_08111812_006_01_repacked.json": "s3://nasa-cryo-scratch/h5cloud/kerchunk-repacked/h5repack_ATL03_20230211164520_08111812_006_01_repacked.json" - }, - "original": { - "ATL03_20181120182818_08110112_006_02.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5", - "ATL03_20190219140808_08110212_006_02.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5", - "ATL03_20200217204710_08110612_006_01.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5", - "ATL03_20211114142614_08111312_006_01.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5", - "ATL03_20230211164520_08111812_006_01.h5": "s3://nasa-cryo-scratch/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5" + + }, + "atl03-bigsize-original": { + "ATL03_20181120182818_08110112_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5", + "ATL03_20190219140808_08110212_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5", + "ATL03_20200217204710_08110612_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20200217204710_08110612_006_01.h5", + "ATL03_20211114142614_08111312_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20211114142614_08111312_006_01.h5", + "ATL03_20230211164520_08111812_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20230211164520_08111812_006_01.h5" + }, + "atl03-bigsize-h5repack": { + "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5", + "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5", + "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20200217204710_08110612_006_01_repacked.h5", + "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20211114142614_08111312_006_01_repacked.h5", + "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20230211164520_08111812_006_01_repacked.h5" + }, + "atl03-kerchunk-bigsize-original": { + "atl03_ATL03_20190219140808_08110212_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20190219140808_08110212_006_02.json", + "atl03_ATL03_20230211164520_08111812_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20230211164520_08111812_006_01.json", + "atl03_ATL03_20200217204710_08110612_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20200217204710_08110612_006_01.json", + "atl03_ATL03_20181120182818_08110112_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json", + "atl03_ATL03_20211114142614_08111312_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20211114142614_08111312_006_01.json" + }, + "atl03-kerchunk-bigsize-repacked": { + "atl03_ATL03_20181120182818_08110112_006_02_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json", + "atl03_ATL03_20190219140808_08110212_006_02_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20190219140808_08110212_006_02_repacked.json", + "atl03_ATL03_20211114142614_08111312_006_01_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20211114142614_08111312_006_01_repacked.json", + "atl03_ATL03_20200217204710_08110612_006_01_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20200217204710_08110612_006_01_repacked.json", + "atl03_ATL03_20230211164520_08111812_006_01_repacked.json": "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20230211164520_08111812_006_01_repacked.json" + }, + "atl03-midsize-original": { + "ATL03_20191225111315_13680501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-midsize-h5repack":{ + "ATL03_20191225111315_13680501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-kerchunk-midsize-original": { + "atl03_ATL03_20220919113142_13681601_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20220919113142_13681601_006_01.json", + "atl03_ATL03_20191225111315_13680501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20191225111315_13680501_006_01.json", + "atl03_ATL03_20220620155150_13681501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20220620155150_13681501_006_01.json", + "atl03_ATL03_20200922221235_13680801_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20200922221235_13680801_006_02.json", + "atl03_ATL03_20230618223036_13681901_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/kerchunk/atl03_ATL03_20230618223036_13681901_006_01.json" + }, + "atl03-kerchunk-midsize-repacked": { + "atl03_ATL03_20220620155150_13681501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20220620155150_13681501_006_01.json", + "atl03_ATL03_20191225111315_13680501_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20191225111315_13680501_006_01.json", + "atl03_ATL03_20220919113142_13681601_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20220919113142_13681601_006_01.json", + "atl03_ATL03_20200922221235_13680801_006_02.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20200922221235_13680801_006_02.json", + "atl03_ATL03_20230618223036_13681901_006_01.json": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/kerchunk/atl03_ATL03_20230618223036_13681901_006_01.json" } } \ No newline at end of file diff --git a/helpers/s3itslive.json b/helpers/s3itslive.json new file mode 100644 index 0000000..4f0a16d --- /dev/null +++ b/helpers/s3itslive.json @@ -0,0 +1,50 @@ +{ + "flatgeobuf": { + + }, + "flatgeobuf_no_sindex": { + + }, + "geoparquet": { + + }, + "atl03-bigsize-original": { + "ATL03_20181120182818_08110112_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5", + "ATL03_20190219140808_08110212_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5", + "ATL03_20200217204710_08110612_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20200217204710_08110612_006_01.h5", + "ATL03_20211114142614_08111312_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20211114142614_08111312_006_01.h5", + "ATL03_20230211164520_08111812_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20230211164520_08111812_006_01.h5" + }, + "atl03-bigsize-h5repack": { + "ATL03_20181120182818_08110112_006_02_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5", + "ATL03_20190219140808_08110212_006_02_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5", + "ATL03_20200217204710_08110612_006_01_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20200217204710_08110612_006_01_repacked.h5", + "ATL03_20211114142614_08111312_006_01_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20211114142614_08111312_006_01_repacked.h5", + "ATL03_20230211164520_08111812_006_01_repacked.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20230211164520_08111812_006_01_repacked.h5" + }, + "atl03-kerchunk-bigsize-original": { + + }, + "atl03-kerchunk-bigsize-repacked": { + }, + "atl03-midsize-original": { + "ATL03_20191225111315_13680501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-midsize-h5repack":{ + "ATL03_20191225111315_13680501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20191225111315_13680501_006_01.h5", + "ATL03_20200922221235_13680801_006_02.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20200922221235_13680801_006_02.h5", + "ATL03_20220620155150_13681501_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20220620155150_13681501_006_01.h5", + "ATL03_20220919113142_13681601_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20220919113142_13681601_006_01.h5", + "ATL03_20230618223036_13681901_006_01.h5": "s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5" + }, + "atl03-kerchunk-midsize-original": { + + }, + "atl03-kerchunk-midsize-repacked": { + + } +} \ No newline at end of file diff --git a/notebooks/access_time.summary.png b/notebooks/access_time.summary.png new file mode 100644 index 0000000..b8c3e4e Binary files /dev/null and b/notebooks/access_time.summary.png differ diff --git a/notebooks/benchmarks.csv b/notebooks/benchmarks.csv new file mode 100644 index 0000000..bda4468 --- /dev/null +++ b/notebooks/benchmarks.csv @@ -0,0 +1,47 @@ +,tool,dataset,cloud-aware,format,file,time,mean,size,product +0,h5py,ATL03-1GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,2.843794107437134,386.06738,1GB,ATL03 +1,h5py,ATL03-1GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,4.157144546508789,386.06738,1GB,ATL03 +2,h5py,ATL03-7GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,6.9494102001190186,1035.1631,7GB,ATL03 +3,h5py,ATL03-7GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,13.6586012840271,1035.1631,7GB,ATL03 +4,h5py,ATL03-2GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,1.4053022861480713,2049.7554,2GB,ATL03 +5,h5py,ATL03-2GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,1.0851728916168213,2049.7554,2GB,ATL03 +6,kerchunk,ATL03-7GB-kerchunk,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json,10.746918678283691," +array(1035.1631, dtype=float32)",7GB,ATL03 +7,kerchunk,ATL03-7GB-kerchunk,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json,8.8134024143219," +array(1035.1631, dtype=float32)",7GB,ATL03 +8,xarray,ATL03-1GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,46.50308704376221," +array(386.06738, dtype=float32)",1GB,ATL03 +9,xarray,ATL03-1GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,10.25867509841919," +array(386.06738, dtype=float32)",1GB,ATL03 +10,xarray,ATL03-7GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,62.89623713493347," +array(1035.1631, dtype=float32)",7GB,ATL03 +11,xarray,ATL03-7GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,81.67518210411072," +array(1035.1631, dtype=float32)",7GB,ATL03 +12,xarray,ATL03-2GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,47.506706953048706," +array(2049.7554, dtype=float32)",2GB,ATL03 +13,xarray,ATL03-2GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,18.109654188156128," +array(2049.7554, dtype=float32)",2GB,ATL03 +14,h5coro,ATL03-1GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,4.562052011489868,386.06738,1GB,ATL03 +15,h5coro,ATL03-1GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,4.286046743392944,386.06738,1GB,ATL03 +16,h5coro,ATL03-7GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,14.072925567626953,1035.1631,7GB,ATL03 +17,h5coro,ATL03-7GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,11.79448390007019,1035.1631,7GB,ATL03 +18,h5coro,ATL03-2GB,no,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,3.1101267337799072,2049.7554,2GB,ATL03 +19,h5coro,ATL03-2GB,no,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,1.8120653629302979,2049.7554,2GB,ATL03 +20,h5py,ATL03-1GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,1.8618409633636475,386.06738,1GB,ATL03 +21,h5py,ATL03-1GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,1.9302234649658203,386.06738,1GB,ATL03 +22,h5py,ATL03-7GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,6.602761507034302,1035.1631,7GB,ATL03 +23,h5py,ATL03-7GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,5.758350849151611,1035.1631,7GB,ATL03 +24,h5py,ATL03-2GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,1.2604756355285645,2049.7554,2GB,ATL03 +25,h5py,ATL03-2GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,0.8633284568786621,2049.7554,2GB,ATL03 +26,xarray,ATL03-1GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5,42.18248891830444," +array(386.06738, dtype=float32)",1GB,ATL03 +27,xarray,ATL03-1GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5,2.5429904460906982," +array(386.06738, dtype=float32)",1GB,ATL03 +28,xarray,ATL03-7GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5,48.71459078788757," +array(1035.1631, dtype=float32)",7GB,ATL03 +29,xarray,ATL03-7GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5,6.6719231605529785," +array(1035.1631, dtype=float32)",7GB,ATL03 +30,xarray,ATL03-2GB,yes,original,s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5,40.31614112854004," +array(2049.7554, dtype=float32)",2GB,ATL03 +31,xarray,ATL03-2GB,yes,optimized,s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5,2.156572103500366," +array(2049.7554, dtype=float32)",2GB,ATL03 diff --git a/notebooks/01_data-selection.ipynb b/notebooks/data-wrangling/01_data-selection.ipynb similarity index 100% rename from notebooks/01_data-selection.ipynb rename to notebooks/data-wrangling/01_data-selection.ipynb diff --git a/notebooks/arr_mean_bar_plot.png b/notebooks/data-wrangling/arr_mean_bar_plot.png similarity index 100% rename from notebooks/arr_mean_bar_plot.png rename to notebooks/data-wrangling/arr_mean_bar_plot.png diff --git a/notebooks/benchmark-h5repack.ipynb b/notebooks/data-wrangling/benchmark-h5repack.ipynb similarity index 100% rename from notebooks/benchmark-h5repack.ipynb rename to notebooks/data-wrangling/benchmark-h5repack.ipynb diff --git a/notebooks/benchmark-small-file-h5repack.ipynb b/notebooks/data-wrangling/benchmark-small-file-h5repack.ipynb similarity index 100% rename from notebooks/benchmark-small-file-h5repack.ipynb rename to notebooks/data-wrangling/benchmark-small-file-h5repack.ipynb diff --git a/notebooks/benchmarks-outline.ipynb b/notebooks/data-wrangling/benchmarks-outline.ipynb similarity index 100% rename from notebooks/benchmarks-outline.ipynb rename to notebooks/data-wrangling/benchmarks-outline.ipynb diff --git a/notebooks/data-wrangling/cloud-optimized-hdf5.ipynb b/notebooks/data-wrangling/cloud-optimized-hdf5.ipynb new file mode 100644 index 0000000..98eaa91 --- /dev/null +++ b/notebooks/data-wrangling/cloud-optimized-hdf5.ipynb @@ -0,0 +1,1062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "11f9a9cb-c049-461e-8578-7090a644508e", + "metadata": {}, + "source": [ + "# Cloud Optimized HDF: or How I Learned to Stop Worrying and Love the Format\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "6332a484-8fd6-4448-827f-aa48e6322f8f", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "2d37475f-42b0-4105-b34c-529f627d9066", + "metadata": {}, + "source": [ + "## The big ol list of \"ifs\"\n", + "\n", + "* We use the most recent versions of h5py, xarray and fsspec\n", + "* We create the HDF5 files with [cloud optimized flags](https://www.youtube.com/watch?v=rcS5vt-mKok)\n", + " * if the files are out there we can repack them, consolidating the metadata and perhaps incresing the chunk sizes\n", + "* We know how to \"tweak the nobs\" (or a fair understanding of what the I/O libraries are doing)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "736bb5fb-c5cd-42bf-be4e-6b81ae6eb865", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray v2024.1.1\n", + "h5py v3.10.0\n", + "s3fs v2023.12.2\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import h5py\n", + "import s3fs\n", + "\n", + "fs = s3fs.S3FileSystem(anon=True)\n", + "\n", + "for library in (xr, h5py, s3fs):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "78d6697b-9f84-4edf-b426-fde27560bc68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ETag': '\"237bbd5828745b9e1a1e0ba88486e43c-835\"',\n", + " 'LastModified': datetime.datetime(2024, 1, 29, 4, 48, 24, tzinfo=tzutc()),\n", + " 'size': 6997123664,\n", + " 'name': 'its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5',\n", + " 'type': 'file',\n", + " 'StorageClass': 'INTELLIGENT_TIERING',\n", + " 'VersionId': None,\n", + " 'ContentType': 'application/x-hdf5'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a \"big\" ATL03 file from the ICESat-2 mission\n", + "original_granule = \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\"\n", + "# the same \"big\" ATL03 file from the ICESat-2 mission, metadata consolidated in 8MB-size pages.\n", + "cloud_optimized = \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\"\n", + "\n", + "fs.info(original_granule)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e94bb01e-a325-4ab3-8f6a-ac5799d14f02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ETag': '\"08af0688f787f10eee1ccfb13f7eb66d-836\"',\n", + " 'LastModified': datetime.datetime(2024, 1, 29, 4, 52, 44, tzinfo=tzutc()),\n", + " 'size': 7008000000,\n", + " 'name': 'its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5',\n", + " 'type': 'file',\n", + " 'StorageClass': 'INTELLIGENT_TIERING',\n", + " 'VersionId': None,\n", + " 'ContentType': 'application/x-hdf5'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fs.info(cloud_optimized)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec2bce8f-bcf4-4982-8556-d3a71209af74", + "metadata": {}, + "outputs": [], + "source": [ + "# don't even try this out of region (us-west-2) will take forever, forever >= 30 minutes\n", + "ds = xr.open_dataset(fs.open(original_granule),\n", + " group=\"/gt1l/heights\",\n", + " engine=\"h5netcdf\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9b5701b-6a8b-41ac-a56a-34a4f42125e1", + "metadata": {}, + "outputs": [], + "source": [ + "# again... don't even try this out of region (us-west-2) will take forever, forever >= 30 minutes\n", + "ds = xr.open_dataset(fs.open(cloud_optimized),\n", + " group=\"/gt1l/heights\",\n", + " engine=\"h5netcdf\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0def8b43-7616-4e01-a502-3f44811ae47e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.16 s, sys: 3.04 s, total: 7.2 s\n", + "Wall time: 20.6 s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:         (delta_time: 73765028, ds_surf_type: 5)\n",
+       "Coordinates:\n",
+       "  * delta_time      (delta_time) datetime64[ns] 2019-02-19T14:08:08.557345384...\n",
+       "    lat_ph          (delta_time) float64 ...\n",
+       "    lon_ph          (delta_time) float64 ...\n",
+       "Dimensions without coordinates: ds_surf_type\n",
+       "Data variables:\n",
+       "    dist_ph_across  (delta_time) float32 ...\n",
+       "    dist_ph_along   (delta_time) float32 ...\n",
+       "    h_ph            (delta_time) float32 ...\n",
+       "    pce_mframe_cnt  (delta_time) uint32 ...\n",
+       "    ph_id_channel   (delta_time) uint8 ...\n",
+       "    ph_id_count     (delta_time) uint8 ...\n",
+       "    ph_id_pulse     (delta_time) uint8 ...\n",
+       "    quality_ph      (delta_time) int8 ...\n",
+       "    signal_conf_ph  (delta_time, ds_surf_type) int8 ...\n",
+       "    weight_ph       (delta_time) uint8 ...\n",
+       "Attributes:\n",
+       "    Description:  Contains arrays of the parameters for each received photon.\n",
+       "    data_rate:    Data are stored at the photon detection rate.
" + ], + "text/plain": [ + "\n", + "Dimensions: (delta_time: 73765028, ds_surf_type: 5)\n", + "Coordinates:\n", + " * delta_time (delta_time) datetime64[ns] 2019-02-19T14:08:08.557345384...\n", + " lat_ph (delta_time) float64 ...\n", + " lon_ph (delta_time) float64 ...\n", + "Dimensions without coordinates: ds_surf_type\n", + "Data variables:\n", + " dist_ph_across (delta_time) float32 ...\n", + " dist_ph_along (delta_time) float32 ...\n", + " h_ph (delta_time) float32 ...\n", + " pce_mframe_cnt (delta_time) uint32 ...\n", + " ph_id_channel (delta_time) uint8 ...\n", + " ph_id_count (delta_time) uint8 ...\n", + " ph_id_pulse (delta_time) uint8 ...\n", + " quality_ph (delta_time) int8 ...\n", + " signal_conf_ph (delta_time, ds_surf_type) int8 ...\n", + " weight_ph (delta_time) uint8 ...\n", + "Attributes:\n", + " Description: Contains arrays of the parameters for each received photon.\n", + " data_rate: Data are stored at the photon detection rate." + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# this one is different! you can try this at home (cloud otpmized HDF5!)\n", + "\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"blockcache\", # or \"first\" with enough space\n", + " \"block_size\": 8*1024*1024 # could be bigger\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": { # only recent versions of xarray and h5netcdf allow this correctly\n", + " \"page_buf_size\": 32*1024*1024, # this one only works in repacked files\n", + " \"rdcc_nbytes\": 8*1024*1024 # this one is to read the chunks \n", + " }\n", + "\n", + " }\n", + "}\n", + "ds = xr.open_dataset(fs.open(cloud_optimized, **io_params[\"fsspec_params\"]),\n", + " group=\"/gt1l/heights\",\n", + " engine=\"h5netcdf\",\n", + " **io_params[\"h5py_params\"])\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "da959721-2f9d-4151-b361-6f9f38fa5b8c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 11 s, sys: 2.02 s, total: 13 s\n", + "Wall time: 1min 25s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'h_ph' ()>\n",
+       "array(1031.6101, dtype=float32)
" + ], + "text/plain": [ + "\n", + "array(1031.6101, dtype=float32)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# takes about ~2 minutes\n", + "ds.h_ph.mean()" + ] + }, + { + "cell_type": "markdown", + "id": "35caf411-afe7-44f7-9264-5e7b892456d0", + "metadata": {}, + "source": [ + "
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/convert_h5dataframe2flatgeobuf.ipynb b/notebooks/data-wrangling/convert_h5dataframe2flatgeobuf.ipynb similarity index 100% rename from notebooks/convert_h5dataframe2flatgeobuf.ipynb rename to notebooks/data-wrangling/convert_h5dataframe2flatgeobuf.ipynb diff --git a/notebooks/example-list-test-files.ipynb b/notebooks/data-wrangling/example-list-test-files.ipynb similarity index 100% rename from notebooks/example-list-test-files.ipynb rename to notebooks/data-wrangling/example-list-test-files.ipynb diff --git a/notebooks/format-preprocessing-times.ipynb b/notebooks/data-wrangling/format-preprocessing-times.ipynb similarity index 100% rename from notebooks/format-preprocessing-times.ipynb rename to notebooks/data-wrangling/format-preprocessing-times.ipynb diff --git a/notebooks/fsspec-logs.ipynb b/notebooks/data-wrangling/fsspec-logs.ipynb similarity index 99% rename from notebooks/fsspec-logs.ipynb rename to notebooks/data-wrangling/fsspec-logs.ipynb index 96bfe63..87d7f05 100644 --- a/notebooks/fsspec-logs.ipynb +++ b/notebooks/data-wrangling/fsspec-logs.ipynb @@ -317,7 +317,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/h5coro_benchmarks.ipynb b/notebooks/data-wrangling/h5coro_benchmarks.ipynb similarity index 100% rename from notebooks/h5coro_benchmarks.ipynb rename to notebooks/data-wrangling/h5coro_benchmarks.ipynb diff --git a/notebooks/h5py_testing_original_repacked_with_subsetting.ipynb b/notebooks/data-wrangling/h5py_testing_original_repacked_with_subsetting.ipynb similarity index 100% rename from notebooks/h5py_testing_original_repacked_with_subsetting.ipynb rename to notebooks/data-wrangling/h5py_testing_original_repacked_with_subsetting.ipynb diff --git a/notebooks/kerchunker.ipynb b/notebooks/data-wrangling/kerchunker.ipynb similarity index 100% rename from notebooks/kerchunker.ipynb rename to notebooks/data-wrangling/kerchunker.ipynb diff --git a/notebooks/read-results.ipynb b/notebooks/data-wrangling/read-results.ipynb similarity index 100% rename from notebooks/read-results.ipynb rename to notebooks/data-wrangling/read-results.ipynb diff --git a/notebooks/data-wrangling/ros3vfd-log-info.ipynb b/notebooks/data-wrangling/ros3vfd-log-info.ipynb new file mode 100644 index 0000000..faa57d1 --- /dev/null +++ b/notebooks/data-wrangling/ros3vfd-log-info.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c1d4cad4-c84c-4104-981c-9eb0a20f75fd", + "metadata": {}, + "source": [ + "# ROS3 VFD Log Analysis Dashboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f53c6f1c-e624-4952-a3e8-69302152da81", + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "import io\n", + "import re\n", + "import numpy as np\n", + "from bokeh.models import HoverTool\n", + "import holoviews as hv\n", + "import panel as pn\n", + "hv.extension('bokeh')\n", + "pn.extension()" + ] + }, + { + "cell_type": "markdown", + "id": "bc1a4585-45e7-48c7-a806-d7096f0f82bc", + "metadata": {}, + "source": [ + "## Log Parser\n", + "\n", + "The class that represents information of one HTTP range GET request:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cef30c0-71b2-405f-83d4-a440df748a52", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass(frozen=True)\n", + "class ByteRange:\n", + " start: int\n", + " end: int\n", + " filesize: int\n", + "\n", + " def __post_init__(self):\n", + " if self.start < 0 or self.end <= 0 or self.filesize <= 0:\n", + " raise ValueError('Start, end, and file size values must be positive integers')\n", + " elif self.end > self.filesize:\n", + " raise ValueError('End value must be smaller or equal to file size')\n", + " elif self.start > self.end:\n", + " raise ValueError('Start value must be smaller or equal to end value')\n", + "\n", + " @property\n", + " def size(self):\n", + " return self.end - self.start + 1\n", + "\n", + " def __len__(self):\n", + " return self.size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b205fe75-e4a3-4bb3-86ac-a24b630580f8", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_fsspec_log(content: bytes) -> list[ByteRange]:\n", + " head_line = re.compile('read: 0 - ')\n", + " fsize_line = re.compile('FileSize: ([0-9]+)')\n", + " range_line = re.compile('\\s*(read: \\d+ - \\d+)')\n", + "\n", + " ranges = list()\n", + " with io.TextIOWrapper(io.BytesIO(content)) as logtxt:\n", + " for line in logtxt:\n", + " if head_line.match(line):\n", + " break\n", + " else:\n", + " raise RuntimeError('HEAD line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = fsize_line.match(line)\n", + " if match:\n", + " fsize = int(match.group(1))\n", + " break\n", + " else:\n", + " raise RuntimeError('FILESIZE line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = range_line.search(line)\n", + " if match:\n", + " range = ByteRange(start=int(match.group('start')), \n", + " end=int(match.group('end')),\n", + " filesize=fsize)\n", + " if range.size != int(match.group('size')):\n", + " raise ValueError(f'Reported size different for {match.group()}')\n", + " ranges.append(range)\n", + " \n", + " return ranges" + ] + }, + { + "cell_type": "markdown", + "id": "a5954181-f1d7-40f3-8594-3afe62bcd2aa", + "metadata": {}, + "source": [ + "Log file parser:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4e4e020-c058-4a6c-b2dd-ed12962ae785", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_ros3vfd_log(content: bytes) -> list[ByteRange]:\n", + " head_line = re.compile('HEAD: Bytes 0 - ')\n", + " fsize_line = re.compile('FILESIZE: ([0-9]+)')\n", + " range_line = re.compile('GET: Bytes (?P[0-9]+) - (?P[0-9]+), Request Size: (?P[0-9]+)')\n", + "\n", + " ranges = list()\n", + " with io.TextIOWrapper(io.BytesIO(content)) as logtxt:\n", + " for line in logtxt:\n", + " if head_line.match(line):\n", + " break\n", + " else:\n", + " raise RuntimeError('HEAD line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = fsize_line.match(line)\n", + " if match:\n", + " fsize = int(match.group(1))\n", + " break\n", + " else:\n", + " raise RuntimeError('FILESIZE line not found in the log file')\n", + "\n", + " for line in logtxt:\n", + " match = range_line.search(line)\n", + " if match:\n", + " range = ByteRange(start=int(match.group('start')), \n", + " end=int(match.group('end')),\n", + " filesize=fsize)\n", + " if range.size != int(match.group('size')):\n", + " raise ValueError(f'Reported size different for {match.group()}')\n", + " ranges.append(range)\n", + " \n", + " return ranges" + ] + }, + { + "cell_type": "markdown", + "id": "502ba537-da68-4bc2-95bc-ccf73b1f3322", + "metadata": {}, + "source": [ + "## Dashboard\n", + "\n", + "Function for generating log stats and plots:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8738d59c-3ff6-4d83-9ef2-4eeeb93ee851", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_ros3vfd_log(from_file):\n", + " if from_file is None:\n", + " return\n", + " elif len(from_file) == 0:\n", + " return [pn.pane.Alert('ros3vfd log file empty.', alert_type='danger')]\n", + " try:\n", + " ranges = parse_ros3vfd_log(from_file)\n", + " except Exception as e:\n", + " return [pn.pane.Alert(f'Error: {str(e)}', alert_type='danger')]\n", + " if len(ranges) == 0:\n", + " return [pn.pane.Alert('No range `GET` info found.', alert_type='info')]\n", + " start = np.fromiter([r.start for r in ranges], dtype=np.uint64)\n", + " end = np.fromiter([r.end for r in ranges], dtype=np.uint64)\n", + " req_no = np.arange(len(ranges)) + 1\n", + " sizes = np.fromiter([r.size for r in ranges], np.uint64)\n", + " info = pn.pane.Markdown(f\"\"\"\n", + "# ros3vfd Log Information\n", + "\n", + "Log size: {len(from_file):,} bytes\n", + "\n", + "HDF5 file size: {ranges[0].filesize:,} bytes\n", + "\n", + "Number of range _GET_ requests: {len(ranges):,}\n", + "\n", + "Overall range _GET_ requests stats:\n", + "\n", + "* Smallest: {np.min(sizes):,} bytes
\n", + "* Median: {int(np.median(sizes)):,} bytes
\n", + "* Largest: {np.max(sizes):,} bytes\n", + "\n", + "Maximum file byte read: {end.max():,}\n", + "\n", + "Total of file content read: {sizes.sum():,} bytes\n", + "\n", + "Percentage of content read to file size: {100 * (sizes.sum() / ranges[0].filesize) :.2f} %\n", + "\"\"\")\n", + " data = dict(start=start, end=end, start_event=req_no, end_event=req_no)\n", + " max_offset_range = min(16_000_000, np.max(end))\n", + " req_range = np.where(end <= max_offset_range)[0]\n", + " if req_range.size == 0:\n", + " max_req_range = req_no[-1]\n", + " else:\n", + " max_req_range = req_no[np.where(end <= max_offset_range)[0][-1]] + 1\n", + " ros3plt = hv.Segments(\n", + " data, \n", + " [\n", + " hv.Dimension('start', label='File offset', range=(0, max_offset_range)),\n", + " hv.Dimension('start_event', label='Req. No.', range=(0, max_req_range)), \n", + " 'end', \n", + " 'end_event'\n", + " ]\n", + " )\n", + " hvrtip = HoverTool(\n", + " tooltips = [\n", + " ('req no', '@start_event'),\n", + " ('start byte', '@start'),\n", + " ('end byte', '@end')\n", + " ]\n", + " )\n", + " ros3plt.opts(width=700, height=600, invert_axes=True, color='blue', \n", + " line_width=3, tools=[hvrtip])\n", + " size_hist = hv.Histogram(np.histogram(sizes, bins=512))\n", + " size_hist.opts(color='blue', line_color=None, tools=['hover'],\n", + " xlabel='Size (bytes)', ylabel='Number of requests')\n", + " \n", + " return [pn.Row(info, size_hist), ros3plt]" + ] + }, + { + "cell_type": "markdown", + "id": "d1578b9a-1a8f-43cb-9902-96e21c83cf3a", + "metadata": {}, + "source": [ + "### Dashboard Components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "553c70f5-6f78-4f28-9d6b-7998cbbc7ec2", + "metadata": {}, + "outputs": [], + "source": [ + "log_file = pn.widgets.FileInput()\n", + "upld_form = pn.Row(\n", + " pn.pane.Markdown('Please select a ros3vfd log file (limit 10MB):'),\n", + " log_file\n", + ")\n", + "res = pn.Column()\n", + "app = pn.WidgetBox(upld_form, res)" + ] + }, + { + "cell_type": "markdown", + "id": "15126686-d315-4ae4-8666-054dd6127ba5", + "metadata": {}, + "source": [ + "Callback function for interactive log processing invocation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322e3372-be21-4d65-b97e-9db5a552aaf0", + "metadata": {}, + "outputs": [], + "source": [ + "def callback(value):\n", + " res.objects = plot_ros3vfd_log(value)" + ] + }, + { + "cell_type": "markdown", + "id": "dd2b2c83-1d2e-4db4-a23f-db9ad5d6f84d", + "metadata": {}, + "source": [ + "Register callback with the appropriate dashboard object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "580279df-8b79-426f-bb22-002f117f200a", + "metadata": {}, + "outputs": [], + "source": [ + "log_file.param.watch_values(callback, ['value']);" + ] + }, + { + "cell_type": "markdown", + "id": "a3f35ce7-0698-4b02-a95e-75107138a29a", + "metadata": {}, + "source": [ + "Run the dashboard:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56c63583-922a-4b67-8cb0-8f3751ed7c28", + "metadata": {}, + "outputs": [], + "source": [ + "app.servable()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/run-tests.ipynb b/notebooks/data-wrangling/run-tests.ipynb similarity index 100% rename from notebooks/run-tests.ipynb rename to notebooks/data-wrangling/run-tests.ipynb diff --git a/notebooks/sliderule2geoparquet.ipynb b/notebooks/data-wrangling/sliderule2geoparquet.ipynb similarity index 100% rename from notebooks/sliderule2geoparquet.ipynb rename to notebooks/data-wrangling/sliderule2geoparquet.ipynb diff --git a/notebooks/xarray-h5coro-backend.ipynb b/notebooks/data-wrangling/xarray-h5coro-backend.ipynb similarity index 100% rename from notebooks/xarray-h5coro-backend.ipynb rename to notebooks/data-wrangling/xarray-h5coro-backend.ipynb diff --git a/notebooks/plot_benchmark_results.ipynb b/notebooks/plot_benchmark_results.ipynb new file mode 100644 index 0000000..a2195b3 --- /dev/null +++ b/notebooks/plot_benchmark_results.ipynb @@ -0,0 +1,449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e2d85d97-bd23-4af1-b302-1ab55921a30b", + "metadata": { + "user_expressions": [] + }, + "source": [ + "# Plot Benchmarking Results\n", + "\n", + "Plots the results in `benchmarks.csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a7c05ac8-7256-42f7-a351-4b498da62ffc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import re\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "7804d7bc-a01e-46bb-807b-fb5081616d8f", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Read `benchmarks.csv`\n", + "\n", + "This file is generated using [portable-full-comparison.ipynb](https://hub.cryointhecloud.com/hub/user-redirect/lab/tree/h5cloud/notebooks/portable-full-comparison.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1fa7d33b-ef5f-419d-a0a6-8213e075ede5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tooldatasetcloud-awareformatfiletimemeansizeproduct
0h5pyATL03-1GBnooriginals3://nasa-cryo-persistent/h5cloud/atl03/averag...2.843794386.067381GBATL03
1h5pyATL03-1GBnooptimizeds3://nasa-cryo-persistent/h5cloud/atl03/averag...4.157145386.067381GBATL03
2h5pyATL03-7GBnooriginals3://nasa-cryo-persistent/h5cloud/atl03/big/or...6.9494101035.16317GBATL03
3h5pyATL03-7GBnooptimizeds3://nasa-cryo-persistent/h5cloud/atl03/big/re...13.6586011035.16317GBATL03
4h5pyATL03-2GBnooriginals3://nasa-cryo-persistent/h5cloud/atl03/big/or...1.4053022049.75542GBATL03
\n", + "
" + ], + "text/plain": [ + " tool dataset cloud-aware format \\\n", + "0 h5py ATL03-1GB no original \n", + "1 h5py ATL03-1GB no optimized \n", + "2 h5py ATL03-7GB no original \n", + "3 h5py ATL03-7GB no optimized \n", + "4 h5py ATL03-2GB no original \n", + "\n", + " file time mean \\\n", + "0 s3://nasa-cryo-persistent/h5cloud/atl03/averag... 2.843794 386.06738 \n", + "1 s3://nasa-cryo-persistent/h5cloud/atl03/averag... 4.157145 386.06738 \n", + "2 s3://nasa-cryo-persistent/h5cloud/atl03/big/or... 6.949410 1035.1631 \n", + "3 s3://nasa-cryo-persistent/h5cloud/atl03/big/re... 13.658601 1035.1631 \n", + "4 s3://nasa-cryo-persistent/h5cloud/atl03/big/or... 1.405302 2049.7554 \n", + "\n", + " size product \n", + "0 1GB ATL03 \n", + "1 1GB ATL03 \n", + "2 7GB ATL03 \n", + "3 7GB ATL03 \n", + "4 2GB ATL03 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"benchmarks.csv\", index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8e4e2660-437b-48f7-896f-795e93ae1644", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Reformat data for plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5c69cca4-430f-4552-b9e7-88bd07deea33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
formatoptimizedoriginal
cloud-awarenoyesnoyes
toolsize
h5coro1GB4.286047NaN4.562052NaN
2GB1.812065NaN3.110127NaN
7GB11.794484NaN14.072926NaN
h5py1GB4.1571451.9302232.8437941.861841
2GB1.0851730.8633281.4053021.260476
7GB13.6586015.7583516.9494106.602762
kerchunk7GB8.813402NaN10.746919NaN
xarray1GB10.2586752.54299046.50308742.182489
2GB18.1096542.15657247.50670740.316141
7GB81.6751826.67192362.89623748.714591
\n", + "
" + ], + "text/plain": [ + "format optimized original \n", + "cloud-aware no yes no yes\n", + "tool size \n", + "h5coro 1GB 4.286047 NaN 4.562052 NaN\n", + " 2GB 1.812065 NaN 3.110127 NaN\n", + " 7GB 11.794484 NaN 14.072926 NaN\n", + "h5py 1GB 4.157145 1.930223 2.843794 1.861841\n", + " 2GB 1.085173 0.863328 1.405302 1.260476\n", + " 7GB 13.658601 5.758351 6.949410 6.602762\n", + "kerchunk 7GB 8.813402 NaN 10.746919 NaN\n", + "xarray 1GB 10.258675 2.542990 46.503087 42.182489\n", + " 2GB 18.109654 2.156572 47.506707 40.316141\n", + " 7GB 81.675182 6.671923 62.896237 48.714591" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df = df.pivot_table(index=[\"tool\", \"size\"], columns=[\"format\", \"cloud-aware\"], values=\"time\", aggfunc=\"mean\")\n", + "pivot_df" + ] + }, + { + "cell_type": "markdown", + "id": "97f6a3dc-e5eb-46e7-ac46-611a08143d07", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Plot results" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "33602bbd-41c0-4133-bab5-76f45e9fe1a5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Set seaborn plot style\n", + "sns.set_style(\"darkgrid\", rc={'axes.facecolor': '0.9'})\n", + "\n", + "tool_order = [\"h5py\", \"xarray\", \"h5coro\", \"kerchunk\"]\n", + "# Create figure and axis to \"contain\" plot - allows customization via ax object\n", + "fig, ax = plt.subplots(figsize=(15,6), layout=\"constrained\")\n", + "\n", + "# Plot results\n", + "pivot_df.loc[tool_order,:].plot(kind=\"bar\", ax=ax, \n", + " color=[\"tab:cyan\", \"tab:blue\", \"tab:pink\", \"tab:red\"],\n", + " xlabel=\"\", fontsize=15);\n", + "ax.legend(labels = [\"Optimized\", \"Optimized with informed io parameters\", \"Original\", \"Original with informed io parameters\"], fontsize=15)\n", + "ax.set_ylabel(\"Time (s)\", fontsize=20)\n", + "\n", + "## Make two level axis\n", + "\n", + "# helper to create axis labels\n", + "def parse_text(s):\n", + " return re.sub(r\"[()]\", \"\", s).split(\", \")\n", + "\n", + "# Retrieve and parse axis labels and position\n", + "tool, size, x, y = map(np.array, zip(*[(*parse_text(l.get_text()), *l.get_position()) for l in ax.get_xticklabels()]))\n", + "# Make labels and x-positions for seconary axis\n", + "sec_x, sec_label = zip(*[(x[tool == tool_name].mean(), \"\\n\"+tool_name) for tool_name in np.unique(tool)])\n", + "# Assign ticks and labels\n", + "ax.set_xticks(x, size, rotation=0);\n", + "sec = ax.secondary_xaxis(location=0);\n", + "sec.set_xticks(sec_x, sec_label, fontsize=18);\n", + "sec.tick_params(length=0)\n", + "\n", + "sepa_x = np.array([x[tool == tool_name].min()-0.5 for tool_name in np.unique(tool)] + [x.max()+0.5])\n", + "[ax.axvline(xs, c='k', ymin=-.1, clip_on=False, zorder=3) for xs in sepa_x];\n", + "\n", + "# Uncomment to save figure\n", + "# fig.savefig(\"access_time.summary.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "815347b5-f23c-4d25-9104-42523e9de093", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/portable-full-comparison.ipynb b/notebooks/portable-full-comparison.ipynb new file mode 100644 index 0000000..400a027 --- /dev/null +++ b/notebooks/portable-full-comparison.ipynb @@ -0,0 +1,799 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c9b37e2-2daa-4283-a228-ea581498de0c", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## AB testing access time for ICESat-2 ATL03 HDF5 files in the cloud.\n", + "\n", + "This notebook requires that we have 2 versions of the same file:\n", + " * Original A: The original file with no modifications on a S3 location.\n", + " * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaca84b1-46e9-4b41-a494-24da3a368f38", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!mamba uninstall -y h5coro \n", + "%pip install git+https://github.com/ICESat2-SlideRule/h5coro.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b78fb94-10ae-48cb-8e30-521b2c8b7822", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import h5py\n", + "import fsspec\n", + "import logging\n", + "import re\n", + "import time\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "from h5coro import h5coro, s3driver, filedriver\n", + "driver = s3driver.S3Driver\n", + "\n", + "logger = logging.getLogger('fsspec')\n", + "logger.setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "431d900d-0656-4b75-af6b-82f0f171d5f8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for library in (xr, h5py, fsspec, h5coro):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "markdown", + "id": "7998cd99-6034-4a1b-9ae5-d651bc265bff", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "For listing files in CryoCloud\n", + "\n", + "```bash\n", + "aws s3 ls s3://nasa-cryo-persistent/h5cloud/ --recursive\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9850faac-f534-4bc2-9214-c8dababe0f52", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_dict = {\n", + " \"ATL03-1GB\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5\"\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " },\n", + " \"ATL03-7GB\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " },\n", + " \"ATL03-7GB-kerchunk\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/kerchunk/atl03_ATL03_20181120182818_08110112_006_02.json\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/kerchunk/atl03_ATL03_20181120182818_08110112_006_02_repacked.json\",\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " }, \n", + " \"ATL03-2GB\": {\n", + " \"links\": {\n", + " \"original\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20210402143840_01341107_006_02.h5\",\n", + " \"optimized\": \"s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20210402143840_01341107_006_02_repacked.h5\",\n", + " },\n", + " \"group\": \"/gt1l/heights\",\n", + " \"variable\": \"h_ph\",\n", + " \"processing\": [\n", + " \"h5repack -S PAGE -G 8000000\"\n", + " ]\n", + " }\n", + "}\n", + "\n", + "def kerchunk_result(file: str, dataset: str, variable: str):\n", + " fs = fsspec.filesystem(\n", + " \"reference\",\n", + " fo=file,\n", + " remote_protocol=\"s3\",\n", + " remote_options=dict(anon=False),\n", + " skip_instance_cache=True,\n", + " )\n", + " ds = xr.open_dataset(\n", + " fs.get_mapper(\"\"), engine=\"zarr\", consolidated=False, group=dataset\n", + " )\n", + " return ds[variable].mean()\n", + "\n", + "# This will use the embedded credentials in the hub to access the s3://nasa-cryo-persistent bucket\n", + "fs = fsspec.filesystem('s3')\n" + ] + }, + { + "cell_type": "markdown", + "id": "4d166627-6144-40bf-884d-2188e5c764ba", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## [h5coro](https://github.com/ICESat2-SlideRule/h5coro/)\n", + "\n", + "**h5coro** is optimized for reading HDF5 data in high-latency high-throughput environments. It accomplishes this through a few key design decisions:\n", + "* __All reads are concurrent.__ Each dataset and/or attribute read by **h5coro** is performed in its own thread.\n", + "* __Intelligent range gets__ are used to read as many dataset chunks as possible in each read operation. This drastically reduces the number of HTTP requests to S3 and means there is no longer a need to re-chunk the data (it actually works better on smaller chunk sizes due to the granularity of the request).\n", + "* __Block caching__ is used to minimize the number of GET requests made to S3. S3 has a large first-byte latency (we've measured it at ~60ms on our systems), which means there is a large penalty for each read operation performed. **h5coro** performs all reads to S3 as large block reads and then maintains data in a local cache for access to smaller amounts of data within those blocks.\n", + "* __The system is serverless__ and does not depend on any external services to read the data. This means it scales naturally as the user application scales, and it reduces overall system complexity.\n", + "* __No metadata repository is needed.__ The structure of the file are cached as they are read so that successive reads to other datasets in the same file will not have to re-read and re-build the directory structure of the file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efe41d4a-1947-438b-a3c3-7ab954d75e13", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "h5coro_beanchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " print (f\"Processing: {link}\")\n", + " if \"kerchunk\" in link:\n", + " continue\n", + " group = dataset[\"group\"]\n", + " variable = dataset['variable'] \n", + " final_h5coro_array = []\n", + " start = time.time()\n", + " if link.startswith(\"s3://nasa-cryo-persistent/\"):\n", + " h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver)\n", + " else:\n", + " h5obj = h5coro.H5Coro(link.replace(\"s3://\", \"\"), s3driver.S3Driver, credentials={\"annon\": True})\n", + " ds = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)\n", + " data = ds[f'{group}/{variable}'][:]\n", + " data_mean = np.mean(data)\n", + " elapsed = time.time() - start\n", + " \n", + " h5coro_beanchmarks.append({\"tool\": \"h5coro\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + "\n", + "\n", + "df = pd.DataFrame.from_dict(h5coro_beanchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.title('h5coro cloud optimized HDF5 performance')\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8f0ba64d-d89c-4879-b965-f00d70956360", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Xarray + kerchunk, out of the box performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff56958f-8c1d-4fd7-b885-6efb81af8da7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# this is going to keep our numbers without modifying the i/o paramters\n", + "regular_xarray_benchmarks = []\n", + "kerchunk_benchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " print (f\"Processing: {link}\")\n", + " try:\n", + " log_filename = f\"logs/fsspec-xarray-{key}-{k}-default.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " \n", + " start = time.time()\n", + " if \"kerchunk\" in link:\n", + " data_mean = kerchunk_result(link, dataset[\"group\"], dataset[\"variable\"])\n", + " elapsed = time.time() - start\n", + " kerchunk_benchmarks.append(\n", + " {\"tool\": \"kerchunk\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean}) \n", + " else:\n", + " ds = xr.open_dataset(fs.open(link, mode='rb'), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n", + " data_mean = ds[dataset[\"variable\"]].mean() \n", + " elapsed = time.time() - start\n", + " regular_xarray_benchmarks.append(\n", + " {\"tool\": \"xarray\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean}) \n", + " \n", + " logging.getLogger().removeHandler(file_handler)\n", + " file_handler.close()\n", + "\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "92a8e67d-026e-4c6b-aa7d-b19dc10f4afd", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Plotting Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "149d5972-c5b9-4f29-979a-cf46c9654a06", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(kerchunk_benchmarks + regular_xarray_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "\n", + "plt.title(\"Out of the box I/O parameters\", fontsize=10)\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fa6ac2b9-989c-4246-bb89-b54b711dd695", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## h5py out of the box performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98c29558-de50-44af-87e9-074092fcd0ac", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "regular_h5py_benchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " try:\n", + " if \"kerchunk\" in link:\n", + " continue \n", + " print (f\"Processing: {link}\")\n", + " log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " # this is mostly IO so no perf_counter is needed\n", + " start = time.time()\n", + " with h5py.File(fs.open(link, mode=\"rb\")) as f:\n", + " path = f\"{dataset['group']}/{dataset['variable']}\"\n", + " data = f[path][:]\n", + " data_mean = data.mean()\n", + " elapsed = time.time() - start\n", + " regular_h5py_benchmarks.append(\n", + " {\"tool\": \"h5py\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"no\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + "\n", + " logging.getLogger().removeHandler(file_handler) \n", + " file_handler.close()\n", + " \n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "f4232e98-1159-45eb-ba11-0f0dbb905d83", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "### Plotting Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8fa6dca-f408-4298-beca-f2839d4c3b67", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_h5py_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "plt.title(\"Out of the box I/O parameters\", fontsize=10)\n", + "\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=45)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b20b2032-9ab4-46e1-b1f8-2e62b656a265", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Aggregated plot by tool and different file sizes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64bcc5de-aae3-46aa-9474-1c90b9ff20a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_h5py_benchmarks + kerchunk_benchmarks + regular_xarray_benchmarks + h5coro_beanchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "plt.title(\"Out of the box I/O parameters\", fontsize=10)\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "0ea67b0b-5e7f-4d1f-bca9-1f3cae7fe309", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Now let's run the tests with \"informed\" parameters, this is a I/O that aligns to the cloud-optimized granules chunking strategy and consolidated metadata.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8151834b-0b57-4a3d-98b5-8cfaffa37dc4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "optimized_h5py_benchmarks = []\n", + "optimized_xarray_benchmarks = []\n", + "\n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " print(f\"Processing: {link}\")\n", + " try:\n", + " log_filename = f\"logs/fsspec-xarray-{key}-{k}.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " \n", + " io_params = {\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\": {}\n", + " }\n", + " \n", + " if \"repacked\" in link: \n", + " io_params ={\n", + " \"fsspec_params\": {\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": {\n", + " \"page_buf_size\": 64*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + "\n", + " }\n", + " }\n", + "\n", + " if \"kerchunk\" in link:\n", + " continue\n", + " \n", + " start = time.time()\n", + " ds = xr.open_dataset(fs.open(link, mode='rb', **io_params[\"fsspec_params\"]), group=dataset[\"group\"], engine=\"h5netcdf\", decode_cf=False)\n", + " data_mean = ds[dataset[\"variable\"]].mean()\n", + " elapsed = time.time() - start\n", + " optimized_xarray_benchmarks.append(\n", + " {\"tool\": \"xarray\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"yes\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + " \n", + " logging.getLogger().removeHandler(file_handler)\n", + " file_handler.close()\n", + "\n", + " except Exception as e:\n", + " print(e)\n", + " \n", + "for key, dataset in test_dict.items():\n", + " for k, link in dataset[\"links\"].items():\n", + " try:\n", + " if \"kerchunk\" in link:\n", + " continue \n", + " print (f\"Processing: {link}\")\n", + " log_filename = f\"logs/fsspec-h5py-{key}-{k}_default.log\"\n", + " \n", + " # Create a new FileHandler for each iteration\n", + " file_handler = logging.FileHandler(log_filename)\n", + " file_handler.setLevel(logging.DEBUG)\n", + "\n", + " # Add the handler to the root logger\n", + " logging.getLogger().addHandler(file_handler)\n", + " # this is mostly IO so no perf_counter is needed\n", + " start = time.time()\n", + " io_params = {\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\": {}\n", + " }\n", + " \n", + " if \"repacked\" in link: \n", + " io_params ={\n", + " \"fsspec_params\": {\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"page_buf_size\": 64*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + " } \n", + " with h5py.File(fs.open(link, mode=\"rb\", **io_params[\"fsspec_params\"]), **io_params[\"h5py_params\"]) as f:\n", + " path = f\"{dataset['group']}/{dataset['variable']}\"\n", + " data = f[path][:]\n", + " data_mean = data.mean()\n", + " elapsed = time.time() - start\n", + " optimized_h5py_benchmarks.append(\n", + " {\"tool\": \"h5py\",\n", + " \"dataset\": key,\n", + " \"cloud-aware\": \"yes\",\n", + " \"format\": k,\n", + " \"file\": link,\n", + " \"time\": elapsed,\n", + " \"mean\": data_mean})\n", + "\n", + " logging.getLogger().removeHandler(file_handler) \n", + " file_handler.close()\n", + " \n", + "\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "04414c2e-0666-4701-8ecc-7842727ede22", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Plotting results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2db2535a-8d3a-4e65-b21c-8db6b48074c8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(optimized_h5py_benchmarks+h5coro_beanchmarks+optimized_xarray_benchmarks+kerchunk_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['tool','dataset'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "\n", + "plt.suptitle('Cloud-optimized HDF5 performance (less is better)', fontsize=14)\n", + "plt.title(\"Informed I/O parameters\", fontsize=10)\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ea0db03e-5653-4908-ada1-16d723666e18", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Ploting tool specific performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47444e8a-6d59-42c2-baff-a3c85c447eb2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_xarray_benchmarks+optimized_xarray_benchmarks)\n", + "\n", + "pivot_df = df.pivot_table(index=['dataset','cloud-aware'], columns=['format'], values='time', aggfunc='mean')\n", + "\n", + "# Plotting\n", + "pivot_df.plot(kind='bar', figsize=(10, 6))\n", + "plt.title('Xarray \"Cloud-Aware\" Access Pattern Performance (less is better)')\n", + "plt.xlabel('Tool')\n", + "plt.ylabel('Mean Time')\n", + "plt.xticks(rotation=90)\n", + "plt.legend(title='Format')\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8395f794-0ea7-4c26-8f64-0d2f9659d841", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## Make one comparison plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbe17a07-22e3-4b99-a50a-d3183425d15c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(regular_h5py_benchmarks + \n", + " kerchunk_benchmarks + \n", + " regular_xarray_benchmarks + \n", + " h5coro_beanchmarks + \n", + " optimized_h5py_benchmarks + \n", + " optimized_xarray_benchmarks)\n", + "df[\"size\"] = df.dataset.str.extract(r\"-(\\dGB)\")\n", + "df[\"product\"] = df.dataset.str.extract(r\"(ATL\\d{2})\")\n", + "df.to_csv(\"benchmarks.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90486527-a1f2-4a92-bee6-1b2f934aa24d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pivot_df = df.pivot_table(index=[\"tool\", \"size\"], columns=[\"format\", \"cloud-aware\"], values=\"time\", aggfunc=\"mean\")\n", + "pivot_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88badbc0-a277-4aee-9236-d74327032d0d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "\n", + "sns.set_style(\"darkgrid\", rc={'axes.facecolor': '0.9'})\n", + "# sns.set_palette(\"bright\", 4)\n", + "\n", + "fig, ax = plt.subplots(figsize=(15,6), layout=\"constrained\")\n", + "\n", + "pivot_df.plot(kind=\"bar\", ax=ax, \n", + " color=[\"tab:cyan\", \"tab:blue\", \"tab:pink\", \"tab:red\"],\n", + " xlabel=\"\", fontsize=15);\n", + "ax.legend(labels = [\"Optimized\", \"Optimized with informed io parameters\", \"Original\", \"Original with informed io parameters\"], fontsize=15)\n", + "ax.set_ylabel(\"Time (s)\", fontsize=20)\n", + "\n", + "# Make two level axis\n", + "def parse_text(s):\n", + " return re.sub(r\"[()]\", \"\", s).split(\", \")\n", + "\n", + "# Retrieve and parse axis labels and position\n", + "tool, size, x, y = map(np.array, zip(*[(*parse_text(l.get_text()), *l.get_position()) for l in ax.get_xticklabels()]))\n", + "# Make labels and x-positions for seconary axis\n", + "sec_x, sec_label = zip(*[(x[tool == tool_name].mean(), \"\\n\"+tool_name) for tool_name in np.unique(tool)])\n", + "# Assign ticks and labels\n", + "ax.set_xticks(x, size, rotation=0);\n", + "sec = ax.secondary_xaxis(location=0);\n", + "sec.set_xticks(sec_x, sec_label, fontsize=18);\n", + "sec.tick_params(length=0)\n", + "\n", + "sepa_x = np.array([x[tool == tool_name].min()-0.5 for tool_name in np.unique(tool)] + [x.max()+0.5])\n", + "[ax.axvline(xs, c='k', ymin=-.1, clip_on=False, zorder=3) for xs in sepa_x];\n", + "\n", + "# Use plot_benchmark_results.ipynb to generate saveable png" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8dd30a5-952e-428f-b908-9897fac81aa7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae595359-3e8a-4072-89b4-bd2e52d9ec12", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/portable-h5coro-test.ipynb b/notebooks/portable-h5coro-test.ipynb new file mode 100644 index 0000000..df2394f --- /dev/null +++ b/notebooks/portable-h5coro-test.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload \n", + "\n", + "import sys\n", + "import os\n", + "classes_path = os.path.abspath('../h5tests/')\n", + "sys.path.append(classes_path)\n", + "from h5coro_arr_mean import H5CoroArrMean\n", + "import pandas as pd\n", + "\n", + "benchmarks = []" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "original_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "h5coro_original = H5CoroArrMean('atl03-bigsize-original', files=original_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "FatalError", + "evalue": "invalid credential keys provided, looking for: aws_access_key_id, aws_secret_access_key, and aws_session_token", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFatalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# takes about ~30 seconds per granule out of region (6+ GB granules)\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mh5coro_original\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m benchmarks\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlibrary\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh5coro\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moriginal\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmean\u001b[39m\u001b[38;5;124m\"\u001b[39m: results[\u001b[38;5;241m0\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal_requests\u001b[39m\u001b[38;5;124m\"\u001b[39m: results[\u001b[38;5;241m3\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal_reqs\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mavg_req_size\u001b[39m\u001b[38;5;124m\"\u001b[39m: results[\u001b[38;5;241m3\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mavg_req_size\u001b[39m\u001b[38;5;124m\"\u001b[39m]})\n\u001b[1;32m 10\u001b[0m benchmarks\n", + "File \u001b[0;32m~/work/openscapes/h5cloud/h5tests/h5test.py:95\u001b[0m, in \u001b[0;36mtimer_decorator..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 93\u001b[0m __setup_logging(\u001b[38;5;28mself\u001b[39m, tstamp)\n\u001b[1;32m 94\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m---> 95\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m end_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogs_regex:\n", + "File \u001b[0;32m~/work/openscapes/h5cloud/h5tests/h5coro_arr_mean.py:30\u001b[0m, in \u001b[0;36mH5CoroArrMean.run\u001b[0;34m(self, dataset, variable)\u001b[0m\n\u001b[1;32m 27\u001b[0m credentials \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mregion_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mus-west-2\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124manon\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mTrue\u001b[39;00m}\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfiles:\n\u001b[0;32m---> 30\u001b[0m h5obj \u001b[38;5;241m=\u001b[39m \u001b[43mh5coro\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mH5Coro\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreplace\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ms3://\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43ms3driver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mS3Driver\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcredentials\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m output \u001b[38;5;241m=\u001b[39m h5obj\u001b[38;5;241m.\u001b[39mreadDatasets(datasets\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgroup\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m], block\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 32\u001b[0m data \u001b[38;5;241m=\u001b[39m h5obj[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgroup\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\n", + "File \u001b[0;32m~/.pyenv/versions/mambaforge/envs/h5cloud/lib/python3.12/site-packages/h5coro/h5coro.py:2020\u001b[0m, in \u001b[0;36mH5Coro.__init__\u001b[0;34m(self, resource, driver_class, credentials, datasets, block)\u001b[0m\n\u001b[1;32m 2018\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, resource, driver_class, credentials\u001b[38;5;241m=\u001b[39m{}, datasets\u001b[38;5;241m=\u001b[39m[], block\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[1;32m 2019\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresource \u001b[38;5;241m=\u001b[39m resource\n\u001b[0;32m-> 2020\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdriver \u001b[38;5;241m=\u001b[39m \u001b[43mdriver_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2022\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 2023\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetaDataTable \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m~/.pyenv/versions/mambaforge/envs/h5cloud/lib/python3.12/site-packages/h5coro/s3driver.py:43\u001b[0m, in \u001b[0;36mS3Driver.__init__\u001b[0;34m(self, resource, credentials)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession \u001b[38;5;241m=\u001b[39m boto3\u001b[38;5;241m.\u001b[39mSession()\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 43\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m FatalError(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minvalid credential keys provided, looking for: aws_access_key_id, aws_secret_access_key, and aws_session_token\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# open resource\u001b[39;00m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mresource(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms3\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mObject(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresourcePath[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresourcePath[\u001b[38;5;241m1\u001b[39m:]))\n", + "\u001b[0;31mFatalError\u001b[0m: invalid credential keys provided, looking for: aws_access_key_id, aws_secret_access_key, and aws_session_token" + ] + } + ], + "source": [ + "# takes about ~30 seconds per granule out of region (6+ GB granules)\n", + "results = h5coro_original.run()\n", + "benchmarks.append({\"library\": \"h5coro\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [ + "cloud_optimized_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "h5py_cloud = H5pyArrMean('atl03-bigsize-repacked', files=cloud_optimized_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/portable-h5py-test.ipynb b/notebooks/portable-h5py-test.ipynb new file mode 100644 index 0000000..05c60b8 --- /dev/null +++ b/notebooks/portable-h5py-test.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload \n", + "\n", + "import sys\n", + "import os\n", + "classes_path = os.path.abspath('../h5tests/')\n", + "sys.path.append(classes_path)\n", + "from h5py_arr_mean import H5pyArrMean\n", + "import pandas as pd\n", + "\n", + "benchmarks = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "original_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/original/ATL03_20191225111315_13680501_006_01.h5\",\n", + " # \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "h5py_original = H5pyArrMean('atl03-bigsize-original', files=original_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# takes about ~30 seconds per granule out of region (6+ GB granules)\n", + "io_params ={\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\" : {}\n", + "}\n", + "\n", + "results = h5py_original.run(io_params)\n", + "\n", + "benchmarks.append({\"library\": \"h5py\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [ + "cloud_optimized_granules = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/average/repacked/ATL03_20191225111315_13680501_006_01.h5\",\n", + " # \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "h5py_cloud = H5pyArrMean('atl03-bigsize-repacked', files=cloud_optimized_granules, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", + "metadata": {}, + "outputs": [], + "source": [ + "# takes about ~30 seconds per granule out of region\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"first\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"page_buf_size\": 32*1024*1024,\n", + " \"rdcc_nbytes\": 1024*1024\n", + " }\n", + "}\n", + "\n", + "results = h5py_cloud.run(io_params)\n", + "\n", + "benchmarks.append({\"library\": \"h5py\",\n", + " \"format\": \"cloud\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3059ebd8-b110-49c5-9250-2a2cd009338f", + "metadata": {}, + "outputs": [], + "source": [ + "for run in range(5):\n", + " results = h5py_cloud.run(io_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/portable-xarray-test.ipynb b/notebooks/portable-xarray-test.ipynb new file mode 100644 index 0000000..4e67969 --- /dev/null +++ b/notebooks/portable-xarray-test.ipynb @@ -0,0 +1,305 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "48daa283-8e1e-46e3-b4ce-1a0271b86d37", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n", + "xarray v2024.1.1\n", + "h5py v3.10.0\n", + "s3fs v2024.2.0\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload \n", + "\n", + "import sys\n", + "import os\n", + "classes_path = os.path.abspath('../h5tests/')\n", + "sys.path.append(classes_path)\n", + "from xarray_arr_mean import XarrayArrMean\n", + "import pandas as pd\n", + "\n", + "import xarray as xr\n", + "import h5py\n", + "import s3fs\n", + "\n", + "benchmarks = []\n", + "\n", + "for library in (xr, h5py, s3fs):\n", + " print(f'{library.__name__} v{library.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d6ce77fd-f9cd-48b1-94cd-1fe57f52e11f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "files = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5\",\n", + "]\n", + "xarray_original = XarrayArrMean('atl03-bigsize-original', files=files, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "60eeeb1b-9531-4fec-a847-3ca5304c4685", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'library': 'xarray',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 176.90762186050415,\n", + " 'total_requested_bytes': 720001152,\n", + " 'total_requests': 100,\n", + " 'avg_req_size': 7200011},\n", + " {'library': 'xarray',\n", + " 'format': 'original',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 1456.8166418075562,\n", + " 'total_requested_bytes': 438520591,\n", + " 'total_requests': 26988,\n", + " 'avg_req_size': 16248}]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# don't even try this out of region...\n", + "# takes about ~10 minutes per granule out of region (6+ GB granules)\n", + "io_params ={\n", + " \"fsspec_params\": {},\n", + " \"h5py_params\" : {}\n", + "}\n", + "results = xarray_original.run(io_params)\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"original\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "64c4584f-c527-44bb-8c05-68a96820d1ff", + "metadata": {}, + "outputs": [], + "source": [ + "files = [\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5\",\n", + " \"s3://its-live-data/cloud-experiments/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5\",\n", + "]\n", + "xarray_cloud = XarrayArrMean('atl03-bigsize-repacked', files=files, store_results=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dfd4e404-0412-4d2f-8eba-ca39a670e369", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'library': 'xarray',\n", + " 'format': 'cloud',\n", + " 'mean': 1032.984130859375,\n", + " 'time': 176.90762186050415,\n", + " 'total_requested_bytes': 720001152,\n", + " 'total_requests': 100,\n", + " 'avg_req_size': 7200011}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# takes about ~90 seconds per granule out of region\n", + "io_params ={\n", + " \"fsspec_params\": {\n", + " # \"skip_instance_cache\": True\n", + " \"cache_type\": \"blockcache\",\n", + " \"block_size\": 8*1024*1024\n", + " },\n", + " \"h5py_params\" : {\n", + " \"driver_kwds\": {\n", + " \"page_buf_size\": 32*1024*1024,\n", + " \"rdcc_nbytes\": 8*1024*1024\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "results = xarray_cloud.run(io_params)\n", + "\n", + "benchmarks.append({\"library\": \"xarray\",\n", + " \"format\": \"cloud\",\n", + " \"mean\": results[0],\n", + " \"time\": results[1],\n", + " \"total_requested_bytes\": results[3][\"total_reqs_bytes\"],\n", + " \"total_requests\": results[3][\"total_reqs\"],\n", + " \"avg_req_size\": results[3][\"avg_req_size\"]})\n", + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9d30d92b-4192-4da1-8b60-41cc94ca2db1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame.from_dict(benchmarks)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3ff4c22f-7f77-4c69-a84c-f13b0fbba1f2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "for name, group in df.groupby(['library', 'format']):\n", + " library, format = name\n", + " x = f'{library}, {format}'\n", + " y = group['time'].mean()\n", + " ax.bar(f'{library}, {format}', group['time'].mean(), label=f'{library}, {format}', align='center')\n", + " ax.text(x, y + 0.05, f'{group[\"time\"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=12)\n", + " ax.text(x, y - (y/2) - 10, f'Total Requests: {group[\"total_requests\"].mean()}', ha='center', va='bottom', color='black', fontsize=8)\n", + " ax.text(x, y - (y/2.5), f'Total Req Bytes (MB): {round(group[\"total_requested_bytes\"].mean() / (1024*1024) , 2)}', ha='center', va='bottom', color='black', fontsize=8)\n", + "\n", + "# Set labels and title\n", + "ax.set_xlabel('Access Pattern')\n", + "ax.set_ylabel('Time in Seconds')\n", + "ax.set_title(f'mean() on photon data for runs on ATL03, less is better ')\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "# # Show legend\n", + "# ax.legend()\n", + "\n", + "# Show the plot\n", + "with plt.xkcd():\n", + " # This figure will be in XKCD-style\n", + " fig1 = plt.figure()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}