ICESAT-2HackWeek · betolink · Jan 29, 2024 · Jan 30, 2024 · Feb 6, 2024 · Feb 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -109,3 +109,7 @@ venv.bak/
 *.hdf5
 *.nc
 *.tif
+
+*.log
+notebooks/logs
+notebooks/results
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,23 @@
+name: h5cloud
+channels:
+  - conda-forge
+dependencies:
+  - jupyterlab
+  - boto3
+  - tqdm
+  - matplotlib-base
+  - pandas
+  - numpy
+  - s3fs
+  - xarray
+  - dask
+  - distributed
+  - geopandas
+  - h5py>=3.10
+  - zarr
+  - kerchunk
+  - h5netcdf
+  - pip
+  - pip:
+    - git+https://github.com/betolink/filesystem_spec.git
+    - git+https://github.com/ICESat2-SlideRule/h5coro.git
diff --git a/h5tests/h5coro_arr_mean.py b/h5tests/h5coro_arr_mean.py
@@ -1,27 +1,40 @@
-from .h5test import H5Test, timer_decorator
-import numpy as np
 import subprocess
 
+import numpy as np
+from h5test import H5Test, timer_decorator
+
 try:
     import h5coro
 except:
-    completed_process = subprocess.run([
-        'mamba', 'install', '-c', 'conda-forge', 'h5coro', '--yes'
-    ])
+    completed_process = subprocess.run(
+        ["pip", "install", "git+https://github.com/ICESat2-SlideRule/h5coro.git@main"]
+    )
     import h5coro
 
-from h5coro import h5coro, s3driver, filedriver
-h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)
-
+from h5coro import h5coro, s3driver
+
+driver = s3driver.S3Driver
+
+
 class H5CoroArrMean(H5Test):
     @timer_decorator
-    def run(self):
-        group = '/gt1l/heights'
-        variable = 'h_ph'        
+    def run(self, dataset="/gt1l/heights", variable="h_ph"):
+        group = dataset
+        variable = variable
         final_h5coro_array = []
+
         for file in self.files:
-            h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver)
-            output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)
-            data = h5obj[f'{group}/{variable}'].values
-            final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None)
+            if link.startswith("s3://nasa-cryo-persistent/"):
+                h5obj = h5coro.H5Coro(link.replace("s3://", ""), s3driver.S3Driver)
+            else:
+                h5obj = h5coro.H5Coro(
+                    link.replace("s3://", ""),
+                    s3driver.S3Driver,
+                    credentials={"annon": True},
+                )
+            ds = h5obj.readDatasets(datasets=[f"{group}/{variable}"], block=True)
+            data = ds[f"{group}/{variable}"][:]
+            final_h5coro_array = np.insert(
+                final_h5coro_array, len(final_h5coro_array), data, axis=None
+            )
         return np.mean(final_h5coro_array)
diff --git a/h5tests/h5py_arr_mean.py b/h5tests/h5py_arr_mean.py
@@ -1,21 +1,26 @@
-from .h5test import H5Test, timer_decorator
 import h5py
 import numpy as np
+from h5test import H5Test, fsspec_logging_decorator, timer_decorator
+
 
 class H5pyArrMean(H5Test):
     @timer_decorator
-    def run(self):
-        final_h5py_array = []  
-        # TODO: Do we need to make this configurable or consistent?
-        group = '/gt1l/heights'
-        variable = 'h_ph'        
+    @fsspec_logging_decorator
+    def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"):
+        final_h5py_array = []
+        fsspec_params = {}
+        h5py_params = {}
+        if "fsspec_params" in io_params:
+            fsspec_params = io_params["fsspec_params"]
+        if "h5py_params" in io_params:
+            h5py_params = io_params["h5py_params"]
+        self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files]
         for file in self.files:
-            with h5py.File(self.s3_fs.open(file, 'rb')) as f:
-                data = f[f'{group}/{variable}'][:]
-                # Need to test if using concatenate is faster
-                final_h5py_array = np.insert(
-                    final_h5py_array,
-                    len(final_h5py_array),
-                    data, axis=None
-                )
+            with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo:
+                print("h5py params: ", h5py_params)
+                with h5py.File(fo, **h5py_params) as f:
+                    data = f[f"{dataset}/{variable}"][:]
+                    final_h5py_array = np.insert(
+                        final_h5py_array, len(final_h5py_array), data, axis=None
+                    )
         return np.mean(final_h5py_array)
diff --git a/h5tests/h5py_arr_subset_mean.py b/h5tests/h5py_arr_subset_mean.py
@@ -1,49 +1,51 @@
 import os
 import sys
 
-from .h5test import H5Test, timer_decorator
 import h5py
 import numpy as np
+from h5test import H5Test, fsspec_logging_decorator, timer_decorator
 
-current = os.path.abspath('..')
+current = os.path.abspath("..")
 sys.path.append(current)
-from helpers.geospatial import get_subset_region, get_subset_indices
+from helpers.geospatial import get_subset_indices, get_subset_region
+
 
 class H5pyArrSubsetMean(H5Test):
-
     def __init__(self, data_format, geometry=None):
         """
         geometry : path to geojson file containing geometry
                    **Could be list containing [lonmin, lonmax, latmin, latmax]**
         """
         super().__init__(data_format)
         self.bounds = get_subset_region(geometry)
-        
+
     @timer_decorator
-    def run(self):
-        final_h5py_array = []  
+    @fsspec_logging_decorator
+    def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"):
+        final_h5py_array = []
         # TODO: Do we need to make this configurable or consistent?
-        group = '/gt1l/heights'
-        variable = 'h_ph'        
+        if "fsspec_params" in io_params:
+            fsspec_params = io_params["fsspec_params"]
+        if "h5py_params" in io_params:
+            h5py_params = io_params["h5py_params"]
         for file in self.files:
-            with h5py.File(self.s3_fs.open(file, 'rb')) as f:
-
-                lat = f[f'{group}/lat_ph'][:]
-                lon = f[f'{group}/lon_ph'][:]
-
+            with h5py.File(
+                self.s3_fs.open(file, "rb", **fsspec_params), **h5py_params
+            ) as f:
+                lat = f[f"{dataset}/lat_ph"][:]
+                lon = f[f"{dataset}/lon_ph"][:]
+
                 idx_start, idx_end = get_subset_indices(lat, lon, self.bounds)
-                
+
                 # Leaving this code here so that we can create a DataFrame or
-                # Dataset at a later date.  Suggest creating dict which can be 
+                # Dataset at a later date.  Suggest creating dict which can be
                 # passsed to xarray or (geo)pandas
                 # lat[idx_start:idx_end])
                 # lon[idx_start:idx_end])
 
-                data = f[f'{group}/{variable}'][idx_start:idx_end]
+                data = f[f"{dataset}/{variable}"][idx_start:idx_end]
                 # Need to test if using concatenate is faster
                 final_h5py_array = np.insert(
-                    final_h5py_array,
-                    len(final_h5py_array),
-                    data, axis=None
+                    final_h5py_array, len(final_h5py_array), data, axis=None
                 )
-        return np.mean(final_h5py_array)    
+        return np.mean(final_h5py_array)