Merge pull request #8 from NeurodataWithoutBorders/jfm

Initial package
NeurodataWithoutBorders · Mar 15, 2024 · 4472e7d · 4472e7d
2 parents 487b098 + 721f489
commit 4472e7d
Show file tree

Hide file tree

Showing 29 changed files with 2,453 additions and 1 deletion.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+*.zarr.json
+
+.coverage
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -0,0 +1,17 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    // The "bash -ic" is needed here so that our ~/.bashrc gets sourced. See: https://github.com/microsoft/vscode/issues/29412
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Test",
+            "type": "shell",
+            "command": "bash -ic .vscode/tasks/test.sh",
+            "presentation": {
+                "clear": true
+            },
+            "detail": "Run tests"
+        }
+    ]
+}
diff --git a/.vscode/tasks/test.sh b/.vscode/tasks/test.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -ex
+
+# black --check .
+cd scratch/dev1
+pyright
+pytest --cov=lindi --cov-report=xml --cov-report=term tests/
diff --git a/README.md b/README.md
@@ -1,2 +1,3 @@
 # lindi
-Linked Neurodata Interface (LINDI) - cloud-friendly access to NWB data
+
+Linked Data Interface (LINDI) - cloud-friendly access to NWB data
diff --git a/devel/demonstrate_slow_get_chunk_info.py b/devel/demonstrate_slow_get_chunk_info.py
@@ -0,0 +1,40 @@
+import numpy as np
+import h5py
+import remfile
+
+
+# https://neurosift.app/?p=/nwb&dandisetId=000776&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/54895119-f739-4544-973e-a9341a5c66ad/download/
+h5_url = "https://api.dandiarchive.org/api/assets/54895119-f739-4544-973e-a9341a5c66ad/download/"
+
+
+def demonstrate_slow_get_chunk_info():
+    # Open the remote file using remfile. We use verbose option to see the
+    # activity of the download. Don't be confused about when remfile says
+    # "loading 2 chunks" - those are different chunks than hdf5 dataset chunks.
+    remf = remfile.File(h5_url, verbose=True)
+
+    h5f = h5py.File(remf, "r")
+    dset = h5f["/acquisition/CalciumImageSeries/data"]
+    assert isinstance(dset, h5py.Dataset)
+    shape = dset.shape
+    chunk_shape = dset.chunks
+    assert chunk_shape is not None
+    print(f"shape: {shape}")  # (128000, 212, 322, 2)
+    print(f"chunk_shape: {chunk_shape}")  # (3, 53, 81, 1)
+    chunk_coord_shape = [
+        (shape[i] + chunk_shape[i] - 1) // chunk_shape[i] for i in range(len(shape))
+    ]
+    print(f"chunk_coord_shape: {chunk_coord_shape}")  # [42667, 4, 4, 2]
+    num_chunks = np.prod(chunk_coord_shape)
+    print(f"Number of chunks: {num_chunks}")  # 1365344 - around 1.3 million
+
+    dsid = dset.id
+    print(
+        "Getting chunk info for chunk 0 (this takes a very long time because I think it is iterating through all the chunks)"
+    )
+    info = dsid.get_chunk_info(0)
+    print(info)
+
+
+if __name__ == "__main__":
+    demonstrate_slow_get_chunk_info()
diff --git a/devel/old_tests/old_tests.py b/devel/old_tests/old_tests.py
@@ -0,0 +1,117 @@
+import json
+import tempfile
+import numpy as np
+import h5py
+import zarr
+import kerchunk.hdf  # type: ignore
+from lindi import LindiH5Store
+from fsspec.implementations.reference import ReferenceFileSystem
+
+
+def test_scalar_dataset():
+    for val in ["abc", b"abc", 1, 3.6]:
+        print(f"Testing scalar {val} of type {type(val)}")
+        with tempfile.TemporaryDirectory() as tmpdir:
+            filename = f"{tmpdir}/test.h5"
+            with h5py.File(filename, "w") as f:
+                f.create_dataset("X", data=val)
+            zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename)
+            val_kerchunk = zarr_kerchunk["X"][0]
+            zarr_lindi, store_lindi = _get_lindi_zarr(filename)
+            try:
+                val_lindi = zarr_lindi["X"][0]
+                if val_kerchunk != val:
+                    print(f"WARNING: val_kerchunk={val_kerchunk} != val={val}")
+                if val_lindi != val:
+                    print(f"WARNING: val_lindi={val_lindi} != val={val}")
+                if type(val_kerchunk) is not type(val):
+                    print(
+                        "WARNING: type mismatch for kerchunk:",
+                        type(val),
+                        type(val_kerchunk),
+                    )
+                if type(val_lindi) is not type(val):
+                    print("WARNING: type mismatch for lindi:", type(val), type(val_lindi))
+                print("")
+                x = store_lindi.to_reference_file_system()  # noqa: F841
+            finally:
+                store_lindi.close()
+
+
+def test_numpy_array():
+    print("Testing numpy array")
+    X1 = (np.arange(60).reshape(3, 20), (3, 7))
+    X2 = (np.arange(60).reshape(3, 20), None)
+    for array, chunks in [X1, X2]:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            filename = f"{tmpdir}/test.h5"
+            with h5py.File(filename, "w") as f:
+                f.create_dataset("X", data=array, chunks=chunks)
+            zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename)
+            array_kerchunk = zarr_kerchunk["X"][:]
+            assert isinstance(array_kerchunk, np.ndarray)
+            zarr_lindi, store_lindi = _get_lindi_zarr(filename)
+            array_lindi = zarr_lindi["X"][:]
+            assert isinstance(array_lindi, np.ndarray)
+            if not np.array_equal(array_kerchunk, array):
+                print("WARNING: array_kerchunk does not match array")
+                print(array_kerchunk)
+                print(array)
+            if not np.array_equal(array_lindi, array):
+                print("WARNING: array_lindi does not match array")
+                print(array_lindi)
+                print(array)
+            x = store_lindi.to_reference_file_system()  # noqa: F841
+
+
+def test_numpy_array_of_strings():
+    print("Testing numpy array of strings")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("X", data=["abc", "def", "ghi"])
+        zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename)
+        array_kerchunk = zarr_kerchunk["X"][:]
+        assert isinstance(array_kerchunk, np.ndarray)
+        zarr_lindi, store_lindi = _get_lindi_zarr(filename)
+        array_lindi = zarr_lindi["X"][:]
+        assert isinstance(array_lindi, np.ndarray)
+        if not np.array_equal(array_kerchunk, ["abc", "def", "ghi"]):
+            print("WARNING: array_kerchunk does not match array")
+            print(array_kerchunk)
+            print(["abc", "def", "ghi"])
+        if not np.array_equal(array_lindi, ["abc", "def", "ghi"]):
+            print("WARNING: array_lindi does not match array")
+            print(array_lindi)
+            print(["abc", "def", "ghi"])
+        x = store_lindi.to_reference_file_system()  # noqa: F841
+
+
+def _get_lindi_zarr(filename):
+    store = LindiH5Store.from_file(filename, url='.')  # use url='.' so that a reference file system can be created
+    root = zarr.open(store)
+    return root, store
+
+
+def _get_kerchunk_zarr(filename):
+    with h5py.File(filename, "r") as f:
+        h5chunks = kerchunk.hdf.SingleHdf5ToZarr(
+            f,
+            url=filename,
+            hdmf_mode=True,
+            num_chunks_per_dataset_threshold=1000,
+            max_num_items=1000,
+        )
+        a = h5chunks.translate()
+        with open("test_example.zarr.json", "w") as store:
+            json.dump(a, store, indent=2)
+        fs = ReferenceFileSystem(a)
+        store0 = fs.get_mapper(root="/", check=False)
+        root = zarr.open(store0)
+        return root, store0
+
+
+if __name__ == "__main__":
+    test_scalar_dataset()
+    test_numpy_array()
+    test_numpy_array_of_strings()
diff --git a/devel/old_tests/test_lindi_client.py b/devel/old_tests/test_lindi_client.py
@@ -0,0 +1,34 @@
+from lindi import LindiClient, LindiGroup, LindiDataset
+
+
+def test_lindi_client():
+    client = LindiClient.from_file("example_0.zarr.json")
+
+    for k, v in client.attrs.items():
+        print(f"{k}: {v}")
+
+    for k in client.keys():
+        print(k)
+
+    acquisition = client["acquisition"]
+    assert isinstance(acquisition, LindiGroup)
+    for k in acquisition.keys():
+        print(k)
+
+    x = client["acquisition/ElectricalSeriesAp"]["data"]
+    assert isinstance(x, LindiDataset)
+
+    print(x.shape)
+    print(x[:5])
+
+    general = client["general"]
+    assert isinstance(general, LindiGroup)
+    for k in general.keys():
+        a = general[k]
+        if isinstance(a, LindiDataset):
+            print(f"{k}: {a.shape}")
+            print(a[()])
+
+
+if __name__ == "__main__":
+    test_lindi_client()