Skip to content

Commit

Permalink
Merge pull request #8 from NeurodataWithoutBorders/jfm
Browse files Browse the repository at this point in the history
Initial package
  • Loading branch information
magland authored Mar 15, 2024
2 parents 487b098 + 721f489 commit 4472e7d
Show file tree
Hide file tree
Showing 29 changed files with 2,453 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
ignore = E501
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
*.zarr.json

.coverage

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
17 changes: 17 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
// The "bash -ic" is needed here so that our ~/.bashrc gets sourced. See: https://github.com/microsoft/vscode/issues/29412
"version": "2.0.0",
"tasks": [
{
"label": "Test",
"type": "shell",
"command": "bash -ic .vscode/tasks/test.sh",
"presentation": {
"clear": true
},
"detail": "Run tests"
}
]
}
7 changes: 7 additions & 0 deletions .vscode/tasks/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
set -ex

# black --check .
cd scratch/dev1
pyright
pytest --cov=lindi --cov-report=xml --cov-report=term tests/
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# lindi
Linked Neurodata Interface (LINDI) - cloud-friendly access to NWB data

Linked Data Interface (LINDI) - cloud-friendly access to NWB data
40 changes: 40 additions & 0 deletions devel/demonstrate_slow_get_chunk_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import h5py
import remfile


# https://neurosift.app/?p=/nwb&dandisetId=000776&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/54895119-f739-4544-973e-a9341a5c66ad/download/
h5_url = "https://api.dandiarchive.org/api/assets/54895119-f739-4544-973e-a9341a5c66ad/download/"


def demonstrate_slow_get_chunk_info():
# Open the remote file using remfile. We use verbose option to see the
# activity of the download. Don't be confused about when remfile says
# "loading 2 chunks" - those are different chunks than hdf5 dataset chunks.
remf = remfile.File(h5_url, verbose=True)

h5f = h5py.File(remf, "r")
dset = h5f["/acquisition/CalciumImageSeries/data"]
assert isinstance(dset, h5py.Dataset)
shape = dset.shape
chunk_shape = dset.chunks
assert chunk_shape is not None
print(f"shape: {shape}") # (128000, 212, 322, 2)
print(f"chunk_shape: {chunk_shape}") # (3, 53, 81, 1)
chunk_coord_shape = [
(shape[i] + chunk_shape[i] - 1) // chunk_shape[i] for i in range(len(shape))
]
print(f"chunk_coord_shape: {chunk_coord_shape}") # [42667, 4, 4, 2]
num_chunks = np.prod(chunk_coord_shape)
print(f"Number of chunks: {num_chunks}") # 1365344 - around 1.3 million

dsid = dset.id
print(
"Getting chunk info for chunk 0 (this takes a very long time because I think it is iterating through all the chunks)"
)
info = dsid.get_chunk_info(0)
print(info)


if __name__ == "__main__":
demonstrate_slow_get_chunk_info()
117 changes: 117 additions & 0 deletions devel/old_tests/old_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import json
import tempfile
import numpy as np
import h5py
import zarr
import kerchunk.hdf # type: ignore
from lindi import LindiH5Store
from fsspec.implementations.reference import ReferenceFileSystem


def test_scalar_dataset():
for val in ["abc", b"abc", 1, 3.6]:
print(f"Testing scalar {val} of type {type(val)}")
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X", data=val)
zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename)
val_kerchunk = zarr_kerchunk["X"][0]
zarr_lindi, store_lindi = _get_lindi_zarr(filename)
try:
val_lindi = zarr_lindi["X"][0]
if val_kerchunk != val:
print(f"WARNING: val_kerchunk={val_kerchunk} != val={val}")
if val_lindi != val:
print(f"WARNING: val_lindi={val_lindi} != val={val}")
if type(val_kerchunk) is not type(val):
print(
"WARNING: type mismatch for kerchunk:",
type(val),
type(val_kerchunk),
)
if type(val_lindi) is not type(val):
print("WARNING: type mismatch for lindi:", type(val), type(val_lindi))
print("")
x = store_lindi.to_reference_file_system() # noqa: F841
finally:
store_lindi.close()


def test_numpy_array():
print("Testing numpy array")
X1 = (np.arange(60).reshape(3, 20), (3, 7))
X2 = (np.arange(60).reshape(3, 20), None)
for array, chunks in [X1, X2]:
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X", data=array, chunks=chunks)
zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename)
array_kerchunk = zarr_kerchunk["X"][:]
assert isinstance(array_kerchunk, np.ndarray)
zarr_lindi, store_lindi = _get_lindi_zarr(filename)
array_lindi = zarr_lindi["X"][:]
assert isinstance(array_lindi, np.ndarray)
if not np.array_equal(array_kerchunk, array):
print("WARNING: array_kerchunk does not match array")
print(array_kerchunk)
print(array)
if not np.array_equal(array_lindi, array):
print("WARNING: array_lindi does not match array")
print(array_lindi)
print(array)
x = store_lindi.to_reference_file_system() # noqa: F841


def test_numpy_array_of_strings():
print("Testing numpy array of strings")
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X", data=["abc", "def", "ghi"])
zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename)
array_kerchunk = zarr_kerchunk["X"][:]
assert isinstance(array_kerchunk, np.ndarray)
zarr_lindi, store_lindi = _get_lindi_zarr(filename)
array_lindi = zarr_lindi["X"][:]
assert isinstance(array_lindi, np.ndarray)
if not np.array_equal(array_kerchunk, ["abc", "def", "ghi"]):
print("WARNING: array_kerchunk does not match array")
print(array_kerchunk)
print(["abc", "def", "ghi"])
if not np.array_equal(array_lindi, ["abc", "def", "ghi"]):
print("WARNING: array_lindi does not match array")
print(array_lindi)
print(["abc", "def", "ghi"])
x = store_lindi.to_reference_file_system() # noqa: F841


def _get_lindi_zarr(filename):
store = LindiH5Store.from_file(filename, url='.') # use url='.' so that a reference file system can be created
root = zarr.open(store)
return root, store


def _get_kerchunk_zarr(filename):
with h5py.File(filename, "r") as f:
h5chunks = kerchunk.hdf.SingleHdf5ToZarr(
f,
url=filename,
hdmf_mode=True,
num_chunks_per_dataset_threshold=1000,
max_num_items=1000,
)
a = h5chunks.translate()
with open("test_example.zarr.json", "w") as store:
json.dump(a, store, indent=2)
fs = ReferenceFileSystem(a)
store0 = fs.get_mapper(root="/", check=False)
root = zarr.open(store0)
return root, store0


if __name__ == "__main__":
test_scalar_dataset()
test_numpy_array()
test_numpy_array_of_strings()
34 changes: 34 additions & 0 deletions devel/old_tests/test_lindi_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from lindi import LindiClient, LindiGroup, LindiDataset


def test_lindi_client():
client = LindiClient.from_file("example_0.zarr.json")

for k, v in client.attrs.items():
print(f"{k}: {v}")

for k in client.keys():
print(k)

acquisition = client["acquisition"]
assert isinstance(acquisition, LindiGroup)
for k in acquisition.keys():
print(k)

x = client["acquisition/ElectricalSeriesAp"]["data"]
assert isinstance(x, LindiDataset)

print(x.shape)
print(x[:5])

general = client["general"]
assert isinstance(general, LindiGroup)
for k in general.keys():
a = general[k]
if isinstance(a, LindiDataset):
print(f"{k}: {a.shape}")
print(a[()])


if __name__ == "__main__":
test_lindi_client()
Loading

0 comments on commit 4472e7d

Please sign in to comment.