diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index bcb5dba7f..9b7e82cd1 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -12,7 +12,6 @@ from time import sleep from typing import Any -from dandischema.digests.zarr import get_checksum from dandischema.models import BareAsset, DigestType import requests from zarr_checksum.tree import ZarrChecksumTree @@ -159,25 +158,25 @@ def stat(self) -> ZarrStat: def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: # Avoid heavy import by importing within function: - from dandi.support.digests import md5file_nocache + from dandi.support.digests import checksum_zarr_dir, md5file_nocache size = 0 - dir_md5s = {} - file_md5s = {} + dir_info = {} + file_info = {} files = [] for p in dirpath.iterdir(): if p.is_dir(): st = dirstat(p) size += st.size - dir_md5s[p.name] = (st.digest.value, st.size) + dir_info[p.name] = (st.digest.value, st.size) files.extend(st.files) else: size += p.size - file_md5s[p.name] = (md5file_nocache(p.filepath), p.size) + file_info[p.name] = (md5file_nocache(p.filepath), p.size) files.append(p) return ZarrStat( size=size, - digest=Digest.dandi_zarr(get_checksum(file_md5s, dir_md5s)), + digest=Digest.dandi_zarr(checksum_zarr_dir(file_info, dir_info)), files=files, ) diff --git a/dandi/support/digests.py b/dandi/support/digests.py index ad43c18be..7a69a1629 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -24,6 +24,7 @@ from dandischema.digests.dandietag import DandiETag from fscacher import PersistentCache +from zarr_checksum.checksum import ZarrChecksum, ZarrChecksumManifest from zarr_checksum.tree import ZarrChecksumTree from .threaded_walk import threaded_walk @@ -134,3 +135,31 @@ def md5file_nocache(filepath: str | Path) -> str: present in Zarrs """ return Digester(["md5"])(filepath)["md5"] + + +def checksum_zarr_dir( + files: dict[str, tuple[str, int]], directories: dict[str, tuple[str, int]] +) -> str: + """ + Calculate the Zarr checksum of a directory only from information about the + files and subdirectories immediately within it. + + :param files: + A mapping from names of files in the directory to pairs of their MD5 + digests and sizes + :param directories: + A mapping from names of subdirectories in the directory to pairs of + their Zarr checksums and the sum of the sizes of all files recursively + within them + """ + manifest = ZarrChecksumManifest( + files=[ + ZarrChecksum(digest=digest, name=name, size=size) + for name, (digest, size) in files.items() + ], + directories=[ + ZarrChecksum(digest=digest, name=name, size=size) + for name, (digest, size) in directories.items() + ], + ) + return manifest.generate_digest().digest diff --git a/dandi/support/tests/test_digests.py b/dandi/support/tests/test_digests.py index d382b6cb7..af37214ea 100644 --- a/dandi/support/tests/test_digests.py +++ b/dandi/support/tests/test_digests.py @@ -7,12 +7,15 @@ # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +from __future__ import annotations + from pathlib import Path +import pytest from pytest_mock import MockerFixture from .. import digests -from ..digests import Digester, get_zarr_checksum +from ..digests import Digester, checksum_zarr_dir, get_zarr_checksum def test_digester(tmp_path): @@ -101,3 +104,56 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None: == "f77f4c5b277575f781c19ba91422f0c5-8--197" ) spy.assert_called_once_with(sub2 / "file7.txt") + + +@pytest.mark.parametrize( + "files,directories,checksum", + [ + ({}, {}, "481a2f77ab786a0f45aafd5db0971caa-0--0"), + ( + {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, + {}, + "f21b9b4bf53d7ce1167bcfae76371e59-1--1", + ), + ( + {}, + {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1)}, + "ea8b8290b69b96422a3ed1cca0390f21-1--1", + ), + ( + { + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1), + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2), + }, + {}, + "4e67de4393d14c1e9c472438f0f1f8b1-2--3", + ), + ( + {}, + { + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1), + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2), + }, + "859ca1926affe9c7d0424030f26fbd89-2--3", + ), + ( + {}, + { + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", 1), + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--2", 2), + }, + "8f8361a286c9a7c3fbfd464e33989037-2--3", + ), + ( + {"baz": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, + {"bar": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2)}, + "3cb139f47d3a3580388f41956c15f55e-2--3", + ), + ], +) +def test_checksum_zarr_dir( + files: dict[str, tuple[str, int]], + directories: dict[str, tuple[str, int]], + checksum: str, +) -> None: + assert checksum_zarr_dir(files=files, directories=directories) == checksum