Skip to content

Commit

Permalink
index: collect: add update flag
Browse files Browse the repository at this point in the history
Allows recollecting without dropping hash_info.
  • Loading branch information
efiop committed Sep 25, 2022
1 parent 2412902 commit 1dbbc6d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/dvc_data/hashfile/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,6 @@ def tree_from_index(
assert entry.meta and entry.hash_info
tree_key = key[len(prefix) :]
tree.add(tree_key, entry.meta, entry.hash_info)
tree_meta.size += tree_meta.size or 0
tree_meta.size += entry.meta.size or 0
tree_meta.nfiles += 1
return tree_meta, tree
36 changes: 24 additions & 12 deletions src/dvc_data/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,28 +226,40 @@ def node_factory(_, key, children, *args):
return self.traverse(node_factory, prefix=root_key)


def _collect_dir(index, prefix, entry, path, fs):
def _collect_dir(index, prefix, prefix_entry, path, fs, update=False):
dir_meta = Meta(nfiles=0, size=0, isdir=True)

for root, dnames, fnames in fs.walk(path):
sub_prefix = fs.path.relparts(root, path) if root != path else ()
for name in chain(dnames, fnames):
key = (*prefix, *sub_prefix, name)
entry_path = fs.path.join(root, name)
entry = index.get(key)
if entry is None:
entry = DataIndexEntry()
index[key] = entry

entry.fs = fs
entry.path = entry_path
entry.cache = prefix_entry.cache
entry.remote = prefix_entry.remote

# TODO: localfs.walk doesn't currently support detail=True,
# so we have to call fs.info() manually
meta = Meta.from_info(
fs.info(entry_path, refresh=True), fs.protocol
)
index[key] = DataIndexEntry(
meta=meta,
fs=fs,
path=entry_path,
cache=entry.cache,
remote=entry.remote,
)
if entry.meta != meta and not update:
entry.hash_info = None

entry.meta = meta
dir_meta.nfiles += 1
dir_meta.size += meta.size

return dir_meta


def collect(index, path, fs):
def collect(index, path, fs, update=False):
# NOTE: converting to list to avoid iterating and modifying the dict the
# same time.
items = list(index.iteritems(shallow=True))
Expand All @@ -257,16 +269,16 @@ def collect(index, path, fs):
info = fs.info(entry_path, refresh=True)

fs_meta = Meta.from_info(info, fs.protocol)
if entry.meta != fs_meta:
entry.meta = Meta.from_info(info, fs.protocol)
if entry.meta != fs_meta and not update:
entry.hash_info = None
entry.meta = fs_meta
entry.fs = fs
entry.path = entry_path

if info["type"] == "file":
continue

_collect_dir(index, key, entry, entry_path, fs)
entry.meta = _collect_dir(index, key, entry, entry_path, fs)


def md5(index):
Expand Down

0 comments on commit 1dbbc6d

Please sign in to comment.