From 4d5a0163eaaa1bdaa8c99a4fedcf6e15686b05c9 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 27 Sep 2023 03:37:54 +0300 Subject: [PATCH] diff: handle granular targets --- dvc/repo/diff.py | 20 ++++++++++-- dvc/repo/index.py | 72 +++++++++++++++++++++++++++++------------ pyproject.toml | 2 +- tests/func/test_diff.py | 56 ++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 25 deletions(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index d49b7e3ac9..9a98d11211 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -22,7 +22,7 @@ def _hash(entry): return None -def _diff(old, new, with_missing=False): +def _diff(old, new, data_keys, with_missing=False): from dvc_data.index.diff import ADD, DELETE, MODIFY, RENAME from dvc_data.index.diff import diff as idiff @@ -34,12 +34,23 @@ def _diff(old, new, with_missing=False): "not in cache": [], } + def meta_cmp_key(meta): + if not meta: + return meta + return meta.isdir + for change in idiff( old, new, with_renames=True, - hash_only=True, + meta_cmp_key=meta_cmp_key, + roots=data_keys, ): + if (change.old and change.old.isdir and not change.old.hash_info) or ( + change.new and change.new.isdir and not change.new.hash_info + ): + continue + if change.typ == ADD: ret["added"].append( { @@ -116,6 +127,7 @@ def diff( b_rev = "workspace" with_missing = True + data_keys = set() for rev in self.brancher(revs=[a_rev, b_rev]): if rev == "workspace" and b_rev != "workspace": # brancher always returns workspace, but we only need to compute @@ -132,6 +144,8 @@ def onerror(target, _exc): recursive=recursive, ) + data_keys.update(view.data_keys.get("repo", set())) + if rev == "workspace": from .index import build_data_index @@ -165,4 +179,4 @@ def onerror(target, _exc): new = indexes[b_rev] with ui.status("Calculating diff"): - return _diff(old, new, with_missing=with_missing) + return _diff(old, new, data_keys, with_missing=with_missing) diff --git a/dvc/repo/index.py b/dvc/repo/index.py index 4532d24ca6..a32a9c737a 100644 --- a/dvc/repo/index.py +++ b/dvc/repo/index.py @@ -1,5 +1,6 @@ import logging import time +from collections import defaultdict from functools import partial from typing import ( TYPE_CHECKING, @@ -319,6 +320,22 @@ def outs(self) -> Iterator["Output"]: for stage in self.stages: yield from stage.outs + @cached_property + def out_data_keys(self) -> Dict[str, Set["DataIndexKey"]]: + by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set) + + by_workspace["repo"] = set() + by_workspace["local"] = set() + + for out in self.outs: + if not out.use_cache: + continue + + ws, key = out.index_key + by_workspace[ws].add(key) + + return dict(by_workspace) + @property def decorated_outs(self) -> Iterator["Output"]: for output in self.outs: @@ -359,8 +376,6 @@ def _plot_sources(self) -> List[str]: @cached_property def data_keys(self) -> Dict[str, Set["DataIndexKey"]]: - from collections import defaultdict - by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set) by_workspace["repo"] = set() @@ -377,8 +392,6 @@ def data_keys(self) -> Dict[str, Set["DataIndexKey"]]: @cached_property def metric_keys(self) -> Dict[str, Set["DataIndexKey"]]: - from collections import defaultdict - from .metrics.show import _collect_top_level_metrics by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set) @@ -400,8 +413,6 @@ def metric_keys(self) -> Dict[str, Set["DataIndexKey"]]: @cached_property def plot_keys(self) -> Dict[str, Set["DataIndexKey"]]: - from collections import defaultdict - by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set) by_workspace["repo"] = set() @@ -520,8 +531,6 @@ def used_objs( jobs: Optional[int] = None, push: bool = False, ) -> "ObjectContainer": - from collections import defaultdict - used: "ObjectContainer" = defaultdict(set) pairs = self.collect_targets(targets, recursive=recursive, with_deps=with_deps) for stage, filter_info in pairs: @@ -640,9 +649,23 @@ def outs(self) -> Iterator["Output"]: yield from {out for (out, _) in self._filtered_outs} @cached_property - def _data_prefixes(self) -> Dict[str, "_DataPrefixes"]: - from collections import defaultdict + def out_data_keys(self) -> Dict[str, Set["DataIndexKey"]]: + by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set) + by_workspace["repo"] = set() + by_workspace["local"] = set() + + for out in self.outs: + if not out.use_cache: + continue + + ws, key = out.index_key + by_workspace[ws].add(key) + + return dict(by_workspace) + + @cached_property + def _data_prefixes(self) -> Dict[str, "_DataPrefixes"]: prefixes: Dict[str, "_DataPrefixes"] = defaultdict( lambda: _DataPrefixes(set(), set()) ) @@ -660,8 +683,6 @@ def _data_prefixes(self) -> Dict[str, "_DataPrefixes"]: @cached_property def data_keys(self) -> Dict[str, Set["DataIndexKey"]]: - from collections import defaultdict - ret: Dict[str, Set["DataIndexKey"]] = defaultdict(set) for out, filter_info in self._filtered_outs: @@ -714,7 +735,7 @@ def key_filter(workspace: str, key: "DataIndexKey"): return data -def build_data_index( # noqa: C901 +def build_data_index( # noqa: C901, PLR0912 index: Union["Index", "IndexView"], path: str, fs: "FileSystem", @@ -777,14 +798,6 @@ def build_data_index( # noqa: C901 data.add(entry) callback.relative_update(1) - if compute_hash: - tree_meta, tree = build_tree(data, key, name=hash_name) - out_entry.meta = tree_meta - out_entry.hash_info = tree.hash_info - out_entry.loaded = True - data.add(out_entry) - callback.relative_update(1) - for key in parents: parent_path = fs.path.join(path, *key) if not fs.exists(parent_path): @@ -793,6 +806,23 @@ def build_data_index( # noqa: C901 data.add(direntry) callback.relative_update(1) + if compute_hash: + out_keys = index.out_data_keys.get(workspace, set()) + data_keys = index.data_keys.get(workspace, set()) + for key in data_keys.intersection(out_keys): + hash_name = _get_entry_hash_name(index, workspace, key) + + out_entry = data.get(key) + if not out_entry or not out_entry.isdir: + continue + + tree_meta, tree = build_tree(data, key, name=hash_name) + out_entry.meta = tree_meta + out_entry.hash_info = tree.hash_info + out_entry.loaded = True + data.add(out_entry) + callback.relative_update(1) + return data diff --git a/pyproject.toml b/pyproject.toml index 3cf023b62e..b4e4e30421 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "configobj>=5.0.6", "distro>=1.3", "dpath<3,>=2.1.0", - "dvc-data>=2.16.3,<2.17.0", + "dvc-data>=2.17.1,<2.18.0", "dvc-http>=2.29.0", "dvc-render>=0.3.1,<1", "dvc-studio-client>=0.13.0,<1", diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index b1b09d9936..2c818c8dfd 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -668,3 +668,59 @@ def test_rename_multiple_files_same_hashes(tmp_dir, scm, dvc): }, ], } + + +def test_diff_granular(tmp_dir, dvc, scm): + tmp_dir.gen( + { + "dir": { + "data": { + "subdir": {"subfoo": "subfoo", "subbar": "subbar"}, + "foo": "foo", + "bar": "bar", + }, + }, + } + ) + + dvc.add(os.path.join("dir", "data")) + scm.add(os.path.join("dir", "data.dvc")) + scm.add(os.path.join("dir", ".gitignore")) + scm.commit("data") + + assert dvc.diff() == {} + + (tmp_dir / "dir" / "data" / "subdir" / "new").write_text("new") + + assert dvc.diff() == { + "added": [ + { + "hash": "22af645d1859cb5ca6da0c484f1f37ea", + "path": os.path.join("dir", "data", "subdir", "new"), + } + ], + "deleted": [], + "modified": [ + { + "hash": { + "new": "efa5b20d5f935dcc5555b26db6e19b76.dir", + "old": "1aca2c799df82929bbdd976557975546.dir", + }, + "path": os.path.join("dir", "data", ""), + } + ], + "not in cache": [], + "renamed": [], + } + assert dvc.diff(targets=[os.path.join("dir", "data", "subdir")]) == { + "added": [ + { + "hash": "22af645d1859cb5ca6da0c484f1f37ea", + "path": os.path.join("dir", "data", "subdir", "new"), + } + ], + "deleted": [], + "modified": [], + "not in cache": [], + "renamed": [], + }