Skip to content

Commit

Permalink
Force MaybeDependency to have a Dependency OR list[Problem], no…
Browse files Browse the repository at this point in the history
…t neither nor both (#3635)

## Changes
Force `MaybeDependency` to have a `Dependency` OR `list[Problem]`, not
neither nor both. This enforcement triggered to handle the known
libraries during import registration.

### Linked issues
<!-- DOC: Link issue with a keyword: close, closes, closed, fix, fixes,
fixed, resolve, resolves, resolved. See
https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword
-->

Resolves #3585
Breaks up #3626
Progresses #1527

### Functionality

- [x] modified code linting related logic

### Tests

- [x] added and modified unit tests
  • Loading branch information
JCZuurmond authored Feb 13, 2025
1 parent 54d37bc commit 9083d6f
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 30 deletions.
8 changes: 3 additions & 5 deletions src/databricks/labs/ucx/source_code/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
MaybeDependency,
StubContainer,
)
from databricks.labs.ucx.source_code.known import KnownList
from databricks.labs.ucx.source_code.known import KnownDependency, KnownList
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.linters.python import PythonCodeAnalyzer

Expand Down Expand Up @@ -181,10 +181,8 @@ def _resolve_allow_list(self, name: str) -> MaybeDependency | None:
if not compatibility.known:
logger.debug(f"Resolving unknown import: {name}")
return None
if not compatibility.problems:
return MaybeDependency(None, [])
# TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527
return MaybeDependency(None, compatibility.problems)
dependency = KnownDependency(name, compatibility.problems)
return MaybeDependency(dependency, [])

def _resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency | None:
if not name:
Expand Down
44 changes: 36 additions & 8 deletions src/databricks/labs/ucx/source_code/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,19 @@ def register_notebook(self, path: Path, inherit_context: bool) -> list[Dependenc
def register_import(self, name: str) -> list[DependencyProblem]:
if not name:
return [DependencyProblem('import-empty', 'Empty import name')]
maybe = self._resolver.resolve_import(self.path_lookup, name)
if not maybe.dependency:
return maybe.problems
maybe_graph = self.register_dependency(maybe.dependency)
maybe_dependency = self._resolver.resolve_import(self.path_lookup, name)
if maybe_dependency.problems:
return maybe_dependency.problems
assert maybe_dependency.dependency
maybe_graph = self.register_dependency(maybe_dependency.dependency)
return maybe_graph.problems

def register_file(self, path: Path) -> list[DependencyProblem]:
maybe = self._resolver.resolve_file(self.path_lookup, path)
if not maybe.dependency:
return maybe.problems
maybe_graph = self.register_dependency(maybe.dependency)
maybe_dependency = self._resolver.resolve_file(self.path_lookup, path)
if maybe_dependency.problems:
return maybe_dependency.problems
assert maybe_dependency.dependency
maybe_graph = self.register_dependency(maybe_dependency.dependency)
return maybe_graph.problems

def register_dependency(self, dependency: Dependency) -> MaybeGraph:
Expand Down Expand Up @@ -398,8 +400,34 @@ def resolve_file(self, path_lookup, path: Path) -> MaybeDependency:

@dataclass
class MaybeDependency:
"""A class:`Dependency` or a :class:`Failure`.
The `MaybeDependency` is designed to either contain a `dependency` OR
`problems`, never both or neither. Typically, a `Dependency` is
constructed by a resolver yielding a `MaybeDependency` with
`list[Problems]` if the dependency could NOT be resolved, otherwise it
yields the `Dependency`, resulting in code that looks like:
``` python
maybe_dependency = resolver.resolve_import(path_lookup, module_name)
if maybe_dependency.problems:
# Handle failure and return early
assert maybe_dependency.dependency, "Dependency should be given when no problems are given."
# Use dependency
```
"""

dependency: Dependency | None
"""The dependency"""

problems: list[DependencyProblem]
"""The problems during constructing the dependency"""

def __post_init__(self):
if not self.dependency and not self.problems:
raise ValueError(f"Dependency or problems should be given: {self}")
if self.dependency and self.problems:
raise ValueError(f"Dependency and problems should not be both given: {self}")


class DependencyResolver:
Expand Down
49 changes: 48 additions & 1 deletion src/databricks/labs/ucx/source_code/known.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@

from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.graph import Dependency, DependencyProblem
from databricks.labs.ucx.source_code.graph import (
Dependency,
DependencyGraph,
DependencyLoader,
DependencyProblem,
SourceContainer,
)
from databricks.labs.ucx.source_code.path_lookup import PathLookup

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -255,6 +261,47 @@ def __repr__(self):
return f"<DistInfoPackage {self._path}>"


class KnownContainer(SourceContainer):
"""A container for known libraries."""

def __init__(self, path: Path, problems: list[DependencyProblem]):
super().__init__()
self._path = path
self._problems = problems

def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
"""Return the known problems."""
_ = parent
return self._problems


class KnownLoader(DependencyLoader):
"""Always load as `KnownContainer`.
This loader is used in combination with the KnownList to load known dependencies and their known problems.
"""

def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> KnownContainer:
"""Load the dependency."""
_ = path_lookup
if not isinstance(dependency, KnownDependency):
raise RuntimeError("Only KnownDependency is supported")
# Known library paths do not need to be resolved
return KnownContainer(dependency.path, dependency.problems)


class KnownDependency(Dependency):
"""A dependency for known libraries, see :class:KnownList."""

def __init__(self, module_name: str, problems: list[DependencyProblem]):
known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json"
# Note that Github does not support navigating JSON files, hence the #<module_name> does nothing.
# https://docs.github.com/en/repositories/working-with-files/using-files/navigating-code-on-github
super().__init__(KnownLoader(), Path(f"{known_url}#{module_name}"), inherits_context=False)
self._module_name = module_name
self.problems = problems


if __name__ == "__main__":
logger = get_logger(__file__) # this only works for __main__
KnownList.rebuild(Path.cwd())
6 changes: 3 additions & 3 deletions tests/unit/source_code/test_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def test_dependency_resolver_visits_file_dependencies(simple_dependency_resolver
assert maybe.graph.all_relative_names() == {"root5.py", "leaf4.py"}


def test_dependency_resolver_skips_builtin_dependencies(simple_dependency_resolver) -> None:
def test_dependency_resolver_skips_standard_library_dependencies(simple_dependency_resolver) -> None:
maybe = simple_dependency_resolver.build_local_file_dependency_graph(
Path("python_builtins.py"), CurrentSessionState()
Path("standard_libraries.py"), CurrentSessionState()
)
assert not maybe.failed
graph = maybe.graph
Expand All @@ -123,7 +123,7 @@ def test_dependency_resolver_skips_builtin_dependencies(simple_dependency_resolv

def test_dependency_resolver_ignores_known_dependencies(simple_dependency_resolver) -> None:
maybe = simple_dependency_resolver.build_local_file_dependency_graph(
Path("python_builtins.py"), CurrentSessionState()
Path("standard_libraries.py"), CurrentSessionState()
)
assert maybe.graph
graph = maybe.graph
Expand Down
13 changes: 13 additions & 0 deletions tests/unit/source_code/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Dependency,
DependencyGraph,
DependencyProblem,
MaybeDependency,
InheritedContext,
)
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
Expand Down Expand Up @@ -201,3 +202,15 @@ def test_dependency_problem_as_located_advice_has_path_missing_by_default() -> N
problem = DependencyProblem("code", "message")
located_advice = problem.as_located_advice()
assert located_advice.has_missing_path()


def test_maybe_dependency_raises_value_error_if_no_dependency_nor_problems() -> None:
with pytest.raises(ValueError, match="Dependency or problems should be given: *"):
MaybeDependency(None, [])


def test_maybe_dependency_raises_value_error_if_dependency_and_problems() -> None:
dependency = Dependency(FileLoader(), Path("test"))
problem = DependencyProblem("code", "message")
with pytest.raises(ValueError, match="Dependency and problems should not be both given: *"):
MaybeDependency(dependency, [problem])
37 changes: 36 additions & 1 deletion tests/unit/source_code/test_known.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
from pathlib import Path
from typing import cast
from unittest import mock
from unittest.mock import create_autospec

import pytest

from databricks.labs.ucx.source_code.known import KnownList
from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.graph import DependencyProblem, DependencyGraph

from databricks.labs.ucx.source_code.known import KnownList, KnownDependency, KnownContainer, KnownLoader
from databricks.labs.ucx.source_code.path_lookup import PathLookup


Expand Down Expand Up @@ -95,3 +99,34 @@ def analyze_cachetools_dist_info(cls):
return

TestKnownList.analyze_cachetools_dist_info()


@pytest.mark.parametrize("problems", [[], [DependencyProblem("test", "test")]])
def test_known_container_loads_problems_during_dependency_graph_building(
simple_dependency_resolver, problems: list[DependencyProblem]
) -> None:
path_lookup = create_autospec(PathLookup)
dependency = KnownDependency("test", problems)
graph = DependencyGraph(dependency, None, simple_dependency_resolver, path_lookup, CurrentSessionState())
container = KnownContainer(Path("test.py"), problems)
assert container.build_dependency_graph(graph) == problems
path_lookup.assert_not_called()


@pytest.mark.parametrize("problems", [[], [DependencyProblem("test", "test")]])
def test_known_loader_loads_known_container_with_problems(
simple_dependency_resolver, problems: list[DependencyProblem]
) -> None:
path_lookup = create_autospec(PathLookup)
loader = KnownLoader()
dependency = KnownDependency("test", problems)
graph = DependencyGraph(dependency, None, simple_dependency_resolver, path_lookup, CurrentSessionState())
container = loader.load_dependency(path_lookup, dependency)
assert container.build_dependency_graph(graph) == problems
path_lookup.resolve.assert_not_called()


@pytest.mark.parametrize("problems", [[], [DependencyProblem("test", "test")]])
def test_known_dependency_has_problems(problems: list[DependencyProblem]) -> None:
dependency = KnownDependency("test", problems)
assert dependency.problems == problems
8 changes: 4 additions & 4 deletions tests/unit/source_code/test_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,30 +227,30 @@ def test_notebook_builds_python_dependency_graph(mock_path_lookup) -> None:

def test_notebook_builds_python_dependency_graph_with_loop(mock_path_lookup) -> None:
path = "run_notebooks.py"
expected_path = {mock_path_lookup.cwd / path for path in (path, "leaf1.py", "leaf2.py", "leaf3.py")}
resolver = dependency_resolver(mock_path_lookup)
maybe = resolver.resolve_notebook(mock_path_lookup, Path(path), False)
assert maybe.dependency is not None
graph = DependencyGraph(maybe.dependency, None, resolver, mock_path_lookup, CurrentSessionState())
container = maybe.dependency.load(mock_path_lookup)
assert container is not None
container.build_dependency_graph(graph)
expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"]
all_paths = set(d.path for d in graph.all_dependencies)
assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths}
assert not expected_path - all_paths


def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_lookup) -> None:
path = "run_notebooks_with_fstring.py"
expected_path = {mock_path_lookup.cwd / path for path in (path, "leaf1.py", "leaf2.py", "leaf3.py")}
resolver = dependency_resolver(mock_path_lookup)
maybe = resolver.resolve_notebook(mock_path_lookup, Path(path), False)
assert maybe.dependency is not None
graph = DependencyGraph(maybe.dependency, None, resolver, mock_path_lookup, CurrentSessionState())
container = maybe.dependency.load(mock_path_lookup)
assert container is not None
container.build_dependency_graph(graph)
expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"]
all_paths = set(d.path for d in graph.all_dependencies)
assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths}
assert not expected_path - all_paths


def test_detects_multiple_calls_to_dbutils_notebook_run_in_python_code() -> None:
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/source_code/test_path_lookup_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@
],
3,
),
(["simulate-sys-path", "via-sys-path", "run_notebook_1.py"], 1),
(["simulate-sys-path", "via-sys-path", "run_notebook_2.py"], 1),
(["simulate-sys-path", "via-sys-path", "run_notebook_4.py"], 2),
(["simulate-sys-path", "via-sys-path", "run_notebook_1.py"], 2),
(["simulate-sys-path", "via-sys-path", "run_notebook_2.py"], 2),
(["simulate-sys-path", "via-sys-path", "run_notebook_4.py"], 3),
],
)
def test_locates_notebooks(source: list[str], expected: int, mock_path_lookup) -> None:
Expand All @@ -60,8 +60,8 @@ def test_locates_notebooks(source: list[str], expected: int, mock_path_lookup) -
"source, expected",
[
(["simulate-sys-path", "siblings", "sibling1_file.py"], 2),
(["simulate-sys-path", "via-sys-path", "import_file_1.py"], 2),
(["simulate-sys-path", "via-sys-path", "import_file_2.py"], 2),
(["simulate-sys-path", "via-sys-path", "import_file_1.py"], 3),
(["simulate-sys-path", "via-sys-path", "import_file_2.py"], 3),
],
)
def test_locates_files(source: list[str], expected: int) -> None:
Expand Down Expand Up @@ -117,7 +117,7 @@ def test_locates_notebooks_with_absolute_path() -> None:
assert not maybe.problems
assert maybe.graph is not None
all_paths = [d.path for d in maybe.graph.all_dependencies]
assert len(all_paths) == 2
assert len(all_paths) == 3


def test_locates_files_with_absolute_path() -> None:
Expand Down
10 changes: 8 additions & 2 deletions tests/unit/source_code/test_s3fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@
def test_detect_s3fs_import(
empty_index, source: str, expected: list[DependencyProblem], tmp_path, mock_path_lookup
) -> None:
known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json"
module_name = "s3fs.subpackage" if "subpackage" in source else "s3fs"
expected_path = Path(f"{known_url}#{module_name}")

sample = tmp_path / "test_detect_s3fs_import.py"
sample.write_text(source)
mock_path_lookup.append_path(tmp_path)
Expand All @@ -127,7 +131,7 @@ def test_detect_s3fs_import(
pip_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup
)
maybe = dependency_resolver.build_local_file_dependency_graph(sample, CurrentSessionState())
assert maybe.problems == [dataclasses.replace(_, source_path=sample) for _ in expected]
assert maybe.problems == [dataclasses.replace(_, source_path=expected_path) for _ in expected]


@pytest.mark.parametrize(
Expand All @@ -151,6 +155,8 @@ def test_detect_s3fs_import(
def test_detect_s3fs_import_in_dependencies(
empty_index, expected: list[DependencyProblem], mock_path_lookup, mock_notebook_resolver
) -> None:
known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json"
expected_path = Path(f"{known_url}#s3fs")
file_loader = FileLoader()
import_resolver = ImportFileResolver(file_loader)
pip_resolver = PythonLibraryResolver()
Expand All @@ -159,4 +165,4 @@ def test_detect_s3fs_import_in_dependencies(
)
sample = mock_path_lookup.cwd / "root9.py"
maybe = dependency_resolver.build_local_file_dependency_graph(sample, CurrentSessionState())
assert maybe.problems == expected
assert maybe.problems == [dataclasses.replace(p, source_path=expected_path) for p in expected]

0 comments on commit 9083d6f

Please sign in to comment.