From 7403ac85f98cf106800fdfdf81699e45da20a415 Mon Sep 17 00:00:00 2001 From: Cor Date: Thu, 13 Feb 2025 11:58:36 +0100 Subject: [PATCH] Force `MaybeDependency` to have a `Dependency` OR `list[Problem]`, not neither nor both (#3635) ## Changes Force `MaybeDependency` to have a `Dependency` OR `list[Problem]`, not neither nor both. This enforcement triggered to handle the known libraries during import registration. ### Linked issues Resolves #3585 Breaks up #3626 Progresses #1527 ### Functionality - [x] modified code linting related logic ### Tests - [x] added and modified unit tests --------- Co-authored-by: Liran Bareket --- src/databricks/labs/ucx/source_code/files.py | 13 +- src/databricks/labs/ucx/source_code/graph.py | 44 ++++- src/databricks/labs/ucx/source_code/known.py | 49 +++++- .../labs/ucx/source_code/linters/files.py | 6 + .../labs/ucx/source_code/linters/folders.py | 1 + .../labs/ucx/source_code/python_libraries.py | 9 +- tests/unit/source_code/linters/test_files.py | 46 +---- .../unit/source_code/linters/test_folders.py | 144 ++++++++++++++++ ...thon_builtins.py => standard_libraries.py} | 0 tests/unit/source_code/test_dependencies.py | 6 +- tests/unit/source_code/test_files.py | 41 ++++- tests/unit/source_code/test_graph.py | 13 ++ tests/unit/source_code/test_known.py | 26 ++- tests/unit/source_code/test_notebook.py | 8 +- .../test_path_lookup_simulation.py | 12 +- tests/unit/source_code/test_s3fs.py | 162 ------------------ 16 files changed, 338 insertions(+), 242 deletions(-) create mode 100644 tests/unit/source_code/linters/test_folders.py rename tests/unit/source_code/samples/{python_builtins.py => standard_libraries.py} (100%) diff --git a/src/databricks/labs/ucx/source_code/files.py b/src/databricks/labs/ucx/source_code/files.py index 3dc313cec2..b7689c0884 100644 --- a/src/databricks/labs/ucx/source_code/files.py +++ b/src/databricks/labs/ucx/source_code/files.py @@ -25,7 +25,7 @@ MaybeDependency, StubContainer, ) -from databricks.labs.ucx.source_code.known import KnownList +from databricks.labs.ucx.source_code.known import KnownDependency, KnownList from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.linters.python import PythonCodeAnalyzer @@ -168,6 +168,11 @@ def resolve_file(self, path_lookup, path: Path) -> MaybeDependency: return MaybeDependency(None, [problem]) def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + """Resolve an import by name. + + 1. Check the known modules. + 2. Check the import on the path lookup. + """ maybe = self._resolve_allow_list(name) if maybe is not None: return maybe @@ -181,10 +186,8 @@ def _resolve_allow_list(self, name: str) -> MaybeDependency | None: if not compatibility.known: logger.debug(f"Resolving unknown import: {name}") return None - if not compatibility.problems: - return MaybeDependency(None, []) - # TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527 - return MaybeDependency(None, compatibility.problems) + dependency = KnownDependency(name, compatibility.problems) + return MaybeDependency(dependency, []) def _resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency | None: if not name: diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index e062365d6c..4db027b9df 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -66,17 +66,19 @@ def register_notebook(self, path: Path, inherit_context: bool) -> list[Dependenc def register_import(self, name: str) -> list[DependencyProblem]: if not name: return [DependencyProblem('import-empty', 'Empty import name')] - maybe = self._resolver.resolve_import(self.path_lookup, name) - if not maybe.dependency: - return maybe.problems - maybe_graph = self.register_dependency(maybe.dependency) + maybe_dependency = self._resolver.resolve_import(self.path_lookup, name) + if maybe_dependency.problems: + return maybe_dependency.problems + assert maybe_dependency.dependency + maybe_graph = self.register_dependency(maybe_dependency.dependency) return maybe_graph.problems def register_file(self, path: Path) -> list[DependencyProblem]: - maybe = self._resolver.resolve_file(self.path_lookup, path) - if not maybe.dependency: - return maybe.problems - maybe_graph = self.register_dependency(maybe.dependency) + maybe_dependency = self._resolver.resolve_file(self.path_lookup, path) + if maybe_dependency.problems: + return maybe_dependency.problems + assert maybe_dependency.dependency + maybe_graph = self.register_dependency(maybe_dependency.dependency) return maybe_graph.problems def register_dependency(self, dependency: Dependency) -> MaybeGraph: @@ -398,8 +400,34 @@ def resolve_file(self, path_lookup, path: Path) -> MaybeDependency: @dataclass class MaybeDependency: + """A class:`Dependency` or a :class:`Failure`. + + The `MaybeDependency` is designed to either contain a `dependency` OR + `problems`, never both or neither. Typically, a `Dependency` is + constructed by a resolver yielding a `MaybeDependency` with + `list[Problems]` if the dependency could NOT be resolved, otherwise it + yields the `Dependency`, resulting in code that looks like: + + ``` python + maybe_dependency = resolver.resolve_import(path_lookup, module_name) + if maybe_dependency.problems: + # Handle failure and return early + assert maybe_dependency.dependency, "Dependency should be given when no problems are given." + # Use dependency + ``` + """ + dependency: Dependency | None + """The dependency""" + problems: list[DependencyProblem] + """The problems during constructing the dependency""" + + def __post_init__(self): + if not self.dependency and not self.problems: + raise ValueError(f"Dependency or problems should be given: {self}") + if self.dependency and self.problems: + raise ValueError(f"Dependency and problems should not be both given: {self}") class DependencyResolver: diff --git a/src/databricks/labs/ucx/source_code/known.py b/src/databricks/labs/ucx/source_code/known.py index 54929532bf..0f1e24aba7 100644 --- a/src/databricks/labs/ucx/source_code/known.py +++ b/src/databricks/labs/ucx/source_code/known.py @@ -15,8 +15,12 @@ from databricks.labs.blueprint.entrypoint import get_logger from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex -from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.graph import Dependency, DependencyProblem +from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState +from databricks.labs.ucx.source_code.graph import ( + Dependency, + DependencyLoader, + StubContainer, +) from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.getLogger(__name__) @@ -47,7 +51,7 @@ @dataclass class Compatibility: known: bool - problems: list[DependencyProblem] + problems: list[KnownProblem] @dataclass(unsafe_hash=True, frozen=True, eq=True, order=True) @@ -58,6 +62,10 @@ class KnownProblem: def as_dict(self): return {'code': self.code, 'message': self.message} + def as_advice(self) -> Advice: + # TODO: Pass on the complete Advice (https://github.com/databrickslabs/ucx/issues/3625) + return Advice(self.code, self.message, -1, -1, -1, -1) + UNKNOWN = Compatibility(False, []) _DEFAULT_ENCODING = sys.getdefaultencoding() @@ -70,10 +78,10 @@ def __init__(self): known = self._get_known() for distribution_name, modules in known.items(): specific_modules_first = sorted(modules.items(), key=lambda x: x[0], reverse=True) - for module_ref, problems in specific_modules_first: - module_problems = [DependencyProblem(**_) for _ in problems] - self._module_problems[module_ref] = module_problems - self._library_problems[distribution_name].extend(module_problems) + for module_ref, raw_problems in specific_modules_first: + problems = [KnownProblem(**_) for _ in raw_problems] + self._module_problems[module_ref] = problems + self._library_problems[distribution_name].extend(problems) for name in sys.stdlib_module_names: self._module_problems[name] = [] @@ -255,6 +263,33 @@ def __repr__(self): return f"" +class KnownLoader(DependencyLoader): + """Always load as `StubContainer`. + + This loader is used in combination with the KnownList to load known dependencies and their known problems. + """ + + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> StubContainer: + """Load the dependency.""" + _ = path_lookup + if not isinstance(dependency, KnownDependency): + raise RuntimeError("Only KnownDependency is supported") + # Known library paths do not need to be resolved + return StubContainer(dependency.path) + + +class KnownDependency(Dependency): + """A dependency for known libraries, see :class:KnownList.""" + + def __init__(self, module_name: str, problems: list[KnownProblem]): + known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" + # Note that Github does not support navigating JSON files, hence the # does nothing. + # https://docs.github.com/en/repositories/working-with-files/using-files/navigating-code-on-github + super().__init__(KnownLoader(), Path(f"{known_url}#{module_name}"), inherits_context=False) + self._module_name = module_name + self.problems = problems + + if __name__ == "__main__": logger = get_logger(__file__) # this only works for __main__ KnownList.rebuild(Path.cwd()) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 10b7899e84..77a023ac60 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -16,6 +16,7 @@ ) from databricks.labs.ucx.source_code.files import LocalFile from databricks.labs.ucx.source_code.graph import Dependency +from databricks.labs.ucx.source_code.known import KnownDependency from databricks.labs.ucx.source_code.linters.base import PythonLinter from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.imports import SysPathChange, UnresolvedPath @@ -228,6 +229,11 @@ def __init__( def lint(self) -> Iterable[Advice]: """Lint the file.""" + if isinstance(self._dependency, KnownDependency): + # TODO: Pass on the right advice type (https://github.com/databrickslabs/ucx/issues/3625) + advices = [problem.as_advice().as_advisory() for problem in self._dependency.problems] + yield from advices + return source_container = self._dependency.load(self._path_lookup) if not source_container: # The linter only reports **linting** errors, not loading errors diff --git a/src/databricks/labs/ucx/source_code/linters/folders.py b/src/databricks/labs/ucx/source_code/linters/folders.py index eea3a5178c..11e3d32220 100644 --- a/src/databricks/labs/ucx/source_code/linters/folders.py +++ b/src/databricks/labs/ucx/source_code/linters/folders.py @@ -79,6 +79,7 @@ def lint_path(self, path: Path) -> Iterable[LocatedAdvice]: problems = container.build_dependency_graph(graph) for problem in problems: yield problem.as_located_advice() + return walker = LintingWalker(graph, self._path_lookup, self._context_factory) yield from walker diff --git a/src/databricks/labs/ucx/source_code/python_libraries.py b/src/databricks/labs/ucx/source_code/python_libraries.py index 83ad53ee77..41ead804f7 100644 --- a/src/databricks/labs/ucx/source_code/python_libraries.py +++ b/src/databricks/labs/ucx/source_code/python_libraries.py @@ -35,12 +35,17 @@ def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep """We delegate to pip to install the library and augment the path look-up to resolve the library at import. This gives us the flexibility to install any library that is not in the allow-list, and we don't have to bother about parsing cross-version dependencies in our code.""" + known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" if len(libraries) == 0: return [] if len(libraries) == 1: # Multiple libraries might be installation flags - compatibility = self._allow_list.distribution_compatibility(libraries[0]) + library = libraries[0] + compatibility = self._allow_list.distribution_compatibility(library) if compatibility.known: - return compatibility.problems + # TODO: Pass in the line number and column number https://github.com/databrickslabs/ucx/issues/3625 + path = Path(f"{known_url}#{library}") + problems = [DependencyProblem(p.code, p.message, path) for p in compatibility.problems] + return problems return self._install_library(path_lookup, *libraries) @cached_property diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index d7ef38f0a0..e5595e453d 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -3,6 +3,8 @@ import pytest from databricks.labs.blueprint.tui import MockPrompts + + from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex @@ -19,7 +21,7 @@ from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver -from tests.unit import locate_site_packages, _samples_path +from tests.unit import locate_site_packages def test_file_linter_lints_file() -> None: @@ -155,48 +157,6 @@ def test_notebook_migrator_supported_language_no_diagnostics(mock_path_lookup) - assert not migrator.apply(path) -@pytest.fixture() -def local_code_linter(mock_path_lookup, migration_index): - notebook_loader = NotebookLoader() - file_loader = FileLoader() - folder_loader = FolderLoader(notebook_loader, file_loader) - pip_resolver = PythonLibraryResolver() - session_state = CurrentSessionState() - import_file_resolver = ImportFileResolver(file_loader) - resolver = DependencyResolver( - pip_resolver, - NotebookResolver(NotebookLoader()), - import_file_resolver, - import_file_resolver, - mock_path_lookup, - ) - return LocalCodeLinter( - notebook_loader, - file_loader, - folder_loader, - mock_path_lookup, - session_state, - resolver, - lambda: LinterContext(migration_index), - ) - - -def test_linter_walks_directory(mock_path_lookup, local_code_linter) -> None: - mock_path_lookup.append_path(Path(_samples_path(SourceContainer))) - path = Path(__file__).parent / "../samples" / "simulate-sys-path" - advices = list(local_code_linter.lint_path(path)) - assert not advices - assert len(mock_path_lookup.successfully_resolved_paths) > 10 - - -def test_linter_lints_children_in_context(mock_path_lookup, local_code_linter) -> None: - mock_path_lookup.append_path(Path(_samples_path(SourceContainer))) - path = Path(__file__).parent.parent / "samples" / "parent-child-context" - advices = list(local_code_linter.lint_path(path)) - assert not advices - assert mock_path_lookup.successfully_resolved_paths == {path, Path("parent.py"), Path("child.py")} - - def test_triple_dot_import() -> None: file_resolver = ImportFileResolver(FileLoader()) path_lookup = create_autospec(PathLookup) diff --git a/tests/unit/source_code/linters/test_folders.py b/tests/unit/source_code/linters/test_folders.py new file mode 100644 index 0000000000..51d9f63612 --- /dev/null +++ b/tests/unit/source_code/linters/test_folders.py @@ -0,0 +1,144 @@ +from pathlib import Path + +import pytest + +from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState, LocatedAdvice +from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver +from databricks.labs.ucx.source_code.folders import FolderLoader +from databricks.labs.ucx.source_code.graph import DependencyResolver, SourceContainer +from databricks.labs.ucx.source_code.linters.context import LinterContext +from databricks.labs.ucx.source_code.linters.folders import LocalCodeLinter +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver +from tests.unit import _samples_path + + +@pytest.fixture() +def local_code_linter(mock_path_lookup, migration_index): + notebook_loader = NotebookLoader() + file_loader = FileLoader() + folder_loader = FolderLoader(notebook_loader, file_loader) + pip_resolver = PythonLibraryResolver() + session_state = CurrentSessionState() + import_file_resolver = ImportFileResolver(file_loader) + resolver = DependencyResolver( + pip_resolver, + NotebookResolver(NotebookLoader()), + import_file_resolver, + import_file_resolver, + mock_path_lookup, + ) + return LocalCodeLinter( + notebook_loader, + file_loader, + folder_loader, + mock_path_lookup, + session_state, + resolver, + lambda: LinterContext(migration_index), + ) + + +def test_local_code_linter_walks_directory(mock_path_lookup, local_code_linter) -> None: + # TODO remove sample paths and clean up test when the paths is no longer needed + mock_path_lookup.append_path(Path(_samples_path(SourceContainer))) + path = Path(__file__).parent / "../samples" / "simulate-sys-path" + advices = list(local_code_linter.lint_path(path)) + assert len(mock_path_lookup.successfully_resolved_paths) > 10 + assert not advices + + +def test_local_code_linter_lints_children_in_context(mock_path_lookup, local_code_linter) -> None: + # TODO remove sample paths and clean up test when the paths is no longer needed + mock_path_lookup.append_path(Path(_samples_path(SourceContainer))) + path = Path(__file__).parent.parent / "samples" / "parent-child-context" + advices = list(local_code_linter.lint_path(path)) + assert mock_path_lookup.successfully_resolved_paths == {path, Path("parent.py"), Path("child.py")} + assert not advices + + +def test_local_code_linter_lints_import_from_known_list(tmp_path, mock_path_lookup, local_code_linter) -> None: + expected_failures = [ + Advisory( + "jvm-access-in-shared-clusters", "Cannot access Spark Driver JVM on UC Shared Clusters", -1, -1, -1, -1 + ), + Advisory( + "legacy-context-in-shared-clusters", + "sc is not supported on UC Shared Clusters. Rewrite it using spark", + -1, + -1, + -1, + -1, + ), + ] + ucx_known_url = "https:/github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" + expected_path = Path(f"{ucx_known_url}#pyspark.sql.functions") + expected_located_advices = [LocatedAdvice(failure, expected_path) for failure in expected_failures] + + content = "import pyspark.sql.functions" # Has known issues + path = tmp_path / "file.py" + path.write_text(content) + located_advices = list(local_code_linter.lint_path(path)) + + assert located_advices == expected_located_advices + + +def test_local_code_linter_lints_known_s3fs_problems(local_code_linter, mock_path_lookup) -> None: + known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" + expected = Advisory( + "direct-filesystem-access", + "S3fs library assumes AWS IAM Instance Profile to work with S3, " + "which is not compatible with Databricks Unity Catalog, that " + "routes access through Storage Credentials.", + -1, + -1, + -1, + -1, + ) + path = mock_path_lookup.resolve(Path("leaf9.py")) + located_advices = list(local_code_linter.lint_path(path)) + assert located_advices == [LocatedAdvice(expected, Path(known_url + "#s3fs"))] + + +S3FS_DEPRECATION_MESSAGE = ( + 'S3fs library assumes AWS IAM Instance Profile to work with ' + 'S3, which is not compatible with Databricks Unity Catalog, ' + 'that routes access through Storage Credentials.' +) + + +@pytest.mark.parametrize( + "source_code, advice", + [ + ("import s3fs", Advisory('direct-filesystem-access', S3FS_DEPRECATION_MESSAGE, -1, -1, -1, -1)), + ("from s3fs import something", Advisory('direct-filesystem-access', S3FS_DEPRECATION_MESSAGE, -1, -1, -1, -1)), + ("import certifi", None), + ("from certifi import core", None), + ("import s3fs, certifi", Advisory('direct-filesystem-access', S3FS_DEPRECATION_MESSAGE, -1, -1, -1, -1)), + ("from certifi import core, s3fs", None), + ( + "def func():\n import s3fs", + Advisory('direct-filesystem-access', S3FS_DEPRECATION_MESSAGE, -1, -1, -1, -1), + ), + ("import s3fs as s", Advisory('direct-filesystem-access', S3FS_DEPRECATION_MESSAGE, -1, -1, -1, -1)), + ( + "from s3fs.subpackage import something", + Advisory('direct-filesystem-access', S3FS_DEPRECATION_MESSAGE, -1, -1, -1, -1), + ), + ("", None), + ], +) +def test_local_code_linter_lints_known_s3fs_problems_from_source_code( + tmp_path, + mock_path_lookup, + local_code_linter, + source_code: str, + advice: Advisory | None, +) -> None: + known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" + module_name = "s3fs.subpackage" if "subpackage" in source_code else "s3fs" + expected = [LocatedAdvice(advice, Path(f"{known_url}#{module_name}"))] if advice else [] + path = tmp_path / "file.py" + path.write_text(source_code) + located_advices = list(local_code_linter.lint_path(path)) + assert located_advices == expected diff --git a/tests/unit/source_code/samples/python_builtins.py b/tests/unit/source_code/samples/standard_libraries.py similarity index 100% rename from tests/unit/source_code/samples/python_builtins.py rename to tests/unit/source_code/samples/standard_libraries.py diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py index da69ffbd65..35fe7968b0 100644 --- a/tests/unit/source_code/test_dependencies.py +++ b/tests/unit/source_code/test_dependencies.py @@ -109,9 +109,9 @@ def test_dependency_resolver_visits_file_dependencies(simple_dependency_resolver assert maybe.graph.all_relative_names() == {"root5.py", "leaf4.py"} -def test_dependency_resolver_skips_builtin_dependencies(simple_dependency_resolver) -> None: +def test_dependency_resolver_skips_standard_library_dependencies(simple_dependency_resolver) -> None: maybe = simple_dependency_resolver.build_local_file_dependency_graph( - Path("python_builtins.py"), CurrentSessionState() + Path("standard_libraries.py"), CurrentSessionState() ) assert not maybe.failed graph = maybe.graph @@ -123,7 +123,7 @@ def test_dependency_resolver_skips_builtin_dependencies(simple_dependency_resolv def test_dependency_resolver_ignores_known_dependencies(simple_dependency_resolver) -> None: maybe = simple_dependency_resolver.build_local_file_dependency_graph( - Path("python_builtins.py"), CurrentSessionState() + Path("standard_libraries.py"), CurrentSessionState() ) assert maybe.graph graph = maybe.graph diff --git a/tests/unit/source_code/test_files.py b/tests/unit/source_code/test_files.py index 71c24e8790..cccfbfed3a 100644 --- a/tests/unit/source_code/test_files.py +++ b/tests/unit/source_code/test_files.py @@ -7,7 +7,8 @@ from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyProblem, StubContainer -from databricks.labs.ucx.source_code.files import FileLoader, LocalFile +from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver, LocalFile +from databricks.labs.ucx.source_code.known import KnownDependency from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -269,3 +270,41 @@ def test_file_loader_ignores_path_by_loading_it_as_stub_container(tmp_path) -> N assert isinstance(stub, StubContainer) path_lookup.resolve.assert_called_once_with(path) + + +def test_import_resolver_resolves_import_from_known_list_without_problems() -> None: + import_file_resolver = ImportFileResolver(FileLoader()) + path_lookup = create_autospec(PathLookup) + + maybe_dependency = import_file_resolver.resolve_import(path_lookup, "aiohttp") + assert not maybe_dependency.problems + assert isinstance(maybe_dependency.dependency, KnownDependency) + assert not maybe_dependency.dependency.problems + path_lookup.resolve.assert_not_called() + + # Regression checks for KnownContainer to not yield the known problems + # The known problems should be surfaced during linting + graph = create_autospec(DependencyGraph) + container = maybe_dependency.dependency.load(path_lookup) + assert container + assert not container.build_dependency_graph(graph) + graph.assert_not_called() + + +def test_import_resolver_resolves_import_from_known_list_with_problems() -> None: + import_file_resolver = ImportFileResolver(FileLoader()) + path_lookup = create_autospec(PathLookup) + + maybe_dependency = import_file_resolver.resolve_import(path_lookup, "pyspark.sql.functions") + assert not maybe_dependency.problems # Problems are not surfaced during import resolving + assert isinstance(maybe_dependency.dependency, KnownDependency) + assert maybe_dependency.dependency.problems + path_lookup.resolve.assert_not_called() + + # Regression checks for KnownContainer to not yield the known problems + # The known problems should be surfaced during linting + graph = create_autospec(DependencyGraph) + container = maybe_dependency.dependency.load(path_lookup) + assert container + assert not container.build_dependency_graph(graph) + graph.assert_not_called() diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py index e73422e821..3ebeaf6e1d 100644 --- a/tests/unit/source_code/test_graph.py +++ b/tests/unit/source_code/test_graph.py @@ -9,6 +9,7 @@ Dependency, DependencyGraph, DependencyProblem, + MaybeDependency, InheritedContext, ) from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader @@ -201,3 +202,15 @@ def test_dependency_problem_as_located_advice_has_path_missing_by_default() -> N problem = DependencyProblem("code", "message") located_advice = problem.as_located_advice() assert located_advice.has_missing_path() + + +def test_maybe_dependency_raises_value_error_if_no_dependency_nor_problems() -> None: + with pytest.raises(ValueError, match="Dependency or problems should be given: *"): + MaybeDependency(None, []) + + +def test_maybe_dependency_raises_value_error_if_dependency_and_problems() -> None: + dependency = Dependency(FileLoader(), Path("test")) + problem = DependencyProblem("code", "message") + with pytest.raises(ValueError, match="Dependency and problems should not be both given: *"): + MaybeDependency(dependency, [problem]) diff --git a/tests/unit/source_code/test_known.py b/tests/unit/source_code/test_known.py index ac9b1d28d9..bb866dcece 100644 --- a/tests/unit/source_code/test_known.py +++ b/tests/unit/source_code/test_known.py @@ -2,10 +2,14 @@ from pathlib import Path from typing import cast from unittest import mock +from unittest.mock import create_autospec import pytest -from databricks.labs.ucx.source_code.known import KnownList +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.graph import DependencyGraph + +from databricks.labs.ucx.source_code.known import KnownList, KnownDependency, KnownLoader, KnownProblem from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -95,3 +99,23 @@ def analyze_cachetools_dist_info(cls): return TestKnownList.analyze_cachetools_dist_info() + + +@pytest.mark.parametrize("problems", [[], [KnownProblem("test", "test")]]) +def test_known_loader_loads_known_container_without_problems( + simple_dependency_resolver, problems: list[KnownProblem] +) -> None: + """The known problems are surfaced during linting not dependency graph building.""" + path_lookup = create_autospec(PathLookup) + loader = KnownLoader() + dependency = KnownDependency("test", problems) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, path_lookup, CurrentSessionState()) + container = loader.load_dependency(path_lookup, dependency) + assert not container.build_dependency_graph(graph) + path_lookup.resolve.assert_not_called() + + +@pytest.mark.parametrize("problems", [[], [KnownProblem("test", "test")]]) +def test_known_dependency_has_problems(problems: list[KnownProblem]) -> None: + dependency = KnownDependency("test", problems) + assert dependency.problems == problems diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index 31eb78cde2..db12d80ae0 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -227,6 +227,7 @@ def test_notebook_builds_python_dependency_graph(mock_path_lookup) -> None: def test_notebook_builds_python_dependency_graph_with_loop(mock_path_lookup) -> None: path = "run_notebooks.py" + expected_path = {mock_path_lookup.cwd / path for path in (path, "leaf1.py", "leaf2.py", "leaf3.py")} resolver = dependency_resolver(mock_path_lookup) maybe = resolver.resolve_notebook(mock_path_lookup, Path(path), False) assert maybe.dependency is not None @@ -234,13 +235,13 @@ def test_notebook_builds_python_dependency_graph_with_loop(mock_path_lookup) -> container = maybe.dependency.load(mock_path_lookup) assert container is not None container.build_dependency_graph(graph) - expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"] all_paths = set(d.path for d in graph.all_dependencies) - assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths} + assert not expected_path - all_paths def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_lookup) -> None: path = "run_notebooks_with_fstring.py" + expected_path = {mock_path_lookup.cwd / path for path in (path, "leaf1.py", "leaf2.py", "leaf3.py")} resolver = dependency_resolver(mock_path_lookup) maybe = resolver.resolve_notebook(mock_path_lookup, Path(path), False) assert maybe.dependency is not None @@ -248,9 +249,8 @@ def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_loo container = maybe.dependency.load(mock_path_lookup) assert container is not None container.build_dependency_graph(graph) - expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"] all_paths = set(d.path for d in graph.all_dependencies) - assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths} + assert not expected_path - all_paths def test_detects_multiple_calls_to_dbutils_notebook_run_in_python_code() -> None: diff --git a/tests/unit/source_code/test_path_lookup_simulation.py b/tests/unit/source_code/test_path_lookup_simulation.py index b042a1efcd..c5764effd3 100644 --- a/tests/unit/source_code/test_path_lookup_simulation.py +++ b/tests/unit/source_code/test_path_lookup_simulation.py @@ -32,9 +32,9 @@ ], 3, ), - (["simulate-sys-path", "via-sys-path", "run_notebook_1.py"], 1), - (["simulate-sys-path", "via-sys-path", "run_notebook_2.py"], 1), - (["simulate-sys-path", "via-sys-path", "run_notebook_4.py"], 2), + (["simulate-sys-path", "via-sys-path", "run_notebook_1.py"], 2), + (["simulate-sys-path", "via-sys-path", "run_notebook_2.py"], 2), + (["simulate-sys-path", "via-sys-path", "run_notebook_4.py"], 3), ], ) def test_locates_notebooks(source: list[str], expected: int, mock_path_lookup) -> None: @@ -60,8 +60,8 @@ def test_locates_notebooks(source: list[str], expected: int, mock_path_lookup) - "source, expected", [ (["simulate-sys-path", "siblings", "sibling1_file.py"], 2), - (["simulate-sys-path", "via-sys-path", "import_file_1.py"], 2), - (["simulate-sys-path", "via-sys-path", "import_file_2.py"], 2), + (["simulate-sys-path", "via-sys-path", "import_file_1.py"], 3), + (["simulate-sys-path", "via-sys-path", "import_file_2.py"], 3), ], ) def test_locates_files(source: list[str], expected: int) -> None: @@ -117,7 +117,7 @@ def test_locates_notebooks_with_absolute_path() -> None: assert not maybe.problems assert maybe.graph is not None all_paths = [d.path for d in maybe.graph.all_dependencies] - assert len(all_paths) == 2 + assert len(all_paths) == 3 def test_locates_files_with_absolute_path() -> None: diff --git a/tests/unit/source_code/test_s3fs.py b/tests/unit/source_code/test_s3fs.py index 86575a2ba7..e69de29bb2 100644 --- a/tests/unit/source_code/test_s3fs.py +++ b/tests/unit/source_code/test_s3fs.py @@ -1,162 +0,0 @@ -import dataclasses -from pathlib import Path - -import pytest - -from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.graph import ( - DependencyResolver, - DependencyProblem, -) -from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver -from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver - -S3FS_DEPRECATION_MESSAGE = ( - 'S3fs library assumes AWS IAM Instance Profile to work with ' - 'S3, which is not compatible with Databricks Unity Catalog, ' - 'that routes access through Storage Credentials.' -) - - -@pytest.mark.parametrize( - "source, expected", - [ - ( - "import s3fs", - [ - DependencyProblem( - code='direct-filesystem-access', - message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), - start_line=0, - start_col=0, - end_line=0, - end_col=11, - ) - ], - ), - ( - "from s3fs import something", - [ - DependencyProblem( - code='direct-filesystem-access', - message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), - start_line=0, - start_col=0, - end_line=0, - end_col=26, - ) - ], - ), - ("import certifi", []), - ("from certifi import core", []), - ( - "import s3fs, certifi", - [ - DependencyProblem( - code='direct-filesystem-access', - message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), - start_line=0, - start_col=0, - end_line=0, - end_col=20, - ) - ], - ), - ("from certifi import core, s3fs", []), - ( - "def func():\n import s3fs", - [ - DependencyProblem( - code='direct-filesystem-access', - message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), - start_line=1, - start_col=4, - end_line=1, - end_col=15, - ) - ], - ), - ( - "import s3fs as s", - [ - DependencyProblem( - code='direct-filesystem-access', - message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), - start_line=0, - start_col=0, - end_line=0, - end_col=16, - ) - ], - ), - ( - "from s3fs.subpackage import something", - [ - DependencyProblem( - code='direct-filesystem-access', - message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), - start_line=0, - start_col=0, - end_line=0, - end_col=37, - ) - ], - ), - ("", []), - ], -) -def test_detect_s3fs_import( - empty_index, source: str, expected: list[DependencyProblem], tmp_path, mock_path_lookup -) -> None: - sample = tmp_path / "test_detect_s3fs_import.py" - sample.write_text(source) - mock_path_lookup.append_path(tmp_path) - notebook_loader = NotebookLoader() - file_loader = FileLoader() - notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader) - pip_resolver = PythonLibraryResolver() - dependency_resolver = DependencyResolver( - pip_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup - ) - maybe = dependency_resolver.build_local_file_dependency_graph(sample, CurrentSessionState()) - assert maybe.problems == [dataclasses.replace(_, source_path=sample) for _ in expected] - - -@pytest.mark.parametrize( - "expected", - ( - [ - DependencyProblem( - code='direct-filesystem-access', - message='S3fs library assumes AWS IAM Instance Profile to work with ' - 'S3, which is not compatible with Databricks Unity Catalog, ' - 'that routes access through Storage Credentials.', - source_path=Path('leaf9.py'), - start_line=0, - start_col=0, - end_line=0, - end_col=12, - ), - ], - ), -) -def test_detect_s3fs_import_in_dependencies( - empty_index, expected: list[DependencyProblem], mock_path_lookup, mock_notebook_resolver -) -> None: - file_loader = FileLoader() - import_resolver = ImportFileResolver(file_loader) - pip_resolver = PythonLibraryResolver() - dependency_resolver = DependencyResolver( - pip_resolver, mock_notebook_resolver, import_resolver, import_resolver, mock_path_lookup - ) - sample = mock_path_lookup.cwd / "root9.py" - maybe = dependency_resolver.build_local_file_dependency_graph(sample, CurrentSessionState()) - assert maybe.problems == expected