From 74413f981e3022424f70bf82ec17e5e11be87239 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:12:11 +0100 Subject: [PATCH 01/33] Introduce PythonFixer --- .../labs/ucx/source_code/linters/base.py | 22 ++++++++ tests/unit/source_code/linters/test_base.py | 56 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/unit/source_code/linters/test_base.py diff --git a/src/databricks/labs/ucx/source_code/linters/base.py b/src/databricks/labs/ucx/source_code/linters/base.py index 0841648bad..07ef0cdcba 100644 --- a/src/databricks/labs/ucx/source_code/linters/base.py +++ b/src/databricks/labs/ucx/source_code/linters/base.py @@ -114,6 +114,28 @@ def lint(self, code: str) -> Iterable[Advice]: def lint_tree(self, tree: Tree) -> Iterable[Advice]: ... +class PythonFixer(Fixer): + """Fix python source code.""" + + def apply(self, code: str) -> str: + """Apply the changes to Python source code. + + The source code is parsed into an AST tree, and the fixes are applied + to the tree. + """ + maybe_tree = MaybeTree.from_source_code(code) + if maybe_tree.failure: + # Fixing does not yield parse failures, linting does + logger.warning(f"Parsing source code resulted in failure `{maybe_tree.failure}`: {code}") + return code + assert maybe_tree.tree is not None + return self.apply_tree(maybe_tree.tree) + + @abstractmethod + def apply_tree(self, tree: Tree) -> str: + """Apply the fixes to the AST tree.""" + + class DfsaPyCollector(DfsaCollector, ABC): def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: diff --git a/tests/unit/source_code/linters/test_base.py b/tests/unit/source_code/linters/test_base.py new file mode 100644 index 0000000000..e7f65b5455 --- /dev/null +++ b/tests/unit/source_code/linters/test_base.py @@ -0,0 +1,56 @@ +from databricks.labs.ucx.source_code.linters.base import PythonFixer +from databricks.labs.ucx.source_code.python.python_ast import Tree + + +def test_python_fixer_has_dummy_code() -> None: + class _PythonFixer(PythonFixer): + + @property + def diagnostic_code(self) -> str: + """Dummy diagnostic code""" + return "dummy" + + def apply_tree(self, tree: Tree) -> str: + """Dummy fixer""" + _ = tree + return "" + + fixer = _PythonFixer() + assert fixer.diagnostic_code == "dummy" + + +def test_python_fixer_applies_valid_python() -> None: + class _PythonFixer(PythonFixer): + + @property + def diagnostic_code(self) -> str: + """Dummy diagnostic code""" + return "dummy" + + def apply_tree(self, tree: Tree) -> str: + """Dummy fixer""" + _ = tree + return "fixed" + + fixer = _PythonFixer() + assert fixer.diagnostic_code == "dummy" + assert fixer.apply("print(1)") == "fixed" + + +def test_python_fixer_applies_invalid_python() -> None: + """Cannot fix invalid Python, thus nothing should happen""" + + class _PythonFixer(PythonFixer): + + @property + def diagnostic_code(self) -> str: + """Dummy diagnostic code""" + return "dummy" + + def apply_tree(self, tree: Tree) -> str: + """Dummy fixer""" + _ = tree + return "fixed" + + fixer = _PythonFixer() + assert fixer.apply("print(1") == "print(1" # closing parenthesis is missing on purpose From 4ddbc5197b53f9f2d65340bfba09ea03d3031714 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:51:10 +0100 Subject: [PATCH 02/33] Change Notebook.parse to classmethod --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 5e3de710b0..6153cdf411 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -24,13 +24,13 @@ class Notebook(SourceContainer): - @staticmethod - def parse(path: Path, source: str, default_language: Language) -> Notebook: + @classmethod + def parse(cls, path: Path, source: str, default_language: Language) -> Notebook: default_cell_language = CellLanguage.of_language(default_language) cells = default_cell_language.extract_cells(source) if cells is None: raise ValueError(f"Could not parse Notebook: {path}") - return Notebook(path, source, default_language, cells, source.endswith('\n')) + return cls(path, source, default_language, cells, source.endswith('\n')) def __init__(self, path: Path, source: str, language: Language, cells: list[Cell], ends_with_lf: bool): self._path = path From 926de3b3b5a7201a5253e59d6dc62f55f8ba7001 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:51:37 +0100 Subject: [PATCH 03/33] Rename Notebook._source to ._original_code like LocalFile --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 6153cdf411..a1c93d63f0 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -24,6 +24,10 @@ class Notebook(SourceContainer): + TODO: + Let `Notebook` inherit from `LocalFile` + """ + @classmethod def parse(cls, path: Path, source: str, default_language: Language) -> Notebook: default_cell_language = CellLanguage.of_language(default_language) @@ -34,7 +38,7 @@ def parse(cls, path: Path, source: str, default_language: Language) -> Notebook: def __init__(self, path: Path, source: str, language: Language, cells: list[Cell], ends_with_lf: bool): self._path = path - self._source = source + self._original_code = source self._language = language self._cells = cells self._ends_with_lf = ends_with_lf @@ -49,7 +53,7 @@ def cells(self) -> list[Cell]: @property def original_code(self) -> str: - return self._source + return self._original_code def to_migrated_code(self) -> str: default_language = CellLanguage.of_language(self._language) From b6f2746c379d50bf98f6e0c9f4f86850c6ff4405 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:52:20 +0100 Subject: [PATCH 04/33] Remove NotebookMigrator --- .../labs/ucx/source_code/linters/files.py | 34 ------------------- tests/unit/source_code/linters/test_files.py | 16 +-------- 2 files changed, 1 insertion(+), 49 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 77a023ac60..86097d41f0 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -277,37 +277,3 @@ def __init__(self, languages: LinterContext): # TODO: move languages to `apply` self._languages = languages - def revert(self, path: Path) -> bool: - backup_path = path.with_suffix(".bak") - if not backup_path.exists(): - return False - return path.write_text(backup_path.read_text()) > 0 - - def apply(self, path: Path) -> bool: - if not path.exists(): - return False - dependency = Dependency(NotebookLoader(), path) - # TODO: the interface for this method has to be changed - lookup = PathLookup.from_sys_path(Path.cwd()) - container = dependency.load(lookup) - assert isinstance(container, Notebook) - return self._apply(container) - - def _apply(self, notebook: Notebook) -> bool: - changed = False - for cell in notebook.cells: - # %run is not a supported language, so this needs to come first - if isinstance(cell, RunCell): - # TODO migration data, see https://github.com/databrickslabs/ucx/issues/1327 - continue - if not self._languages.is_supported(cell.language.language): - continue - migrated_code = self._languages.apply_fixes(cell.language.language, cell.original_code) - if migrated_code != cell.original_code: - cell.migrated_code = migrated_code - changed = True - if changed: - # TODO https://github.com/databrickslabs/ucx/issues/1327 store 'migrated' status - notebook.path.replace(notebook.path.with_suffix(".bak")) - notebook.path.write_text(notebook.to_migrated_code()) - return changed diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index eaa781247e..e61ea35914 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -13,7 +13,7 @@ from databricks.labs.ucx.source_code.graph import Dependency, DependencyResolver, SourceContainer from databricks.labs.ucx.source_code.linters.base import PythonLinter from databricks.labs.ucx.source_code.linters.context import LinterContext -from databricks.labs.ucx.source_code.linters.files import FileLinter, NotebookLinter, NotebookMigrator +from databricks.labs.ucx.source_code.linters.files import FileLinter, NotebookLinter from databricks.labs.ucx.source_code.linters.folders import LocalCodeLinter from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver from databricks.labs.ucx.source_code.notebooks.sources import Notebook @@ -142,20 +142,6 @@ def test_file_linter_applies_migrated(tmp_path, mock_path_lookup, migration_inde assert path.read_text().rstrip() == "df = spark.read.table('brand.new.stuff')" -def test_notebook_migrator_ignores_unsupported_extensions() -> None: - languages = LinterContext(TableMigrationIndex([])) - migrator = NotebookMigrator(languages) - path = Path('unsupported.ext') - assert not migrator.apply(path) - - -def test_notebook_migrator_supported_language_no_diagnostics(mock_path_lookup) -> None: - languages = LinterContext(TableMigrationIndex([])) - migrator = NotebookMigrator(languages) - path = mock_path_lookup.resolve(Path("root1.run.py")) - assert not migrator.apply(path) - - def test_triple_dot_import() -> None: file_resolver = ImportFileResolver(FileLoader()) path_lookup = create_autospec(PathLookup) From 3a060309904d00b71c855a8137596d0cd401ce06 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:52:53 +0100 Subject: [PATCH 05/33] Rename migrated_code similar to LocalFile --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 5 ++++- tests/unit/source_code/test_notebook.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index a1c93d63f0..8602f45114 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -23,6 +23,7 @@ class Notebook(SourceContainer): + """ TODO: Let `Notebook` inherit from `LocalFile` @@ -55,7 +56,9 @@ def cells(self) -> list[Cell]: def original_code(self) -> str: return self._original_code - def to_migrated_code(self) -> str: + @property + def migrated_code(self) -> str: + """Format the migrated code by chaining the migrated cells.""" default_language = CellLanguage.of_language(self._language) header = f"{default_language.comment_prefix} {NOTEBOOK_HEADER}" sources = [header] diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index db12d80ae0..7fdb296c02 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -101,9 +101,8 @@ def test_notebook_rebuilds_same_code(source: tuple[str, Language, list[str]]) -> assert len(sources) == 1 notebook = Notebook.parse(Path(path), sources[0], source[1]) assert notebook is not None - new_source = notebook.to_migrated_code() # ignore trailing whitespaces - actual_purified = re.sub(r'\s+$', '', new_source, flags=re.MULTILINE) + actual_purified = re.sub(r'\s+$', '', notebook.migrated_code, flags=re.MULTILINE) expected_purified = re.sub(r'\s+$', '', sources[0], flags=re.MULTILINE) assert actual_purified == expected_purified From e9bdcd7b59333a438aeb1976042775ac7f978de3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:53:16 +0100 Subject: [PATCH 06/33] Add backup method to Notebook like LocalFile --- .../labs/ucx/source_code/notebooks/sources.py | 34 ++++++ .../source_code/notebooks/test_sources.py | 100 ++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 tests/unit/source_code/notebooks/test_sources.py diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 8602f45114..fac7c012fd 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -6,6 +6,7 @@ from databricks.sdk.service.workspace import Language +from databricks.labs.ucx.source_code.base import back_up_path, safe_write_text, revert_back_up_path from databricks.labs.ucx.source_code.graph import ( SourceContainer, DependencyGraph, @@ -75,6 +76,39 @@ def migrated_code(self) -> str: sources.append('') # following join will append lf return '\n'.join(sources) + def _safe_write_text(self, contents: str) -> int | None: + """Write content to the local file.""" + return safe_write_text(self._path, contents) + + def _back_up_path(self) -> Path | None: + """Back up the original file.""" + return back_up_path(self._path) + + def back_up_original_and_flush_migrated_code(self) -> int | None: + """Back up the original notebook and flush the migrated code to the file. + + This is a single method to avoid overwriting the original file without a backup. + + Returns : + int : The number of characters written. If None, nothing is written to the file. + + TODO: + Let `Notebook` inherit from `LocalFile` and reuse implementation of + `back_up_original_and_flush_migrated_code`. + """ + if self.original_code == self.migrated_code: + # Avoiding unnecessary back up and flush + return len(self.migrated_code) + backed_up_path = self._back_up_path() + if not backed_up_path: + # Failed to back up the original file, avoid overwriting existing file + return None + number_of_characters_written = self._safe_write_text(self.migrated_code) + if number_of_characters_written is None: + # Failed to overwrite original file, clean up by reverting backup + revert_back_up_path(self._path) + return number_of_characters_written + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: """Check for any problems with dependencies of the cells in this notebook. diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py new file mode 100644 index 0000000000..4087f2430a --- /dev/null +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -0,0 +1,100 @@ +from databricks.sdk.service.workspace import Language + +from databricks.labs.ucx.source_code.notebooks.sources import Notebook + + +def test_notebook_flush_migrated_code(tmp_path) -> None: + """Happy path of flushing and backing up fixed code""" + source_code = "# Databricks notebook source\nprint(1)" + migrated_code = "# Databricks notebook source\nprint(2)" + + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "print(2)" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written == len(migrated_code) + assert notebook.original_code == source_code + assert notebook.migrated_code == migrated_code + assert path.with_suffix(".py.bak").read_text() == source_code + assert path.read_text() == migrated_code + + +def test_notebook_flush_migrated_code_with_empty_cell_contents(tmp_path) -> None: + """Verify the method handles a cell without contents.""" + source_code = "# Databricks notebook source\nprint(1)" + migrated_code = "# Databricks notebook source\n" + + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written == len(migrated_code) + assert notebook.original_code == source_code + assert notebook.migrated_code == migrated_code + assert path.with_suffix(".py.bak").read_text() == source_code + assert path.read_text() == migrated_code + + +def test_notebook_flush_non_migrated_code(tmp_path) -> None: + """No-op in case the code is not migrated""" + source_code = "# Databricks notebook source\nprint(1)" + + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = Notebook.parse(path, "print(1)", Language.PYTHON) + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written == len(source_code) + assert notebook.original_code == source_code + assert notebook.migrated_code == source_code + assert not path.with_suffix(".py.bak").is_file() + assert path.read_text() == source_code + + +def test_notebook_does_not_flush_migrated_code_when_backup_fails(tmp_path) -> None: + """If backup fails the method should not flush the migrated code""" + + class _Notebook(Notebook): + def _back_up_path(self) -> None: + # Simulate an error, back_up_path handles the error, no return signals an error + pass + + source_code = "# Databricks notebook source\nprint(1)" + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = _Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "print(2)" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written is None + assert not path.with_suffix(".py.bak").is_file() + assert path.read_text() == source_code + + +def test_notebook_flush_migrated_code_with_error(tmp_path) -> None: + """If flush fails, the method should revert the backup""" + + class _Notebook(Notebook): + def _safe_write_text(self, contents: str) -> None: + # Simulate an error, safe_write_text handles the error, no returns signals an error + _ = contents + + source_code = "# Databricks notebook source\nprint(1)" + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = _Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "print(2)" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written is None + assert not path.with_suffix(".py.bak").is_file() + assert path.read_text() == source_code From e29a5c1358a70eb413f2293be628093873b0b4ac Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 14:54:24 +0100 Subject: [PATCH 07/33] Introduce migrating notebook --- .../labs/ucx/source_code/linters/files.py | 45 +++++++++++++++---- tests/unit/source_code/linters/test_files.py | 15 +++++++ 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 86097d41f0..294d1104dd 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -17,7 +17,7 @@ from databricks.labs.ucx.source_code.files import LocalFile from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.known import KnownDependency -from databricks.labs.ucx.source_code.linters.base import PythonLinter +from databricks.labs.ucx.source_code.linters.base import PythonFixer, PythonLinter from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.imports import SysPathChange, UnresolvedPath from databricks.labs.ucx.source_code.notebooks.cells import ( @@ -26,7 +26,6 @@ RunCell, RunCommand, ) -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader from databricks.labs.ucx.source_code.notebooks.magic import MagicLine from databricks.labs.ucx.source_code.notebooks.sources import Notebook from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -76,6 +75,36 @@ def lint(self) -> Iterable[Advice]: ) return + def apply(self) -> None: + """Apply changes to the notebook.""" + maybe_tree = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) + if maybe_tree and maybe_tree.failure: + logger.warning("Failed to parse the notebook, run linter for more details.") + return + for cell in self._notebook.cells: + try: + linter = self._context.linter(cell.language.language) + except ValueError: # Language is not supported (yet) + continue + if isinstance(cell, PythonCell): + linter = cast(PythonLinter, linter) + tree = self._python_tree_cache[(self._notebook.path, cell)] + advices = linter.lint_tree(tree) + else: + advices = linter.lint(cell.original_code) + for advice in advices: + fixer = self._context.fixer(cell.language.language, advice.code) + if fixer is None: + continue + if isinstance(cell, PythonCell): + fixer = cast(PythonFixer, fixer) + fixed_code = fixer.apply_tree(tree) + else: + fixed_code = fixer.apply(cell.original_code) + cell.migrated_code = fixed_code + self._notebook.back_up_original_and_flush_migrated_code() + return + def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree | None: """Parse a notebook by parsing its cells. @@ -264,6 +293,8 @@ def apply(self) -> None: source_container = self._dependency.load(self._path_lookup) if isinstance(source_container, LocalFile): self._apply_file(source_container) + elif isinstance(source_container, Notebook): + self._apply_notebook(source_container) def _apply_file(self, local_file: LocalFile) -> None: """Apply changes to a local file.""" @@ -271,9 +302,7 @@ def _apply_file(self, local_file: LocalFile) -> None: local_file.migrated_code = fixed_code local_file.back_up_original_and_flush_migrated_code() - -class NotebookMigrator: - def __init__(self, languages: LinterContext): - # TODO: move languages to `apply` - self._languages = languages - + def _apply_notebook(self, notebook: Notebook) -> None: + """Apply changes to a notebook.""" + notebook_linter = NotebookLinter(notebook, self._path_lookup, self._context, self._inherited_tree) + notebook_linter.apply() diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index e61ea35914..7817b47a0f 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -252,6 +252,21 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo ] +def test_notebook_linter_applies_migrated_table_with_rename(tmp_path, migration_index, mock_path_lookup) -> None: + """The table should be migrated to the new table name as defined in the migration index.""" + path = tmp_path / "notebook.py" + source = "# Databricks notebook source\nspark.table('old.things')" + path.write_text(source) + notebook = Notebook.parse(path, source, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) + + linter.apply() + + contents = path.read_text() + assert "old.things" not in contents + assert "brand.new.stuff" in contents + + @pytest.mark.xfail(reason="https://github.com/databrickslabs/ucx/issues/3556") def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, mock_path_lookup) -> None: source = """ From 4ca2e0f2160113f294e02067a74ac6d6270dc8cd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 17:35:32 +0100 Subject: [PATCH 08/33] Let apply tree return Tree Explain why the PythonFixer works like this --- src/databricks/labs/ucx/source_code/linters/base.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/base.py b/src/databricks/labs/ucx/source_code/linters/base.py index 07ef0cdcba..28bbfa1872 100644 --- a/src/databricks/labs/ucx/source_code/linters/base.py +++ b/src/databricks/labs/ucx/source_code/linters/base.py @@ -129,11 +129,18 @@ def apply(self, code: str) -> str: logger.warning(f"Parsing source code resulted in failure `{maybe_tree.failure}`: {code}") return code assert maybe_tree.tree is not None - return self.apply_tree(maybe_tree.tree) + tree = self.apply_tree(maybe_tree.tree) + return tree.node.as_string() @abstractmethod - def apply_tree(self, tree: Tree) -> str: - """Apply the fixes to the AST tree.""" + def apply_tree(self, tree: Tree) -> Tree: + """Apply the fixes to the AST tree. + + For Python, the fixes are applied to a Tree so that we + - Can chain multiple fixers without transpiling back and forth between + source code and AST tree + - Can extend the tree with (brought into scope) nodes, e.g. to add imports + """ class DfsaPyCollector(DfsaCollector, ABC): From 9cb3a6b853b274ffa806388ea370dd636087f6eb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 13 Feb 2025 17:38:27 +0100 Subject: [PATCH 09/33] Rewrite python fixer applies to apply_tree --- .../labs/ucx/source_code/linters/pyspark.py | 31 ++++++------------- tests/unit/source_code/linters/test_base.py | 15 ++++----- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 7c0478c3ea..63d2156efb 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -20,6 +20,7 @@ from databricks.labs.ucx.source_code.linters.base import ( SqlLinter, Fixer, + PythonFixer, PythonLinter, DfsaPyCollector, TablePyCollector, @@ -33,7 +34,6 @@ from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import ( MatchingVisitor, - MaybeTree, Tree, TreeHelper, ) @@ -393,7 +393,7 @@ def matchers(self) -> dict[str, _TableNameMatcher]: return self._matchers -class SparkTableNamePyLinter(PythonLinter, Fixer, TablePyCollector): +class SparkTableNamePyLinter(PythonLinter, PythonFixer, TablePyCollector): """Linter for table name references in PySpark Examples: @@ -427,21 +427,15 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: assert isinstance(node, Call) yield from matcher.lint(self._from_table, self._index, self._session_state, node) - def apply(self, code: str) -> str: - maybe_tree = MaybeTree.from_source_code(code) - if not maybe_tree.tree: - assert maybe_tree.failure is not None - logger.warning(maybe_tree.failure.message) - return code - tree = maybe_tree.tree - # we won't be doing it like this in production, but for the sake of the example + def apply_tree(self, tree: Tree) -> Tree: + """Apply the fixes to the AST tree.""" for node in tree.walk(): matcher = self._find_matcher(node) if matcher is None: continue assert isinstance(node, Call) matcher.apply(self._from_table, self._index, node) - return tree.node.as_string() + return tree def _find_matcher(self, node: NodeNG) -> _TableNameMatcher | None: if not isinstance(node, Call): @@ -476,7 +470,7 @@ def _visit_call_nodes(cls, tree: Tree) -> Iterable[tuple[Call, NodeNG]]: yield call_node, query -class _SparkSqlPyLinter(_SparkSqlAnalyzer, PythonLinter, Fixer): +class _SparkSqlPyLinter(_SparkSqlAnalyzer, PythonLinter, PythonFixer): """Linter for SparkSQL used within PySpark.""" def __init__(self, sql_linter: SqlLinter, sql_fixer: Fixer | None): @@ -503,15 +497,10 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: code = self.diagnostic_code yield dataclasses.replace(advice.replace_from_node(call_node), code=code) - def apply(self, code: str) -> str: + def apply_tree(self, tree: Tree) -> Tree: + """Apply the fixes to the AST tree.""" if not self._sql_fixer: - return code - maybe_tree = MaybeTree.from_source_code(code) - if maybe_tree.failure: - logger.warning(maybe_tree.failure.message) - return code - assert maybe_tree.tree is not None - tree = maybe_tree.tree + return tree for _call_node, query in self._visit_call_nodes(tree): if not isinstance(query, Const) or not isinstance(query.value, str): continue @@ -519,7 +508,7 @@ def apply(self, code: str) -> str: # this requires changing 'apply' API in order to check advice fragment location new_query = self._sql_fixer.apply(query.value) query.value = new_query - return tree.node.as_string() + return tree class FromTableSqlPyLinter(_SparkSqlPyLinter): diff --git a/tests/unit/source_code/linters/test_base.py b/tests/unit/source_code/linters/test_base.py index e7f65b5455..1d1890b21f 100644 --- a/tests/unit/source_code/linters/test_base.py +++ b/tests/unit/source_code/linters/test_base.py @@ -10,10 +10,9 @@ def diagnostic_code(self) -> str: """Dummy diagnostic code""" return "dummy" - def apply_tree(self, tree: Tree) -> str: + def apply_tree(self, tree: Tree) -> Tree: """Dummy fixer""" - _ = tree - return "" + return tree fixer = _PythonFixer() assert fixer.diagnostic_code == "dummy" @@ -27,10 +26,9 @@ def diagnostic_code(self) -> str: """Dummy diagnostic code""" return "dummy" - def apply_tree(self, tree: Tree) -> str: + def apply_tree(self, tree: Tree) -> Tree: """Dummy fixer""" - _ = tree - return "fixed" + return tree fixer = _PythonFixer() assert fixer.diagnostic_code == "dummy" @@ -47,10 +45,9 @@ def diagnostic_code(self) -> str: """Dummy diagnostic code""" return "dummy" - def apply_tree(self, tree: Tree) -> str: + def apply_tree(self, tree: Tree) -> Tree: """Dummy fixer""" - _ = tree - return "fixed" + return tree fixer = _PythonFixer() assert fixer.apply("print(1") == "print(1" # closing parenthesis is missing on purpose From 7ddae4c652b022eaea2f0ea9db90e070db464c29 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 08:21:46 +0100 Subject: [PATCH 10/33] Fix fixer apply test --- tests/unit/source_code/linters/test_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/source_code/linters/test_base.py b/tests/unit/source_code/linters/test_base.py index 1d1890b21f..c1f1036d20 100644 --- a/tests/unit/source_code/linters/test_base.py +++ b/tests/unit/source_code/linters/test_base.py @@ -31,8 +31,7 @@ def apply_tree(self, tree: Tree) -> Tree: return tree fixer = _PythonFixer() - assert fixer.diagnostic_code == "dummy" - assert fixer.apply("print(1)") == "fixed" + assert fixer.apply("print(1)\n") == "print(1)\n\n" # Format introduces EOF newline def test_python_fixer_applies_invalid_python() -> None: From 9ea071c9e96e58699ef6184fa10b40540800b0c3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 08:23:05 +0100 Subject: [PATCH 11/33] Pass source code in test --- tests/unit/source_code/notebooks/test_sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 4087f2430a..3e68d8ba15 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -47,7 +47,7 @@ def test_notebook_flush_non_migrated_code(tmp_path) -> None: path = tmp_path / "test.py" path.write_text(source_code) - notebook = Notebook.parse(path, "print(1)", Language.PYTHON) + notebook = Notebook.parse(path, source_code, Language.PYTHON) number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() From 79f907a53a3b8a28e12df6c2d9a15cba7f1583cc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 08:25:35 +0100 Subject: [PATCH 12/33] Implement NotebookLinter.apply --- .../labs/ucx/source_code/linters/files.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 294d1104dd..5de418751d 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -86,22 +86,26 @@ def apply(self) -> None: linter = self._context.linter(cell.language.language) except ValueError: # Language is not supported (yet) continue - if isinstance(cell, PythonCell): - linter = cast(PythonLinter, linter) + is_python_cell = isinstance(cell, PythonCell) + tree: Tree | None = None # For Python fixing + if is_python_cell: tree = self._python_tree_cache[(self._notebook.path, cell)] - advices = linter.lint_tree(tree) + advices = cast(PythonLinter, linter).lint_tree(tree) else: advices = linter.lint(cell.original_code) + fixed_code = cell.original_code # For default fixing for advice in advices: fixer = self._context.fixer(cell.language.language, advice.code) - if fixer is None: + if not fixer: continue - if isinstance(cell, PythonCell): - fixer = cast(PythonFixer, fixer) - fixed_code = fixer.apply_tree(tree) + if is_python_cell and tree: + # By calling `apply_tree` directly, we chain fixes on the same tree + tree = cast(PythonFixer, fixer).apply_tree(tree) else: - fixed_code = fixer.apply(cell.original_code) - cell.migrated_code = fixed_code + fixed_code = fixer.apply(fixed_code) + if tree: + fixed_code = tree.node.as_string() + cell.migrated_code = fixed_code self._notebook.back_up_original_and_flush_migrated_code() return From de2361b553f558a5d309b6118906128fe0a84c5c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 09:24:02 +0100 Subject: [PATCH 13/33] Simplify code --- .../labs/ucx/source_code/linters/files.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 5de418751d..1aa77992c5 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -86,26 +86,23 @@ def apply(self) -> None: linter = self._context.linter(cell.language.language) except ValueError: # Language is not supported (yet) continue + fixed_code = cell.original_code # For default fixing + tree = self._python_tree_cache.get((self._notebook.path, cell)) # For Python fixing is_python_cell = isinstance(cell, PythonCell) - tree: Tree | None = None # For Python fixing - if is_python_cell: - tree = self._python_tree_cache[(self._notebook.path, cell)] + if is_python_cell and tree: advices = cast(PythonLinter, linter).lint_tree(tree) else: advices = linter.lint(cell.original_code) - fixed_code = cell.original_code # For default fixing for advice in advices: fixer = self._context.fixer(cell.language.language, advice.code) if not fixer: continue if is_python_cell and tree: - # By calling `apply_tree` directly, we chain fixes on the same tree + # By calling `apply_tree` instead of `apply`, we chain fixes on the same tree tree = cast(PythonFixer, fixer).apply_tree(tree) else: fixed_code = fixer.apply(fixed_code) - if tree: - fixed_code = tree.node.as_string() - cell.migrated_code = fixed_code + cell.migrated_code = tree.node.as_string() if tree else fixed_code self._notebook.back_up_original_and_flush_migrated_code() return From d6e66412b3e6c0eeaa4b26b1af49f1d1611dce26 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 10:03:30 +0100 Subject: [PATCH 14/33] Format --- src/databricks/labs/ucx/source_code/linters/files.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 1aa77992c5..cd8b1f72dc 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -41,7 +41,11 @@ class NotebookLinter: """ def __init__( - self, notebook: Notebook, path_lookup: PathLookup, context: LinterContext, parent_tree: Tree | None = None + self, + notebook: Notebook, + path_lookup: PathLookup, + context: LinterContext, + parent_tree: Tree | None = None, ): self._context: LinterContext = context self._path_lookup = path_lookup From e9b4543000f8f8ad42001ddfd81cc52a0110d8e0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 10:03:47 +0100 Subject: [PATCH 15/33] Add notebook fixing tests https://github.com/databrickslabs/ucx/issues/3692 --- tests/unit/source_code/linters/test_files.py | 119 +++++++++++++++---- 1 file changed, 97 insertions(+), 22 deletions(-) diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index 7817b47a0f..e6697b75b4 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -302,18 +302,36 @@ def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, m ] -def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) -> None: +def test_notebook_linter_lints_and_applies_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: """Regression test with the tests below.""" - source = """ + expected = """ # Databricks notebook source +table_name = 'brand.new.stuff' + -table_name = "old.things" # Migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + source_code = """ +# Databricks notebook source +table_name = 'old.things' + + + +# COMMAND ---------- + +spark.table(table_name) + + +""".lstrip() + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) advices = list(linter.lint()) @@ -321,83 +339,140 @@ def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) assert advices[0] == Deprecation( code='table-migrated-to-uc-python', message='Table old.things is migrated to brand.new.stuff in Unity Catalog', - start_line=6, + start_line=7, start_col=0, - end_line=6, + end_line=7, end_col=23, ) + linter.apply() + + assert path.read_text() == expected + -def test_notebook_linter_lints_not_migrated_table(migration_index, mock_path_lookup) -> None: +def test_notebook_linter_lints_and_applies_not_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: """Regression test with the tests above and below.""" - source = """ + source_code = """ # Databricks notebook source +table_name = 'not_migrated.table' + -table_name = "not_migrated.table" # NOT a migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) advices = list(linter.lint()) assert not [advice for advice in advices if advice.code == "table-migrated-to-uc"] + linter.apply() + + assert path.read_text() == source_code + -def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: +def test_notebook_linter_lints_and_applies_migrated_table_with_rename( + tmp_path, migration_index, mock_path_lookup +) -> None: """The spark.table should read the table defined above the call not below. This is a regression test with the tests above and below. """ - source = """ + expected = """ +# Databricks notebook source +table_name = 'brand.new.stuff' + + + +# COMMAND ---------- + +spark.table(table_name) + + + +# COMMAND ---------- + +table_name = 'not_migrated.table' + + +""".lstrip() + source_code = """ # Databricks notebook source +table_name = 'old.things' + -table_name = "old.things" # Migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + # COMMAND ---------- -table_name = "not_migrated.table" # NOT a migrated table according to the migration index +table_name = 'not_migrated.table' + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) first_advice = next(iter(linter.lint())) assert first_advice == Deprecation( code='table-migrated-to-uc-python', message='Table old.things is migrated to brand.new.stuff in Unity Catalog', - start_line=6, + start_line=7, start_col=0, - end_line=6, + end_line=7, end_col=23, ) + linter.apply() + + assert path.read_text() == expected + -def test_notebook_linter_lints_not_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: +def test_notebook_linter_lints_not_migrated_table_with_rename(tmp_path, migration_index, mock_path_lookup) -> None: """The spark.table should read the table defined above the call not below. This is a regression test with the tests above. """ - source = """ + source_code = """ # Databricks notebook source +table_name = 'not_migrated.table' + -table_name = "not_migrated.table" # NOT a migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + # COMMAND ---------- -table_name = "old.things" # Migrated table according to the migration index +table_name = 'old.things' + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) advices = list(linter.lint()) assert not [advice for advice in advices if advice.code == "table-migrated-to-uc"] + + linter.apply() + + assert path.read_text() == source_code From fb4ded8ba813030dec6451ac3a7a19dd05b4903b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 12:05:57 +0100 Subject: [PATCH 16/33] Warn about non-fixable Spark calls https://github.com/databrickslabs/ucx/issues/3695 --- .../labs/ucx/source_code/linters/pyspark.py | 17 +++++++++++++++-- tests/unit/source_code/linters/test_files.py | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 63d2156efb..b829540a6b 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -1,5 +1,6 @@ import dataclasses import logging +import urllib.parse from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator from dataclasses import dataclass @@ -170,8 +171,20 @@ def lint( def apply(self, from_table: FromTableSqlLinter, index: TableMigrationIndex, node: Call) -> None: table_arg = self._get_table_arg(node) - assert isinstance(table_arg, Const) - # TODO locate constant when value is inferred + if not isinstance(table_arg, Const): + # TODO: https://github.com/databrickslabs/ucx/issues/3695 + source_code = node.as_string() + new_issue_url = ( + "https://github.com/databrickslabs/ucx/issues/new?title=Autofix+the+following+Python+code" + "&labels=migrate/code,needs-triage&type=Feature" + "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" + "%0A%0A%60%60%60+python%0ATODO:+Add+relevant+source+code%0A" + f"{urllib.parse.quote_plus(source_code)}%0A%60%60%60" + ) + logger.warning( + f"Cannot fix the following Python code: {source_code}. Please report this issue at {new_issue_url}" + ) + return info = UsedTable.parse(table_arg.value, from_table.schema) dst = self._find_dest(index, info) if dst is not None: diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index e6697b75b4..649bc21471 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -302,6 +302,7 @@ def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, m ] +@pytest.mark.skip(reason="https://github.com/databrickslabs/ucx/issues/3695") def test_notebook_linter_lints_and_applies_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: """Regression test with the tests below.""" expected = """ @@ -378,6 +379,7 @@ def test_notebook_linter_lints_and_applies_not_migrated_table(tmp_path, migratio assert path.read_text() == source_code +@pytest.mark.skip(reason="https://github.com/databrickslabs/ucx/issues/3695") def test_notebook_linter_lints_and_applies_migrated_table_with_rename( tmp_path, migration_index, mock_path_lookup ) -> None: From 6eb6e9ff04f8ba044ef9991661bffd90b68870dc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 12:18:28 +0100 Subject: [PATCH 17/33] Add docstring to notebook source container --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index fac7c012fd..da7cdcad48 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -24,7 +24,7 @@ class Notebook(SourceContainer): - """ + """A notebook source code container. TODO: Let `Notebook` inherit from `LocalFile` From e68c253abba1b78ce3a6d814f19ca98f460b2b00 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 14 Feb 2025 12:29:03 +0100 Subject: [PATCH 18/33] Add more notebook fixing tests --- tests/unit/source_code/linters/test_files.py | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index 649bc21471..d8b2031852 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -267,6 +267,37 @@ def test_notebook_linter_applies_migrated_table_with_rename(tmp_path, migration_ assert "brand.new.stuff" in contents +def test_notebook_linter_applies_two_migrated_table_with_rename(tmp_path, migration_index, mock_path_lookup) -> None: + """The table should be migrated to the new table names as defined in the migration index.""" + path = tmp_path / "notebook.py" + source = "# Databricks notebook source\nspark.table('old.things')\nspark.table('other.matters')" + path.write_text(source) + notebook = Notebook.parse(path, source, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) + + linter.apply() + + contents = path.read_text() + assert "old.things" not in contents + assert "brand.new.stuff" in contents + assert "some.certain.issues" in contents + + +def test_notebook_linter_applies_no_op_for_non_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: + """The table should not be migrated to the new table name as defined in the migration index.""" + path = tmp_path / "notebook.py" + source = "# Databricks notebook source\nspark.table('not.migrated.table')" + path.write_text(source) + notebook = Notebook.parse(path, source, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) + + linter.apply() + + contents = path.read_text() + assert "not.migrated.table" in contents + assert "brand.new.stuff" not in contents + + @pytest.mark.xfail(reason="https://github.com/databrickslabs/ucx/issues/3556") def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, mock_path_lookup) -> None: source = """ From 143a6a233b50446c072a36a1da3618f704f3df6a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 17 Feb 2025 09:27:46 +0100 Subject: [PATCH 19/33] Move GITHUB_URL to global variable --- src/databricks/labs/ucx/__init__.py | 4 ++++ src/databricks/labs/ucx/install.py | 5 +++-- src/databricks/labs/ucx/source_code/known.py | 5 +++-- src/databricks/labs/ucx/source_code/linters/pyspark.py | 4 +++- .../labs/ucx/source_code/python/python_ast.py | 10 +++++----- .../labs/ucx/source_code/python_libraries.py | 5 ++--- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/databricks/labs/ucx/__init__.py b/src/databricks/labs/ucx/__init__.py index 022d9c57a5..5f82609be8 100644 --- a/src/databricks/labs/ucx/__init__.py +++ b/src/databricks/labs/ucx/__init__.py @@ -19,3 +19,7 @@ # Add ucx/ for re-packaging of ucx, where product name is omitted ua.with_product("ucx", __version__) + + +DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" +GITHUB_URL = "https://github.com/databrickslabs/ucx" diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index b291ab466b..b1e6d9486b 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -48,6 +48,7 @@ ) from databricks.sdk.useragent import with_extra +from databricks.labs.ucx import DOCS_URL, GITHUB_URL from databricks.labs.ucx.__about__ import __version__ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalInfo from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo @@ -218,7 +219,7 @@ def run( if isinstance(err.__cause__, RequestsConnectionError): logger.warning( f"Cannot connect with {self.workspace_client.config.host} see " - f"https://github.com/databrickslabs/ucx#network-connectivity-issues for help: {err}" + f"{GITHUB_URL}#network-connectivity-issues for help: {err}" ) raise err return config @@ -573,7 +574,7 @@ def _create_database(self): if "Unable to load AWS credentials from any provider in the chain" in str(err): msg = ( "The UCX installation is configured to use external metastore. There is issue with the external metastore connectivity.\n" - "Please check the UCX installation instruction https://github.com/databrickslabs/ucx?tab=readme-ov-file#prerequisites" + f"Please check the UCX installation instruction {DOCS_URL}/installation " "and re-run installation.\n" "Please Follow the Below Command to uninstall and Install UCX\n" "UCX Uninstall: databricks labs uninstall ucx.\n" diff --git a/src/databricks/labs/ucx/source_code/known.py b/src/databricks/labs/ucx/source_code/known.py index 0f1e24aba7..5516f5c5d6 100644 --- a/src/databricks/labs/ucx/source_code/known.py +++ b/src/databricks/labs/ucx/source_code/known.py @@ -14,6 +14,7 @@ from databricks.labs.blueprint.entrypoint import get_logger +from databricks.labs.ucx import GITHUB_URL from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState from databricks.labs.ucx.source_code.graph import ( @@ -24,6 +25,7 @@ from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.getLogger(__name__) +KNOWN_URL = f"{GITHUB_URL}/blob/main/src/databricks/labs/ucx/source_code/known.json" """ Known libraries that are not in known.json @@ -282,10 +284,9 @@ class KnownDependency(Dependency): """A dependency for known libraries, see :class:KnownList.""" def __init__(self, module_name: str, problems: list[KnownProblem]): - known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" # Note that Github does not support navigating JSON files, hence the # does nothing. # https://docs.github.com/en/repositories/working-with-files/using-files/navigating-code-on-github - super().__init__(KnownLoader(), Path(f"{known_url}#{module_name}"), inherits_context=False) + super().__init__(KnownLoader(), Path(f"{KNOWN_URL}#{module_name}"), inherits_context=False) self._module_name = module_name self.problems = problems diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index b829540a6b..6493d97aa5 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -7,6 +7,8 @@ from typing import TypeVar from astroid import Attribute, Call, Const, Name, NodeNG # type: ignore + +from databricks.labs.ucx import GITHUB_URL from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus from databricks.labs.ucx.source_code.base import ( Advice, @@ -175,7 +177,7 @@ def apply(self, from_table: FromTableSqlLinter, index: TableMigrationIndex, node # TODO: https://github.com/databrickslabs/ucx/issues/3695 source_code = node.as_string() new_issue_url = ( - "https://github.com/databrickslabs/ucx/issues/new?title=Autofix+the+following+Python+code" + f"{GITHUB_URL}/issues/new?title=Autofix+the+following+Python+code" "&labels=migrate/code,needs-triage&type=Feature" "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" "%0A%0A%60%60%60+python%0ATODO:+Add+relevant+source+code%0A" diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 1817fcf7e2..38180fccfd 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -29,9 +29,9 @@ ) from astroid.exceptions import AstroidSyntaxError # type: ignore -from databricks.labs.ucx.source_code.base import ( - Failure, -) +from databricks.labs.ucx import GITHUB_URL +from databricks.labs.ucx.source_code.base import Failure + logger = logging.getLogger(__name__) missing_handlers: set[str] = set() @@ -101,8 +101,8 @@ def _failure_from_exception(source_code: str, e: Exception) -> Failure: end_col=(e.error.end_offset or 2) - 1, ) new_issue_url = ( - "https://github.com/databrickslabs/ucx/issues/new?title=[BUG]:+Python+parse+error" - "&labels=migrate/code,needs-triage,bug" + f"{GITHUB_URL}/issues/new?title=Python+parse+error" + "&labels=migrate/code,needs-triage&type=Bug" "&body=%23+Current+behaviour%0A%0ACannot+parse+the+following+Python+code" f"%0A%0A%60%60%60+python%0A{urllib.parse.quote_plus(source_code)}%0A%60%60%60" ) diff --git a/src/databricks/labs/ucx/source_code/python_libraries.py b/src/databricks/labs/ucx/source_code/python_libraries.py index 41ead804f7..f6b8b82f25 100644 --- a/src/databricks/labs/ucx/source_code/python_libraries.py +++ b/src/databricks/labs/ucx/source_code/python_libraries.py @@ -12,7 +12,7 @@ from databricks.labs.ucx.framework.utils import run_command from databricks.labs.ucx.source_code.graph import DependencyProblem, LibraryResolver -from databricks.labs.ucx.source_code.known import KnownList +from databricks.labs.ucx.source_code.known import KNOWN_URL, KnownList from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -35,7 +35,6 @@ def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep """We delegate to pip to install the library and augment the path look-up to resolve the library at import. This gives us the flexibility to install any library that is not in the allow-list, and we don't have to bother about parsing cross-version dependencies in our code.""" - known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" if len(libraries) == 0: return [] if len(libraries) == 1: # Multiple libraries might be installation flags @@ -43,7 +42,7 @@ def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep compatibility = self._allow_list.distribution_compatibility(library) if compatibility.known: # TODO: Pass in the line number and column number https://github.com/databrickslabs/ucx/issues/3625 - path = Path(f"{known_url}#{library}") + path = Path(f"{KNOWN_URL}#{library}") problems = [DependencyProblem(p.code, p.message, path) for p in compatibility.problems] return problems return self._install_library(path_lookup, *libraries) From 17b492f23ea2db21c5651ed7c3af72db6b7e4ef5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 16:17:12 +0100 Subject: [PATCH 20/33] Add Github issue doc link --- src/databricks/labs/ucx/source_code/linters/pyspark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 6493d97aa5..1b482f7005 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -176,6 +176,7 @@ def apply(self, from_table: FromTableSqlLinter, index: TableMigrationIndex, node if not isinstance(table_arg, Const): # TODO: https://github.com/databrickslabs/ucx/issues/3695 source_code = node.as_string() + # https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue#creating-an-issue-from-a-url-query new_issue_url = ( f"{GITHUB_URL}/issues/new?title=Autofix+the+following+Python+code" "&labels=migrate/code,needs-triage&type=Feature" From 75c2103c45ee5b1f7d814f06124101edbb2518a9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 16:21:48 +0100 Subject: [PATCH 21/33] Move Github urls to separate module to introduce method --- src/databricks/labs/ucx/__init__.py | 2 -- src/databricks/labs/ucx/github.py | 6 ++++++ src/databricks/labs/ucx/install.py | 2 +- src/databricks/labs/ucx/source_code/known.py | 2 +- src/databricks/labs/ucx/source_code/linters/pyspark.py | 2 +- src/databricks/labs/ucx/source_code/python/python_ast.py | 2 +- tests/unit/test_github.py | 5 +++++ 7 files changed, 15 insertions(+), 6 deletions(-) create mode 100644 src/databricks/labs/ucx/github.py create mode 100644 tests/unit/test_github.py diff --git a/src/databricks/labs/ucx/__init__.py b/src/databricks/labs/ucx/__init__.py index 5f82609be8..1b234bdcc2 100644 --- a/src/databricks/labs/ucx/__init__.py +++ b/src/databricks/labs/ucx/__init__.py @@ -21,5 +21,3 @@ ua.with_product("ucx", __version__) -DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" -GITHUB_URL = "https://github.com/databrickslabs/ucx" diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py new file mode 100644 index 0000000000..7150dbfff8 --- /dev/null +++ b/src/databricks/labs/ucx/github.py @@ -0,0 +1,6 @@ +DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" +GITHUB_URL = "https://github.com/databrickslabs/ucx" + + +def construct_new_issue_url() -> str: + return f"{GITHUB_URL}/issues/new" diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index b1e6d9486b..8581350fde 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -48,7 +48,7 @@ ) from databricks.sdk.useragent import with_extra -from databricks.labs.ucx import DOCS_URL, GITHUB_URL +from databricks.labs.ucx.github import DOCS_URL, GITHUB_URL from databricks.labs.ucx.__about__ import __version__ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalInfo from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo diff --git a/src/databricks/labs/ucx/source_code/known.py b/src/databricks/labs/ucx/source_code/known.py index 5516f5c5d6..fde858e513 100644 --- a/src/databricks/labs/ucx/source_code/known.py +++ b/src/databricks/labs/ucx/source_code/known.py @@ -14,7 +14,7 @@ from databricks.labs.blueprint.entrypoint import get_logger -from databricks.labs.ucx import GITHUB_URL +from databricks.labs.ucx.github import GITHUB_URL from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState from databricks.labs.ucx.source_code.graph import ( diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 1b482f7005..d4f927a643 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -8,7 +8,7 @@ from astroid import Attribute, Call, Const, Name, NodeNG # type: ignore -from databricks.labs.ucx import GITHUB_URL +from databricks.labs.ucx.github import GITHUB_URL from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus from databricks.labs.ucx.source_code.base import ( Advice, diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 38180fccfd..80ac2de44d 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -29,7 +29,7 @@ ) from astroid.exceptions import AstroidSyntaxError # type: ignore -from databricks.labs.ucx import GITHUB_URL +from databricks.labs.ucx.github import GITHUB_URL from databricks.labs.ucx.source_code.base import Failure diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py new file mode 100644 index 0000000000..ac90e983ed --- /dev/null +++ b/tests/unit/test_github.py @@ -0,0 +1,5 @@ +from databricks.labs.ucx.github import GITHUB_URL, construct_new_issue_url + + +def test_construct_new_issue_url_starts_with_github_url() -> None: + assert construct_new_issue_url().startswith(GITHUB_URL) From 55b62c6b6f30efc74cc7d76b4ba7406df22cd16d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 16:28:23 +0100 Subject: [PATCH 22/33] Construct issue url with type, title and body --- src/databricks/labs/ucx/__init__.py | 2 -- src/databricks/labs/ucx/github.py | 25 +++++++++++++++++++++++-- tests/unit/test_github.py | 5 +++-- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/__init__.py b/src/databricks/labs/ucx/__init__.py index 1b234bdcc2..022d9c57a5 100644 --- a/src/databricks/labs/ucx/__init__.py +++ b/src/databricks/labs/ucx/__init__.py @@ -19,5 +19,3 @@ # Add ucx/ for re-packaging of ucx, where product name is omitted ua.with_product("ucx", __version__) - - diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py index 7150dbfff8..a71cf912b0 100644 --- a/src/databricks/labs/ucx/github.py +++ b/src/databricks/labs/ucx/github.py @@ -1,6 +1,27 @@ +from enum import Enum + +from typing import Literal + DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" GITHUB_URL = "https://github.com/databrickslabs/ucx" -def construct_new_issue_url() -> str: - return f"{GITHUB_URL}/issues/new" +class IssueType(Enum): + """The issue type""" + + FEATURE = "Feature" + BUG = "Bug" + TASK = "Task" + + +def construct_new_issue_url( + issue_type: IssueType, + title: str, + body: str, +) -> str: + """Construct a new issue URL. + + References: + - https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue#creating-an-issue-from-a-url-query + """ + return f"{GITHUB_URL}/issues/new?type={issue_type.value}&title={title}&body={body}" diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index ac90e983ed..69a21afbc6 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -1,5 +1,6 @@ -from databricks.labs.ucx.github import GITHUB_URL, construct_new_issue_url +from databricks.labs.ucx.github import GITHUB_URL, IssueType, construct_new_issue_url def test_construct_new_issue_url_starts_with_github_url() -> None: - assert construct_new_issue_url().startswith(GITHUB_URL) + url = construct_new_issue_url(IssueType.FEATURE, "title", "body") + assert url.startswith(GITHUB_URL) From 0e1936d0b398e00812162f49fa060ba7659bf761 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:16:24 +0100 Subject: [PATCH 23/33] Test labels in url --- src/databricks/labs/ucx/github.py | 6 +++++- tests/unit/test_github.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py index a71cf912b0..5802170f0f 100644 --- a/src/databricks/labs/ucx/github.py +++ b/src/databricks/labs/ucx/github.py @@ -18,10 +18,14 @@ def construct_new_issue_url( issue_type: IssueType, title: str, body: str, + *, + labels: set[str] | None = None, ) -> str: """Construct a new issue URL. References: - https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue#creating-an-issue-from-a-url-query """ - return f"{GITHUB_URL}/issues/new?type={issue_type.value}&title={title}&body={body}" + labels = labels or set() + labels.add("needs-triage") + return f"{GITHUB_URL}/issues/new?type={issue_type.value}&title={title}&body={body}&labels={','.join(labels)}" diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index 69a21afbc6..59a5e5c0a8 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -1,6 +1,23 @@ +import pytest + from databricks.labs.ucx.github import GITHUB_URL, IssueType, construct_new_issue_url def test_construct_new_issue_url_starts_with_github_url() -> None: + """Test that the URL starts with the GitHub URL.""" url = construct_new_issue_url(IssueType.FEATURE, "title", "body") assert url.startswith(GITHUB_URL) + + +def test_construct_new_issue_url_with_labels() -> None: + """Test that the URL contains the labels.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body", labels={"label1", "label2"}) + assert "label1" in url + assert "label2" in url + + +@pytest.mark.parametrize("labels", [None, {}, {"label1", "label2"}]) +def test_construct_new_issue_url_always_has_needs_triage_label(labels: set[str] | None) -> None: + """Test that the URL always has the `needs-triage` label.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body", labels=labels) + assert "needs-triage" in url From 03a5ba8cb17e5ee4c16a53163f6d3dacda93a043 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:16:51 +0100 Subject: [PATCH 24/33] Remove unused import --- src/databricks/labs/ucx/github.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py index 5802170f0f..5d4a5425ad 100644 --- a/src/databricks/labs/ucx/github.py +++ b/src/databricks/labs/ucx/github.py @@ -1,7 +1,5 @@ from enum import Enum -from typing import Literal - DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" GITHUB_URL = "https://github.com/databrickslabs/ucx" From d725e63fcbc834defe9ee55307fb2f2303db67e8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:19:46 +0100 Subject: [PATCH 25/33] Test url is properly encoded --- src/databricks/labs/ucx/github.py | 5 ++++- tests/unit/test_github.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py index 5d4a5425ad..3106a357a4 100644 --- a/src/databricks/labs/ucx/github.py +++ b/src/databricks/labs/ucx/github.py @@ -1,4 +1,6 @@ from enum import Enum +import urllib.parse + DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" GITHUB_URL = "https://github.com/databrickslabs/ucx" @@ -26,4 +28,5 @@ def construct_new_issue_url( """ labels = labels or set() labels.add("needs-triage") - return f"{GITHUB_URL}/issues/new?type={issue_type.value}&title={title}&body={body}&labels={','.join(labels)}" + query = urllib.parse.quote_plus(f"type={issue_type.value}&title={title}&body={body}&labels={','.join(labels)}") + return f"{GITHUB_URL}/issues/new?{query}" diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index 59a5e5c0a8..f18d863387 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -21,3 +21,9 @@ def test_construct_new_issue_url_always_has_needs_triage_label(labels: set[str] """Test that the URL always has the `needs-triage` label.""" url = construct_new_issue_url(IssueType.FEATURE, "title", "body", labels=labels) assert "needs-triage" in url + + +def test_construct_new_issue_url_makes_url_safe() -> None: + """Test that the URL is properly URL-encoded.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body with spaces") + assert "body+with+spaces" in url From 19998adc67310328482d708c4714537c3e925246 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:29:20 +0100 Subject: [PATCH 26/33] Quote query values only --- src/databricks/labs/ucx/github.py | 8 +++++++- tests/unit/test_github.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py index 3106a357a4..5db61fe257 100644 --- a/src/databricks/labs/ucx/github.py +++ b/src/databricks/labs/ucx/github.py @@ -28,5 +28,11 @@ def construct_new_issue_url( """ labels = labels or set() labels.add("needs-triage") - query = urllib.parse.quote_plus(f"type={issue_type.value}&title={title}&body={body}&labels={','.join(labels)}") + parameters = { + "type": issue_type.value, + "title": title, + "body": body, + "labels": ",".join(sorted(labels)), + } + query = "&".join(f"{key}={urllib.parse.quote_plus(value)}" for key, value in parameters.items()) return f"{GITHUB_URL}/issues/new?{query}" diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index f18d863387..4fdb2dfa18 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -27,3 +27,18 @@ def test_construct_new_issue_url_makes_url_safe() -> None: """Test that the URL is properly URL-encoded.""" url = construct_new_issue_url(IssueType.FEATURE, "title", "body with spaces") assert "body+with+spaces" in url + + +def test_construct_new_issue_url_advanced() -> None: + """Test that the URL is properly constructed with advanced parameters.""" + expected = ( + f"{GITHUB_URL}/issues/new?" + "type=Feature" + "&title=Autofix+the+following+Python+code" + "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" + "%0A%0A%60%60%60+python%0ATODO%3A+Add+relevant+source+code%0A%60%60%60" + "&labels=migrate%2Fcode%2Cneeds-triage" + ) + body = "# Desired behaviour\n\nAutofix following Python code\n\n" "``` python\nTODO: Add relevant source code\n```" + url = construct_new_issue_url(IssueType.FEATURE, "Autofix the following Python code", body, labels={"migrate/code"}) + assert url == expected From 1849465c063e2b7a3ee860543c7f5802b949872b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:34:49 +0100 Subject: [PATCH 27/33] Use construct_new_issue_url to create issue url --- .../labs/ucx/source_code/linters/pyspark.py | 18 ++++++------------ .../labs/ucx/source_code/python/python_ast.py | 11 +++-------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index d4f927a643..af7590d40a 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -1,6 +1,5 @@ import dataclasses import logging -import urllib.parse from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator from dataclasses import dataclass @@ -8,7 +7,7 @@ from astroid import Attribute, Call, Const, Name, NodeNG # type: ignore -from databricks.labs.ucx.github import GITHUB_URL +from databricks.labs.ucx.github import IssueType, construct_new_issue_url from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus from databricks.labs.ucx.source_code.base import ( Advice, @@ -176,17 +175,12 @@ def apply(self, from_table: FromTableSqlLinter, index: TableMigrationIndex, node if not isinstance(table_arg, Const): # TODO: https://github.com/databrickslabs/ucx/issues/3695 source_code = node.as_string() - # https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue#creating-an-issue-from-a-url-query - new_issue_url = ( - f"{GITHUB_URL}/issues/new?title=Autofix+the+following+Python+code" - "&labels=migrate/code,needs-triage&type=Feature" - "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" - "%0A%0A%60%60%60+python%0ATODO:+Add+relevant+source+code%0A" - f"{urllib.parse.quote_plus(source_code)}%0A%60%60%60" - ) - logger.warning( - f"Cannot fix the following Python code: {source_code}. Please report this issue at {new_issue_url}" + body = ( + "# Desired behaviour\n\nAutofix following Python code\n\n" + f"``` python\nTODO: Add relevant source code\n{source_code}\n```" ) + url = construct_new_issue_url(IssueType.FEATURE, "Autofix the following Python code", body) + logger.warning(f"Cannot fix the following Python code: {source_code}. Please report this issue at {url}") return info = UsedTable.parse(table_arg.value, from_table.schema) dst = self._find_dest(index, info) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 80ac2de44d..7057f49f6d 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -4,7 +4,6 @@ import logging import sys import re -import urllib.parse from abc import ABC from collections.abc import Iterable from dataclasses import dataclass @@ -29,7 +28,7 @@ ) from astroid.exceptions import AstroidSyntaxError # type: ignore -from databricks.labs.ucx.github import GITHUB_URL +from databricks.labs.ucx.github import IssueType, construct_new_issue_url from databricks.labs.ucx.source_code.base import Failure @@ -100,12 +99,8 @@ def _failure_from_exception(source_code: str, e: Exception) -> Failure: end_line=(e.error.end_lineno or 2) - 1, end_col=(e.error.end_offset or 2) - 1, ) - new_issue_url = ( - f"{GITHUB_URL}/issues/new?title=Python+parse+error" - "&labels=migrate/code,needs-triage&type=Bug" - "&body=%23+Current+behaviour%0A%0ACannot+parse+the+following+Python+code" - f"%0A%0A%60%60%60+python%0A{urllib.parse.quote_plus(source_code)}%0A%60%60%60" - ) + body = "# Desired behaviour\n\nCannot parse the follow Python code\n\n``` python\n{source_code}\n```" + new_issue_url = construct_new_issue_url(IssueType.BUG, "Python parse error", body) return Failure( code="python-parse-error", message=( From a8762c3cb8cae1d0516bba9d282db96ac9d6a1ce Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:35:50 +0100 Subject: [PATCH 28/33] Fix link to docs url --- src/databricks/labs/ucx/install.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 8581350fde..d2da9015e2 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -48,7 +48,7 @@ ) from databricks.sdk.useragent import with_extra -from databricks.labs.ucx.github import DOCS_URL, GITHUB_URL +from databricks.labs.ucx.github import DOCS_URL from databricks.labs.ucx.__about__ import __version__ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalInfo from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo @@ -219,7 +219,7 @@ def run( if isinstance(err.__cause__, RequestsConnectionError): logger.warning( f"Cannot connect with {self.workspace_client.config.host} see " - f"{GITHUB_URL}#network-connectivity-issues for help: {err}" + f"{DOCS_URL}#reference/common_challenges/#network-connectivity-issues for help: {err}" ) raise err return config From f7d7d961b8a42d26ebf7ad885a59af0584354428 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:37:18 +0100 Subject: [PATCH 29/33] Duplicate github url in unit tests --- tests/unit/test_github.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index 4fdb2dfa18..3e8b91a8a6 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -6,7 +6,7 @@ def test_construct_new_issue_url_starts_with_github_url() -> None: """Test that the URL starts with the GitHub URL.""" url = construct_new_issue_url(IssueType.FEATURE, "title", "body") - assert url.startswith(GITHUB_URL) + assert url.startswith("https://github.com/databrickslabs/ucx") def test_construct_new_issue_url_with_labels() -> None: @@ -32,8 +32,8 @@ def test_construct_new_issue_url_makes_url_safe() -> None: def test_construct_new_issue_url_advanced() -> None: """Test that the URL is properly constructed with advanced parameters.""" expected = ( - f"{GITHUB_URL}/issues/new?" - "type=Feature" + f"https://github.com/databrickslabs/ucx/issues/new" + "?type=Feature" "&title=Autofix+the+following+Python+code" "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" "%0A%0A%60%60%60+python%0ATODO%3A+Add+relevant+source+code%0A%60%60%60" From d2d5469deba927874157c09133d8f04a1d0010c0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:37:45 +0100 Subject: [PATCH 30/33] Remove redundant quotes --- tests/unit/test_github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index 3e8b91a8a6..d7d736d012 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -39,6 +39,6 @@ def test_construct_new_issue_url_advanced() -> None: "%0A%0A%60%60%60+python%0ATODO%3A+Add+relevant+source+code%0A%60%60%60" "&labels=migrate%2Fcode%2Cneeds-triage" ) - body = "# Desired behaviour\n\nAutofix following Python code\n\n" "``` python\nTODO: Add relevant source code\n```" + body = "# Desired behaviour\n\nAutofix following Python code\n\n``` python\nTODO: Add relevant source code\n```" url = construct_new_issue_url(IssueType.FEATURE, "Autofix the following Python code", body, labels={"migrate/code"}) assert url == expected From 6042657acddad82cc645cdccd65b6f743d8fa7df Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 19 Feb 2025 17:41:47 +0100 Subject: [PATCH 31/33] Remove redundant import --- tests/unit/test_github.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py index d7d736d012..c07fd7831c 100644 --- a/tests/unit/test_github.py +++ b/tests/unit/test_github.py @@ -1,6 +1,6 @@ import pytest -from databricks.labs.ucx.github import GITHUB_URL, IssueType, construct_new_issue_url +from databricks.labs.ucx.github import IssueType, construct_new_issue_url def test_construct_new_issue_url_starts_with_github_url() -> None: @@ -32,7 +32,7 @@ def test_construct_new_issue_url_makes_url_safe() -> None: def test_construct_new_issue_url_advanced() -> None: """Test that the URL is properly constructed with advanced parameters.""" expected = ( - f"https://github.com/databrickslabs/ucx/issues/new" + "https://github.com/databrickslabs/ucx/issues/new" "?type=Feature" "&title=Autofix+the+following+Python+code" "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" From 6cf289416f547d3d3c6d09f2c585a883a8078da4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 20 Feb 2025 08:46:04 +0100 Subject: [PATCH 32/33] Copy fixed files to empty directory --- .../unit/source_code/linters/test_folders.py | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/unit/source_code/linters/test_folders.py b/tests/unit/source_code/linters/test_folders.py index 1f6544f367..845c7c33b3 100644 --- a/tests/unit/source_code/linters/test_folders.py +++ b/tests/unit/source_code/linters/test_folders.py @@ -1,3 +1,4 @@ +import shutil from pathlib import Path import pytest @@ -43,8 +44,14 @@ def test_local_code_linter_lint_walks_directory(mock_path_lookup, local_code_lin assert not advices -def test_local_code_linter_apply_walks_directory(mock_path_lookup, local_code_linter) -> None: - advices = list(local_code_linter.apply(Path("simulate-sys-path/"))) +def test_local_code_linter_apply_walks_directory(tmp_path, mock_path_lookup, local_code_linter) -> None: + # Copy the parent-child-context directory to a temporary directory so that fixes are cleaned up after the test + source_path = mock_path_lookup.resolve(Path("simulate-sys-path/")) + destination_path = tmp_path / "simulate-sys-path" + copied_path = shutil.copytree(source_path, destination_path) + + advices = list(local_code_linter.apply(copied_path)) + assert len(mock_path_lookup.successfully_resolved_paths) > 10 assert not advices @@ -56,10 +63,22 @@ def test_local_code_linter_lints_child_in_context(mock_path_lookup, local_code_l assert not advices -def test_local_code_linter_applies_child_in_context(mock_path_lookup, local_code_linter) -> None: - expected = {Path("parent-child-context/parent.py"), Path("child.py")} - advices = list(local_code_linter.apply(Path("parent-child-context/parent.py"))) - assert not expected - mock_path_lookup.successfully_resolved_paths +def test_local_code_linter_applies_child_in_context(tmp_path, mock_path_lookup, local_code_linter) -> None: + # Copy the parent-child-context directory to a temporary directory so that fixes are cleaned up after the test + source_path = mock_path_lookup.resolve(Path("parent-child-context/")) + destination_path = tmp_path / "parent-child-context" + copied_path = shutil.copytree(source_path, destination_path) + + parent_path = copied_path / "parent.py" + child_path = copied_path / "child.py" + # 1./2. The full parent and child paths are expected to be resolved to read the notebooks + # 3. The relative child path is expected to be resolved as it is defined in the parent notebook + # 4. The parent-child-context directory is resolved at the top of this test + expected = {parent_path, child_path, Path("child.py"), Path("parent-child-context")} + + advices = list(local_code_linter.apply(parent_path)) + + assert mock_path_lookup.successfully_resolved_paths == expected assert not advices From bdbd8118aa5b8f61ff6017bd9ed6bd1ce99c592f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 20 Feb 2025 09:00:02 +0100 Subject: [PATCH 33/33] Rename test --- tests/integration/source_code/linters/test_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/source_code/linters/test_files.py b/tests/integration/source_code/linters/test_files.py index 0ae136b713..5990c80e8c 100644 --- a/tests/integration/source_code/linters/test_files.py +++ b/tests/integration/source_code/linters/test_files.py @@ -27,6 +27,6 @@ def test_local_code_linter_lints_ucx(local_checkout_context, ucx_source_path) -> assert len(problems) > 0, "Found no problems while linting ucx" -def test_local_code_migrator_fixes_ucx(local_checkout_context, ucx_source_path) -> None: +def test_local_code_linter_applies_ucx(local_checkout_context, ucx_source_path) -> None: problems = list(local_checkout_context.local_code_linter.apply(ucx_source_path)) assert len(problems) == 0, "Found no problems while fixing ucx"