From 45b9cbe5779ec3c820228a948abba1da3f2ad6b0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 15 Nov 2024 13:18:44 +0100 Subject: [PATCH 1/2] Copy changes from #3112 https://github.com/databrickslabs/ucx/pull/3112 --- ...> 01_00_percentage_migration_progress.sql} | 2 +- ..._01_percentage_udf_migration_progress.sql} | 0 ...2_percentage_grant_migration_progress.sql} | 0 ..._03_percentage_job_migration_progress.sql} | 0 ...percentage_cluster_migration_progress.sql} | 0 ...5_percentage_table_migration_progress.sql} | 2 +- .../01_06_percentage_used_table_progress.sql | 5 +++ .../01_07_count_direct_filesystem_access.sql | 7 +++ .../main/01_08_count_query_problem.sql | 6 +++ ...ercentage_pipeline_migration_progress.sql} | 0 ..._percentage_policy_migration_progress.sql} | 0 ..._11_distinct_failures_per_object_type.sql} | 2 +- ...2_4_migration_status_by_owner_overview.sql | 2 +- .../ucx/queries/progress/main/03_00_code.md | 8 ++++ ...ending_migration_data_asset_references.sql | 4 ++ ...ta_asset_references_by_owner_bar_graph.sql | 24 +++++++++++ .../03_03_migrated_data_asset_references.sql | 4 ++ ..._references_pending_migration_overview.sql | 20 +++++++++ ...ata_asset_references_pending_migration.sql | 43 +++++++++++++++++++ .../main/03_06_code_compatibility_issues.sql | 29 +++++++++++++ 20 files changed, 154 insertions(+), 4 deletions(-) rename src/databricks/labs/ucx/queries/progress/main/{01_0_percentage_migration_progress.sql => 01_00_percentage_migration_progress.sql} (55%) rename src/databricks/labs/ucx/queries/progress/main/{01_1_percentage_udf_migration_progress.sql => 01_01_percentage_udf_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_2_percentage_grant_migration_progress.sql => 01_02_percentage_grant_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_3_percentage_job_migration_progress.sql => 01_03_percentage_job_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_4_percentage_cluster_migration_progress.sql => 01_04_percentage_cluster_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_5_percentage_table_migration_progress.sql => 01_05_percentage_table_migration_progress.sql} (75%) create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql rename src/databricks/labs/ucx/queries/progress/main/{01_6_percentage_pipeline_migration_progress.sql => 01_09_percentage_pipeline_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_7_percentage_policy_migration_progress.sql => 01_10_percentage_policy_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_8_distinct_failures_per_object_type.sql => 01_11_distinct_failures_per_object_type.sql} (67%) create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_00_code.md create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_0_percentage_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql similarity index 55% rename from src/databricks/labs/ucx/queries/progress/main/01_0_percentage_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql index d429af42cc..d5ca534978 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_0_percentage_migration_progress.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql @@ -2,4 +2,4 @@ SELECT ROUND(100 * try_divide(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') +WHERE object_type IN ('ClusterInfo', 'DirectFsAccess', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'QueryProblem', 'Table', 'Udf', 'UsedTable') diff --git a/src/databricks/labs/ucx/queries/progress/main/01_1_percentage_udf_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_01_percentage_udf_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_1_percentage_udf_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_01_percentage_udf_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_2_percentage_grant_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_02_percentage_grant_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_2_percentage_grant_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_02_percentage_grant_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_3_percentage_job_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_03_percentage_job_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_3_percentage_job_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_03_percentage_job_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_4_percentage_cluster_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_04_percentage_cluster_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_4_percentage_cluster_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_04_percentage_cluster_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_5_percentage_table_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_05_percentage_table_migration_progress.sql similarity index 75% rename from src/databricks/labs/ucx/queries/progress/main/01_5_percentage_table_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_05_percentage_table_migration_progress.sql index b3a2e4554e..120dbab112 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_5_percentage_table_migration_progress.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_05_percentage_table_migration_progress.sql @@ -1,4 +1,4 @@ -/* --title 'Table migration progress (%)' --width 2 */ +/* --title 'Table migration progress (%)' */ SELECT ROUND(100 * TRY_DIVIDE(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage FROM ucx_catalog.multiworkspace.objects_snapshot diff --git a/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql new file mode 100644 index 0000000000..544062edc2 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql @@ -0,0 +1,5 @@ +/* --title '"Table references in code" progress (%)' --description 'Tables referring UC over Hive metastore' */ +SELECT + ROUND(100 * TRY_DIVIDE(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "UsedTable" diff --git a/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql b/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql new file mode 100644 index 0000000000..2a79b7d902 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql @@ -0,0 +1,7 @@ +/* --title 'Direct filesystem access progress (#)' --description 'Unsupported in Unity Catalog' */ +SELECT COUNT(*) AS counter +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "DirectFsAccess" + -- Redundant filter as a direct filesystem access is a failure by definition (see description above), + -- however, filter is defined for explicitness and as this knowledge is not "known" to this query. + AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql b/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql new file mode 100644 index 0000000000..a70028dc6b --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql @@ -0,0 +1,6 @@ +/* --title 'Query problem progress (#)' */ +SELECT COUNT(*) AS counter +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "QueryProblem" + -- Redundant filter as a query problem is a failure by definition, however, filter is defined for explicitness + AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/01_6_percentage_pipeline_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_09_percentage_pipeline_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_6_percentage_pipeline_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_09_percentage_pipeline_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_7_percentage_policy_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_10_percentage_policy_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_7_percentage_policy_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_10_percentage_policy_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_8_distinct_failures_per_object_type.sql b/src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql similarity index 67% rename from src/databricks/labs/ucx/queries/progress/main/01_8_distinct_failures_per_object_type.sql rename to src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql index 00a229d02f..75cb3bcaf6 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_8_distinct_failures_per_object_type.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql @@ -2,7 +2,7 @@ with failures AS ( SELECT object_type, explode(failures) AS failure FROM ucx_catalog.multiworkspace.objects_snapshot - WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') + WHERE object_type IN ('ClusterInfo', 'DirectFsAccess', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'QueryProblem', 'Table', 'Udf', 'UsedTable') ) SELECT diff --git a/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql b/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql index a9d7a7591f..c4ff69b267 100644 --- a/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql +++ b/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql @@ -1,6 +1,6 @@ /* --title 'Overview' --description 'Tables and views migration' --width 5 */ WITH migration_statuses AS ( - SELECT * + SELECT owner, failures FROM ucx_catalog.multiworkspace.objects_snapshot WHERE object_type = 'Table' ) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_00_code.md b/src/databricks/labs/ucx/queries/progress/main/03_00_code.md new file mode 100644 index 0000000000..ca3fd81e2a --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_00_code.md @@ -0,0 +1,8 @@ +# Code + +This section shows Unity Catalog compatability issues found while linting code. There are two kinds of code changes to +perform: +- Data asset reference, like references to Hive metastore tables and views or direct filesystem access (dfsa). These + references should be updated to refer to their Unity Catalog counterparts. +- Linting compatability issues, like using RDDs or directly accessing the Spark context. These issues should be resolved + by following the instructions stated with the issue. diff --git a/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql b/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql new file mode 100644 index 0000000000..d6388b41b3 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql @@ -0,0 +1,4 @@ +/* --title 'Pending migration' --description 'Total number of table, view and dfsa references' --height 6 */ +SELECT COUNT(*) AS count +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql b/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql new file mode 100644 index 0000000000..3910fc0b06 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql @@ -0,0 +1,24 @@ +/* +--title 'Pending migration' +--description 'Tables, views and dfsa per owner' +--width 5 +--overrides '{"spec": { + "version": 3, + "widgetType": "bar", + "encodings": { + "x": {"fieldName": "owner", "scale": {"type": "categorical"}, "displayName": "owner"}, + "y": {"fieldName": "count", "scale": {"type": "quantitative"}, "displayName": "count"} + } +}}' +*/ +WITH owners_with_failures AS ( + SELECT owner + FROM ucx_catalog.multiworkspace.objects_snapshot + WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) > 0 +) + +SELECT + owner, + COUNT(1) AS count +FROM owners_with_failures +GROUP BY owner diff --git a/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql b/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql new file mode 100644 index 0000000000..689e2bfaf0 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql @@ -0,0 +1,4 @@ +/* --title 'Migrated' --description 'Total number of table, view and dfsa references' --height 6 */ +SELECT COUNT(*) AS count +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) == 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql b/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql new file mode 100644 index 0000000000..1b14d7185b --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql @@ -0,0 +1,20 @@ +/* --title 'Overview' --description 'Table, view and dfsa migration' --width 5 */ +WITH migration_statuses AS ( + SELECT owner, object_type, failures + FROM ucx_catalog.multiworkspace.objects_snapshot + WHERE object_type IN ('DirectFsAccess', 'UsedTable') +) + +SELECT + owner, + CASE + WHEN object_type = 'DirectFsAccess' THEN 'Direct filesystem access' + WHEN object_type = 'UsedTable' THEN 'Table or view reference' + ELSE object_type + END AS object_type, + DOUBLE(CEIL(100 * COUNT_IF(SIZE(failures) = 0) / SUM(COUNT(*)) OVER (PARTITION BY owner, object_type), 2)) AS percentage, + COUNT(*) AS total, + COUNT_IF(SIZE(failures) = 0) AS total_migrated, + COUNT_IF(SIZE(failures) > 0) AS total_not_migrated +FROM migration_statuses +GROUP BY owner, object_type diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql new file mode 100644 index 0000000000..0db6a1bc8c --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql @@ -0,0 +1,43 @@ +/* +--title 'Data asset references' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, + {"fieldName": "object_type", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "object_type"}, + {"fieldName": "object_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "object_id"}, + {"fieldName": "failure", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "failure"}, + {"fieldName": "is_read", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "is_read"}, + {"fieldName": "is_write", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "is_write"} + ]}, + "invisibleColumns": [ + {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"} + ] + }}' +*/ +SELECT + workspace_id, + owner, + CASE + WHEN object_type = 'DirectFsAccess' THEN 'Direct filesystem access' + WHEN object_type = 'UsedTable' THEN 'Table or view reference' + ELSE object_type + END AS object_type, + CASE + WHEN object_type = 'DirectFsAccess' THEN data.path + WHEN object_type = 'UsedTable' THEN CONCAT_WS('.', object_id) + ELSE CONCAT_WS('.', object_id) + END AS object_id, + EXPLODE(failures) AS failure, + CAST(data.is_read AS BOOLEAN) AS is_read, + CAST(data.is_write AS BOOLEAN) AS is_write, + -- Below are invisible column(s) used in links url templates + CASE + -- SQL queries do NOT point to the workspace, i.e. start with '/' + WHEN object_type = 'DirectFsAccess' AND SUBSTRING(data.source_id, 0, 1) != '/' THEN CONCAT('/sql/editor/', data.source_id) + ELSE CONCAT('/#workspace', data.source_id) + END AS link +FROM ucx_catalog.multiworkspace.objects_snapshot +ORDER BY workspace_id, owner, object_type, object_id +WHERE object_type IN ('DirectFsAccess', 'UsedTable') diff --git a/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql b/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql new file mode 100644 index 0000000000..1c623da8fd --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql @@ -0,0 +1,29 @@ +/* +--title 'Code compatability issues' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, + {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, + {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, + {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard"}, + {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query"} + ]}, + "invisibleColumns": [ + {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, + {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} + ] + }}' +*/ +SELECT + workspace_id, + data.code, + data.message, + data.dashboard_name, + data.query_name, + -- Below are invisible columns used in links url templates + data.dashboard_id, + data.query_id +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = 'QueryProblem' From 9acea863c9722102754e8e289341d891057005f9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 15 Nov 2024 13:24:28 +0100 Subject: [PATCH 2/2] Add integration test --- .../queries/test_migration_progress.py | 283 +++++++++++++++++- 1 file changed, 273 insertions(+), 10 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index f3596a6777..10391c7936 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -16,7 +16,10 @@ from databricks.labs.ucx.hive_metastore.udfs import Udf from databricks.labs.ucx.progress.install import ProgressTrackingInstallation from databricks.labs.ucx.progress.workflow_runs import WorkflowRun +from databricks.labs.ucx.source_code.base import DirectFsAccess, LineageAtom from databricks.labs.ucx.source_code.jobs import JobProblem +from databricks.labs.ucx.source_code.queries import QueryProblem +from databricks.labs.ucx.source_code.used_table import UsedTable from ..conftest import MockRuntimeContext @@ -188,6 +191,110 @@ def policies() -> list[PolicyInfo]: return records +@pytest.fixture +def query_problems(make_dashboard, make_query) -> list[QueryProblem]: + dashboard, query = make_dashboard(), make_query() + records = [ + QueryProblem( + dashboard.id, + dashboard.parent, + dashboard.name, + query.id, + query.parent, + query.name, + "sql-parse-error", + "Could not parse SQL", + ) + ] + return records + + +@pytest.fixture +def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: + workspace_file = make_workspace_file(content='df = spark.read.csv("dbfs://folder/file.csv")') + query = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") + records = [ + DirectFsAccess( + path="dbfs://folder/file.csv", + is_read=False, + # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=str(workspace_file), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id=str(workspace_file)), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + DirectFsAccess( + path="dbfs://folder/file.csv", + is_read=False, + # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=query.id, + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), + LineageAtom(object_type="QUERY", object_id=f"my_dashboard_id/{query.id}", other={"name": "my_query"}), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + ] + return records + + +@pytest.fixture +def used_tables(make_workspace_file, make_table) -> list[UsedTable]: + table = make_table(catalog_name="hive_metastore") + workspace_file = make_workspace_file(content=f'df = spark.read.table("{table.full_name}")\ndisplay(df)') + records = [ + UsedTable( + catalog_name=table.catalog_name, # This table is pending migration + schema_name=table.schema_name, + table_name=table.name, + is_read=False, + # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=str(workspace_file), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id=str(workspace_file)), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + UsedTable( + catalog_name="catalog", # This table is migrated + schema_name="staff_db", + table_name="employees", + is_read=False, + is_write=True, + source_id=str(make_workspace_file()), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id="my file_path"), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + ] + return records + + @pytest.fixture def catalog_populated( # pylint: disable=too-many-arguments runtime_ctx: MockRuntimeContext, @@ -201,6 +308,9 @@ def catalog_populated( # pylint: disable=too-many-arguments clusters: list[ClusterInfo], pipelines: list[PipelineInfo], policies: list[PolicyInfo], + query_problems: list[QueryProblem], + dfsas: list[DirectFsAccess], + used_tables: list[UsedTable], ): """Populate the UCX catalog with multiworkspace tables. @@ -235,6 +345,13 @@ def catalog_populated( # pylint: disable=too-many-arguments Grant, mode='overwrite', ) + # Persist UsedTable to match when looking for UsedTable ownership to tables + runtime_ctx.sql_backend.save_table( + f'hive_metastore.{runtime_ctx.inventory_database}.used_tables_in_paths', + used_tables, + UsedTable, + mode='overwrite', + ) for parent_run_id in range(1, 3): # No changes in progress between the two runs runtime_ctx = runtime_ctx.replace(parent_run_id=parent_run_id) runtime_ctx.tables_progress.append_inventory_snapshot(tables) @@ -252,6 +369,12 @@ def catalog_populated( # pylint: disable=too-many-arguments del runtime_ctx.pipelines_progress runtime_ctx.policies_progress.append_inventory_snapshot(policies) del runtime_ctx.policies_progress + runtime_ctx.query_problem_progress.append_inventory_snapshot(query_problems) + del runtime_ctx.query_problem_progress + runtime_ctx.direct_filesystem_access_progress.append_inventory_snapshot(dfsas) + del runtime_ctx.direct_filesystem_access_progress + runtime_ctx.used_table_progress.append_inventory_snapshot(used_tables) + del runtime_ctx.used_table_progress return runtime_ctx.ucx_catalog @@ -290,22 +413,30 @@ def test_migration_progress_dashboard( @pytest.mark.parametrize( "query_name, rows", [ - ("01_0_percentage_migration_progress", [Row(percentage=round(100 * 22 / 34, 2))]), - ("01_1_percentage_udf_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_2_percentage_grant_migration_progress", [Row(percentage=round(100 * 12 / 13, 2))]), - ("01_3_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), - ("01_4_percentage_cluster_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_5_percentage_table_migration_progress", [Row(percentage=round(100 * 5 / 10, 2))]), - ("01_6_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_7_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_00_percentage_migration_progress", [Row(percentage=round(100 * 23 / 39, 2))]), + ("01_01_percentage_udf_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_02_percentage_grant_migration_progress", [Row(percentage=round(100 * 12 / 13, 2))]), + ("01_03_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), + ("01_04_percentage_cluster_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_05_percentage_table_migration_progress", [Row(percentage=round(100 * 5 / 10, 2))]), + ("01_06_percentage_used_table_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_07_count_direct_filesystem_access", [Row(counter=2)]), + ("01_08_count_query_problem", [Row(counter=1)]), + ("01_09_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_10_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ( - "01_8_distinct_failures_per_object_type", + "01_11_distinct_failures_per_object_type", [ Row( object_type="ClusterInfo", count=1, failure="Uses azure service principal credentials config in cluster", ), + Row( + object_type="DirectFsAccess", + count=2, + failure="Direct filesystem access is not supported in Unity Catalog", + ), Row( object_type="Grant", count=1, @@ -327,8 +458,10 @@ def test_migration_progress_dashboard( count=1, failure="Uses azure service principal credentials config in policy", ), + Row(object_type="QueryProblem", count=1, failure="[sql-parse-error] Could not parse SQL"), Row(object_type="Table", count=5, failure="Pending migration"), Row(object_type="Udf", count=1, failure="UDF not supported by UC"), + Row(object_type="UsedTable", count=1, failure="Pending migration"), ], ), ( @@ -351,9 +484,21 @@ def test_migration_progress_dashboard( Row(owner="Eric", percentage=round(100 * 1 / 1, 2), total=1, total_migrated=1, total_not_migrated=0), ], ), + ( + "03_01_pending_migration_data_asset_references", + [ + Row(count=3), + ], + ), + ( + "03_03_migrated_data_asset_references", + [ + Row(count=1), + ], + ), ], ) -def test_percentage_migration_progress( +def test_migration_progress_query( dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, query_name, @@ -363,3 +508,121 @@ def test_percentage_migration_progress( assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) assert query_results == rows + + +def test_migration_progress_query_data_asset_references_by_owner_bar_graph( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, +) -> None: + """Separate test is required to set the owner of the used table at runtime""" + query_name = "03_02_data_asset_references_by_owner_bar_graph" + rows = [Row(owner=ws.current_user.me().user_name, count=1)] + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows + + +def test_migration_progress_query_data_asset_references_pending_migration_overview( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, +) -> None: + """Separate test is required to set the owner of the used table at runtime""" + query_name = "03_04_data_asset_references_pending_migration_overview" + current_user = ws.current_user.me().user_name + rows = [ + Row( + owner=current_user, + object_type="Direct filesystem access", + percentage=0, + total=2, + total_migrated=0, + total_not_migrated=2, + ), + Row( + owner=current_user, + object_type="Table or view reference", + percentage=50, + total=2, + total_migrated=1, + total_not_migrated=1, + ), + ] + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows + + +def test_migration_progress_query_data_asset_references_pending_migration( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, + dfsas: list[DirectFsAccess], + used_tables: list[UsedTable], +) -> None: + """Separate test is required to set the dfsas and used table dynamically""" + query_name = "03_05_data_asset_references_pending_migration" + workspace_id = ws.get_workspace_id() + current_user = ws.current_user.me().user_name + rows = [] + for dfsa in dfsas: + link_prefix = "/sql/editor/" if dfsa.source_type == "QUERY" else "/#workspace" + row = Row( + workspace_id=workspace_id, + owner=current_user, + object_type="Direct filesystem access", + object_id=dfsas[0].path, + failure="Direct filesystem access is not supported in Unity Catalog", + is_read=False, + is_write=True, + link=f"{link_prefix}{dfsa.source_id}", + ) + rows.append(row) + for used_table in used_tables: + if used_table.catalog_name != "hive_metastore": + continue + row = Row( + workspace_id=workspace_id, + owner=current_user, + object_type="Table or view reference", + object_id=f"{used_table.catalog_name}.{used_table.schema_name}.{used_table.table_name}", + failure="Pending migration", + is_read=False, + is_write=True, + link=f"/#workspace{used_table.source_id}", + ) + rows.append(row) + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows + + +def test_migration_progress_code_compatibility_issues( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, + query_problems: list[QueryProblem], +) -> None: + """Separate test is required to set the dashboard and query id dynamically""" + query_name = "03_06_code_compatibility_issues" + workspace_id = ws.get_workspace_id() + rows = [] + for query_problem in query_problems: + row = Row( + workspace_id=workspace_id, + code="sql-parse-error", + message="Could not parse SQL", + dashboard_name=query_problem.dashboard_name, + query_name=query_problem.query_name, + dashboard_id=query_problem.dashboard_id, + query_id=query_problem.query_id, + ) + rows.append(row) + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows