Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect tables that are not present in the mapping file #2205

Merged
merged 18 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/databricks/labs/ucx/hive_metastore/table_migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ def __init__(
self._seen_tables: dict[str, str] = {}
self._principal_grants = principal_grants

def get_remaining_tables(self) -> list[Table]:
self.index_full_refresh()
table_rows = []
for crawled_table in self._tc.snapshot():
if not self._is_migrated(crawled_table.database, crawled_table.name):
table_rows.append(crawled_table)
aminmovahed-db marked this conversation as resolved.
Show resolved Hide resolved
logger.warning(f"remained-hive-metastore-table: {crawled_table.key}")
return table_rows

def index(self):
return self._migration_status_refresher.index()

Expand Down Expand Up @@ -490,3 +499,7 @@ def _sql_alter_from(self, table: Table, target_table_key: str, ws_id: int):
f"('upgraded_from' = '{source}'"
f" , '{table.UPGRADED_FROM_WS_PARAM}' = '{ws_id}');"
)

def _is_migrated(self, schema: str, table: str) -> bool:
aminmovahed-db marked this conversation as resolved.
Show resolved Hide resolved
index = self._migration_status_refresher.index()
aminmovahed-db marked this conversation as resolved.
Show resolved Hide resolved
return index.is_migrated(schema, table)
20 changes: 10 additions & 10 deletions src/databricks/labs/ucx/hive_metastore/workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def migrate_views(self, ctx: RuntimeContext):
)

@job_task(job_cluster="table_migration", depends_on=[migrate_views])
def refresh_migration_status(self, ctx: RuntimeContext):
def update_migration_status(self, ctx: RuntimeContext):
"""Refresh the migration status to present it in the dashboard."""
ctx.tables_migrator.index_full_refresh()
ctx.tables_migrator.get_remaining_tables()


class MigrateHiveSerdeTablesInPlace(Workflow):
Expand Down Expand Up @@ -109,9 +109,9 @@ def migrate_views(self, ctx: RuntimeContext):
)

@job_task(job_cluster="table_migration", depends_on=[migrate_views])
def refresh_migration_status(self, ctx: RuntimeContext):
def update_migration_status(self, ctx: RuntimeContext):
"""Refresh the migration status to present it in the dashboard."""
ctx.tables_migrator.index_full_refresh()
ctx.tables_migrator.get_remaining_tables()


class MigrateExternalTablesCTAS(Workflow):
Expand Down Expand Up @@ -159,9 +159,9 @@ def migrate_views(self, ctx: RuntimeContext):
)

@job_task(job_cluster="table_migration", depends_on=[migrate_views])
def refresh_migration_status(self, ctx: RuntimeContext):
def update_migration_status(self, ctx: RuntimeContext):
"""Refresh the migration status to present it in the dashboard."""
ctx.tables_migrator.index_full_refresh()
ctx.tables_migrator.get_remaining_tables()


class ScanTablesInMounts(Workflow):
Expand All @@ -176,9 +176,9 @@ def scan_tables_in_mounts_experimental(self, ctx: RuntimeContext):
ctx.tables_in_mounts.snapshot()

@job_task(job_cluster="table_migration", depends_on=[scan_tables_in_mounts_experimental])
def refresh_migration_status(self, ctx: RuntimeContext):
def update_migration_status(self, ctx: RuntimeContext):
"""Refresh the migration status to present it in the dashboard."""
ctx.tables_migrator.index_full_refresh()
ctx.tables_migrator.get_remaining_tables()


class MigrateTablesInMounts(Workflow):
Expand All @@ -191,6 +191,6 @@ def migrate_tables_in_mounts_experimental(self, ctx: RuntimeContext):
ctx.tables_migrator.migrate_tables(what=What.TABLE_IN_MOUNT)

@job_task(job_cluster="table_migration", depends_on=[migrate_tables_in_mounts_experimental])
def refresh_migration_status(self, ctx: RuntimeContext):
def update_migration_status(self, ctx: RuntimeContext):
"""Refresh the migration status to present it in the dashboard."""
ctx.tables_migrator.index_full_refresh()
ctx.tables_migrator.get_remaining_tables()
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/* --title 'List of remaining tables in HMS' --type table --width 6 */
SELECT
SUBSTRING(message, LENGTH('remained-hive-metastore-table: ') + 1) AS message
FROM inventory.logs
WHERE
message LIKE 'remained-hive-metastore-table: %'
33 changes: 33 additions & 0 deletions tests/integration/hive_metastore/test_workflows.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from databricks.sdk.errors import NotFound
from databricks.labs.ucx.hive_metastore.tables import Table


@pytest.mark.parametrize(
Expand Down Expand Up @@ -97,3 +98,35 @@ def test_hiveserde_table_ctas_migration_job(ws, installation_ctx, prepare_tables
assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name
except NotFound:
assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}"


@pytest.mark.parametrize('prepare_tables_for_migration', ['regular'], indirect=True)
def test_table_migration_job_publishes_remianed_tables(
ws, installation_ctx, sql_backend, prepare_tables_for_migration, caplog
):
tables, dst_schema = prepare_tables_for_migration
aminmovahed-db marked this conversation as resolved.
Show resolved Hide resolved
installation_ctx.workspace_installation.run()
second_table = list(tables.values())[1]
table = Table(
"hive_metastore",
dst_schema.name,
second_table.name,
object_type="UNKNOWN",
table_format="UNKNOWN",
)
installation_ctx.table_mapping.skip_table_or_view(dst_schema.name, second_table.name, load_table=lambda *_: table)
installation_ctx.deployed_workflows.run_workflow("migrate-tables")
assert installation_ctx.deployed_workflows.validate_step("migrate-tables")

remained_tables = list(
sql_backend.fetch(
aminmovahed-db marked this conversation as resolved.
Show resolved Hide resolved
f"""
SELECT
SUBSTRING(message, LENGTH('remained-hive-metastore-table: ') + 1)
AS message
FROM {installation_ctx.inventory_database}.logs
WHERE message LIKE 'remained-hive-metastore-table: %'
"""
)
)
assert remained_tables[0].message == f'hive_metastore.{dst_schema.name}.{second_table.name}'
62 changes: 62 additions & 0 deletions tests/unit/hive_metastore/test_table_migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1349,3 +1349,65 @@ def test_revert_migrated_tables_failed(caplog):
table_migrate = get_table_migrator(backend)
table_migrate.revert_migrated_tables(schema="test_schema1")
assert "Failed to revert table hive_metastore.test_schema1.test_table1: error" in caplog.text


def test_refresh_migration_status_published_remained_tables(caplog):
aminmovahed-db marked this conversation as resolved.
Show resolved Hide resolved
backend = MockBackend()
table_crawler = create_autospec(TablesCrawler)
grant_crawler = create_autospec(GrantsCrawler)
client = mock_workspace_client()
table_crawler.snapshot.return_value = [
Table(
object_type="EXTERNAL",
table_format="DELTA",
catalog="hive_metastore",
database="schema1",
name="table1",
location="s3://some_location/table1",
upgraded_to="ucx_default.db1_dst.dst_table1",
),
Table(
object_type="EXTERNAL",
table_format="DELTA",
catalog="hive_metastore",
database="schema1",
name="table2",
location="s3://some_location/table2",
upgraded_to="ucx_default.db1_dst.dst_table2",
),
Table(
object_type="EXTERNAL",
table_format="DELTA",
catalog="hive_metastore",
database="schema1",
name="table3",
location="s3://some_location/table3",
),
]
group_manager = GroupManager(backend, client, "inventory_database")
table_mapping = mock_table_mapping()
migration_status_refresher = create_autospec(MigrationStatusRefresher)
migration_index = MigrationIndex(
[
MigrationStatus("schema1", "table1", "ucx_default", "db1_dst", "dst_table1"),
MigrationStatus("schema1", "table2", "ucx_default", "db1_dst", "dst_table2"),
]
)
migration_status_refresher.index.return_value = migration_index
principal_grants = create_autospec(PrincipalACL)
table_migrate = TablesMigrator(
table_crawler,
grant_crawler,
client,
backend,
table_mapping,
group_manager,
migration_status_refresher,
principal_grants,
)
with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore"):
tables = table_migrate.get_remaining_tables()
assert 'remained-hive-metastore-table: hive_metastore.schema1.table3' in caplog.messages
assert len(tables) == 1 and tables[0].key == "hive_metastore.schema1.table3"
grant_crawler.assert_not_called()
principal_grants.assert_not_called()
6 changes: 2 additions & 4 deletions tests/unit/hive_metastore/test_workflows.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pytest

from databricks.labs.ucx.hive_metastore.workflows import (
TableMigration,
MigrateExternalTablesCTAS,
Expand Down Expand Up @@ -64,9 +63,8 @@ def test_migrate_ctas_views(run_workflow):
MigrateTablesInMounts,
],
)
def test_refresh_migration_status_is_refreshed(run_workflow, workflow):
def test_update_migration_status(run_workflow, workflow):
"""Migration status is refreshed by deleting and showing new tables"""
ctx = run_workflow(getattr(workflow, "refresh_migration_status"))
ctx = run_workflow(getattr(workflow, "update_migration_status"))
assert "TRUNCATE TABLE hive_metastore.ucx.migration_status" in ctx.sql_backend.queries
assert "SHOW DATABASES" in ctx.sql_backend.queries
# No "SHOW TABLE FROM" query as table are not mocked
Loading