Skip to content

Commit

Permalink
Add OpenLineage support to BigQueryToGCSOperator (apache#35660)
Browse files Browse the repository at this point in the history
  • Loading branch information
kacpermuda authored Nov 17, 2023
1 parent 02165c5 commit ce16963
Show file tree
Hide file tree
Showing 5 changed files with 571 additions and 1 deletion.
75 changes: 74 additions & 1 deletion airflow/providers/google/cloud/transfers/bigquery_to_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook, BigQueryJob
from airflow.providers.google.cloud.links.bigquery import BigQueryTableLink
from airflow.providers.google.cloud.triggers.bigquery import BigQueryInsertJobTrigger
from airflow.utils.helpers import merge_dicts

if TYPE_CHECKING:
from google.api_core.retry import Retry
Expand Down Expand Up @@ -139,6 +140,8 @@ def __init__(
self.hook: BigQueryHook | None = None
self.deferrable = deferrable

self._job_id: str = ""

@staticmethod
def _handle_job_error(job: BigQueryJob | UnknownJob) -> None:
if job.error_result:
Expand Down Expand Up @@ -240,6 +243,7 @@ def execute(self, context: Context):
f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
)

self._job_id = job.job_id
conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"]
dataset_id, project_id, table_id = conf["datasetId"], conf["projectId"], conf["tableId"]
BigQueryTableLink.persist(
Expand All @@ -255,7 +259,7 @@ def execute(self, context: Context):
timeout=self.execution_timeout,
trigger=BigQueryInsertJobTrigger(
conn_id=self.gcp_conn_id,
job_id=job_id,
job_id=self._job_id,
project_id=self.project_id or self.hook.project_id,
),
method_name="execute_complete",
Expand All @@ -276,3 +280,72 @@ def execute_complete(self, context: Context, event: dict[str, Any]):
self.task_id,
event["message"],
)

def get_openlineage_facets_on_complete(self, task_instance):
"""Implementing on_complete as we will include final BQ job id."""
from pathlib import Path

from openlineage.client.facet import (
ExternalQueryRunFacet,
SymlinksDatasetFacet,
SymlinksDatasetFacetIdentifiers,
)
from openlineage.client.run import Dataset

from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
from airflow.providers.google.cloud.utils.openlineage import (
get_facets_from_bq_table,
get_identity_column_lineage_facet,
)
from airflow.providers.openlineage.extractors import OperatorLineage

table_object = self.hook.get_client(self.hook.project_id).get_table(self.source_project_dataset_table)

input_dataset = Dataset(
namespace="bigquery",
name=str(table_object.reference),
facets=get_facets_from_bq_table(table_object),
)

output_dataset_facets = {
"schema": input_dataset.facets["schema"],
"columnLineage": get_identity_column_lineage_facet(
field_names=[field.name for field in table_object.schema], input_datasets=[input_dataset]
),
}
output_datasets = []
for uri in sorted(self.destination_cloud_storage_uris):
bucket, blob = _parse_gcs_url(uri)
additional_facets = {}

if "*" in blob:
# If wildcard ("*") is used in gcs path, we want the name of dataset to be directory name,
# but we create a symlink to the full object path with wildcard.
additional_facets = {
"symlink": SymlinksDatasetFacet(
identifiers=[
SymlinksDatasetFacetIdentifiers(
namespace=f"gs://{bucket}", name=blob, type="file"
)
]
),
}
blob = Path(blob).parent.as_posix()
if blob == ".":
# blob path does not have leading slash, but we need root dataset name to be "/"
blob = "/"

dataset = Dataset(
namespace=f"gs://{bucket}",
name=blob,
facets=merge_dicts(output_dataset_facets, additional_facets),
)
output_datasets.append(dataset)

run_facets = {}
if self._job_id:
run_facets = {
"externalQuery": ExternalQueryRunFacet(externalQueryId=self._job_id, source="bigquery"),
}

return OperatorLineage(inputs=[input_dataset], outputs=output_datasets, run_facets=run_facets)
80 changes: 80 additions & 0 deletions airflow/providers/google/cloud/utils/openlineage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""This module contains code related to OpenLineage and lineage extraction."""
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from openlineage.client.facet import (
ColumnLineageDatasetFacet,
ColumnLineageDatasetFacetFieldsAdditional,
ColumnLineageDatasetFacetFieldsAdditionalInputFields,
DocumentationDatasetFacet,
SchemaDatasetFacet,
SchemaField,
)

if TYPE_CHECKING:
from google.cloud.bigquery.table import Table
from openlineage.client.run import Dataset


def get_facets_from_bq_table(table: Table) -> dict[Any, Any]:
"""Get facets from BigQuery table object."""
facets = {
"schema": SchemaDatasetFacet(
fields=[
SchemaField(name=field.name, type=field.field_type, description=field.description)
for field in table.schema
]
),
"documentation": DocumentationDatasetFacet(description=table.description or ""),
}

return facets


def get_identity_column_lineage_facet(
field_names: list[str],
input_datasets: list[Dataset],
) -> ColumnLineageDatasetFacet:
"""
Get column lineage facet.
Simple lineage will be created, where each source column corresponds to single destination column
in each input dataset and there are no transformations made.
"""
if field_names and not input_datasets:
raise ValueError("When providing `field_names` You must provide at least one `input_dataset`.")

column_lineage_facet = ColumnLineageDatasetFacet(
fields={
field: ColumnLineageDatasetFacetFieldsAdditional(
inputFields=[
ColumnLineageDatasetFacetFieldsAdditionalInputFields(
namespace=dataset.namespace, name=dataset.name, field=field
)
for dataset in input_datasets
],
transformationType="IDENTITY",
transformationDescription="identical",
)
for field in field_names
}
)
return column_lineage_facet
1 change: 1 addition & 0 deletions docs/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ codepoints
Colour
colour
colours
ColumnLineageDatasetFacet
CommandType
comparator
compat
Expand Down
Loading

0 comments on commit ce16963

Please sign in to comment.