Skip to content

Commit

Permalink
resolve conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal committed Jan 15, 2025
2 parents f69ac8b + a3c7a33 commit 717c249
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 12 deletions.
8 changes: 1 addition & 7 deletions metadata-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,7 @@ task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) {


task lint(type: Exec, dependsOn: installDev) {
/*
The find/sed combo below is a temporary work-around for the following mypy issue with airflow 2.2.0:
"venv/lib/python3.8/site-packages/airflow/_vendor/connexion/spec.py:169: error: invalid syntax".
*/
commandLine 'bash', '-c',
"find ${venv_name}/lib -path *airflow/_vendor/connexion/spec.py -exec sed -i.bak -e '169,169s/ # type: List\\[str\\]//g' {} \\; && " +
"source ${venv_name}/bin/activate && set -x && " +
"black --check --diff src/ tests/ examples/ && " +
"isort --check --diff src/ tests/ examples/ && " +
Expand All @@ -124,8 +119,7 @@ task lintFix(type: Exec, dependsOn: installDev) {
"source ${venv_name}/bin/activate && set -x && " +
"black src/ tests/ examples/ && " +
"isort src/ tests/ examples/ && " +
"ruff check --fix src/ tests/ examples/ && " +
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
"ruff check --fix src/ tests/ examples/"
}

def pytest_default_env = "PYTHONDEVMODE=1"
Expand Down
5 changes: 5 additions & 0 deletions metadata-ingestion/developing.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ cd metadata-ingestion-modules/gx-plugin
source venv/bin/activate
datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)"
```

### (Optional) Set up your Python environment for developing on Dagster Plugin

From the repository root:
Expand All @@ -99,6 +100,7 @@ cd metadata-ingestion-modules/dagster-plugin
source venv/bin/activate
datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)"
```

### Common setup issues

Common issues (click to expand):
Expand Down Expand Up @@ -188,6 +190,9 @@ mypy src/ tests/
or you can run from root of the repository

```shell
./gradlew :metadata-ingestion:lint

# This will auto-fix some linting issues.
./gradlew :metadata-ingestion:lintFix
```

Expand Down
3 changes: 3 additions & 0 deletions metadata-ingestion/docs/sources/mssql/mssql_recipe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ source:
username: user
password: pass

# This is recommended to improve lineage quality. Ignores case-sensitivity when constructing internal identifiers.
convert_urns_to_lowercase: True

# Options
# Uncomment if you need to use encryption with pytds
# See https://python-tds.readthedocs.io/en/latest/pytds.html#pytds.connect
Expand Down
33 changes: 28 additions & 5 deletions metadata-ingestion/src/datahub/utilities/urns/_urn_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import functools
import urllib.parse
from abc import abstractmethod
from typing import ClassVar, Dict, List, Optional, Type
from typing import ClassVar, Dict, List, Optional, Type, Union

from deprecated import deprecated
from typing_extensions import Self
Expand Down Expand Up @@ -86,12 +86,24 @@ def entity_ids(self) -> List[str]:
return self._entity_ids

@classmethod
def from_string(cls, urn_str: str) -> Self:
"""
Creates an Urn from its string representation.
def from_string(cls, urn_str: Union[str, "Urn"], /) -> Self:
"""Create an Urn from its string representation.
When called against the base Urn class, this method will return a more specific Urn type where possible.
>>> from datahub.metadata.urns import DatasetUrn, Urn
>>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
>>> urn = Urn.from_string(urn_str)
>>> assert isinstance(urn, DatasetUrn)
When called against a specific Urn type (e.g. DatasetUrn.from_string), this method can
also be used for type narrowing.
>>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
>>> assert DatasetUrn.from_string(urn_str)
Args:
urn_str: The string representation of the Urn.
urn_str: The string representation of the urn. Also accepts an existing Urn instance.
Returns:
Urn of the given string representation.
Expand All @@ -100,6 +112,17 @@ def from_string(cls, urn_str: str) -> Self:
InvalidUrnError: If the string representation is in invalid format.
"""

if isinstance(urn_str, Urn):
if issubclass(cls, _SpecificUrn) and isinstance(urn_str, cls):
# Fast path - we're already the right type.

# I'm not really sure why we need a type ignore here, but mypy doesn't really
# understand the isinstance check above.
return urn_str # type: ignore

# Fall through, so that we can convert a generic Urn to a specific Urn type.
urn_str = urn_str.urn()

# TODO: Add handling for url encoded urns e.g. urn%3A ...

if not urn_str.startswith("urn:li:"):
Expand Down
41 changes: 41 additions & 0 deletions metadata-ingestion/tests/unit/urns/test_urn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@

import pytest

import datahub.utilities.urns._urn_base
from datahub.metadata.urns import (
CorpUserUrn,
DataPlatformUrn,
DatasetUrn,
SchemaFieldUrn,
Urn,
)
from datahub.testing.doctest import assert_doctest
from datahub.utilities.urns.error import InvalidUrnError

pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand Down Expand Up @@ -87,6 +89,10 @@ def test_urn_type_dispatch_1() -> None:
with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"):
DatasetUrn.from_string("urn:li:corpuser:foo")

urn2 = DatasetUrn.from_string(urn)
assert isinstance(urn2, DatasetUrn)
assert urn2 == urn


def test_urn_type_dispatch_2() -> None:
urn = "urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)"
Expand All @@ -96,6 +102,41 @@ def test_urn_type_dispatch_2() -> None:
CorpUserUrn.from_string(urn)


def test_urn_type_dispatch_3() -> None:
# Creating a "generic" Urn.
urn = Urn("dataset", ["urn:li:dataPlatform:abc", "def", "PROD"])
assert isinstance(urn, Urn)

urn2 = DatasetUrn.from_string(urn)
assert isinstance(urn2, DatasetUrn)
assert urn2 == urn

with pytest.raises(
InvalidUrnError,
match="Passed an urn of type dataset to the from_string method of CorpUserUrn",
):
CorpUserUrn.from_string(urn)


def test_urn_type_dispatch_4() -> None:
# A generic urn of a new entity type.
urn_str = "urn:li:new_entity_type:(abc,def)"

urn = Urn.from_string(urn_str)
assert type(urn) is Urn
assert urn == Urn("new_entity_type", ["abc", "def"])
assert urn.urn() == urn_str

urn2 = Urn.from_string(urn)
assert type(urn2) is Urn
assert urn2 == urn
assert urn2.urn() == urn_str


def test_urn_doctest() -> None:
assert_doctest(datahub.utilities.urns._urn_base)


def _load_urns(file_name: pathlib.Path) -> List[str]:
urns = [
line.strip()
Expand Down

0 comments on commit 717c249

Please sign in to comment.