From cba73670941d9ec9f011ab7ac50d14f61b846e1d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 14 Jul 2024 10:34:27 -0700 Subject: [PATCH] feat(geospatial): use geoarrow extension types when returning geometry columns as pyarrow (#9549) Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com> --- ibis/formats/pyarrow.py | 71 +++++++++++++++++++++++++++++- ibis/formats/tests/test_pyarrow.py | 56 ++++++++++++++++++++++- poetry.lock | 18 +++++++- pyproject.toml | 4 +- requirements-dev.txt | 1 + 5 files changed, 144 insertions(+), 6 deletions(-) diff --git a/ibis/formats/pyarrow.py b/ibis/formats/pyarrow.py index 94610a533e14..ac7f9ab8b376 100644 --- a/ibis/formats/pyarrow.py +++ b/ibis/formats/pyarrow.py @@ -113,6 +113,54 @@ def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType: return dt.Map(key_dtype, value_dtype, nullable=nullable) elif pa.types.is_dictionary(typ): return cls.to_ibis(typ.value_type) + elif ( + isinstance(typ, pa.ExtensionType) + and type(typ).__module__ == "geoarrow.types.type_pyarrow" + ): + from geoarrow import types as gat + + gat.type_pyarrow.register_extension_types() + + auth_code = None + if typ.crs is not None: + crs_dict = typ.crs.to_json_dict() + if "id" in crs_dict: + crs_id = crs_dict["id"] + if "authority" in crs_id and "code" in crs_id: + auth_code = (crs_id["authority"], crs_id["code"]) + + if typ.crs is not None and auth_code is None: + # It is possible to have PROJJSON that does not have an authority/code + # attached, either because the producer didn't have that information + # (e.g., because they were reading a older shapefile). In this case, + # pyproj can often guess the authority/code. + import pyproj + + auth_code = pyproj.CRS(typ.crs.to_json()).to_authority() + if auth_code is None: + raise ValueError(f"Can't resolve SRID of crs {typ.crs}") + + if auth_code is None: + srid = None + elif auth_code == ("OGC", "CRS84"): + # OGC:CRS84 and EPSG:4326 are identical except for the order of + # coordinates (i.e., lon lat vs. lat lon) in their official definition. + # This axis ordering is ignored in all but the most obscure scenarios + # such that these are identical. OGC:CRS84 is more correct, but EPSG:4326 + # is more common. + srid = 4326 + else: + # This works because the two most common srid authorities are EPSG and ESRI + # and the "codes" are all integers and don't intersect with each other on + # purpose. This won't scale to something like OGC:CRS27 (not common). + srid = int(auth_code[1]) + + if typ.edge_type == gat.EdgeType.SPHERICAL: + geotype = "geography" + else: + geotype = "geometry" + + return dt.GeoSpatial(geotype, srid, nullable) else: return _from_pyarrow_types()[typ](nullable=nullable) @@ -175,7 +223,28 @@ def from_ibis(cls, dtype: dt.DataType) -> pa.DataType: ) return pa.map_(key_field, value_field, keys_sorted=False) elif dtype.is_geospatial(): - return pa.binary() + from geoarrow import types as gat + + # Resolve CRS + if dtype.srid is None: + crs = None + elif dtype.srid == 4326: + crs = gat.OGC_CRS84 + else: + import pyproj + + # Assume that these are EPSG codes. An srid is more accurately a key + # into a backend/connection-specific lookup table; however, most usage + # should work with this assumption. + crs = pyproj.CRS(f"EPSG:{dtype.srid}") + + # Resolve edge type + if dtype.geotype == "geography": + edge_type = gat.EdgeType.SPHERICAL + else: + edge_type = gat.EdgeType.PLANAR + + return gat.wkb(crs=crs, edge_type=edge_type).to_pyarrow() else: try: return _to_pyarrow_types()[type(dtype)] diff --git a/ibis/formats/tests/test_pyarrow.py b/ibis/formats/tests/test_pyarrow.py index 015923af4ca5..00a57480a26e 100644 --- a/ibis/formats/tests/test_pyarrow.py +++ b/ibis/formats/tests/test_pyarrow.py @@ -188,5 +188,57 @@ def test_unknown_dtype_gets_converted_to_string(): pytest.param(dt.multipolygon, id="multipolygon"), ], ) -def test_geo_gets_converted_to_binary(ibis_type): - assert PyArrowType.from_ibis(ibis_type) == pa.binary() +def test_geo_gets_converted_to_geoarrow(ibis_type): + type_pyarrow = pytest.importorskip("geoarrow.types.type_pyarrow") + + assert isinstance( + PyArrowType.from_ibis(ibis_type), type_pyarrow.GeometryExtensionType + ) + + +def test_geoarrow_gets_converted_to_geo(): + gat = pytest.importorskip("geoarrow.types") + + pyarrow_type = gat.wkb().to_pyarrow() + ibis_type = PyArrowType.to_ibis(pyarrow_type) + assert ibis_type.is_geospatial() + assert ibis_type.geotype == "geometry" + assert ibis_type.srid is None + assert ibis_type.nullable is True + assert ibis_type.to_pyarrow() == pyarrow_type + + pyarrow_type = gat.wkb(edge_type=gat.EdgeType.SPHERICAL).to_pyarrow() + ibis_type = PyArrowType.to_ibis(pyarrow_type) + assert ibis_type.geotype == "geography" + assert ibis_type.to_pyarrow() == pyarrow_type + + ibis_type = PyArrowType.to_ibis(gat.wkb().to_pyarrow(), nullable=False) + assert ibis_type.nullable is False + + +def test_geoarrow_crs_gets_converted_to_geo(): + gat = pytest.importorskip("geoarrow.types") + pyproj = pytest.importorskip("pyproj") + + # Check the GeoArrow/GeoParquet standard representation of longitude/latitude + pyarrow_type = gat.wkb(crs=gat.OGC_CRS84).to_pyarrow() + ibis_type = PyArrowType.to_ibis(pyarrow_type) + assert ibis_type.srid == 4326 + assert ibis_type.to_pyarrow() == pyarrow_type + + # Check a standard representation of lon/lat that happens to be missing the + # explicit authority/code section of the PROJJSON (i.e., make pyproj guess + # the srid for us) + lonlat_crs = gat.OGC_CRS84.to_json_dict() + del lonlat_crs["id"] + pyarrow_type = gat.wkb(crs=lonlat_crs).to_pyarrow() + ibis_type = PyArrowType.to_ibis(pyarrow_type) + assert ibis_type.srid == 4326 + assert ibis_type.to_pyarrow() == pyarrow_type + + # Check a non-lon/lat CRS (e.g., UTM Zone 20N) + utm_20n = pyproj.CRS("EPSG:32620") + pyarrow_type = gat.wkb(crs=utm_20n).to_pyarrow() + ibis_type = PyArrowType.to_ibis(pyarrow_type) + assert ibis_type.srid == 32620 + assert ibis_type.to_pyarrow() == pyarrow_type diff --git a/poetry.lock b/poetry.lock index 12592b2bd259..9ae5afbf73d9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1887,6 +1887,20 @@ requests = "*" crc = ["crcmod"] gcsfuse = ["fusepy"] +[[package]] +name = "geoarrow-types" +version = "0.2.0" +description = "" +optional = true +python-versions = ">=3.8" +files = [ + {file = "geoarrow_types-0.2.0-py3-none-any.whl", hash = "sha256:b83bd7e4cee92356df1904bc681cd86938ae808778aef8f836d2dce8f234cb7e"}, + {file = "geoarrow_types-0.2.0.tar.gz", hash = "sha256:2dcb3db9c80b2079a7a61c3e74aa46904f1c899136735f1cacc015757707e924"}, +] + +[package.extras] +test = ["pyarrow", "pytest"] + [[package]] name = "geopandas" version = "1.0.1" @@ -7676,7 +7690,7 @@ duckdb = ["duckdb", "pyarrow", "pyarrow-hotfix"] examples = ["fsspec", "pins"] exasol = ["pyarrow", "pyarrow-hotfix", "pyexasol"] flink = ["pyarrow", "pyarrow-hotfix"] -geospatial = ["geopandas", "shapely"] +geospatial = ["geoarrow-types", "geopandas", "pyproj", "shapely"] impala = ["impyla", "pyarrow", "pyarrow-hotfix"] mssql = ["pyarrow", "pyarrow-hotfix", "pyodbc"] mysql = ["pyarrow", "pyarrow-hotfix", "pymysql"] @@ -7694,4 +7708,4 @@ visualization = ["graphviz"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "78c776cedc53934095a7266faddc2c32aa5b76a2b6285719df3055e58d9a67a3" +content-hash = "dfb7c483def7e2d0fded85c445aba1a47a725df201fd5429971124b986ff99c9" diff --git a/pyproject.toml b/pyproject.toml index f190c4e95b37..505b8bca7cf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,8 @@ db-dtypes = { version = ">=0.3,<2", optional = true } deltalake = { version = ">=0.9.0,<1", optional = true } duckdb = { version = ">=0.8.1,<2", optional = true } geopandas = { version = ">=0.6,<2", optional = true } +geoarrow-types = { version = ">=0.2,<1", optional = true } +pyproj = { version = ">=3.3.0,<4", optional = true } google-cloud-bigquery = { version = ">=3,<4", optional = true } google-cloud-bigquery-storage = { version = ">=2,<3", optional = true } graphviz = { version = ">=0.16,<1", optional = true } @@ -173,7 +175,7 @@ visualization = ["graphviz"] decompiler = ["black"] deltalake = ["deltalake"] examples = ["pins", "fsspec"] -geospatial = ["geopandas", "shapely"] +geospatial = ["geoarrow-types", "geopandas", "pyproj", "shapely"] [tool.poetry.plugins."ibis.backends"] bigquery = "ibis.backends.bigquery" diff --git a/requirements-dev.txt b/requirements-dev.txt index 8dcc0da9c755..1134db74adb4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -63,6 +63,7 @@ fqdn==1.5.1 ; python_version >= "3.10" and python_version < "3.13" frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0" fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0" gcsfs==2024.6.1 ; python_version >= "3.10" and python_version < "4.0" +geoarrow-types==0.2.0 ; python_version >= "3.10" and python_version < "4.0" geopandas==1.0.1 ; python_version >= "3.10" and python_version < "4.0" google-api-core==2.19.1 ; python_version >= "3.10" and python_version < "4.0" google-api-core[grpc]==2.19.1 ; python_version >= "3.10" and python_version < "4.0"