Skip to content

Commit

Permalink
feat(geospatial): use geoarrow extension types when returning geometr…
Browse files Browse the repository at this point in the history
…y columns as pyarrow (#9549)

Co-authored-by: Phillip Cloud <[email protected]>
  • Loading branch information
paleolimbot and cpcloud authored Jul 14, 2024
1 parent b54bcdb commit cba7367
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 6 deletions.
71 changes: 70 additions & 1 deletion ibis/formats/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,54 @@ def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType:
return dt.Map(key_dtype, value_dtype, nullable=nullable)
elif pa.types.is_dictionary(typ):
return cls.to_ibis(typ.value_type)
elif (
isinstance(typ, pa.ExtensionType)
and type(typ).__module__ == "geoarrow.types.type_pyarrow"
):
from geoarrow import types as gat

gat.type_pyarrow.register_extension_types()

auth_code = None
if typ.crs is not None:
crs_dict = typ.crs.to_json_dict()
if "id" in crs_dict:
crs_id = crs_dict["id"]
if "authority" in crs_id and "code" in crs_id:
auth_code = (crs_id["authority"], crs_id["code"])

if typ.crs is not None and auth_code is None:
# It is possible to have PROJJSON that does not have an authority/code
# attached, either because the producer didn't have that information
# (e.g., because they were reading a older shapefile). In this case,
# pyproj can often guess the authority/code.
import pyproj

auth_code = pyproj.CRS(typ.crs.to_json()).to_authority()
if auth_code is None:
raise ValueError(f"Can't resolve SRID of crs {typ.crs}")

if auth_code is None:
srid = None
elif auth_code == ("OGC", "CRS84"):
# OGC:CRS84 and EPSG:4326 are identical except for the order of
# coordinates (i.e., lon lat vs. lat lon) in their official definition.
# This axis ordering is ignored in all but the most obscure scenarios
# such that these are identical. OGC:CRS84 is more correct, but EPSG:4326
# is more common.
srid = 4326
else:
# This works because the two most common srid authorities are EPSG and ESRI
# and the "codes" are all integers and don't intersect with each other on
# purpose. This won't scale to something like OGC:CRS27 (not common).
srid = int(auth_code[1])

if typ.edge_type == gat.EdgeType.SPHERICAL:
geotype = "geography"
else:
geotype = "geometry"

return dt.GeoSpatial(geotype, srid, nullable)
else:
return _from_pyarrow_types()[typ](nullable=nullable)

Expand Down Expand Up @@ -175,7 +223,28 @@ def from_ibis(cls, dtype: dt.DataType) -> pa.DataType:
)
return pa.map_(key_field, value_field, keys_sorted=False)
elif dtype.is_geospatial():
return pa.binary()
from geoarrow import types as gat

# Resolve CRS
if dtype.srid is None:
crs = None
elif dtype.srid == 4326:
crs = gat.OGC_CRS84
else:
import pyproj

# Assume that these are EPSG codes. An srid is more accurately a key
# into a backend/connection-specific lookup table; however, most usage
# should work with this assumption.
crs = pyproj.CRS(f"EPSG:{dtype.srid}")

# Resolve edge type
if dtype.geotype == "geography":
edge_type = gat.EdgeType.SPHERICAL
else:
edge_type = gat.EdgeType.PLANAR

return gat.wkb(crs=crs, edge_type=edge_type).to_pyarrow()
else:
try:
return _to_pyarrow_types()[type(dtype)]
Expand Down
56 changes: 54 additions & 2 deletions ibis/formats/tests/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,5 +188,57 @@ def test_unknown_dtype_gets_converted_to_string():
pytest.param(dt.multipolygon, id="multipolygon"),
],
)
def test_geo_gets_converted_to_binary(ibis_type):
assert PyArrowType.from_ibis(ibis_type) == pa.binary()
def test_geo_gets_converted_to_geoarrow(ibis_type):
type_pyarrow = pytest.importorskip("geoarrow.types.type_pyarrow")

assert isinstance(
PyArrowType.from_ibis(ibis_type), type_pyarrow.GeometryExtensionType
)


def test_geoarrow_gets_converted_to_geo():
gat = pytest.importorskip("geoarrow.types")

pyarrow_type = gat.wkb().to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.is_geospatial()
assert ibis_type.geotype == "geometry"
assert ibis_type.srid is None
assert ibis_type.nullable is True
assert ibis_type.to_pyarrow() == pyarrow_type

pyarrow_type = gat.wkb(edge_type=gat.EdgeType.SPHERICAL).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.geotype == "geography"
assert ibis_type.to_pyarrow() == pyarrow_type

ibis_type = PyArrowType.to_ibis(gat.wkb().to_pyarrow(), nullable=False)
assert ibis_type.nullable is False


def test_geoarrow_crs_gets_converted_to_geo():
gat = pytest.importorskip("geoarrow.types")
pyproj = pytest.importorskip("pyproj")

# Check the GeoArrow/GeoParquet standard representation of longitude/latitude
pyarrow_type = gat.wkb(crs=gat.OGC_CRS84).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.srid == 4326
assert ibis_type.to_pyarrow() == pyarrow_type

# Check a standard representation of lon/lat that happens to be missing the
# explicit authority/code section of the PROJJSON (i.e., make pyproj guess
# the srid for us)
lonlat_crs = gat.OGC_CRS84.to_json_dict()
del lonlat_crs["id"]
pyarrow_type = gat.wkb(crs=lonlat_crs).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.srid == 4326
assert ibis_type.to_pyarrow() == pyarrow_type

# Check a non-lon/lat CRS (e.g., UTM Zone 20N)
utm_20n = pyproj.CRS("EPSG:32620")
pyarrow_type = gat.wkb(crs=utm_20n).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.srid == 32620
assert ibis_type.to_pyarrow() == pyarrow_type
18 changes: 16 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ db-dtypes = { version = ">=0.3,<2", optional = true }
deltalake = { version = ">=0.9.0,<1", optional = true }
duckdb = { version = ">=0.8.1,<2", optional = true }
geopandas = { version = ">=0.6,<2", optional = true }
geoarrow-types = { version = ">=0.2,<1", optional = true }
pyproj = { version = ">=3.3.0,<4", optional = true }
google-cloud-bigquery = { version = ">=3,<4", optional = true }
google-cloud-bigquery-storage = { version = ">=2,<3", optional = true }
graphviz = { version = ">=0.16,<1", optional = true }
Expand Down Expand Up @@ -173,7 +175,7 @@ visualization = ["graphviz"]
decompiler = ["black"]
deltalake = ["deltalake"]
examples = ["pins", "fsspec"]
geospatial = ["geopandas", "shapely"]
geospatial = ["geoarrow-types", "geopandas", "pyproj", "shapely"]

[tool.poetry.plugins."ibis.backends"]
bigquery = "ibis.backends.bigquery"
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ fqdn==1.5.1 ; python_version >= "3.10" and python_version < "3.13"
frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0"
fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0"
gcsfs==2024.6.1 ; python_version >= "3.10" and python_version < "4.0"
geoarrow-types==0.2.0 ; python_version >= "3.10" and python_version < "4.0"
geopandas==1.0.1 ; python_version >= "3.10" and python_version < "4.0"
google-api-core==2.19.1 ; python_version >= "3.10" and python_version < "4.0"
google-api-core[grpc]==2.19.1 ; python_version >= "3.10" and python_version < "4.0"
Expand Down

0 comments on commit cba7367

Please sign in to comment.