Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(geospatial): use geoarrow extension types when returning geometry columns as pyarrow #9549

Merged
merged 17 commits into from
Jul 14, 2024
Merged
71 changes: 70 additions & 1 deletion ibis/formats/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,54 @@
return dt.Map(key_dtype, value_dtype, nullable=nullable)
elif pa.types.is_dictionary(typ):
return cls.to_ibis(typ.value_type)
elif (
isinstance(typ, pa.ExtensionType)
and type(typ).__module__ == "geoarrow.types.type_pyarrow"
):
from geoarrow import types as gat

gat.type_pyarrow.register_extension_types()

auth_code = None
if typ.crs is not None:
crs_dict = typ.crs.to_json_dict()
if "id" in crs_dict:
crs_id = crs_dict["id"]
if "authority" in crs_id and "code" in crs_id:
auth_code = (crs_id["authority"], crs_id["code"])

if typ.crs is not None and auth_code is None:
# It is possible to have PROJJSON that does not have an authority/code
# attached, either because the producer didn't have that information
# (e.g., because they were reading a older shapefile). In this case,
# pyproj can often guess the authority/code.
import pyproj

auth_code = pyproj.CRS(typ.crs.to_json()).to_authority()
if auth_code is None:
raise ValueError(f"Can't resolve SRID of crs {typ.crs}")

Check warning on line 141 in ibis/formats/pyarrow.py

View check run for this annotation

Codecov / codecov/patch

ibis/formats/pyarrow.py#L141

Added line #L141 was not covered by tests

if auth_code is None:
srid = None
elif auth_code == ("OGC", "CRS84"):
# OGC:CRS84 and EPSG:4326 are identical except for the order of
# coordinates (i.e., lon lat vs. lat lon) in their official definition.
# This axis ordering is ignored in all but the most obscure scenarios
# such that these are identical. OGC:CRS84 is more correct, but EPSG:4326
# is more common.
srid = 4326
else:
# This works because the two most common srid authorities are EPSG and ESRI
# and the "codes" are all integers and don't intersect with each other on
# purpose. This won't scale to something like OGC:CRS27 (not common).
srid = int(auth_code[1])

if typ.edge_type == gat.EdgeType.SPHERICAL:
geotype = "geography"
else:
geotype = "geometry"

return dt.GeoSpatial(geotype, srid, nullable)
else:
return _from_pyarrow_types()[typ](nullable=nullable)

Expand Down Expand Up @@ -175,7 +223,28 @@
)
return pa.map_(key_field, value_field, keys_sorted=False)
elif dtype.is_geospatial():
return pa.binary()
from geoarrow import types as gat

# Resolve CRS
if dtype.srid is None:
crs = None
elif dtype.srid == 4326:
crs = gat.OGC_CRS84
else:
import pyproj

# Assume that these are EPSG codes. An srid is more accurately a key
# into a backend/connection-specific lookup table; however, most usage
# should work with this assumption.
crs = pyproj.CRS(f"EPSG:{dtype.srid}")

# Resolve edge type
if dtype.geotype == "geography":
edge_type = gat.EdgeType.SPHERICAL
else:
edge_type = gat.EdgeType.PLANAR

return gat.wkb(crs=crs, edge_type=edge_type).to_pyarrow()
else:
try:
return _to_pyarrow_types()[type(dtype)]
Expand Down
56 changes: 54 additions & 2 deletions ibis/formats/tests/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,5 +188,57 @@ def test_unknown_dtype_gets_converted_to_string():
pytest.param(dt.multipolygon, id="multipolygon"),
],
)
def test_geo_gets_converted_to_binary(ibis_type):
assert PyArrowType.from_ibis(ibis_type) == pa.binary()
def test_geo_gets_converted_to_geoarrow(ibis_type):
type_pyarrow = pytest.importorskip("geoarrow.types.type_pyarrow")

assert isinstance(
PyArrowType.from_ibis(ibis_type), type_pyarrow.GeometryExtensionType
)


def test_geoarrow_gets_converted_to_geo():
gat = pytest.importorskip("geoarrow.types")

pyarrow_type = gat.wkb().to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.is_geospatial()
assert ibis_type.geotype == "geometry"
assert ibis_type.srid is None
assert ibis_type.nullable is True
assert ibis_type.to_pyarrow() == pyarrow_type

pyarrow_type = gat.wkb(edge_type=gat.EdgeType.SPHERICAL).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.geotype == "geography"
assert ibis_type.to_pyarrow() == pyarrow_type

ibis_type = PyArrowType.to_ibis(gat.wkb().to_pyarrow(), nullable=False)
assert ibis_type.nullable is False


def test_geoarrow_crs_gets_converted_to_geo():
gat = pytest.importorskip("geoarrow.types")
pyproj = pytest.importorskip("pyproj")

# Check the GeoArrow/GeoParquet standard representation of longitude/latitude
pyarrow_type = gat.wkb(crs=gat.OGC_CRS84).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.srid == 4326
assert ibis_type.to_pyarrow() == pyarrow_type

# Check a standard representation of lon/lat that happens to be missing the
# explicit authority/code section of the PROJJSON (i.e., make pyproj guess
# the srid for us)
lonlat_crs = gat.OGC_CRS84.to_json_dict()
del lonlat_crs["id"]
pyarrow_type = gat.wkb(crs=lonlat_crs).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.srid == 4326
assert ibis_type.to_pyarrow() == pyarrow_type

# Check a non-lon/lat CRS (e.g., UTM Zone 20N)
utm_20n = pyproj.CRS("EPSG:32620")
pyarrow_type = gat.wkb(crs=utm_20n).to_pyarrow()
ibis_type = PyArrowType.to_ibis(pyarrow_type)
assert ibis_type.srid == 32620
assert ibis_type.to_pyarrow() == pyarrow_type
18 changes: 16 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ db-dtypes = { version = ">=0.3,<2", optional = true }
deltalake = { version = ">=0.9.0,<1", optional = true }
duckdb = { version = ">=0.8.1,<2", optional = true }
geopandas = { version = ">=0.6,<2", optional = true }
geoarrow-types = { version = ">=0.2,<1", optional = true }
pyproj = { version = ">=3.3.0,<4", optional = true }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure, we are making pyproj an explicit dependency. I know it comes when installing geopandas, which is already a dependency. I guess explicit better than implicit in this case.

Copy link
Member

@cpcloud cpcloud Jul 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's generally a bad idea to assume that a package will forever and always include a dependency that you require. If there's a top-level import for a package, it needs to be included as an explicit dependency.

google-cloud-bigquery = { version = ">=3,<4", optional = true }
google-cloud-bigquery-storage = { version = ">=2,<3", optional = true }
graphviz = { version = ">=0.16,<1", optional = true }
Expand Down Expand Up @@ -173,7 +175,7 @@ visualization = ["graphviz"]
decompiler = ["black"]
deltalake = ["deltalake"]
examples = ["pins", "fsspec"]
geospatial = ["geopandas", "shapely"]
geospatial = ["geoarrow-types", "geopandas", "pyproj", "shapely"]

[tool.poetry.plugins."ibis.backends"]
bigquery = "ibis.backends.bigquery"
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ fqdn==1.5.1 ; python_version >= "3.10" and python_version < "3.13"
frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0"
fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0"
gcsfs==2024.6.1 ; python_version >= "3.10" and python_version < "4.0"
geoarrow-types==0.2.0 ; python_version >= "3.10" and python_version < "4.0"
geopandas==1.0.1 ; python_version >= "3.10" and python_version < "4.0"
google-api-core==2.19.1 ; python_version >= "3.10" and python_version < "4.0"
google-api-core[grpc]==2.19.1 ; python_version >= "3.10" and python_version < "4.0"
Expand Down