From 0af891cfa69affefa81d55eb32e4d25eeb10a68d Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Fri, 27 Oct 2023 20:31:42 -0700 Subject: [PATCH 01/10] Adding initial commit for Geopandas FIle Reader and File Writer Class, Tests for that class, and additional requirements --- hamilton/plugins/geopandas_extensions.py | 139 ++++++++++++++++++++- requirements-test.txt | 4 + tests/plugins/test_geopandas_extensions.py | 33 +++++ 3 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 tests/plugins/test_geopandas_extensions.py diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py index de6ee631b..5d4fdffde 100644 --- a/hamilton/plugins/geopandas_extensions.py +++ b/hamilton/plugins/geopandas_extensions.py @@ -1,15 +1,51 @@ -from typing import Any +import dataclasses +import os +from io import BufferedReader, BytesIO +from pathlib import Path +from typing import IO, Any, Collection, Dict, Optional, Tuple, Type, Union + +from pyproj import CRS +from shapely import ( + GeometryCollection, + LinearRing, + LineString, + MultiLineString, + MultiPoint, + MultiPolygon, + Point, + Polygon, +) try: import geopandas as gpd except ImportError: raise NotImplementedError("geopandas is not installed.") +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal + from hamilton import registry +from hamilton.io import utils +from hamilton.io.data_adapters import DataLoader, DataSaver DATAFRAME_TYPE = gpd.GeoDataFrame COLUMN_TYPE = gpd.GeoSeries +Geometry = Optional[ + Union[ + Point, + LineString, + LinearRing, + Polygon, + MultiPoint, + MultiLineString, + MultiPolygon, + GeometryCollection, + ] +] + @registry.get_column.register(gpd.GeoDataFrame) def get_column_geopandas(df: gpd.GeoDataFrame, column_name: str) -> gpd.GeoSeries: @@ -30,3 +66,104 @@ def register_types(): register_types() + + +@dataclasses.dataclass +class GeopandasFileWriter(DataSaver): + """ + Class that handles saving a GeoDataFrame to a file. + Maps to https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_file.html + """ + + filename: Union[str, os.PathLike, IO] + # kwargs + driver: Optional[str] = None + schema: Optional[Dict] = None + index: Optional[bool] = None + mode: str = "w" + crs: Optional[CRS] = None + engine: Literal["fiona", "pyogrio"] = None + + # TODO Allow additional arguments via kwargs + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_saving_kwargs(self) -> Dict[str, Any]: + saving_kwargs = {} + if self.driver is not None: + saving_kwargs["driver"] = self.driver + if self.schema is not None: + saving_kwargs["schema"] = self.schema + if self.index is not None: + saving_kwargs["index"] = self.index + if self.mode is not None: + saving_kwargs["mode"] = self.mode + if self.crs is not None: + saving_kwargs["crs"] = self.crs + if self.engine is not None: + saving_kwargs["engine"] = self.engine + + return saving_kwargs + + def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: + data.to_file(self.filename, **self._get_saving_kwargs()) + return utils.get_file_metadata(self.filename) + + @classmethod + def name(cls) -> str: + return ["shp", "shx", "dbf"] + + +@dataclasses.dataclass +class GeopandasFileReader(DataLoader): + """ + Class that handles reading files or URLs with Geopandas + Maps to https://geopandas.org/en/stable/docs/reference/api/geopandas.read_file.html + """ + + filename: Union[str, Path, BytesIO, BufferedReader] + # kwargs + bbox: Optional[Union[Tuple, DATAFRAME_TYPE, COLUMN_TYPE, Geometry]] = None + mask: Optional[Union[Dict, DATAFRAME_TYPE, COLUMN_TYPE, Geometry]] = None + rows: Optional[Union[int, slice]] = None + engine: Literal["fiona", "pyogrio"] = None + + # TODO: allow additional arguments via kwargs + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_loading_kwargs(self) -> Dict[str, Any]: + loading_kwargs = {} + + if self.bbox is not None: + loading_kwargs["bbox"] = self.bbox + if self.mask is not None: + loading_kwargs["mask"] = self.mask + if self.rows is not None: + loading_kwargs["rows"] = self.rows + if self.engine is not None: + loading_kwargs["engine"] = self.engine + + return loading_kwargs + + def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + gdf = gpd.read_file(self.filename, **self._get_loading_kwargs()) + metedata = utils.get_file_metadata(self.filename) + + return gdf, metedata + + @classmethod + def name(cls) -> str: + return ["shp", "shx", "dbf"] + + +def register_data_loaders(): + """Function to register the data loaders for this extension.""" + for loader in [ + GeopandasFileReader, + GeopandasFileWriter, + ]: + registry.register_adapter(loader) diff --git a/requirements-test.txt b/requirements-test.txt index dc8a2d368..74f7fdcfb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,5 @@ +geodatasets +geopandas graphviz lightgbm lxml @@ -6,9 +8,11 @@ matplotlib networkx polars pyarrow +pyproj pytest pytest-cov scikit-learn +shapely sqlalchemy==1.4.49; python_version == '3.7.*' sqlalchemy; python_version >= '3.8' xgboost diff --git a/tests/plugins/test_geopandas_extensions.py b/tests/plugins/test_geopandas_extensions.py new file mode 100644 index 000000000..338489d9c --- /dev/null +++ b/tests/plugins/test_geopandas_extensions.py @@ -0,0 +1,33 @@ +import pathlib +import geopandas as gpd +import geodatasets +import sys +from geopandas.testing import assert_geodataframe_equal + +from hamilton.plugins.pandas_extensions import ( + GeopandasFileReader, + GeopandasFileWriter, +) + +def test_geopandas_file(tmp_path: pathlib.Path) -> None: + # load in the example data + new_york_example_data = geopandas.read_file(geodatasets.get_path("nybb")) + + #write the data to a shapefile + file_path = tmp_path / "ShapeFileTest" + writer = GeopandasFileWriter(filename=file_path) + metadata = writer.save_data(data = new_york_example_data) + + #read in the multiple files that we output + dbf_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.dbf" + shp_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shp" + shx_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shx" + + dbf_data, dbf_metadata = GeopandasFileReader(dbf_file_path) + shp_data, shp_metadata = GeopandasFileReader(shp_file_path) + shx_data, shx_metadata = GeopandasFileReader(shx_file_path) + + #check that each is the same as the original + assert_geodataframe_equal(new_york_example_data, dbf_data) + assert_geodataframe_equal(new_york_example_data, shp_data) + assert_geodataframe_equal(new_york_example_data, shx_data) \ No newline at end of file From 3e6ef5428a87a68f28e4cea9710db97c8d870c4a Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Fri, 27 Oct 2023 20:58:30 -0700 Subject: [PATCH 02/10] Correcting naming error in testing file --- tests/plugins/test_geopandas_extensions.py | 26 ++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/plugins/test_geopandas_extensions.py b/tests/plugins/test_geopandas_extensions.py index 338489d9c..57e8c2fbb 100644 --- a/tests/plugins/test_geopandas_extensions.py +++ b/tests/plugins/test_geopandas_extensions.py @@ -1,33 +1,31 @@ import pathlib -import geopandas as gpd + import geodatasets -import sys +import geopandas as gpd from geopandas.testing import assert_geodataframe_equal -from hamilton.plugins.pandas_extensions import ( - GeopandasFileReader, - GeopandasFileWriter, -) +from hamilton.plugins.geopandas_extensions import GeopandasFileReader, GeopandasFileWriter + def test_geopandas_file(tmp_path: pathlib.Path) -> None: # load in the example data - new_york_example_data = geopandas.read_file(geodatasets.get_path("nybb")) + new_york_example_data = gpd.read_file(geodatasets.get_path("nybb")) - #write the data to a shapefile + # write the data to a shapefile file_path = tmp_path / "ShapeFileTest" writer = GeopandasFileWriter(filename=file_path) - metadata = writer.save_data(data = new_york_example_data) + writer.save_data(data=new_york_example_data) - #read in the multiple files that we output - dbf_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.dbf" - shp_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shp" + # read in the multiple files that we output + dbf_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.dbf" + shp_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shp" shx_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shx" dbf_data, dbf_metadata = GeopandasFileReader(dbf_file_path) shp_data, shp_metadata = GeopandasFileReader(shp_file_path) shx_data, shx_metadata = GeopandasFileReader(shx_file_path) - #check that each is the same as the original + # check that each is the same as the original assert_geodataframe_equal(new_york_example_data, dbf_data) assert_geodataframe_equal(new_york_example_data, shp_data) - assert_geodataframe_equal(new_york_example_data, shx_data) \ No newline at end of file + assert_geodataframe_equal(new_york_example_data, shx_data) From 706635c4c83bf90804070a3ad2c7e65123e3ad3a Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Fri, 27 Oct 2023 21:17:24 -0700 Subject: [PATCH 03/10] reformatting geopandas tests --- tests/plugins/test_geopandas_extensions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/plugins/test_geopandas_extensions.py b/tests/plugins/test_geopandas_extensions.py index 57e8c2fbb..c3cdead85 100644 --- a/tests/plugins/test_geopandas_extensions.py +++ b/tests/plugins/test_geopandas_extensions.py @@ -21,9 +21,12 @@ def test_geopandas_file(tmp_path: pathlib.Path) -> None: shp_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shp" shx_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.shx" - dbf_data, dbf_metadata = GeopandasFileReader(dbf_file_path) - shp_data, shp_metadata = GeopandasFileReader(shp_file_path) - shx_data, shx_metadata = GeopandasFileReader(shx_file_path) + dbf_reader = GeopandasFileReader(dbf_file_path) + dbf_data, dbf_metadata = dbf_reader.load_data(gpd.GeoDataFrame) + shp_reader = GeopandasFileReader(shp_file_path) + shp_data, shp_metadata = shp_reader.load_data(gpd.GeoDataFrame) + shx_reader = GeopandasFileReader(shx_file_path) + shx_data, shx_metadata = shx_reader.load_data(gpd.GeoDataFrame) # check that each is the same as the original assert_geodataframe_equal(new_york_example_data, dbf_data) From a6725dbfc9963c797b73d8856bbb816c9fcda6a5 Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Sat, 28 Oct 2023 16:22:53 -0700 Subject: [PATCH 04/10] Adding Feather and Parquet Reader and Writer Class as well as tests for both --- hamilton/plugins/geopandas_extensions.py | 151 ++++++++++++++++++++- tests/plugins/test_geopandas_extensions.py | 47 ++++++- 2 files changed, 195 insertions(+), 3 deletions(-) diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py index 5d4fdffde..d5e7d5c42 100644 --- a/hamilton/plugins/geopandas_extensions.py +++ b/hamilton/plugins/geopandas_extensions.py @@ -2,7 +2,7 @@ import os from io import BufferedReader, BytesIO from pathlib import Path -from typing import IO, Any, Collection, Dict, Optional, Tuple, Type, Union +from typing import IO, Any, Collection, Dict, List, Optional, Tuple, Type, Union from pyproj import CRS from shapely import ( @@ -149,7 +149,7 @@ def _get_loading_kwargs(self) -> Dict[str, Any]: return loading_kwargs - def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + def load_data(self, type: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: gdf = gpd.read_file(self.filename, **self._get_loading_kwargs()) metedata = utils.get_file_metadata(self.filename) @@ -160,10 +160,157 @@ def name(cls) -> str: return ["shp", "shx", "dbf"] +@dataclasses.dataclass +class GeopandasParquetWriter(DataSaver): + """ + Class that handles writing a GeoDataFrame to Parquet File. + Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_parquet.html + """ + + path: Union[str, Path] + # kwargs + index: Optional[bool] = None + compression: Literal["snappy", "gzip", "brotli", None] = "snappy" + schema_version = Optional[Literal["0.1.0", "0.4.0", None]] = None + # TO DO: allow additional arguments via the kwargs keyword + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return DATAFRAME_TYPE + + def _get_saving_kwargs(self) -> Dict[str, Any]: + saving_kwargs = {} + if self.index is not None: + saving_kwargs["index"] = self.index + if self.compression is not None: + saving_kwargs["compression"] = self.compression + if self.schema_version is not None: + saving_kwargs["schema_version"] = self.schema_version + + def load_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: + data.to_parquet(self.path, **self._get_saving_kwargs()) + return utils.get_file_metadata(self.path) + + @classmethod + def name(cls) -> str: + return "parquet" + + +@dataclasses.dataclass +class GeopandasParquetReader(DataLoader): + """ + Class that handles reading Parquet Files and outputs a GeoDataFrame. + Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.read_parquet.html + """ + + path: Union[str, Path] + # kwargs + columns: Optional[List] = None + storage_options: Optional[Dict] = None + + # TO DO: allow additional arguments via kwargs + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return DATAFRAME_TYPE + + def _get_loading_kwargs(self) -> Dict[str, Any]: + loading_kwargs = {} + if self.columns is not None: + loading_kwargs["columns"] = self.columns + if self.storage_options is not None: + loading_kwargs["storage_options"] = self.storage_options + + return loading_kwargs + + def load_data(self, type: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + gdf = gpd.read_parquet(self.path, **self._get_loading_kwargs()) + metadata = utils.get_file_metadata(self.path) + + return gdf, metadata + + @classmethod + def name(cls) -> str: + return "parquet" + + +@dataclasses.dataclass +class GeopandasFeatherWriter(DataSaver): + """ + Class that handles writing a GeoDataFrame to a Feather File. + Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_feather.html + """ + + path: Union[str, Path] + # kwargs + index: Optional[bool] = None + compression: Optional[Literal["zstd", "lz4", "uncompressed"]] = None + schema_version = Optional[Literal["0.1.0", "0.4.0", None]] = None + # TO DO: allow additional arguments via the kwargs keyword + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return DATAFRAME_TYPE + + def _get_saving_kwargs(self) -> Dict[str, Any]: + saving_kwargs = {} + if self.index is not None: + saving_kwargs["index"] = self.index + if self.compression is not None: + saving_kwargs["compression"] = self.compression + if self.schema_version is not None: + saving_kwargs["schema_version"] = self.schema_version + + def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: + data.to_feather(self.path, **self._get_saving_kwargs()) + return utils.get_file_metadata(self.path) + + @classmethod + def name(cls) -> str: + return "feather" + + +@dataclasses.dataclass +class GeopandasFeatherReader(DataLoader): + """ + Class that handles reading Feather Files and outputs a GeoDataFrame. + Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.read_feather.html + """ + + path: Union[str, Path] + # kwargs + columns: Optional[List] = None + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_loading_kwargs(self) -> Dict[str, Any]: + loading_kwargs = {} + if self.columns is not None: + loading_kwargs["columns"] = self.columns + + return loading_kwargs + + def load_data(self, type: Type) -> Tuple[Any, Dict[str, Any]]: + gdf = gpd.read_feather(self.path, **self._get_loading_kwargs()) + metadata = utils.get_file_metadata(self.path) + + return gdf, metadata + + @classmethod + def name(cls) -> str: + return "feather" + + def register_data_loaders(): """Function to register the data loaders for this extension.""" for loader in [ GeopandasFileReader, GeopandasFileWriter, + GeopandasParquetReader, + GeopandasParquetWriter, + GeopandasFeatherReader, + GeopandasFeatherWriter, ]: registry.register_adapter(loader) diff --git a/tests/plugins/test_geopandas_extensions.py b/tests/plugins/test_geopandas_extensions.py index c3cdead85..6e63e103e 100644 --- a/tests/plugins/test_geopandas_extensions.py +++ b/tests/plugins/test_geopandas_extensions.py @@ -4,7 +4,14 @@ import geopandas as gpd from geopandas.testing import assert_geodataframe_equal -from hamilton.plugins.geopandas_extensions import GeopandasFileReader, GeopandasFileWriter +from hamilton.plugins.geopandas_extensions import ( + GeopandasFeatherReader, + GeopandasFeatherWriter, + GeopandasFileReader, + GeopandasFileWriter, + GeopandasParquetReader, + GeopandasParquetWriter, +) def test_geopandas_file(tmp_path: pathlib.Path) -> None: @@ -32,3 +39,41 @@ def test_geopandas_file(tmp_path: pathlib.Path) -> None: assert_geodataframe_equal(new_york_example_data, dbf_data) assert_geodataframe_equal(new_york_example_data, shp_data) assert_geodataframe_equal(new_york_example_data, shx_data) + + +def test_geopandas_parquet(tmp_path: pathlib.Path) -> None: + # load in the example data + new_york_example_data = gpd.read_file(geodatasets.get_path("nybb")) + + # write the data to a shapefile + file_path = tmp_path / "ParquetFileTest.parquet" + writer = GeopandasParquetWriter(path=file_path) + writer.save_data(data=new_york_example_data) + + # read in the multiple files that we output + parquet_file_path = file_path + + parquet_reader = GeopandasParquetReader(parquet_file_path) + parquet_data, parquet_metadata = parquet_reader.load_data(gpd.GeoDataFrame) + + # check that each is the same as the original + assert_geodataframe_equal(new_york_example_data, parquet_data) + + +def test_geopandas_feather(tmp_path: pathlib.Path) -> None: + # load in the example data + new_york_example_data = gpd.read_file(geodatasets.get_path("nybb")) + + # write the data to a shapefile + file_path = tmp_path / "FeatherFileTest.feather" + writer = GeopandasFeatherWriter(path=file_path) + writer.save_data(data=new_york_example_data) + + # read in the multiple files that we output + feather_file_path = file_path + + feather_reader = GeopandasFeatherReader(feather_file_path) + feather_data, feather_metadata = feather_reader.load_data(gpd.GeoDataFrame) + + # check that each is the same as the original + assert_geodataframe_equal(new_york_example_data, feather_data) From 933b3a4cc5b3e1b30753da709b7e3556f0302914 Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Sat, 28 Oct 2023 16:34:47 -0700 Subject: [PATCH 05/10] Correcting the assignment error in the geopandas_extnesions.py file --- hamilton/plugins/geopandas_extensions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py index d5e7d5c42..f3ec0d9f4 100644 --- a/hamilton/plugins/geopandas_extensions.py +++ b/hamilton/plugins/geopandas_extensions.py @@ -170,8 +170,8 @@ class GeopandasParquetWriter(DataSaver): path: Union[str, Path] # kwargs index: Optional[bool] = None - compression: Literal["snappy", "gzip", "brotli", None] = "snappy" - schema_version = Optional[Literal["0.1.0", "0.4.0", None]] = None + compression: Union[Literal["snappy", "gzip", "brotli"], None] = "snappy" + schema_version: Optional[Union[Literal["0.1.0", "0.4.0"], None]] = None # TO DO: allow additional arguments via the kwargs keyword @classmethod @@ -244,8 +244,8 @@ class GeopandasFeatherWriter(DataSaver): path: Union[str, Path] # kwargs index: Optional[bool] = None - compression: Optional[Literal["zstd", "lz4", "uncompressed"]] = None - schema_version = Optional[Literal["0.1.0", "0.4.0", None]] = None + compression: Optional[Union[Literal["zstd", "lz4", "uncompressed"], None]] = None + schema_version: Optional[Union[Literal["0.1.0", "0.4.0"], None]] = None # TO DO: allow additional arguments via the kwargs keyword @classmethod From bfff1725e4a8286e580c48384b69f6dd2ce14610 Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Sat, 28 Oct 2023 16:56:16 -0700 Subject: [PATCH 06/10] fixing errors --- hamilton/plugins/geopandas_extensions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py index f3ec0d9f4..dda5d7550 100644 --- a/hamilton/plugins/geopandas_extensions.py +++ b/hamilton/plugins/geopandas_extensions.py @@ -187,7 +187,9 @@ def _get_saving_kwargs(self) -> Dict[str, Any]: if self.schema_version is not None: saving_kwargs["schema_version"] = self.schema_version - def load_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: + return saving_kwargs + + def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: data.to_parquet(self.path, **self._get_saving_kwargs()) return utils.get_file_metadata(self.path) @@ -261,6 +263,8 @@ def _get_saving_kwargs(self) -> Dict[str, Any]: if self.schema_version is not None: saving_kwargs["schema_version"] = self.schema_version + return saving_kwargs + def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: data.to_feather(self.path, **self._get_saving_kwargs()) return utils.get_file_metadata(self.path) From fd30a7a9c1ba1266f83c262e090ffd757dc531e6 Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Mon, 30 Oct 2023 17:37:32 -0700 Subject: [PATCH 07/10] Adding PostGIS read and write classes, added extra_kwargs logic to all other classes --- hamilton/plugins/geopandas_extensions.py | 143 +++++++++++++++++++-- requirements-test.txt | 3 +- tests/plugins/test_geopandas_extensions.py | 64 ++++++--- 3 files changed, 185 insertions(+), 25 deletions(-) diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py index dda5d7550..ba849c552 100644 --- a/hamilton/plugins/geopandas_extensions.py +++ b/hamilton/plugins/geopandas_extensions.py @@ -4,6 +4,13 @@ from pathlib import Path from typing import IO, Any, Collection, Dict, List, Optional, Tuple, Type, Union +from sqlalchemy.engine import Connection, Engine + +try: + from collections.abc import Sequence +except ImportError: + from collections import Sequence + from pyproj import CRS from shapely import ( GeometryCollection, @@ -83,8 +90,8 @@ class GeopandasFileWriter(DataSaver): mode: str = "w" crs: Optional[CRS] = None engine: Literal["fiona", "pyogrio"] = None - - # TODO Allow additional arguments via kwargs + # additional kwargs not listed + extra_kwargs: Optional[Dict[str, Any]] = None @classmethod def applicable_types(cls) -> Collection[Type]: @@ -104,6 +111,8 @@ def _get_saving_kwargs(self) -> Dict[str, Any]: saving_kwargs["crs"] = self.crs if self.engine is not None: saving_kwargs["engine"] = self.engine + if self.extra_kwargs is not None: + saving_kwargs.update(self.extra_kwargs) return saving_kwargs @@ -129,6 +138,8 @@ class GeopandasFileReader(DataLoader): mask: Optional[Union[Dict, DATAFRAME_TYPE, COLUMN_TYPE, Geometry]] = None rows: Optional[Union[int, slice]] = None engine: Literal["fiona", "pyogrio"] = None + # additional kwargs not listed + extra_kwargs: Optional[Dict[str, Any]] = None # TODO: allow additional arguments via kwargs @classmethod @@ -146,10 +157,12 @@ def _get_loading_kwargs(self) -> Dict[str, Any]: loading_kwargs["rows"] = self.rows if self.engine is not None: loading_kwargs["engine"] = self.engine + if self.extra_kwargs is not None: + loading_kwargs.update(self.extra_kwargs) return loading_kwargs - def load_data(self, type: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: gdf = gpd.read_file(self.filename, **self._get_loading_kwargs()) metedata = utils.get_file_metadata(self.filename) @@ -172,7 +185,8 @@ class GeopandasParquetWriter(DataSaver): index: Optional[bool] = None compression: Union[Literal["snappy", "gzip", "brotli"], None] = "snappy" schema_version: Optional[Union[Literal["0.1.0", "0.4.0"], None]] = None - # TO DO: allow additional arguments via the kwargs keyword + # additional kwargs not listed + extra_kwargs: Optional[Dict[str, Any]] = None @classmethod def applicable_types(cls) -> Collection[Type]: @@ -186,6 +200,8 @@ def _get_saving_kwargs(self) -> Dict[str, Any]: saving_kwargs["compression"] = self.compression if self.schema_version is not None: saving_kwargs["schema_version"] = self.schema_version + if self.extra_kwargs is not None: + saving_kwargs.update(self.extra_kwargs) return saving_kwargs @@ -209,8 +225,8 @@ class GeopandasParquetReader(DataLoader): # kwargs columns: Optional[List] = None storage_options: Optional[Dict] = None - - # TO DO: allow additional arguments via kwargs + # additional kwargs not listed + extra_kwargs: Optional[Dict[str, Any]] = None @classmethod def applicable_types(cls) -> Collection[Type]: @@ -222,10 +238,12 @@ def _get_loading_kwargs(self) -> Dict[str, Any]: loading_kwargs["columns"] = self.columns if self.storage_options is not None: loading_kwargs["storage_options"] = self.storage_options + if self.extra_kwargs is not None: + loading_kwargs.update(self.extra_kwargs) return loading_kwargs - def load_data(self, type: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: gdf = gpd.read_parquet(self.path, **self._get_loading_kwargs()) metadata = utils.get_file_metadata(self.path) @@ -248,7 +266,8 @@ class GeopandasFeatherWriter(DataSaver): index: Optional[bool] = None compression: Optional[Union[Literal["zstd", "lz4", "uncompressed"], None]] = None schema_version: Optional[Union[Literal["0.1.0", "0.4.0"], None]] = None - # TO DO: allow additional arguments via the kwargs keyword + # additional kwargs not listed + extra_kwargs: Optional[Dict[str, Any]] = None @classmethod def applicable_types(cls) -> Collection[Type]: @@ -262,6 +281,8 @@ def _get_saving_kwargs(self) -> Dict[str, Any]: saving_kwargs["compression"] = self.compression if self.schema_version is not None: saving_kwargs["schema_version"] = self.schema_version + if self.extra_kwargs is not None: + saving_kwargs.update(self.extra_kwargs) return saving_kwargs @@ -284,6 +305,8 @@ class GeopandasFeatherReader(DataLoader): path: Union[str, Path] # kwargs columns: Optional[List] = None + # additional kwargs not listed + extra_kwargs: Optional[Dict[str, Any]] = None @classmethod def applicable_types(cls) -> Collection[Type]: @@ -293,10 +316,12 @@ def _get_loading_kwargs(self) -> Dict[str, Any]: loading_kwargs = {} if self.columns is not None: loading_kwargs["columns"] = self.columns + if self.extra_kwargs is not None: + loading_kwargs.update(self.extra_kwargs) return loading_kwargs - def load_data(self, type: Type) -> Tuple[Any, Dict[str, Any]]: + def load_data(self, type_: Type) -> Tuple[Any, Dict[str, Any]]: gdf = gpd.read_feather(self.path, **self._get_loading_kwargs()) metadata = utils.get_file_metadata(self.path) @@ -307,6 +332,104 @@ def name(cls) -> str: return "feather" +@dataclasses.dataclass +class GeopandasPostGISWriter(DataSaver): + """ + Class tha handles uploading a GeoDataFrame to a PostGIS database + Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_postgis.html + """ + + name: str + con: Union[Connection, Engine] + # kwargs + if_exists: Literal["fail", "replace", "append"] = "fail" + schema: Optional[str] = None + index: bool = False + index_label: Optional[Union[str, Sequence]] = None + chunksize: Optional[int] = None + dtype: Optional[Dict] = None + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_saving_kwargs(self): + saving_kwargs = {} + if self.if_exists is not None: + saving_kwargs["if_exists"] = self.if_exists + if self.schema is not None: + saving_kwargs["schema"] = self.schema + if self.index is not None: + saving_kwargs["index"] = self.index + if self.index_label is not None: + saving_kwargs["index_label"] = self.index_label + if self.chunksize is not None: + saving_kwargs["chunksize"] = self.chunksize + if self.dtype is not None: + saving_kwargs["dtype"] = self.dtype + + return saving_kwargs + + def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: + results = data.to_postgis(self.name, self.con, **self._get_saving_kwargs()) + return utils.get_sql_metadata(self.name, results) + + @classmethod + def name(cls) -> str: + return "PostGIS" + + +@dataclasses.dataclass +class GeopandasPostGISReader(DataLoader): + """ + Class that handles reading from a PostGIS databse and outputs a GeoDataFrame + Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.read_postgis.html + """ + + sql: str + con: Union[Engine, Connection] + # kwargs + geom_col: str = "geom" + crs: Optional[Union[Dict, str]] = None + chunksize: Optional[int] = None + index_col: Optional[Union[str, List[str]]] = None + coerce_float: bool = True + parse_dates: Optional[Union[List, Dict]] = None + params: Optional[Union[List, Tuple, Dict]] = None + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_loading_kwargs(self): + loading_kwargs = {} + if self.geom_col is not None: + loading_kwargs["geom_col"] = self.geom_col + if self.crs is not None: + loading_kwargs["crs"] = self.crs + if self.chunksize is not None: + loading_kwargs["chunksize"] = self.chunksize + if self.index_col is not None: + loading_kwargs["index_col"] = self.index_col + if self.coerce_float is not None: + loading_kwargs["coerce_float"] = self.coerce_float + if self.parse_dates is not None: + loading_kwargs["parse_dates"] = self.parse_dates + if self.params is not None: + loading_kwargs["params"] = self.params + + return loading_kwargs + + def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + gdf = gpd.read_postgis(self.sql, self.con, **self._get_loading_kwargs()) + metadata = utils.get_sql_metadata(self.sql, gdf) + return gdf, metadata + + @classmethod + def name(cls) -> str: + return "PostGIS" + + def register_data_loaders(): """Function to register the data loaders for this extension.""" for loader in [ @@ -316,5 +439,7 @@ def register_data_loaders(): GeopandasParquetWriter, GeopandasFeatherReader, GeopandasFeatherWriter, + GeopandasPostGISReader, + GeopandasPostGISWriter, ]: registry.register_adapter(loader) diff --git a/requirements-test.txt b/requirements-test.txt index 74f7fdcfb..32fb92565 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,5 @@ -geodatasets +geoalchemy2 +geodatasets; python_version >= '3.8' geopandas graphviz lightgbm diff --git a/tests/plugins/test_geopandas_extensions.py b/tests/plugins/test_geopandas_extensions.py index 6e63e103e..d4bc2e2a8 100644 --- a/tests/plugins/test_geopandas_extensions.py +++ b/tests/plugins/test_geopandas_extensions.py @@ -1,8 +1,11 @@ import pathlib +import sqlite3 +from typing import Union -import geodatasets import geopandas as gpd +import pytest from geopandas.testing import assert_geodataframe_equal +from sqlalchemy import create_engine from hamilton.plugins.geopandas_extensions import ( GeopandasFeatherReader, @@ -11,17 +14,29 @@ GeopandasFileWriter, GeopandasParquetReader, GeopandasParquetWriter, + GeopandasPostGISReader, + GeopandasPostGISWriter, ) +try: + # load in the example data used for python >= 3.8 + import geodatasets + + example_data = gpd.read_file(geodatasets.get_path("nybb")) +except ImportError: + # create example for python 3.7 + from shapely.geometry import Point + + gdf = {"col1": ["name1", "name2"], "geometry": [Point(1, 2), Point(2, 1)]} + example_data = gpd.GeoDataFrame(gdf, crs="EPSG:4326") + def test_geopandas_file(tmp_path: pathlib.Path) -> None: - # load in the example data - new_york_example_data = gpd.read_file(geodatasets.get_path("nybb")) # write the data to a shapefile file_path = tmp_path / "ShapeFileTest" writer = GeopandasFileWriter(filename=file_path) - writer.save_data(data=new_york_example_data) + writer.save_data(data=example_data) # read in the multiple files that we output dbf_file_path = tmp_path / "ShapeFileTest/ShapeFileTest.dbf" @@ -36,19 +51,17 @@ def test_geopandas_file(tmp_path: pathlib.Path) -> None: shx_data, shx_metadata = shx_reader.load_data(gpd.GeoDataFrame) # check that each is the same as the original - assert_geodataframe_equal(new_york_example_data, dbf_data) - assert_geodataframe_equal(new_york_example_data, shp_data) - assert_geodataframe_equal(new_york_example_data, shx_data) + assert_geodataframe_equal(example_data, dbf_data) + assert_geodataframe_equal(example_data, shp_data) + assert_geodataframe_equal(example_data, shx_data) def test_geopandas_parquet(tmp_path: pathlib.Path) -> None: - # load in the example data - new_york_example_data = gpd.read_file(geodatasets.get_path("nybb")) # write the data to a shapefile file_path = tmp_path / "ParquetFileTest.parquet" writer = GeopandasParquetWriter(path=file_path) - writer.save_data(data=new_york_example_data) + writer.save_data(data=example_data) # read in the multiple files that we output parquet_file_path = file_path @@ -57,17 +70,15 @@ def test_geopandas_parquet(tmp_path: pathlib.Path) -> None: parquet_data, parquet_metadata = parquet_reader.load_data(gpd.GeoDataFrame) # check that each is the same as the original - assert_geodataframe_equal(new_york_example_data, parquet_data) + assert_geodataframe_equal(example_data, parquet_data) def test_geopandas_feather(tmp_path: pathlib.Path) -> None: - # load in the example data - new_york_example_data = gpd.read_file(geodatasets.get_path("nybb")) # write the data to a shapefile file_path = tmp_path / "FeatherFileTest.feather" writer = GeopandasFeatherWriter(path=file_path) - writer.save_data(data=new_york_example_data) + writer.save_data(data=example_data) # read in the multiple files that we output feather_file_path = file_path @@ -76,4 +87,27 @@ def test_geopandas_feather(tmp_path: pathlib.Path) -> None: feather_data, feather_metadata = feather_reader.load_data(gpd.GeoDataFrame) # check that each is the same as the original - assert_geodataframe_equal(new_york_example_data, feather_data) + assert_geodataframe_equal(example_data, feather_data) + + +@pytest.mark.parametrize( + "conn", + [ + sqlite3.connect(":memory:"), + create_engine("sqlite://"), + ], +) +def test_geopandas_postgis(conn: Union[str, sqlite3.Connection]) -> None: + # write the file to a test database + writer = GeopandasPostGISWriter(name="test_example", con=conn) + writer.save_data(example_data) + + # read the file + reader = GeopandasPostGISReader(sql="SELECT * FROM test_table", con=conn) + postgis_data, postgis_metadata = reader.load_data(gpd.GeoDataFrame) + + # check that each is the same as the original + assert_geodataframe_equal(example_data, postgis_data) + + if hasattr(conn, "close"): + conn.close() From 6fe775e1f602496349168162332b5336cac5da4b Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Fri, 17 Nov 2023 15:51:42 -0800 Subject: [PATCH 08/10] Removing PostGIS reading and writing in both the geopandas_extensions.py and test_geopandas_extensions.py --- hamilton/plugins/geopandas_extensions.py | 107 --------------------- tests/plugins/test_geopandas_extensions.py | 29 ------ 2 files changed, 136 deletions(-) diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py index ba849c552..bfa678ebd 100644 --- a/hamilton/plugins/geopandas_extensions.py +++ b/hamilton/plugins/geopandas_extensions.py @@ -4,13 +4,6 @@ from pathlib import Path from typing import IO, Any, Collection, Dict, List, Optional, Tuple, Type, Union -from sqlalchemy.engine import Connection, Engine - -try: - from collections.abc import Sequence -except ImportError: - from collections import Sequence - from pyproj import CRS from shapely import ( GeometryCollection, @@ -332,104 +325,6 @@ def name(cls) -> str: return "feather" -@dataclasses.dataclass -class GeopandasPostGISWriter(DataSaver): - """ - Class tha handles uploading a GeoDataFrame to a PostGIS database - Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_postgis.html - """ - - name: str - con: Union[Connection, Engine] - # kwargs - if_exists: Literal["fail", "replace", "append"] = "fail" - schema: Optional[str] = None - index: bool = False - index_label: Optional[Union[str, Sequence]] = None - chunksize: Optional[int] = None - dtype: Optional[Dict] = None - - @classmethod - def applicable_types(cls) -> Collection[Type]: - return [DATAFRAME_TYPE] - - def _get_saving_kwargs(self): - saving_kwargs = {} - if self.if_exists is not None: - saving_kwargs["if_exists"] = self.if_exists - if self.schema is not None: - saving_kwargs["schema"] = self.schema - if self.index is not None: - saving_kwargs["index"] = self.index - if self.index_label is not None: - saving_kwargs["index_label"] = self.index_label - if self.chunksize is not None: - saving_kwargs["chunksize"] = self.chunksize - if self.dtype is not None: - saving_kwargs["dtype"] = self.dtype - - return saving_kwargs - - def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: - results = data.to_postgis(self.name, self.con, **self._get_saving_kwargs()) - return utils.get_sql_metadata(self.name, results) - - @classmethod - def name(cls) -> str: - return "PostGIS" - - -@dataclasses.dataclass -class GeopandasPostGISReader(DataLoader): - """ - Class that handles reading from a PostGIS databse and outputs a GeoDataFrame - Maps to: https://geopandas.org/en/stable/docs/reference/api/geopandas.read_postgis.html - """ - - sql: str - con: Union[Engine, Connection] - # kwargs - geom_col: str = "geom" - crs: Optional[Union[Dict, str]] = None - chunksize: Optional[int] = None - index_col: Optional[Union[str, List[str]]] = None - coerce_float: bool = True - parse_dates: Optional[Union[List, Dict]] = None - params: Optional[Union[List, Tuple, Dict]] = None - - @classmethod - def applicable_types(cls) -> Collection[Type]: - return [DATAFRAME_TYPE] - - def _get_loading_kwargs(self): - loading_kwargs = {} - if self.geom_col is not None: - loading_kwargs["geom_col"] = self.geom_col - if self.crs is not None: - loading_kwargs["crs"] = self.crs - if self.chunksize is not None: - loading_kwargs["chunksize"] = self.chunksize - if self.index_col is not None: - loading_kwargs["index_col"] = self.index_col - if self.coerce_float is not None: - loading_kwargs["coerce_float"] = self.coerce_float - if self.parse_dates is not None: - loading_kwargs["parse_dates"] = self.parse_dates - if self.params is not None: - loading_kwargs["params"] = self.params - - return loading_kwargs - - def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: - gdf = gpd.read_postgis(self.sql, self.con, **self._get_loading_kwargs()) - metadata = utils.get_sql_metadata(self.sql, gdf) - return gdf, metadata - - @classmethod - def name(cls) -> str: - return "PostGIS" - - def register_data_loaders(): """Function to register the data loaders for this extension.""" for loader in [ @@ -439,7 +334,5 @@ def register_data_loaders(): GeopandasParquetWriter, GeopandasFeatherReader, GeopandasFeatherWriter, - GeopandasPostGISReader, - GeopandasPostGISWriter, ]: registry.register_adapter(loader) diff --git a/tests/plugins/test_geopandas_extensions.py b/tests/plugins/test_geopandas_extensions.py index d4bc2e2a8..69445a723 100644 --- a/tests/plugins/test_geopandas_extensions.py +++ b/tests/plugins/test_geopandas_extensions.py @@ -1,11 +1,7 @@ import pathlib -import sqlite3 -from typing import Union import geopandas as gpd -import pytest from geopandas.testing import assert_geodataframe_equal -from sqlalchemy import create_engine from hamilton.plugins.geopandas_extensions import ( GeopandasFeatherReader, @@ -14,8 +10,6 @@ GeopandasFileWriter, GeopandasParquetReader, GeopandasParquetWriter, - GeopandasPostGISReader, - GeopandasPostGISWriter, ) try: @@ -88,26 +82,3 @@ def test_geopandas_feather(tmp_path: pathlib.Path) -> None: # check that each is the same as the original assert_geodataframe_equal(example_data, feather_data) - - -@pytest.mark.parametrize( - "conn", - [ - sqlite3.connect(":memory:"), - create_engine("sqlite://"), - ], -) -def test_geopandas_postgis(conn: Union[str, sqlite3.Connection]) -> None: - # write the file to a test database - writer = GeopandasPostGISWriter(name="test_example", con=conn) - writer.save_data(example_data) - - # read the file - reader = GeopandasPostGISReader(sql="SELECT * FROM test_table", con=conn) - postgis_data, postgis_metadata = reader.load_data(gpd.GeoDataFrame) - - # check that each is the same as the original - assert_geodataframe_equal(example_data, postgis_data) - - if hasattr(conn, "close"): - conn.close() From 34c7a9052b24edd23dd91fd17e98509acbf25815 Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Fri, 17 Nov 2023 19:38:50 -0800 Subject: [PATCH 09/10] Adding examples to the commit and an h_geopandas.py file --- .../materialization/geometric_calculations.py | 30 + .../geopandas/materialization/notebook.ipynb | 514 ++++++++++++++++++ hamilton/plugins/h_geopandas.py | 53 ++ 3 files changed, 597 insertions(+) create mode 100644 examples/geopandas/materialization/geometric_calculations.py create mode 100644 examples/geopandas/materialization/notebook.ipynb create mode 100644 hamilton/plugins/h_geopandas.py diff --git a/examples/geopandas/materialization/geometric_calculations.py b/examples/geopandas/materialization/geometric_calculations.py new file mode 100644 index 000000000..7831a28f9 --- /dev/null +++ b/examples/geopandas/materialization/geometric_calculations.py @@ -0,0 +1,30 @@ +# Define your new Hamilton functions. +# The %%writefile magic command creates a new Python module with the functions below. +# We will import this later and pass it into our Driver. + +import geopandas +import geodatasets +from hamilton.function_modifiers import extract_columns + +@extract_columns("community", "POP2010", "geometry") +def base_df(base_df_location: str) -> geopandas.GeoDataFrame: + """Loads base dataframe of data. + + :param base_df_location: just showing that we could load this from a file... + :return: + """ + chicago = geopandas.read_file(geodatasets.get_path("geoda.chicago_commpop")) + return chicago + +# Look at `my_functions` to see how these functions connect. +def chicago_area(geometry: geopandas.GeoSeries) -> geopandas.GeoSeries: + """Get the area of the row using the geometry column""" + return 10000 * geometry.area + +def chicago_population(POP2010: geopandas.GeoSeries) -> geopandas.GeoSeries: + """ Get the population of the area of interest""" + return POP2010 + +def chicago_population_density(chicago_area: geopandas.GeoSeries, chicago_population: geopandas.GeoSeries) -> geopandas.GeoSeries: + """ Calculate the population density""" + return chicago_population/chicago_area diff --git a/examples/geopandas/materialization/notebook.ipynb b/examples/geopandas/materialization/notebook.ipynb new file mode 100644 index 000000000..bd4595319 --- /dev/null +++ b/examples/geopandas/materialization/notebook.ipynb @@ -0,0 +1,514 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import geodatasets\n", + "import geopandas\n", + "import sys \n", + "\n", + "# Add the hamilton module to your path - optinal\n", + "# project_dir = \"### ADD PATH HERE ###\"\n", + "# sys.path.append(project_dir)\n", + "\n", + "from hamilton import base, driver\n", + "#from hamilton.io.materialization import to\n", + "from hamilton.function_modifiers import extract_columns\n", + "from hamilton.plugins import h_geopandas" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "#load in the example dataset \n", + "#chicago = geopandas.read_file(geodatasets.get_path(\"geoda.chicago_commpop\"))\n", + "#groceries = geopandas.read_file(geodatasets.get_path(\"geoda.groceries\")).to_crs(chicago.crs)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# We use the autoreload extension that comes with ipython to automatically reload modules when\n", + "# the code in them changes.\n", + "\n", + "# import the jupyter extension\n", + "%load_ext autoreload\n", + "# set it to only reload the modules imported\n", + "%autoreload 1" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting geometric_calculations.py\n" + ] + } + ], + "source": [ + "%%writefile geometric_calculations.py\n", + "# Define your new Hamilton functions.\n", + "# The %%writefile magic command creates a new Python module with the functions below.\n", + "# We will import this later and pass it into our Driver.\n", + "\n", + "import geopandas\n", + "import geodatasets\n", + "from hamilton.function_modifiers import extract_columns\n", + "\n", + "@extract_columns(\"community\", \"POP2010\", \"geometry\")\n", + "def base_df(base_df_location: str) -> geopandas.GeoDataFrame:\n", + " \"\"\"Loads base dataframe of data.\n", + "\n", + " :param base_df_location: just showing that we could load this from a file...\n", + " :return:\n", + " \"\"\"\n", + " chicago = geopandas.read_file(geodatasets.get_path(\"geoda.chicago_commpop\"))\n", + " return chicago\n", + " \n", + "# Look at `my_functions` to see how these functions connect.\n", + "def chicago_area(geometry: geopandas.GeoSeries) -> geopandas.GeoSeries:\n", + " \"\"\"Get the area of the row using the geometry column\"\"\"\n", + " return 10000 * geometry.area \n", + "\n", + "def chicago_population(POP2010: geopandas.GeoSeries) -> geopandas.GeoSeries:\n", + " \"\"\" Get the population of the area of interest\"\"\"\n", + " return POP2010\n", + "\n", + "def chicago_population_density(chicago_area: geopandas.GeoSeries, chicago_population: geopandas.GeoSeries) -> geopandas.GeoSeries:\n", + " \"\"\" Calculate the population density\"\"\"\n", + " return chicago_population/chicago_area" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "%aimport geometric_calculations\n", + "#from hamilton import base \n", + "\n", + "config = {\n", + " \"base_df_location\": \"dummy_value\",\n", + "}\n", + "adapter = base.SimplePythonGraphAdapter(result_builder=h_geopandas.GeoPandasDataFrameResult())\n", + "dr = driver.Driver(config, geometric_calculations, adapter=adapter) " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "gdf_builder = base.PandasDataFrameResult()\n", + "dr = driver.Driver({}, geometric_calculations) # can pass in multiple modules" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# we need to specify what we want in the final dataframe. These can be string names, or function references.\n", + "output_columns = [\n", + " \"community\",\n", + " \"chicago_population\",\n", + " \"chicago_area\",\n", + " \"chicago_population_density\",\n", + " \"geometry\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " community chicago_population chicago_area \\\n", + "0 DOUGLAS 18238 4.632396 \n", + "1 OAKLAND 5918 1.702833 \n", + "2 FULLER PARK 2876 2.004699 \n", + "3 GRAND BOULEVARD 21929 4.881245 \n", + "4 KENWOOD 17841 2.926158 \n", + ".. ... ... ... \n", + "72 MOUNT GREENWOOD 19093 7.594687 \n", + "73 MORGAN PARK 22544 9.230990 \n", + "74 OHARE 12756 37.524990 \n", + "75 EDGEWATER 56521 4.890111 \n", + "76 EDISON PARK 11187 3.194218 \n", + "\n", + " chicago_population_density \\\n", + "0 3937.055140 \n", + "1 3475.385799 \n", + "2 1434.629223 \n", + "3 4492.501531 \n", + "4 6097.073294 \n", + ".. ... \n", + "72 2513.994352 \n", + "73 2442.208199 \n", + "74 339.933469 \n", + "75 11558.224160 \n", + "76 3502.265775 \n", + "\n", + " geometry \n", + "0 MULTIPOLYGON (((-87.60914 41.84469, -87.60915 ... \n", + "1 MULTIPOLYGON (((-87.59215 41.81693, -87.59231 ... \n", + "2 MULTIPOLYGON (((-87.62880 41.80189, -87.62879 ... \n", + "3 MULTIPOLYGON (((-87.60671 41.81681, -87.60670 ... \n", + "4 MULTIPOLYGON (((-87.59215 41.81693, -87.59215 ... \n", + ".. ... \n", + "72 MULTIPOLYGON (((-87.69646 41.70714, -87.69644 ... \n", + "73 MULTIPOLYGON (((-87.64215 41.68508, -87.64249 ... \n", + "74 MULTIPOLYGON (((-87.83658 41.98640, -87.83658 ... \n", + "75 MULTIPOLYGON (((-87.65456 41.99817, -87.65456 ... \n", + "76 MULTIPOLYGON (((-87.80676 42.00084, -87.80676 ... \n", + "\n", + "[77 rows x 5 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jordan/Desktop/OpenSource/hamilton-js/examples/geopandas/materialization/geometric_calculations.py:22: UserWarning: Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.\n", + "\n", + " return 10000 * geometry.area\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "\n", + "base_df\n", + "\n", + "base_df\n", + "\n", + "\n", + "\n", + "community\n", + "\n", + "community\n", + "\n", + "\n", + "\n", + "base_df->community\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "geometry\n", + "\n", + "geometry\n", + "\n", + "\n", + "\n", + "base_df->geometry\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "POP2010\n", + "\n", + "POP2010\n", + "\n", + "\n", + "\n", + "base_df->POP2010\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_population\n", + "\n", + "chicago_population\n", + "\n", + "\n", + "\n", + "chicago_population_density\n", + "\n", + "chicago_population_density\n", + "\n", + "\n", + "\n", + "chicago_population->chicago_population_density\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_area\n", + "\n", + "chicago_area\n", + "\n", + "\n", + "\n", + "geometry->chicago_area\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "POP2010->chicago_population\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_area->chicago_population_density\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "base_df_location\n", + "\n", + "Input: base_df_location\n", + "\n", + "\n", + "\n", + "base_df_location->base_df\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Execute the driver.\n", + "\n", + "df = dr.execute(output_columns)\n", + "print(df)\n", + "\n", + "# To visualize do `pip install \"sf-hamilton[visualization]\"` if you want these to work\n", + "dr.visualize_execution(output_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "\n", + "community\n", + "\n", + "community\n", + "\n", + "\n", + "\n", + "geometry\n", + "\n", + "geometry\n", + "\n", + "\n", + "\n", + "base_df\n", + "\n", + "base_df\n", + "\n", + "\n", + "\n", + "base_df->community\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "base_df->geometry\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "POP2010\n", + "\n", + "POP2010\n", + "\n", + "\n", + "\n", + "base_df->POP2010\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_population\n", + "\n", + "chicago_population\n", + "\n", + "\n", + "\n", + "chicago_population_density\n", + "\n", + "chicago_population_density\n", + "\n", + "\n", + "\n", + "chicago_population->chicago_population_density\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_area\n", + "\n", + "chicago_area\n", + "\n", + "\n", + "\n", + "chicago_area->chicago_population_density\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_grocery_plot\n", + "\n", + "chicago_grocery_plot\n", + "\n", + "\n", + "\n", + "grocery_data\n", + "\n", + "Input: grocery_data\n", + "\n", + "\n", + "\n", + "grocery_data->chicago_grocery_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_data\n", + "\n", + "Input: chicago_data\n", + "\n", + "\n", + "\n", + "chicago_data->chicago_population\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_data->chicago_area\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "chicago_data->chicago_grocery_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "base_df_location\n", + "\n", + "Input: base_df_location\n", + "\n", + "\n", + "\n", + "base_df_location->base_df\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dr.display_all_functions()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hamilton/plugins/h_geopandas.py b/hamilton/plugins/h_geopandas.py new file mode 100644 index 000000000..e8909d859 --- /dev/null +++ b/hamilton/plugins/h_geopandas.py @@ -0,0 +1,53 @@ +from typing import Any, Dict, Type, Union + +import geopandas as gpd + +from hamilton import base + + +class GeoPandasDataFrameResult(base.ResultMixin): + """A ResultBuilder that produces a geopandas dataframe. + + Use this when you want to create a geopandas dataframe from the outputs. Caveat: you need to ensure that the length + of the outputs is the same, otherwise you will get an error; mixed outputs aren't that well handled. + + To use: + + .. code-block:: python + + from hamilton import base, driver + from hamilton.plugins import geopandas_extensions + polars_builder = polars_extensions.GeopandasDataFrameResult() + adapter = base.SimplePythonGraphAdapter(geopandas_builder) + dr = driver.Driver(config, *modules, adapter=adapter) + df = dr.execute([...], inputs=...) # returns geopandas dataframe + + Note: this is just a first attempt at something for Geopandas. Think it should handle more? Come chat/open a PR! + """ + + def build_result( + self, **outputs: Dict[str, Union[gpd.GeoSeries, gpd.GeoDataFrame, Any]] + ) -> gpd.GeoDataFrame: + """This is the method that Hamilton will call to build the final result. It will pass in the results + of the requested outputs that you passed in to the execute() method. + + Note: this function could do smarter things; looking for contributions here! + + :param outputs: The results of the requested outputs. + :return: a geopandas DataFrame. + """ + if len(outputs) == 1: + (value,) = outputs.values() # this works because it's length 1. + if isinstance(value, gpd.GeoDataFrame): # it's a dataframe + return value + elif not isinstance(value, gpd.GeoSeries): # it's a single scalar/object + key, value = outputs.popitem() + return gpd.GeoDataFrame({key: [value]}) + else: # it's a series + return gpd.GeoDataFrame(outputs) + # TODO: check for length of outputs and determine what should + # happen for mixed outputs that include scalars for example. + return gpd.GeoDataFrame(outputs) + + def output_type(self) -> Type: + return gpd.GeoDataFrame From 9faee4e3f0de71d0168920513fb6184528e28c67 Mon Sep 17 00:00:00 2001 From: JoJo10Smith Date: Fri, 17 Nov 2023 19:53:01 -0800 Subject: [PATCH 10/10] Fixing pre commit issues with previous commit, no code or logic changes --- .../materialization/geometric_calculations.py | 21 ++++++++++++------- .../pandas/materialization/notebook.ipynb | 4 ++-- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/geopandas/materialization/geometric_calculations.py b/examples/geopandas/materialization/geometric_calculations.py index 7831a28f9..94fd4c9f1 100644 --- a/examples/geopandas/materialization/geometric_calculations.py +++ b/examples/geopandas/materialization/geometric_calculations.py @@ -2,10 +2,12 @@ # The %%writefile magic command creates a new Python module with the functions below. # We will import this later and pass it into our Driver. -import geopandas import geodatasets +import geopandas + from hamilton.function_modifiers import extract_columns + @extract_columns("community", "POP2010", "geometry") def base_df(base_df_location: str) -> geopandas.GeoDataFrame: """Loads base dataframe of data. @@ -15,16 +17,21 @@ def base_df(base_df_location: str) -> geopandas.GeoDataFrame: """ chicago = geopandas.read_file(geodatasets.get_path("geoda.chicago_commpop")) return chicago - + + # Look at `my_functions` to see how these functions connect. def chicago_area(geometry: geopandas.GeoSeries) -> geopandas.GeoSeries: """Get the area of the row using the geometry column""" - return 10000 * geometry.area + return 10000 * geometry.area + def chicago_population(POP2010: geopandas.GeoSeries) -> geopandas.GeoSeries: - """ Get the population of the area of interest""" + """Get the population of the area of interest""" return POP2010 -def chicago_population_density(chicago_area: geopandas.GeoSeries, chicago_population: geopandas.GeoSeries) -> geopandas.GeoSeries: - """ Calculate the population density""" - return chicago_population/chicago_area + +def chicago_population_density( + chicago_area: geopandas.GeoSeries, chicago_population: geopandas.GeoSeries +) -> geopandas.GeoSeries: + """Calculate the population density""" + return chicago_population / chicago_area diff --git a/examples/pandas/materialization/notebook.ipynb b/examples/pandas/materialization/notebook.ipynb index b9327457d..672502a9a 100644 --- a/examples/pandas/materialization/notebook.ipynb +++ b/examples/pandas/materialization/notebook.ipynb @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2023-09-17T05:43:33.932468Z", @@ -154,7 +154,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/dagworks-inc/hamilton#usage-analytics--data-privacy for details.\n" + "UsageError: Line magic function `%aimport` not found.\n" ] } ],