forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Geopandas.GeoDataFrame Document Loader (langchain-ai#3817)
Work in Progress. WIP Not ready... Adds Document Loader support for [Geopandas.GeoDataFrames](https://geopandas.org/) Example: - [x] stub out `GeoDataFrameLoader` class - [x] stub out integration tests - [ ] Experiment with different geometry text representations - [ ] Verify CRS is successfully added in metadata - [ ] Test effectiveness of searches on geometries - [ ] Test with different geometry types (point, line, polygon with multi-variants). - [ ] Add documentation --------- Co-authored-by: Lance Martin <[email protected]> Co-authored-by: Bagatur <[email protected]> Co-authored-by: Lance Martin <[email protected]>
- Loading branch information
1 parent
dfc533a
commit 9aef79c
Showing
6 changed files
with
493 additions
and
2 deletions.
There are no files selected for viewing
199 changes: 199 additions & 0 deletions
199
docs/extras/modules/data_connection/document_loaders/integrations/geopandas.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""Load from Dataframe object""" | ||
from typing import Any, Iterator, List | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class GeoDataFrameLoader(BaseLoader): | ||
"""Load geopandas Dataframe.""" | ||
|
||
def __init__(self, data_frame: Any, page_content_column: str = "geometry"): | ||
"""Initialize with geopandas Dataframe. | ||
Args: | ||
data_frame: geopandas DataFrame object. | ||
page_content_column: Name of the column containing the page content. | ||
Defaults to "geometry". | ||
""" | ||
|
||
try: | ||
import geopandas as gpd | ||
except ImportError: | ||
raise ValueError( | ||
"geopandas package not found, please install it with " | ||
"`pip install geopandas`" | ||
) | ||
|
||
if not isinstance(data_frame, gpd.GeoDataFrame): | ||
raise ValueError( | ||
f"Expected data_frame to be a gpd.GeoDataFrame, got {type(data_frame)}" | ||
) | ||
|
||
self.data_frame = data_frame | ||
self.page_content_column = page_content_column | ||
|
||
def lazy_load(self) -> Iterator[Document]: | ||
"""Lazy load records from dataframe.""" | ||
|
||
for _, row in self.data_frame.iterrows(): | ||
text = row[self.page_content_column] | ||
metadata = row.to_dict() | ||
metadata.pop(self.page_content_column) | ||
# Enforce str since shapely Point objects | ||
# geometry type used in GeoPandas) are not strings | ||
yield Document(page_content=str(text), metadata=metadata) | ||
|
||
def load(self) -> List[Document]: | ||
"""Load full dataframe.""" | ||
return list(self.lazy_load()) |
Oops, something went wrong.