Skip to content

Commit

Permalink
Add Geopandas.GeoDataFrame Document Loader (langchain-ai#3817)
Browse files Browse the repository at this point in the history
Work in Progress.
WIP
Not ready...

Adds Document Loader support for
[Geopandas.GeoDataFrames](https://geopandas.org/)

Example:
- [x] stub out `GeoDataFrameLoader` class
- [x] stub out integration tests
- [ ] Experiment with different geometry text representations
- [ ] Verify CRS is successfully added in metadata
- [ ] Test effectiveness of searches on geometries
- [ ] Test with different geometry types (point, line, polygon with
multi-variants).
- [ ] Add documentation

---------

Co-authored-by: Lance Martin <[email protected]>
Co-authored-by: Bagatur <[email protected]>
Co-authored-by: Lance Martin <[email protected]>
  • Loading branch information
4 people authored Jul 19, 2023
1 parent dfc533a commit 9aef79c
Show file tree
Hide file tree
Showing 6 changed files with 493 additions and 2 deletions.

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from langchain.document_loaders.figma import FigmaFileLoader
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
from langchain.document_loaders.gcs_file import GCSFileLoader
from langchain.document_loaders.geodataframe import GeoDataFrameLoader
from langchain.document_loaders.git import GitLoader
from langchain.document_loaders.gitbook import GitbookLoader
from langchain.document_loaders.github import GitHubIssuesLoader
Expand Down Expand Up @@ -200,6 +201,7 @@
"FileSystemBlobLoader",
"GCSDirectoryLoader",
"GCSFileLoader",
"GeoDataFrameLoader",
"GitHubIssuesLoader",
"GitLoader",
"GitbookLoader",
Expand Down
49 changes: 49 additions & 0 deletions langchain/document_loaders/geodataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Load from Dataframe object"""
from typing import Any, Iterator, List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class GeoDataFrameLoader(BaseLoader):
"""Load geopandas Dataframe."""

def __init__(self, data_frame: Any, page_content_column: str = "geometry"):
"""Initialize with geopandas Dataframe.
Args:
data_frame: geopandas DataFrame object.
page_content_column: Name of the column containing the page content.
Defaults to "geometry".
"""

try:
import geopandas as gpd
except ImportError:
raise ValueError(
"geopandas package not found, please install it with "
"`pip install geopandas`"
)

if not isinstance(data_frame, gpd.GeoDataFrame):
raise ValueError(
f"Expected data_frame to be a gpd.GeoDataFrame, got {type(data_frame)}"
)

self.data_frame = data_frame
self.page_content_column = page_content_column

def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from dataframe."""

for _, row in self.data_frame.iterrows():
text = row[self.page_content_column]
metadata = row.to_dict()
metadata.pop(self.page_content_column)
# Enforce str since shapely Point objects
# geometry type used in GeoPandas) are not strings
yield Document(page_content=str(text), metadata=metadata)

def load(self) -> List[Document]:
"""Load full dataframe."""
return list(self.lazy_load())
Loading

0 comments on commit 9aef79c

Please sign in to comment.