Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: add init for unstructured file loader #29101

Merged
merged 3 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions libs/community/langchain_community/document_loaders/epub.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union

from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
validate_unstructured_version,
)


Expand Down Expand Up @@ -30,13 +31,25 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-epub
"""

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

Args:
file_path: The path to the EPub file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version("0.5.4")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
min_unstructured_version = "0.5.4"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning epub files is only supported in "
f"unstructured>={min_unstructured_version}."
)
from unstructured.partition.epub import partition_epub

return partition_epub(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
for more info. Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

Expand Down
20 changes: 19 additions & 1 deletion libs/community/langchain_community/document_loaders/image.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

Expand Down Expand Up @@ -27,6 +28,23 @@ class UnstructuredImageLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-image
"""

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

Args:
file_path: The path to the Image file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
from unstructured.partition.image import partition_image

Expand Down
38 changes: 24 additions & 14 deletions libs/community/langchain_community/document_loaders/markdown.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)


class UnstructuredMarkdownLoader(UnstructuredFileLoader):
Expand Down Expand Up @@ -68,19 +72,25 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
""" # noqa: E501

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

Args:
file_path: The path to the Markdown file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version("0.4.16")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.partition.md import partition_md

# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
# versions of unstructured like 0.4.17-dev1
_unstructured_version = __unstructured_version__.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])

if unstructured_version < (0, 4, 16):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning markdown files is only supported in unstructured>=0.4.16."
)

return partition_md(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
1 change: 1 addition & 0 deletions libs/community/langchain_community/document_loaders/odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.6.3")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
**unstructured_kwargs: Any additional keyword arguments to pass
to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.7.9")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

Expand Down
17 changes: 17 additions & 0 deletions libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,23 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
"""

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

Args:
file_path: The path to the PDF file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these casts necessary? Some loaders here appear to support Path:

file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(file_path, mode="elements")

(Those integration tests are not run in CI but are intended to be run locally by developers.)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah just saw your comment.

super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> list:
from unstructured.partition.pdf import partition_pdf

Expand Down
37 changes: 25 additions & 12 deletions libs/community/langchain_community/document_loaders/powerpoint.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import os
from typing import List
from pathlib import Path
from typing import Any, List, Union

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)


class UnstructuredPowerPointLoader(UnstructuredFileLoader):
Expand Down Expand Up @@ -29,13 +33,26 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx
"""

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

Args:
file_path: The path to the PowerPoint file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype

unstructured_version = tuple(
[int(x) for x in __unstructured_version__.split(".")]
)
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
Expand All @@ -47,12 +64,8 @@ def _get_elements(self) -> List:
_, extension = os.path.splitext(str(self.file_path))
is_ppt = extension == ".ppt"

if is_ppt and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .ppt files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_ppt:
validate_unstructured_version("0.4.11")

if is_ppt:
from unstructured.partition.ppt import partition_ppt
Expand Down
1 change: 1 addition & 0 deletions libs/community/langchain_community/document_loaders/rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(
**unstructured_kwargs: Additional keyword arguments to pass
to unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.7.5")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

Expand Down
11 changes: 3 additions & 8 deletions libs/community/langchain_community/document_loaders/rtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
validate_unstructured_version,
)


Expand Down Expand Up @@ -49,13 +49,8 @@ def __init__(
**unstructured_kwargs: Additional keyword arguments to pass
to unstructured.
"""
min_unstructured_version = "0.5.12"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning rtf files is only supported in "
f"unstructured>={min_unstructured_version}."
)

file_path = str(file_path)
validate_unstructured_version("0.5.12")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
Expand Down
1 change: 1 addition & 0 deletions libs/community/langchain_community/document_loaders/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
mode: str = "single",
**unstructured_kwargs: Any,
):
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.7.6")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import tempfile
from abc import ABC
from pathlib import Path
from typing import List, Union
from typing import Any, List, Union
from urllib.parse import urlparse

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)


class Docx2txtLoader(BaseLoader, ABC):
Expand Down Expand Up @@ -92,13 +95,26 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
"""

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

Args:
file_path: The path to the Word file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype

unstructured_version = tuple(
[int(x) for x in __unstructured_version__.split(".")]
)
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
Expand All @@ -110,12 +126,8 @@ def _get_elements(self) -> List:
_, extension = os.path.splitext(str(self.file_path))
is_doc = extension == ".doc"

if is_doc and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_doc:
validate_unstructured_version("0.4.11")

if is_doc:
from unstructured.partition.doc import partition_doc
Expand Down
Loading