Skip to content

Commit

Permalink
unstructured: review docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Mar 4, 2024
1 parent 7a1e118 commit 96e2741
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 26 deletions.
3 changes: 2 additions & 1 deletion integrations/unstructured/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ markers = [
module = [
"haystack.*",
"haystack_integrations.*",
"pytest.*"
"pytest.*",
"unstructured.*",
]
ignore_missing_imports = true
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from haystack.utils import Secret
from tqdm import tqdm

from unstructured.documents.elements import Element # type: ignore[import]
from unstructured.partition.api import partition_via_api # type: ignore[import]
from unstructured.documents.elements import Element
from unstructured.partition.api import partition_via_api

logger = logging.getLogger(__name__)

Expand All @@ -24,7 +24,23 @@
@component
class UnstructuredFileConverter:
"""
Convert files to Haystack Documents using the Unstructured API (hosted or running locally).
A component for converting files to Haystack Documents using the Unstructured API (hosted or running locally).
For the supported file types and the specific API parameters, see
[Unstructured docs](https://unstructured-io.github.io/unstructured/api.html).
Usage example:
```python
from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter
# make sure to either set the environment variable UNSTRUCTURED_API_KEY
# or run the Unstructured API locally:
# docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest
# --port 8000 --host 0.0.0.0
converter = UnstructuredFileConverter()
documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"]
```
"""

def __init__(
Expand All @@ -39,22 +55,21 @@ def __init__(
progress_bar: bool = True, # noqa: FBT001, FBT002
):
"""
:param api_url: URL of the Unstructured API. Defaults to the hosted version.
If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general).
See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information.
:param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key).
:param api_url: URL of the Unstructured API. Defaults to the URL of the hosted version.
If you run the API locally, specify the URL of your local API (e.g. `"http://localhost:8000/general/v0/general"`).
:param api_key: API key for the Unstructured API.
It can be explicitly passed or read the environment variable `UNSTRUCTURED_API_KEY` (recommended).
If you run the API locally, it is not needed.
If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY.
:param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured.
- "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field.
- "one-doc-per-page": One Haystack Document per page.
All elements on a page are concatenated into one text field.
- "one-doc-per-element": One Haystack Document per element.
Each element is converted to a Haystack Document.
`"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field.
`"one-doc-per-page"`: One Haystack Document per page.
All elements on a page are concatenated into one text field.
`"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document.
:param separator: Separator between elements when concatenating them into one text field.
:param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API.
See https://unstructured-io.github.io/unstructured/api.html.
:param progress_bar: Show a progress bar for the conversion. Defaults to True.
:param unstructured_kwargs: Additional parameters that are passed to the Unstructured API.
For the available parameters, see
[Unstructured API docs](https://unstructured-io.github.io/unstructured/apis/api_parameters.html).
:param progress_bar: Whether to show a progress bar during the conversion.
"""

self.api_url = api_url
Expand All @@ -77,10 +92,12 @@ def __init__(

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""

# do not serialize api_key
return default_to_dict(
self,
api_url=self.api_url,
Expand All @@ -98,17 +115,21 @@ def run(
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Convert files to Haystack Documents using the Unstructured API (hosted or running locally).
Convert files to Haystack Documents using the Unstructured API.
:param paths: List of paths to convert. Paths can be files or directories.
If a path is a directory, all files in the directory are converted. Subdirectories are ignored.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of paths, because the two lists will be zipped.
Please note that if the paths contain directories, meta can only be a single dictionary
(same metadata for all files).
Defaults to `None`.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of paths, because the two lists will be zipped.
Please note that if the paths contain directories, meta can only be a single dictionary
(same metadata for all files).
:returns: A dictionary with the following key:
- "documents": List of Haystack Documents.
:raises ValueError: If `meta` is a list and `paths` contains directories.
"""
paths_obj = [Path(path) for path in paths]
filepaths = [path for path in paths_obj if path.is_file()]
Expand Down

0 comments on commit 96e2741

Please sign in to comment.