diff --git a/integrations/unstructured/pyproject.toml b/integrations/unstructured/pyproject.toml index 298fdb993..5d14fcfe1 100644 --- a/integrations/unstructured/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -180,6 +180,7 @@ markers = [ module = [ "haystack.*", "haystack_integrations.*", - "pytest.*" + "pytest.*", + "unstructured.*", ] ignore_missing_imports = true diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index a4a132437..0eff7bc82 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -13,8 +13,8 @@ from haystack.utils import Secret from tqdm import tqdm -from unstructured.documents.elements import Element # type: ignore[import] -from unstructured.partition.api import partition_via_api # type: ignore[import] +from unstructured.documents.elements import Element +from unstructured.partition.api import partition_via_api logger = logging.getLogger(__name__) @@ -24,7 +24,23 @@ @component class UnstructuredFileConverter: """ - Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + A component for converting files to Haystack Documents using the Unstructured API (hosted or running locally). + + For the supported file types and the specific API parameters, see + [Unstructured docs](https://unstructured-io.github.io/unstructured/api.html). + + Usage example: + ```python + from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter + + # make sure to either set the environment variable UNSTRUCTURED_API_KEY + # or run the Unstructured API locally: + # docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest + # --port 8000 --host 0.0.0.0 + + converter = UnstructuredFileConverter() + documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"] + ``` """ def __init__( @@ -39,22 +55,21 @@ def __init__( progress_bar: bool = True, # noqa: FBT001, FBT002 ): """ - :param api_url: URL of the Unstructured API. Defaults to the hosted version. - If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general). - See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information. - :param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key). + :param api_url: URL of the Unstructured API. Defaults to the URL of the hosted version. + If you run the API locally, specify the URL of your local API (e.g. `"http://localhost:8000/general/v0/general"`). + :param api_key: API key for the Unstructured API. + It can be explicitly passed or read the environment variable `UNSTRUCTURED_API_KEY` (recommended). If you run the API locally, it is not needed. - If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. :param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured. - - "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field. - - "one-doc-per-page": One Haystack Document per page. - All elements on a page are concatenated into one text field. - - "one-doc-per-element": One Haystack Document per element. - Each element is converted to a Haystack Document. + `"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field. + `"one-doc-per-page"`: One Haystack Document per page. + All elements on a page are concatenated into one text field. + `"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document. :param separator: Separator between elements when concatenating them into one text field. - :param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API. - See https://unstructured-io.github.io/unstructured/api.html. - :param progress_bar: Show a progress bar for the conversion. Defaults to True. + :param unstructured_kwargs: Additional parameters that are passed to the Unstructured API. + For the available parameters, see + [Unstructured API docs](https://unstructured-io.github.io/unstructured/apis/api_parameters.html). + :param progress_bar: Whether to show a progress bar during the conversion. """ self.api_url = api_url @@ -77,10 +92,12 @@ def __init__( def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ - # do not serialize api_key return default_to_dict( self, api_url=self.api_url, @@ -98,17 +115,21 @@ def run( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, ): """ - Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + Convert files to Haystack Documents using the Unstructured API. :param paths: List of paths to convert. Paths can be files or directories. If a path is a directory, all files in the directory are converted. Subdirectories are ignored. :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. - Please note that if the paths contain directories, meta can only be a single dictionary - (same metadata for all files). - Defaults to `None`. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. + Please note that if the paths contain directories, `meta` can only be a single dictionary + (same metadata for all files). + + :returns: A dictionary with the following key: + - "documents": List of Haystack Documents. + + :raises ValueError: If `meta` is a list and `paths` contains directories. """ paths_obj = [Path(path) for path in paths] filepaths = [path for path in paths_obj if path.is_file()]