diff --git a/src/unstract/sdk/adapters/x2text/constants.py b/src/unstract/sdk/adapters/x2text/constants.py index 77cca1b..44418c5 100644 --- a/src/unstract/sdk/adapters/x2text/constants.py +++ b/src/unstract/sdk/adapters/x2text/constants.py @@ -5,3 +5,4 @@ class X2TextConstants: ENABLE_HIGHLIGHT = "enable_highlight" EXTRACTED_TEXT = "extracted_text" WHISPER_HASH = "whisper-hash" + WHISPER_HASH_V2 = "whisper_hash" diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json index 2dce2de..344adc9 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json @@ -18,7 +18,7 @@ "title": "URL", "format": "uri", "default": "https://llmwhisperer-api.unstract.com", - "description": "Provide the URL of the LLM Whisperer service." + "description": "Provide the URL of the LLM Whisperer service. Please note that this version of LLM Whisperer is deprecated." }, "unstract_key": { "type": "string", diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md new file mode 100644 index 0000000..57ea77b --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md @@ -0,0 +1,58 @@ +# Unstract LLM Whisperer v2 X2Text Adapter + +## Env variables + +The below env variables are resolved by LLM Whisperer adapter + +| Variable | Description | +| ---------------------------- | -------------------------------------------------------------------------------------------- | +| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | +| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | + + +--- +id: llm_whisperer_apis_changelog +--- + +# Changelog + +## Version 2.0.0 + +:::warning +This version of the API is not backward compatible with the previous version. +::: + +### API endpoint + +- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2` + +### Global change in parameter naming + +- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency. + +### Whisper parameters + +#### Added +- `mode` (str, optional): The processing mode. +- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document. +- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document. +- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process. +- `lang` (str, optional): The language of the document. +- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes. +- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes. +- `use_webhook` (str, optional): The name of the webhook to call after the document is processed. +- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed. + +#### Removed +- `timeout` (int, optional): The timeout for API requests. *There is no sync mode now. All requests are async.* +- `force_text_processing` (bool, optional): Whether to force text processing. *This is feature is removed* +- `ocr_provider` (str, optional): The OCR provider to use. *This is superseded by `mode`* +- `processing_mode` (str, optional): The processing mode. *This is superseded by `mode`* +- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. *Feature is removed. Data still available and set back when retrieve is called* + + +### New features + +#### Webhooks + +- Added support for webhooks. You can now register a webhook and use it to receive the processed document. diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml new file mode 100644 index 0000000..bf7ad3a --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[project] +name = "unstract-llm_whisperer-x2text-v2" +version = "0.0.1" +description = "V2 of LLMWhisperer X2Text Adapter" +authors = [ + {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, +] +dependencies = [ +] +requires-python = ">=3.9" +readme = "README.md" +classifiers = [ + "Programming Language :: Python" +] +license = {text = "MIT"} + +[tool.pdm.build] +includes = ["src"] +package-dir = "src" +# source-includes = ["tests"] diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py new file mode 100644 index 0000000..14240c6 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py @@ -0,0 +1,9 @@ +from .llm_whisperer_v2 import LLMWhispererV2 + +metadata = { + "name": LLMWhispererV2.__name__, + "version": "1.0.0", + "adapter": LLMWhispererV2, + "description": "LLMWhispererV2 X2Text adapter", + "is_active": True, +} diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py new file mode 100644 index 0000000..146b5ce --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py @@ -0,0 +1,103 @@ +import os +from enum import Enum + + +class Modes(Enum): + NATIVE_TEXT = "native_text" + LOW_COST = "low_cost" + HIGH_QUALITY = "high_quality" + FORM = "form" + + +class OutputModes(Enum): + LAYOUT_PRESERVING = "layout_preserving" + TEXT = "text" + + +class HTTPMethod(Enum): + GET = "GET" + POST = "POST" + + +class WhispererHeader: + UNSTRACT_KEY = "unstract-key" + + +class WhispererEndpoint: + """Endpoints available at LLMWhisperer service.""" + + TEST_CONNECTION = "test-connection" + WHISPER = "whisper" + STATUS = "whisper-status" + RETRIEVE = "whisper-retrieve" + + +class WhispererEnv: + """Env variables for LLM whisperer. + + Can be used to alter behaviour at runtime. + + Attributes: + POLL_INTERVAL: Time in seconds to wait before polling + LLMWhisperer's status API. Defaults to 30s + MAX_POLLS: Total number of times to poll the status API. + Set to -1 to poll indefinitely. Defaults to -1 + """ + + POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL" + MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS" + + +class WhispererConfig: + """Dictionary keys used to configure LLMWhisperer service.""" + + URL = "url" + MODE = "mode" + OUTPUT_MODE = "output_mode" + UNSTRACT_KEY = "unstract_key" + MEDIAN_FILTER_SIZE = "median_filter_size" + GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius" + LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance" + LINE_SPLITTER_STRATEGY = "line_splitter_strategy" + HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" + PAGES_TO_EXTRACT = "pages_to_extract" + MARK_VERTICAL_LINES = "mark_vertical_lines" + MARK_HORIZONTAL_LINES = "mark_horizontal_lines" + PAGE_SEPARATOR = "page_seperator" + URL_IN_POST = "url_in_post" + TAG = "tag" + USE_WEBHOOK = "use_webhook" + WEBHOOK_METADATA = "webhook_metadata" + TEXT_ONLY = "text_only" + + +class WhisperStatus: + """Values returned / used by /whisper-status endpoint.""" + + PROCESSING = "processing" + PROCESSED = "processed" + DELIVERED = "delivered" + UNKNOWN = "unknown" + # Used for async processing + WHISPER_HASH = "whisper_hash" + STATUS = "status" + + +class WhispererDefaults: + """Defaults meant for LLM whisperer.""" + + MEDIAN_FILTER_SIZE = 0 + GAUSSIAN_BLUR_RADIUS = 0.0 + FORCE_TEXT_PROCESSING = False + LINE_SPLITTER_TOLERANCE = 0.75 + LINE_SPLITTER_STRATEGY = "left-priority" + HORIZONTAL_STRETCH_FACTOR = 1.0 + POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) + MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) + PAGES_TO_EXTRACT = "" + PAGE_SEPARATOR = "<<<" + MARK_VERTICAL_LINES = False + MARK_HORIZONTAL_LINES = False + URL_IN_POST = False + TAG = "default" + TEXT_ONLY = False diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py new file mode 100644 index 0000000..202ce64 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py @@ -0,0 +1,360 @@ +import json +import logging +import time +from pathlib import Path +from typing import Any, Optional + +import requests +from requests import Response +from requests.exceptions import ConnectionError, HTTPError, Timeout + +from unstract.sdk.adapters.exceptions import ExtractorError +from unstract.sdk.adapters.utils import AdapterUtils +from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import ( + HTTPMethod, + Modes, + OutputModes, + WhispererConfig, + WhispererDefaults, + WhispererEndpoint, + WhispererHeader, + WhisperStatus, +) + +logger = logging.getLogger(__name__) + + +class LLMWhispererHelper: + + @staticmethod + def get_request_headers(config: dict[str, Any]) -> dict[str, Any]: + """Obtains the request headers to authenticate with LLM Whisperer. + + Returns: + str: Request headers + """ + return { + "accept": "application/json", + WhispererHeader.UNSTRACT_KEY: config.get(WhispererConfig.UNSTRACT_KEY), + } + + @staticmethod + def make_request( + config: dict[str, Any], + request_method: HTTPMethod, + request_endpoint: str, + headers: Optional[dict[str, Any]] = None, + params: Optional[dict[str, Any]] = None, + data: Optional[Any] = None, + ) -> Response: + """Makes a request to LLM whisperer service. + + Args: + request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST + request_endpoint (str): LLM whisperer endpoint to hit + headers (Optional[dict[str, Any]], optional): Headers to pass. + Defaults to None. + params (Optional[dict[str, Any]], optional): Query params to pass. + Defaults to None. + data (Optional[Any], optional): Data to pass in case of POST. + Defaults to None. + + Returns: + Response: Response from the request + """ + llm_whisperer_svc_url = ( + f"{config.get(WhispererConfig.URL)}" f"/api/v2/{request_endpoint}" + ) + if not headers: + headers = LLMWhispererHelper.get_request_headers(config=config) + + try: + response: Response + if request_method == HTTPMethod.GET: + response = requests.get( + url=llm_whisperer_svc_url, headers=headers, params=params + ) + elif request_method == HTTPMethod.POST: + response = requests.post( + url=llm_whisperer_svc_url, + headers=headers, + params=params, + data=data, + ) + else: + raise ExtractorError(f"Unsupported request method: {request_method}") + response.raise_for_status() + except ConnectionError as e: + logger.error(f"Adapter error: {e}") + raise ExtractorError( + "Unable to connect to LLM Whisperer service, please check the URL" + ) + except Timeout as e: + msg = "Request to LLM whisperer has timed out" + logger.error(f"{msg}: {e}") + raise ExtractorError(msg) + except HTTPError as e: + logger.error(f"Adapter error: {e}") + default_err = "Error while calling the LLM Whisperer service" + msg = AdapterUtils.get_msg_from_request_exc( + err=e, message_key="message", default_err=default_err + ) + raise ExtractorError(msg) + return response + + @staticmethod + def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]: + """Gets query params meant for /whisper endpoint. + + The params is filled based on the configuration passed. + + Returns: + dict[str, Any]: Query params + """ + params = { + WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value), + WhispererConfig.OUTPUT_MODE: config.get( + WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value + ), + WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get( + WhispererConfig.LINE_SPLITTER_TOLERANCE, + WhispererDefaults.LINE_SPLITTER_TOLERANCE, + ), + WhispererConfig.LINE_SPLITTER_STRATEGY: config.get( + WhispererConfig.LINE_SPLITTER_STRATEGY, + WhispererDefaults.LINE_SPLITTER_STRATEGY, + ), + WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get( + WhispererConfig.HORIZONTAL_STRETCH_FACTOR, + WhispererDefaults.HORIZONTAL_STRETCH_FACTOR, + ), + WhispererConfig.PAGES_TO_EXTRACT: config.get( + WhispererConfig.PAGES_TO_EXTRACT, + WhispererDefaults.PAGES_TO_EXTRACT, + ), + WhispererConfig.MARK_VERTICAL_LINES: config.get( + WhispererConfig.MARK_VERTICAL_LINES, + WhispererDefaults.MARK_VERTICAL_LINES, + ), + WhispererConfig.MARK_HORIZONTAL_LINES: config.get( + WhispererConfig.MARK_HORIZONTAL_LINES, + WhispererDefaults.MARK_HORIZONTAL_LINES, + ), + WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST, + WhispererConfig.PAGE_SEPARATOR: config.get( + WhispererConfig.PAGE_SEPARATOR, + WhispererDefaults.PAGE_SEPARATOR, + ), + # Not providing default value to maintain legacy compatablity + # these are optional params and identifiers for audit + WhispererConfig.TAG: config.get( + WhispererConfig.TAG, + WhispererDefaults.TAG, + ), + WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), + WhispererConfig.WEBHOOK_METADATA: config.get( + WhispererConfig.WEBHOOK_METADATA + ), + } + if params[WhispererConfig.MODE] == Modes.LOW_COST.value: + params.update( + { + WhispererConfig.MEDIAN_FILTER_SIZE: config.get( + WhispererConfig.MEDIAN_FILTER_SIZE, + WhispererDefaults.MEDIAN_FILTER_SIZE, + ), + WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get( + WhispererConfig.GAUSSIAN_BLUR_RADIUS, + WhispererDefaults.GAUSSIAN_BLUR_RADIUS, + ), + } + ) + return params + + @staticmethod + def check_status_until_ready( + config: dict[str, Any], + whisper_hash: str, + headers: dict[str, Any], + params: dict[str, Any], + ) -> WhisperStatus: + """Checks the extraction status by polling. + + Polls the /whisper-status endpoint in fixed intervals of + env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times + controlled by env: ADAPTER_LLMW_MAX_POLLS. + + Args: + whisper_hash (str): Identifier for the extraction, + returned by LLMWhisperer + headers (dict[str, Any]): Headers to pass for the status check + params (dict[str, Any]): Params to pass for the status check + + Returns: + WhisperStatus: Status of the extraction + """ + POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL + MAX_POLLS = WhispererDefaults.MAX_POLLS + request_count = 0 + + # Check status in fixed intervals upto max poll count. + while True: + request_count += 1 + logger.info( + f"Checking status with interval: {POLL_INTERVAL}s" + f", request count: {request_count} [max: {MAX_POLLS}]" + ) + status_response = LLMWhispererHelper.make_request( + config=config, + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.STATUS, + headers=headers, + params=params, + ) + if status_response.status_code == 200: + status_data = status_response.json() + status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN) + logger.info(f"Whisper status for {whisper_hash}: {status}") + if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]: + break + else: + raise ExtractorError( + "Error checking LLMWhisperer status: " + f"{status_response.status_code} - {status_response.text}" + ) + + # Exit with error if max poll count is reached + if request_count >= MAX_POLLS: + raise ExtractorError( + "Unable to extract text after attempting" f" {request_count} times" + ) + time.sleep(POLL_INTERVAL) + + return status + + @staticmethod + def extract_async(config: dict[str, Any], whisper_hash: str) -> dict[Any, Any]: + """Makes an async extraction with LLMWhisperer. + + Polls and checks the status first before proceeding to retrieve once. + + Args: + whisper_hash (str): Identifier of the extraction + + Returns: + str: Extracted contents from the file + """ + logger.info(f"Extracting async for whisper hash: {whisper_hash}") + + headers: dict[str, Any] = LLMWhispererHelper.get_request_headers(config) + params = { + WhisperStatus.WHISPER_HASH: whisper_hash, + WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY, + } + + # Polls in fixed intervals and checks status + LLMWhispererHelper.check_status_until_ready( + config=config, whisper_hash=whisper_hash, headers=headers, params=params + ) + + retrieve_response = LLMWhispererHelper.make_request( + config=config, + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.RETRIEVE, + headers=headers, + params=params, + ) + if retrieve_response.status_code == 200: + return retrieve_response.json() + else: + raise ExtractorError( + "Error retrieving from LLMWhisperer: " + f"{retrieve_response.status_code} - {retrieve_response.text}" + ) + + @staticmethod + def send_whisper_request( + input_file_path: str, config: dict[str, Any] + ) -> requests.Response: + headers = LLMWhispererHelper.get_request_headers(config) + headers["Content-Type"] = "application/octet-stream" + params = LLMWhispererHelper.get_whisperer_params(config) + + response: requests.Response + try: + with open(input_file_path, "rb") as input_f: + response = LLMWhispererHelper.make_request( + config=config, + request_method=HTTPMethod.POST, + request_endpoint=WhispererEndpoint.WHISPER, + headers=headers, + params=params, + data=input_f.read(), + ) + except OSError as e: + logger.error(f"OS error while reading {input_file_path}: {e}") + raise ExtractorError(str(e)) + return response + + @staticmethod + def extract_text_from_response( + config: dict[str, Any], + output_file_path: Optional[str], + response_dict: dict[str, Any], + response: Response, + ) -> str: + output_json = {} + if response.status_code == 200: + output_json = response.json() + elif response.status_code == 202: + whisper_hash = response_dict.get(WhisperStatus.WHISPER_HASH) + output_json = LLMWhispererHelper.extract_async( + config=config, whisper_hash=whisper_hash + ) + else: + raise ExtractorError("Couldn't extract text from file") + if output_file_path: + LLMWhispererHelper.write_output_to_file( + output_json=output_json, + output_file_path=Path(output_file_path), + ) + return output_json.get("result_text", "") + + @staticmethod + def write_output_to_file(output_json: dict, output_file_path: Path) -> None: + """Writes the extracted text and metadata to the specified output file + and metadata file. + + Args: + output_json (dict): The dictionary containing the extracted data, + with "text" as the key for the main content. + output_file_path (Path): The file path where the extracted text + should be written. + + Raises: + ExtractorError: If there is an error while writing the output file. + """ + try: + text_output = output_json.get("result_text", "") + logger.info(f"Writing output to {output_file_path}") + output_file_path.write_text(text_output, encoding="utf-8") + except Exception as e: + logger.error(f"Error while writing {output_file_path}: {e}") + raise ExtractorError(str(e)) + try: + # Define the directory of the output file and metadata paths + output_dir = output_file_path.parent + metadata_dir = output_dir / "metadata" + metadata_file_name = output_file_path.with_suffix(".json").name + metadata_file_path = metadata_dir / metadata_file_name + # Ensure the metadata directory exists + metadata_dir.mkdir(parents=True, exist_ok=True) + # Remove the "result_text" key from the metadata + metadata = { + key: value for key, value in output_json.items() if key != "result_text" + } + metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4) + logger.info(f"Writing metadata to {metadata_file_path}") + metadata_file_path.write_text(metadata_json, encoding="utf-8") + except Exception as e: + logger.warn(f"Error while writing metadata to {metadata_file_path}: {e}") diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py new file mode 100644 index 0000000..cbc0a85 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -0,0 +1,91 @@ +import json +import logging +import os +from typing import Any, Optional + +import requests + +from unstract.sdk.adapters.x2text.constants import X2TextConstants +from unstract.sdk.adapters.x2text.dto import ( + TextExtractionMetadata, + TextExtractionResult, +) +from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import ( + HTTPMethod, + WhispererEndpoint, +) +from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.helper import LLMWhispererHelper +from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter + +logger = logging.getLogger(__name__) + + +class LLMWhispererV2(X2TextAdapter): + def __init__(self, settings: dict[str, Any]): + super().__init__("LLMWhispererV2") + self.config = settings + + @staticmethod + def get_id() -> str: + return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + + @staticmethod + def get_name() -> str: + return "LLMWhisperer V2" + + @staticmethod + def get_description() -> str: + return "LLMWhisperer V2 X2Text" + + @staticmethod + def get_icon() -> str: + return "/icons/adapter-icons/LLMWhispererV2.png" + + @staticmethod + def get_json_schema() -> str: + f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") + schema = f.read() + f.close() + return schema + + def test_connection(self) -> bool: + LLMWhispererHelper.make_request( + config=self.config, + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.TEST_CONNECTION, + ) + return True + + def process( + self, + input_file_path: str, + output_file_path: Optional[str] = None, + **kwargs: dict[Any, Any], + ) -> TextExtractionResult: + """Used to extract text from documents. + + Args: + input_file_path (str): Path to file that needs to be extracted + output_file_path (Optional[str], optional): File path to write + extracted text into, if None doesn't write to a file. + Defaults to None. + + Returns: + str: Extracted text + """ + + response: requests.Response = LLMWhispererHelper.send_whisper_request( + input_file_path, self.config + ) + response_text = response.text + reponse_dict = json.loads(response_text) + metadata = TextExtractionMetadata( + whisper_hash=reponse_dict.get(X2TextConstants.WHISPER_HASH_V2, "") + ) + + return TextExtractionResult( + extracted_text=LLMWhispererHelper.extract_text_from_response( + self.config, output_file_path, reponse_dict, response + ), + extraction_metadata=metadata, + ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json new file mode 100644 index 0000000..c5dfbd4 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json @@ -0,0 +1,144 @@ +{ + "title": "LLM Whisperer X2Text v2", + "type": "object", + "required": [ + "adapter_name", + "unstract_key", + "url" + ], + "properties": { + "adapter_name": { + "type": "string", + "title": "Name", + "default": "llm-whisperer-v2", + "description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1" + }, + "url": { + "type": "string", + "title": "URL", + "format": "uri", + "default": "https://llmwhisperer-api.unstract.com", + "description": "Provide the URL of the LLM Whisperer service." + }, + "unstract_key": { + "type": "string", + "title": "Unstract Key", + "format": "password", + "description": "API key obtained from the Unstract developer portal (https://us-central.unstract.com/llm-whisperer)" + }, + "mode": { + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Native text : Extract text from native text PDFs. (not scanned). Use this mode when: You have low latency requirement, All documents are PDFs, PDFs are native text PDFs, Cost sensitive application\n Low cost : Cost effective extraction. Use this mode when: High quality scanned PDFs, High quality scanned images, No handwritten documents \n High quality : High quality extraction. Use this mode when: Medium/low quality scanned PDFs, Medium/low quality scanned images, Handwritten documents \n Form: High quality extraction + Checkbox and Radio button detection. Use this mode when: Checkbox and radio button detection, Medium/low quality scanned PDFs, Medium/low quality scanned images, Handwritten documents." + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "layout_preserving", + "text" + ], + "default": "layout_preserving", + "description": "The output format. Valid options are layout_preserving and text. Layout preserving mode tries to extract the text from the document as is, maintaining the structural layout of the document. This works very well for LLM consumption. Text (text) mode extracts the text from the document without applying any processing or intelligence. This mode is useful when the layout_preserving mode is not able to extract the text properly. This can happen if the document contains too many different fonts and font sizes." + }, + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height" + }, + "line_splitter_strategy": { + "type": "string", + "title": "Line Splitter Strategy", + "default":"left-priority", + "description": "An advanced option for customizing the line splitting process." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<<", + "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + }, + "mark_vertical_lines": { + "type": "boolean", + "title": "Mark vertical lines", + "default": false, + "description": "States whether to reproduce vertical lines in the document." + }, + "mark_horizontal_lines": { + "type": "boolean", + "title": "Mark horizontal lines", + "default": false, + "description": "States whether to reproduce horizontal lines in the document." + }, + "tag": { + "type": "string", + "title": "Tag", + "default": "default", + "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports." + }, + "use_webhook": { + "type": "string", + "title": "Webhook", + "default": "", + "description": "The webhook's name which will should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint" + }, + "webhook_metadata": { + "type": "string", + "title": "Webhook Metadata", + "default": "", + "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint." + } + }, + "if": { + "anyOf": [ + { + "properties": { + "mode": { + "const": "low_cost" + } + } + } + ] + }, + "then": { + "properties": { + "median_filter_size": { + "type": "integer", + "title": "Median Filter Size", + "default": 0, + "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." + }, + "gaussian_blur_radius": { + "type": "number", + "title": "Gaussian Blur Radius", + "default": 0.0, + "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." + } + }, + "required": [ + "median_filter_size", + "gaussian_blur_radius" + ] + } +}