Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/document_translation #9

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6a9f40b
First attempt at the document translation tool
DaleMG Sep 27, 2024
75ac62d
Updated code as per review feedback.
DaleMG Oct 6, 2024
f4d85a1
Updated based on the review
DaleMG Oct 7, 2024
841ce92
Made changes based off the 2nd review
DaleMG Oct 10, 2024
4ba0b27
Fixed linting and other build issues
DaleMG Oct 10, 2024
c0455ec
Made the review changes, added an example, and unit test
DaleMG Oct 15, 2024
360edf0
fixed linting issue
DaleMG Oct 15, 2024
48bf47f
fixed some errors
DaleMG Oct 15, 2024
82d80a9
fix pydantic errors
DaleMG Oct 15, 2024
ff2e6cc
fixed lint and extended tests
DaleMG Oct 15, 2024
0d52074
fixed the correct file for linting and edited the extended testing again
DaleMG Oct 15, 2024
f872943
Update extended_testing_deps.txt
DaleMG Oct 15, 2024
0312d93
Update extended_testing_deps.txt
DaleMG Oct 15, 2024
0adc2d7
Update dependencies in test
Sheepsta300 Oct 17, 2024
1e72212
Add unstructured to dependencies
Sheepsta300 Oct 18, 2024
2b9f562
Add pi_heif
Sheepsta300 Oct 18, 2024
b1f91d7
Remove added dependencies
Sheepsta300 Oct 18, 2024
6c1d55a
made the last change per review.
DaleMG Oct 18, 2024
74fd194
mocked unstructured
DaleMG Oct 18, 2024
8bccc64
File translate notebook with examples.
DaleMG Oct 18, 2024
1af3c97
made some comment changes
DaleMG Oct 18, 2024
11dd9b5
fix Issues with notebook and tests
DaleMG Oct 18, 2024
97b5ba7
trying to fix errors, hopefully it works
DaleMG Oct 18, 2024
028a7e1
fixed header typos
DaleMG Oct 18, 2024
317c923
Add changes to class
Sheepsta300 Oct 19, 2024
59f178e
Add changes
Sheepsta300 Oct 19, 2024
c9e0714
Add changes
Sheepsta300 Oct 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ arxiv>=1.4,<2
assemblyai>=0.17.0,<0.18
atlassian-python-api>=3.36.0,<4
azure-ai-documentintelligence>=1.0.0b1,<2
azure-ai-translation-text>=1.0.1,<2
azure-identity>=1.15.0,<2
azure-search-documents==11.4.0
beautifulsoup4>=4,<5
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from __future__ import annotations

import logging
import os
from typing import Any, Dict, Optional

from langchain_core.callbacks import CallbackManagerForToolRun
from langchain_core.tools import BaseTool
from langchain_core.utils import get_from_dict_or_env
from pydantic import model_validator

from langchain_community.document_loaders import (
UnstructuredExcelLoader,
UnstructuredHTMLLoader,
UnstructuredPDFLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader,
UnstructuredXMLLoader,
)

logger = logging.getLogger(__name__)


class AzureFileTranslateTool(BaseTool):
"""
A tool that uses Azure Text Translation API to translate a text document from
any language into a target language.
"""

text_translation_key: str = ""
text_translation_endpoint: str = ""
target_language: str = "en"
region: str = ""
file_path: str = ""
translate_client: Any = None

name: str = "azure_file_translation"
description: str = """
A Wrapper around Azure AI Services that can be used to
translate a document into a specific language.
It reads the text from a file, processes it,
and then outputs with the desired language.
"""

@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""
Validate that the API key and endpoint exist in the environment.
"""
azure_translate_key = get_from_dict_or_env(
values, "text_translation_key", "AZURE_TRANSLATE_API_KEY"
)
azure_translate_endpoint = get_from_dict_or_env(
values, "text_translation_endpoint", "AZURE_TRANSLATE_ENDPOINT"
)

region = get_from_dict_or_env(values, "region", "AZURE_REGION")

try:
from azure.ai.translation.text import TextTranslationClient
from azure.core.credentials import AzureKeyCredential

# Set up the translation client in the values dict
values["translate_client"] = TextTranslationClient(
kristapratico marked this conversation as resolved.
Show resolved Hide resolved
endpoint=azure_translate_endpoint,
credential=AzureKeyCredential(azure_translate_key),
region=region,
)

except ImportError:
raise ImportError(
"azure-ai-translation-text is not installed. "
"Run `pip install azure-ai-translation-text` to install."
)

return values

def _read_text_from_file(self, file_path: str) -> str:
"""
Read and return text from the specified file,
supporting PDF, DOCX, PPTX, XLSX, HTML, and XML formats.

Args:
file_path (str): Path to the input file.

Returns:
str: Extracted text from the file.

Raises:
ValueError: If the file type is unsupported.
"""

file_extension = os.path.splitext(file_path)[1].lower()

# Map file extensions to loader classes
loader_map = {
".pdf": UnstructuredPDFLoader,
".docx": UnstructuredWordDocumentLoader,
".pptx": UnstructuredPowerPointLoader,
".xlsx": UnstructuredExcelLoader,
".xml": UnstructuredXMLLoader,
".html": UnstructuredHTMLLoader,
}

loader_class = loader_map.get(file_extension)

if file_extension == ".txt":
return self._read_text(file_path)
elif loader_class is None:
raise ValueError(f"Unsupported file type: {file_extension}")

# Load the document using the appropriate loader
loader = loader_class(file_path)
data = loader.load()

return " ".join([doc.page_content for doc in data])

def _read_text(self, file_path: str) -> str:
"""Read text from a plain text file."""
with open(file_path, "r", encoding="utf-8") as file:
return file.read().strip()

def _translate_text(self, text: str, target_language: Optional[str] = None) -> str:
"""
Translate the input text to the target language
using the Azure Text Translation API.

Args:
text (str): The text to be translated.
target_language (str, optional):
The target language for translation (default: Spanish).

Returns:
str: Translated text.

Raises:
RuntimeError: If the translation request fails.
"""
if target_language is None:
target_language = self.target_language
try:
from azure.ai.translation.text import TextTranslationClient
from azure.ai.translation.text.models import InputTextItem

self.translation_client: TextTranslationClient
except ImportError:
raise ImportError("Run 'pip install azure-ai-translation-text'.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_environment is called before this function so we'll already have raised an ImportError if the translation library is not installed. We can remove the try/except here


try:
request_body = [InputTextItem(text=text)]
response = self.translate_client.translate(
body=request_body, to_language=[target_language]
)

translations = response[0]["translations"]
if translations:
return translations[0]["text"]
return ""
except Exception as e:
raise RuntimeError(f"An error occurred during translation: {e}")

def _run(
self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
) -> str:
""" "Run the tool"""
try:
text = self._read_text_from_file(self.file_path or query)
return self._translate_text(text)
except Exception as e:
raise RuntimeError(f"Error while running AzureFileTranslateTool: {e}")
Binary file added libs/community/tests/examples/test_azure.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from pathlib import Path
from typing import Any

import pytest

from langchain_community.tools.azure_ai_services.azure_file_translation import (
AzureFileTranslateTool,
)

_THIS_DIR = Path(__file__).parents[3]

_EXAMPLES_DIR = _THIS_DIR / "examples"
AZURE_PDF = _EXAMPLES_DIR / "test_azure.pdf"


@pytest.mark.requires("azure-ai-translation-text")
def test_tool_initialization(mocker: Any) -> None:
mocker.patch("azure.core.credentials.AzureKeyCredential", autospec=True)

mock_translate_client = mocker.Mock()
mocker.patch(
"azure.ai.translation.text.TextTranslationClient",
return_value=mock_translate_client,
)

key = "key"
endpoint = "endpoint"
region = "westus2"

tool = AzureFileTranslateTool(
text_translation_key=key,
text_translation_endpoint=endpoint,
region=region,
translate_client=mock_translate_client,
)

assert tool.text_translation_key == key
assert tool.text_translation_endpoint == endpoint
assert tool.region == region
assert tool.translate_client == mock_translate_client


@pytest.mark.requires("azure-ai-translation-text")
def test_translation_with_file(mocker: Any) -> None:
key = "key"
endpoint = "endpoint"
region = "westus2"

mocker.patch("azure.core.credentials.AzureKeyCredential", autospec=True)

mock_translate_client = mocker.Mock()
mocker.patch(
"azure.ai.translation.text.TextTranslationClient",
return_value=mock_translate_client,
)

tool = AzureFileTranslateTool(
text_translation_key=key,
text_translation_endpoint=endpoint,
region=region,
translate_client=mock_translate_client,
)

mock_translate_client.translate.return_value = [
{
"detectedLanguage": {"language": "en", "score": 1.0},
"translations": [{"text": "Hola, mi nombre es Azure", "to": "es"}],
}
]

file_input: str = str(AZURE_PDF)
expected_output = "Hola, mi nombre es Azure"

result = tool._run(file_input)

assert result == expected_output


@pytest.mark.requires("azure-ai-translation-text")
def test_translation_with_no_file(mocker: Any) -> None:
key = "key"
endpoint = "endpoint"
region = "westus2"

mocker.patch("azure.core.credentials.AzureKeyCredential", autospec=True)

mock_translate_client = mocker.Mock()
mocker.patch(
"azure.ai.translation.text.TextTranslationClient",
return_value=mock_translate_client,
)

tool = AzureFileTranslateTool(
text_translation_key=key,
text_translation_endpoint=endpoint,
region=region,
translate_client=mock_translate_client,
)

file_input: str = ""
expected_output = "Error while running AzureFileTranslateTool"

try:
result = tool._run(file_input)
except RuntimeError as e:
result = str(e)

assert expected_output in result
Loading