deepsense-ai · mateusz-wosinski-ds · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb
diff --git a/docs/extras/integrations/tools/eleven_labs_tts.ipynb b/docs/extras/integrations/tools/eleven_labs_tts.ipynb
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "id": "2f57a647-9214-4562-a8cf-f263a15d1f40",
    "metadata": {},
    "outputs": [
@@ -60,7 +60,7 @@
        "'eleven_labs_text2speech'"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -79,18 +79,28 @@
    "id": "d4613fed-66f0-47c6-be50-7e7670654427",
    "metadata": {},
    "source": [
-    "We can generate audio, save it to the temporary file and then play it."
+    "We can generate audio and play it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "id": "f1984844-aa75-4f83-9d42-1c8052d87cc0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Speech has been generated'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "speech_file = tts.run(text_to_speak)\n",
-    "tts.play(speech_file)"
+    "tts.run(text_to_speak)"
    ]
   },
   {
@@ -103,14 +113,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645",
    "metadata": {},
    "outputs": [],
    "source": [
     "tts.stream_speech(text_to_speak)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "33b7c8b5-c0a7-4d4f-8256-475aac248322",
+   "metadata": {},
+   "source": [
+    "Speech can also be generated and save to temporary file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3265a6db-773c-4043-905b-48e40fc1adc4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/tmp/tmp03j5opkw.wav'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "path = tts.generate_and_save(text_to_speak)\n",
+    "path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a25d66b-ade4-4f88-ae86-57c67ad066c4",
+   "metadata": {},
+   "source": [
+    "And played from the path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a95326ba-ce70-4950-a03f-612e3705dfdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tts.load_and_play(path)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a152766d-5f06-48b1-ac89-b4e8d88d3c9f",
@@ -121,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 7,
    "id": "37626aea-0cf0-4849-9c00-c0f40515ffe0",
    "metadata": {},
    "outputs": [],
@@ -132,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b",
    "metadata": {},
    "outputs": [],
@@ -149,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "id": "336bf95a-3ccb-4963-aac3-638a4df2ed78",
    "metadata": {},
    "outputs": [
@@ -171,13 +229,13 @@
       "```\n",
       "\n",
       "\u001b[0m\n",
-      "Observation: \u001b[36;1m\u001b[1;3m/tmp/tmpsfg783f1.wav\u001b[0m\n",
-      "Thought:\u001b[32;1m\u001b[1;3m I have the audio file ready to be sent to the human\n",
+      "Observation: \u001b[36;1m\u001b[1;3mSpeech has been generated\u001b[0m\n",
+      "Thought:\u001b[32;1m\u001b[1;3m I have the response\n",
       "Action:\n",
       "```\n",
       "{\n",
       "  \"action\": \"Final Answer\",\n",
-      "  \"action_input\": \"/tmp/tmpsfg783f1.wav\"\n",
+      "  \"action_input\": \"Why did the chicken cross the playground? To get to the other slide!\"\n",
       "}\n",
       "```\n",
       "\n",
@@ -188,17 +246,7 @@
     }
    ],
    "source": [
-    "audio_file = agent.run(\"Tell me a joke and read it out for me.\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "f0aa7aa9-4682-4599-8cae-59347d9e5210",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tts.play(audio_file)"
+    "output = agent.run(\"Tell me a joke and read it out for me.\")"
    ]
   }
  ],

diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py
@@ -0,0 +1,18 @@
+import tempfile
+from pathlib import Path
+
+
+def save_audio(audio: bytes) -> str:
+    """Save audio to a temporary file and return the path."""
+    with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f:
+        f.write(audio)
+    return f.name
+
+
+def load_audio(audio_file_path: str) -> bytes:
+    """Load audio from a file into bytes."""
+    if Path(audio_file_path).exists():
+        with open(audio_file_path, mode="rb") as f:
+            audio = f.read()
+        return audio
+    raise FileNotFoundError(f"File {audio_file_path} not found.")
diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py
@@ -1,14 +1,28 @@
 from __future__ import annotations
 
 import logging
-import tempfile
 from typing import Any, Dict, Optional
 
+from IPython import display
+
 from langchain.callbacks.manager import CallbackManagerForToolRun
 from langchain.pydantic_v1 import root_validator
+from langchain.tools.audio_utils import load_audio, save_audio
 from langchain.tools.base import BaseTool
 from langchain.utils import get_from_dict_or_env
 
+
+def _import_azure_speech() -> Any:
+    try:
+        import azure.cognitiveservices.speech as speechsdk
+    except ImportError:
+        raise ImportError(
+            "azure.cognitiveservices.speech is not installed. "
+            "Run `pip install azure-cognitiveservices-speech` to install."
+        )
+    return speechsdk
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -33,6 +47,7 @@ class AzureCogsText2SpeechTool(BaseTool):
     @root_validator(pre=True)
     def validate_environment(cls, values: Dict) -> Dict:
         """Validate that api key and endpoint exists in environment."""
+        speechsdk = _import_azure_speech()
         azure_cogs_key = get_from_dict_or_env(
             values, "azure_cogs_key", "AZURE_COGS_KEY"
         )
@@ -41,40 +56,21 @@ def validate_environment(cls, values: Dict) -> Dict:
             values, "azure_cogs_region", "AZURE_COGS_REGION"
         )
 
-        try:
-            import azure.cognitiveservices.speech as speechsdk
-
-            values["speech_config"] = speechsdk.SpeechConfig(
-                subscription=azure_cogs_key, region=azure_cogs_region
-            )
-        except ImportError:
-            raise ImportError(
-                "azure-cognitiveservices-speech is not installed. "
-                "Run `pip install azure-cognitiveservices-speech` to install."
-            )
-
+        values["speech_config"] = speechsdk.SpeechConfig(
+            subscription=azure_cogs_key, region=azure_cogs_region
+        )
         return values
 
-    def _text2speech(self, text: str, speech_language: str) -> str:
-        try:
-            import azure.cognitiveservices.speech as speechsdk
-        except ImportError:
-            pass
-
+    def _text2speech(self, text: str, speech_language: str) -> bytes:
+        speechsdk = _import_azure_speech()
         self.speech_config.speech_synthesis_language = speech_language
         speech_synthesizer = speechsdk.SpeechSynthesizer(
             speech_config=self.speech_config, audio_config=None
         )
         result = speech_synthesizer.speak_text(text)
 
         if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
-            stream = speechsdk.AudioDataStream(result)
-            with tempfile.NamedTemporaryFile(
-                mode="wb", suffix=".wav", delete=False
-            ) as f:
-                stream.save_to_wav_file(f.name)
-
-            return f.name
+            return result.audio_data
 
         elif result.reason == speechsdk.ResultReason.Canceled:
             cancellation_details = result.cancellation_details
@@ -84,10 +80,10 @@ def _text2speech(self, text: str, speech_language: str) -> str:
                     f"Speech synthesis error: {cancellation_details.error_details}"
                 )
 
-            return "Speech synthesis canceled."
+            raise RuntimeError("Speech synthesis canceled.")
 
         else:
-            return f"Speech synthesis failed: {result.reason}"
+            raise RuntimeError(f"Speech synthesis failed: {result.reason}")
 
     def _run(
         self,
@@ -96,7 +92,24 @@ def _run(
     ) -> str:
         """Use the tool."""
         try:
-            speech_file = self._text2speech(query, self.speech_language)
-            return speech_file
+            speech = self._text2speech(query, self.speech_language)
+            self.play(speech)
+            return "Speech has been generated"
         except Exception as e:
             raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}")
+
+    def play(self, speech: bytes) -> None:
+        """Play the speech."""
+        audio = display.Audio(speech)
+        display.display(audio)
+
+    def generate_and_save(self, query: str) -> str:
+        """Save the text as speech to a temporary file."""
+        speech = self._text2speech(query, self.speech_language)
+        path = save_audio(speech)
+        return path
+
+    def load_and_play(self, path: str) -> None:
+        """Load the text as speech from a temporary file and play it."""
+        speech = load_audio(path)
+        self.play(speech)
diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py
@@ -1,9 +1,9 @@
-import tempfile
 from enum import Enum
 from typing import Any, Dict, Optional, Union
 
 from langchain.callbacks.manager import CallbackManagerForToolRun
 from langchain.pydantic_v1 import root_validator
+from langchain.tools.audio_utils import load_audio, save_audio
 from langchain.tools.base import BaseTool
 from langchain.utils import get_from_dict_or_env
 
@@ -56,20 +56,14 @@ def _run(
         elevenlabs = _import_elevenlabs()
         try:
             speech = elevenlabs.generate(text=query, model=self.model)
-            with tempfile.NamedTemporaryFile(
-                mode="bx", suffix=".wav", delete=False
-            ) as f:
-                f.write(speech)
-            return f.name
+            self.play(speech)
+            return "Speech has been generated"
         except Exception as e:
             raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}")
 
-    def play(self, speech_file: str) -> None:
+    def play(self, speech: bytes) -> None:
         """Play the text as speech."""
         elevenlabs = _import_elevenlabs()
-        with open(speech_file, mode="rb") as f:
-            speech = f.read()
-
         elevenlabs.play(speech)
 
     def stream_speech(self, query: str) -> None:
@@ -78,3 +72,15 @@ def stream_speech(self, query: str) -> None:
         elevenlabs = _import_elevenlabs()
         speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True)
         elevenlabs.stream(speech_stream)
+
+    def generate_and_save(self, query: str) -> str:
+        """Save the text as speech to a temporary file."""
+        elevenlabs = _import_elevenlabs()
+        speech = elevenlabs.generate(text=query, model=self.model)
+        path = save_audio(speech)
+        return path
+
+    def load_and_play(self, path: str) -> None:
+        """Load the text as speech from a temporary file and play it."""
+        speech = load_audio(path)
+        self.play(speech)