adds eleven labs streaming support

2023-03-28 15:57:53 -07:00 · 2023-03-28 15:57:53 -07:00 · 19461eaba4
commit 19461eaba4
parent a9424ffaeb
5 changed files with 99 additions and 20 deletions
--- a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
+++ b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@ -1,10 +1,16 @@
+import io
 from typing import Any, Optional
 import requests
-from vocode import getenv
+from pydub import AudioSegment

+from vocode.streaming.models.audio_encoding import AudioEncoding
+from vocode.streaming.utils import convert_wav
+
+from vocode import getenv
 from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
+    encode_as_wav,
 )
 from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
@ -13,7 +19,6 @@ from vocode.streaming.models.message import BaseMessage

 ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
 ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
-OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"


 class ElevenLabsSynthesizer(BaseSynthesizer):
@ -29,19 +34,48 @@ class ElevenLabsSynthesizer(BaseSynthesizer):
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
-        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream"
+        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
        headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
        body = {
            "text": message.text,
        }
-        response = requests.post(url, headers=headers, json=body)
+        response = requests.post(url, headers=headers, json=body, timeout=5)

-        def chunk_generator(response):
-            for chunk in response.iter_content(chunk_size=chunk_size):
-                yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size)
+        audio_segment: AudioSegment = AudioSegment.from_mp3(
+            io.BytesIO(response.content)
+        )

-        assert (
-            not self.synthesizer_config.should_encode_as_wav
-        ), "ElevenLabs does not support WAV encoding"
-        # return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute)
-        return SynthesisResult(chunk_generator(response), lambda seconds: message.text)
+        output_bytes_io = io.BytesIO()
+        audio_segment.export(output_bytes_io, format="wav")
+
+        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
+            output_bytes = convert_wav(
+                output_bytes_io,
+                output_sample_rate=self.synthesizer_config.sampling_rate,
+                output_encoding=AudioEncoding.LINEAR16,
+            )
+        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
+            output_bytes = convert_wav(
+                output_bytes_io,
+                output_sample_rate=self.synthesizer_config.sampling_rate,
+                output_encoding=AudioEncoding.MULAW,
+            )
+
+        if self.synthesizer_config.should_encode_as_wav:
+            output_bytes = encode_as_wav(output_bytes)
+
+        def chunk_generator(output_bytes):
+            for i in range(0, len(output_bytes), chunk_size):
+                if i + chunk_size > len(output_bytes):
+                    yield SynthesisResult.ChunkResult(output_bytes[i:], True)
+                else:
+                    yield SynthesisResult.ChunkResult(
+                        output_bytes[i : i + chunk_size], False
+                    )
+
+        return SynthesisResult(
+            chunk_generator(output_bytes),
+            lambda seconds: self.get_message_cutoff_from_total_response_length(
+                message, seconds, len(output_bytes)
+            ),
+        )