adds eleven labs streaming support

2023-03-28 15:57:53 -07:00 · 2023-03-28 15:57:53 -07:00 · 19461eaba4
commit 19461eaba4
parent a9424ffaeb
5 changed files with 99 additions and 20 deletions
--- a/examples/hosted_streaming_conversation.py
+++ b/examples/hosted_streaming_conversation.py
@ -5,6 +5,7 @@ from dotenv import load_dotenv
 load_dotenv()
 import vocode
 from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
 from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.helpers import create_microphone_input_and_speaker_output
@ -23,12 +24,15 @@ from vocode.streaming.models.agent import (
    ChatGPTAgentConfig,
 )
 from vocode.streaming.models.message import BaseMessage
-from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
+from vocode.streaming.models.synthesizer import (
    AzureSynthesizerConfig,
    ElevenLabsSynthesizerConfig,
    RimeSynthesizerConfig,
 )
 logging.basicConfig()
 logging.root.setLevel(logging.INFO)
 if __name__ == "__main__":
    microphone_input, speaker_output = create_microphone_input_and_speaker_output(
        streaming=True, use_default_devices=False
@ -44,10 +48,12 @@ if __name__ == "__main__":
        agent_config=ChatGPTAgentConfig(
            initial_message=BaseMessage(text="Hello!"),
            prompt_preamble="The AI is having a pleasant conversation about life",
-            generate_responses=True,
+            generate_responses=False,
            cut_off_response=CutOffResponse(),
        ),
-        synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output),
+        synthesizer_config=ElevenLabsSynthesizerConfig.from_output_device(
            speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY")
        ),
    )
    signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate())
    asyncio.run(conversation.start())
--- a/examples/streaming_conversation.py
+++ b/examples/streaming_conversation.py
@ -3,6 +3,8 @@ import logging
 import signal
 from dotenv import load_dotenv
 from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 load_dotenv()
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
@ -26,6 +28,7 @@ from vocode.streaming.models.agent import (
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import (
    AzureSynthesizerConfig,
    ElevenLabsSynthesizerConfig,
    GoogleSynthesizerConfig,
    RimeSynthesizerConfig,
 )
@ -60,8 +63,10 @@ hella, down,     fire, totally, but like, slay, vibing, queen, go off, bet, sus,
                cut_off_response=CutOffResponse(),
            )
        ),
-        synthesizer=AzureSynthesizer(
+        synthesizer=ElevenLabsSynthesizer(
-            AzureSynthesizerConfig.from_output_device(speaker_output)
+            ElevenLabsSynthesizerConfig.from_output_device(
                speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY")
            )
        ),
        logger=logger,
    )
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vocode"
-version = "0.1.59"
+version = "0.1.60"
 description = "The all-in-one voice SDK"
 authors = ["Ajay Raj <ajay@vocode.dev>"]
 license = "MIT License"
--- a/vocode/streaming/models/synthesizer.py
+++ b/vocode/streaming/models/synthesizer.py
@ -112,9 +112,43 @@ class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
    pass
 ELEVEN_LABS_ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
 class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
    api_key: str
-    voice_id: Optional[str] = None
+    voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID
    @validator("voice_id")
    def set_name(cls, voice_id):
        return voice_id or ELEVEN_LABS_ADAM_VOICE_ID
    @classmethod
    def from_output_device(
        cls,
        output_device: BaseOutputDevice,
        api_key: str,
        voice_id: Optional[str] = None,
    ):
        return cls(
            sampling_rate=output_device.sampling_rate,
            audio_encoding=output_device.audio_encoding,
            api_key=api_key,
            voice_id=voice_id,
        )
    @classmethod
    def from_telephone_output_device(
        cls,
        api_key: str,
        voice_id: Optional[str] = None,
    ):
        return cls(
            sampling_rate=DEFAULT_SAMPLING_RATE,
            audio_encoding=DEFAULT_AUDIO_ENCODING,
            api_key=api_key,
            voice_id=voice_id,
        )
 class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
--- a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
+++ b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@ -1,10 +1,16 @@
 import io
 from typing import Any, Optional
 import requests
-from vocode import getenv
+from pydub import AudioSegment
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.utils import convert_wav
 from vocode import getenv
 from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
    encode_as_wav,
 )
 from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
@ -13,7 +19,6 @@ from vocode.streaming.models.message import BaseMessage
 ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
 ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
 OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
 class ElevenLabsSynthesizer(BaseSynthesizer):
@ -29,19 +34,48 @@ class ElevenLabsSynthesizer(BaseSynthesizer):
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
-        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream"
+        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
        headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
        body = {
            "text": message.text,
        }
-        response = requests.post(url, headers=headers, json=body)
+        response = requests.post(url, headers=headers, json=body, timeout=5)
-        def chunk_generator(response):
+        audio_segment: AudioSegment = AudioSegment.from_mp3(
-            for chunk in response.iter_content(chunk_size=chunk_size):
+            io.BytesIO(response.content)
-                yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size)
+        )
-        assert (
+        output_bytes_io = io.BytesIO()
-            not self.synthesizer_config.should_encode_as_wav
+        audio_segment.export(output_bytes_io, format="wav")
-        ), "ElevenLabs does not support WAV encoding"
+
-        # return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute)
+        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
-        return SynthesisResult(chunk_generator(response), lambda seconds: message.text)
+            output_bytes = convert_wav(
                output_bytes_io,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.LINEAR16,
            )
        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
            output_bytes = convert_wav(
                output_bytes_io,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.MULAW,
            )
        if self.synthesizer_config.should_encode_as_wav:
            output_bytes = encode_as_wav(output_bytes)
        def chunk_generator(output_bytes):
            for i in range(0, len(output_bytes), chunk_size):
                if i + chunk_size > len(output_bytes):
                    yield SynthesisResult.ChunkResult(output_bytes[i:], True)
                else:
                    yield SynthesisResult.ChunkResult(
                        output_bytes[i : i + chunk_size], False
                    )
        return SynthesisResult(
            chunk_generator(output_bytes),
            lambda seconds: self.get_message_cutoff_from_total_response_length(
                message, seconds, len(output_bytes)
            ),
        )