diff --git a/examples/hosted_streaming_conversation.py b/examples/hosted_streaming_conversation.py index 4a119bc..040d26c 100644 --- a/examples/hosted_streaming_conversation.py +++ b/examples/hosted_streaming_conversation.py @@ -5,6 +5,7 @@ from dotenv import load_dotenv load_dotenv() +import vocode from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation from vocode.streaming.streaming_conversation import StreamingConversation from vocode.helpers import create_microphone_input_and_speaker_output @@ -23,12 +24,15 @@ from vocode.streaming.models.agent import ( ChatGPTAgentConfig, ) from vocode.streaming.models.message import BaseMessage -from vocode.streaming.models.synthesizer import AzureSynthesizerConfig +from vocode.streaming.models.synthesizer import ( + AzureSynthesizerConfig, + ElevenLabsSynthesizerConfig, + RimeSynthesizerConfig, +) logging.basicConfig() logging.root.setLevel(logging.INFO) - if __name__ == "__main__": microphone_input, speaker_output = create_microphone_input_and_speaker_output( streaming=True, use_default_devices=False @@ -44,10 +48,12 @@ if __name__ == "__main__": agent_config=ChatGPTAgentConfig( initial_message=BaseMessage(text="Hello!"), prompt_preamble="The AI is having a pleasant conversation about life", - generate_responses=True, + generate_responses=False, cut_off_response=CutOffResponse(), ), - synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output), + synthesizer_config=ElevenLabsSynthesizerConfig.from_output_device( + speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY") + ), ) signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate()) asyncio.run(conversation.start()) diff --git a/examples/streaming_conversation.py b/examples/streaming_conversation.py index 7fee974..4fc10df 100644 --- a/examples/streaming_conversation.py +++ b/examples/streaming_conversation.py @@ -3,6 +3,8 @@ import logging import signal from dotenv import load_dotenv +from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer + load_dotenv() from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent @@ -26,6 +28,7 @@ from vocode.streaming.models.agent import ( from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.synthesizer import ( AzureSynthesizerConfig, + ElevenLabsSynthesizerConfig, GoogleSynthesizerConfig, RimeSynthesizerConfig, ) @@ -60,8 +63,10 @@ hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, cut_off_response=CutOffResponse(), ) ), - synthesizer=AzureSynthesizer( - AzureSynthesizerConfig.from_output_device(speaker_output) + synthesizer=ElevenLabsSynthesizer( + ElevenLabsSynthesizerConfig.from_output_device( + speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY") + ) ), logger=logger, ) diff --git a/pyproject.toml b/pyproject.toml index cc6f886..0296a73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "vocode" -version = "0.1.59" +version = "0.1.60" description = "The all-in-one voice SDK" authors = ["Ajay Raj "] license = "MIT License" diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py index e05f6a4..660cd7d 100644 --- a/vocode/streaming/models/synthesizer.py +++ b/vocode/streaming/models/synthesizer.py @@ -112,9 +112,43 @@ class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE): pass +ELEVEN_LABS_ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB" + + class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS): api_key: str - voice_id: Optional[str] = None + voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID + + @validator("voice_id") + def set_name(cls, voice_id): + return voice_id or ELEVEN_LABS_ADAM_VOICE_ID + + @classmethod + def from_output_device( + cls, + output_device: BaseOutputDevice, + api_key: str, + voice_id: Optional[str] = None, + ): + return cls( + sampling_rate=output_device.sampling_rate, + audio_encoding=output_device.audio_encoding, + api_key=api_key, + voice_id=voice_id, + ) + + @classmethod + def from_telephone_output_device( + cls, + api_key: str, + voice_id: Optional[str] = None, + ): + return cls( + sampling_rate=DEFAULT_SAMPLING_RATE, + audio_encoding=DEFAULT_AUDIO_ENCODING, + api_key=api_key, + voice_id=voice_id, + ) class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME): diff --git a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py index 610232a..6fca8e7 100644 --- a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py +++ b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py @@ -1,10 +1,16 @@ +import io from typing import Any, Optional import requests -from vocode import getenv +from pydub import AudioSegment +from vocode.streaming.models.audio_encoding import AudioEncoding +from vocode.streaming.utils import convert_wav + +from vocode import getenv from vocode.streaming.synthesizer.base_synthesizer import ( BaseSynthesizer, SynthesisResult, + encode_as_wav, ) from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment @@ -13,7 +19,6 @@ from vocode.streaming.models.message import BaseMessage ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/" ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB" -OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C" class ElevenLabsSynthesizer(BaseSynthesizer): @@ -29,19 +34,48 @@ class ElevenLabsSynthesizer(BaseSynthesizer): chunk_size: int, bot_sentiment: Optional[BotSentiment] = None, ) -> SynthesisResult: - url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream" + url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}" headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id} body = { "text": message.text, } - response = requests.post(url, headers=headers, json=body) + response = requests.post(url, headers=headers, json=body, timeout=5) - def chunk_generator(response): - for chunk in response.iter_content(chunk_size=chunk_size): - yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size) + audio_segment: AudioSegment = AudioSegment.from_mp3( + io.BytesIO(response.content) + ) - assert ( - not self.synthesizer_config.should_encode_as_wav - ), "ElevenLabs does not support WAV encoding" - # return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute) - return SynthesisResult(chunk_generator(response), lambda seconds: message.text) + output_bytes_io = io.BytesIO() + audio_segment.export(output_bytes_io, format="wav") + + if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16: + output_bytes = convert_wav( + output_bytes_io, + output_sample_rate=self.synthesizer_config.sampling_rate, + output_encoding=AudioEncoding.LINEAR16, + ) + elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW: + output_bytes = convert_wav( + output_bytes_io, + output_sample_rate=self.synthesizer_config.sampling_rate, + output_encoding=AudioEncoding.MULAW, + ) + + if self.synthesizer_config.should_encode_as_wav: + output_bytes = encode_as_wav(output_bytes) + + def chunk_generator(output_bytes): + for i in range(0, len(output_bytes), chunk_size): + if i + chunk_size > len(output_bytes): + yield SynthesisResult.ChunkResult(output_bytes[i:], True) + else: + yield SynthesisResult.ChunkResult( + output_bytes[i : i + chunk_size], False + ) + + return SynthesisResult( + chunk_generator(output_bytes), + lambda seconds: self.get_message_cutoff_from_total_response_length( + message, seconds, len(output_bytes) + ), + )