adds eleven labs streaming support

This commit is contained in:
Ajay Raj 2023-03-28 15:57:53 -07:00
commit 19461eaba4
5 changed files with 99 additions and 20 deletions

View file

@ -5,6 +5,7 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
import vocode
from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
from vocode.streaming.streaming_conversation import StreamingConversation from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.helpers import create_microphone_input_and_speaker_output from vocode.helpers import create_microphone_input_and_speaker_output
@ -23,12 +24,15 @@ from vocode.streaming.models.agent import (
ChatGPTAgentConfig, ChatGPTAgentConfig,
) )
from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
ElevenLabsSynthesizerConfig,
RimeSynthesizerConfig,
)
logging.basicConfig() logging.basicConfig()
logging.root.setLevel(logging.INFO) logging.root.setLevel(logging.INFO)
if __name__ == "__main__": if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output( microphone_input, speaker_output = create_microphone_input_and_speaker_output(
streaming=True, use_default_devices=False streaming=True, use_default_devices=False
@ -44,10 +48,12 @@ if __name__ == "__main__":
agent_config=ChatGPTAgentConfig( agent_config=ChatGPTAgentConfig(
initial_message=BaseMessage(text="Hello!"), initial_message=BaseMessage(text="Hello!"),
prompt_preamble="The AI is having a pleasant conversation about life", prompt_preamble="The AI is having a pleasant conversation about life",
generate_responses=True, generate_responses=False,
cut_off_response=CutOffResponse(), cut_off_response=CutOffResponse(),
), ),
synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output), synthesizer_config=ElevenLabsSynthesizerConfig.from_output_device(
speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY")
),
) )
signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate()) signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate())
asyncio.run(conversation.start()) asyncio.run(conversation.start())

View file

@ -3,6 +3,8 @@ import logging
import signal import signal
from dotenv import load_dotenv from dotenv import load_dotenv
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
load_dotenv() load_dotenv()
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
@ -26,6 +28,7 @@ from vocode.streaming.models.agent import (
from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import ( from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig, AzureSynthesizerConfig,
ElevenLabsSynthesizerConfig,
GoogleSynthesizerConfig, GoogleSynthesizerConfig,
RimeSynthesizerConfig, RimeSynthesizerConfig,
) )
@ -60,8 +63,10 @@ hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus,
cut_off_response=CutOffResponse(), cut_off_response=CutOffResponse(),
) )
), ),
synthesizer=AzureSynthesizer( synthesizer=ElevenLabsSynthesizer(
AzureSynthesizerConfig.from_output_device(speaker_output) ElevenLabsSynthesizerConfig.from_output_device(
speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY")
)
), ),
logger=logger, logger=logger,
) )

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "vocode" name = "vocode"
version = "0.1.59" version = "0.1.60"
description = "The all-in-one voice SDK" description = "The all-in-one voice SDK"
authors = ["Ajay Raj <ajay@vocode.dev>"] authors = ["Ajay Raj <ajay@vocode.dev>"]
license = "MIT License" license = "MIT License"

View file

@ -112,9 +112,43 @@ class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
pass pass
ELEVEN_LABS_ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS): class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
api_key: str api_key: str
voice_id: Optional[str] = None voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID
@validator("voice_id")
def set_name(cls, voice_id):
return voice_id or ELEVEN_LABS_ADAM_VOICE_ID
@classmethod
def from_output_device(
cls,
output_device: BaseOutputDevice,
api_key: str,
voice_id: Optional[str] = None,
):
return cls(
sampling_rate=output_device.sampling_rate,
audio_encoding=output_device.audio_encoding,
api_key=api_key,
voice_id=voice_id,
)
@classmethod
def from_telephone_output_device(
cls,
api_key: str,
voice_id: Optional[str] = None,
):
return cls(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
api_key=api_key,
voice_id=voice_id,
)
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME): class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):

View file

@ -1,10 +1,16 @@
import io
from typing import Any, Optional from typing import Any, Optional
import requests import requests
from vocode import getenv from pydub import AudioSegment
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.utils import convert_wav
from vocode import getenv
from vocode.streaming.synthesizer.base_synthesizer import ( from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer, BaseSynthesizer,
SynthesisResult, SynthesisResult,
encode_as_wav,
) )
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
@ -13,7 +19,6 @@ from vocode.streaming.models.message import BaseMessage
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/" ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB" ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
class ElevenLabsSynthesizer(BaseSynthesizer): class ElevenLabsSynthesizer(BaseSynthesizer):
@ -29,19 +34,48 @@ class ElevenLabsSynthesizer(BaseSynthesizer):
chunk_size: int, chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None, bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult: ) -> SynthesisResult:
url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream" url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id} headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
body = { body = {
"text": message.text, "text": message.text,
} }
response = requests.post(url, headers=headers, json=body) response = requests.post(url, headers=headers, json=body, timeout=5)
def chunk_generator(response): audio_segment: AudioSegment = AudioSegment.from_mp3(
for chunk in response.iter_content(chunk_size=chunk_size): io.BytesIO(response.content)
yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size) )
assert ( output_bytes_io = io.BytesIO()
not self.synthesizer_config.should_encode_as_wav audio_segment.export(output_bytes_io, format="wav")
), "ElevenLabs does not support WAV encoding"
# return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute) if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
return SynthesisResult(chunk_generator(response), lambda seconds: message.text) output_bytes = convert_wav(
output_bytes_io,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.LINEAR16,
)
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
output_bytes = convert_wav(
output_bytes_io,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.MULAW,
)
if self.synthesizer_config.should_encode_as_wav:
output_bytes = encode_as_wav(output_bytes)
def chunk_generator(output_bytes):
for i in range(0, len(output_bytes), chunk_size):
if i + chunk_size > len(output_bytes):
yield SynthesisResult.ChunkResult(output_bytes[i:], True)
else:
yield SynthesisResult.ChunkResult(
output_bytes[i : i + chunk_size], False
)
return SynthesisResult(
chunk_generator(output_bytes),
lambda seconds: self.get_message_cutoff_from_total_response_length(
message, seconds, len(output_bytes)
),
)