adds eleven labs streaming support
This commit is contained in:
parent
a9424ffaeb
commit
19461eaba4
5 changed files with 99 additions and 20 deletions
|
|
@ -5,6 +5,7 @@ from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
import vocode
|
||||||
from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
|
from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
|
||||||
from vocode.streaming.streaming_conversation import StreamingConversation
|
from vocode.streaming.streaming_conversation import StreamingConversation
|
||||||
from vocode.helpers import create_microphone_input_and_speaker_output
|
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||||
|
|
@ -23,12 +24,15 @@ from vocode.streaming.models.agent import (
|
||||||
ChatGPTAgentConfig,
|
ChatGPTAgentConfig,
|
||||||
)
|
)
|
||||||
from vocode.streaming.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
from vocode.streaming.models.synthesizer import (
|
||||||
|
AzureSynthesizerConfig,
|
||||||
|
ElevenLabsSynthesizerConfig,
|
||||||
|
RimeSynthesizerConfig,
|
||||||
|
)
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
logging.root.setLevel(logging.INFO)
|
logging.root.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
|
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
|
||||||
streaming=True, use_default_devices=False
|
streaming=True, use_default_devices=False
|
||||||
|
|
@ -44,10 +48,12 @@ if __name__ == "__main__":
|
||||||
agent_config=ChatGPTAgentConfig(
|
agent_config=ChatGPTAgentConfig(
|
||||||
initial_message=BaseMessage(text="Hello!"),
|
initial_message=BaseMessage(text="Hello!"),
|
||||||
prompt_preamble="The AI is having a pleasant conversation about life",
|
prompt_preamble="The AI is having a pleasant conversation about life",
|
||||||
generate_responses=True,
|
generate_responses=False,
|
||||||
cut_off_response=CutOffResponse(),
|
cut_off_response=CutOffResponse(),
|
||||||
),
|
),
|
||||||
synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output),
|
synthesizer_config=ElevenLabsSynthesizerConfig.from_output_device(
|
||||||
|
speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY")
|
||||||
|
),
|
||||||
)
|
)
|
||||||
signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate())
|
signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate())
|
||||||
asyncio.run(conversation.start())
|
asyncio.run(conversation.start())
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ import logging
|
||||||
import signal
|
import signal
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
|
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
|
||||||
|
|
@ -26,6 +28,7 @@ from vocode.streaming.models.agent import (
|
||||||
from vocode.streaming.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.streaming.models.synthesizer import (
|
from vocode.streaming.models.synthesizer import (
|
||||||
AzureSynthesizerConfig,
|
AzureSynthesizerConfig,
|
||||||
|
ElevenLabsSynthesizerConfig,
|
||||||
GoogleSynthesizerConfig,
|
GoogleSynthesizerConfig,
|
||||||
RimeSynthesizerConfig,
|
RimeSynthesizerConfig,
|
||||||
)
|
)
|
||||||
|
|
@ -60,8 +63,10 @@ hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus,
|
||||||
cut_off_response=CutOffResponse(),
|
cut_off_response=CutOffResponse(),
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
synthesizer=AzureSynthesizer(
|
synthesizer=ElevenLabsSynthesizer(
|
||||||
AzureSynthesizerConfig.from_output_device(speaker_output)
|
ElevenLabsSynthesizerConfig.from_output_device(
|
||||||
|
speaker_output, api_key=vocode.getenv("ELEVEN_LABS_API_KEY")
|
||||||
|
)
|
||||||
),
|
),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "vocode"
|
name = "vocode"
|
||||||
version = "0.1.59"
|
version = "0.1.60"
|
||||||
description = "The all-in-one voice SDK"
|
description = "The all-in-one voice SDK"
|
||||||
authors = ["Ajay Raj <ajay@vocode.dev>"]
|
authors = ["Ajay Raj <ajay@vocode.dev>"]
|
||||||
license = "MIT License"
|
license = "MIT License"
|
||||||
|
|
|
||||||
|
|
@ -112,9 +112,43 @@ class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
ELEVEN_LABS_ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
|
||||||
|
|
||||||
|
|
||||||
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
|
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
|
||||||
api_key: str
|
api_key: str
|
||||||
voice_id: Optional[str] = None
|
voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID
|
||||||
|
|
||||||
|
@validator("voice_id")
|
||||||
|
def set_name(cls, voice_id):
|
||||||
|
return voice_id or ELEVEN_LABS_ADAM_VOICE_ID
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_output_device(
|
||||||
|
cls,
|
||||||
|
output_device: BaseOutputDevice,
|
||||||
|
api_key: str,
|
||||||
|
voice_id: Optional[str] = None,
|
||||||
|
):
|
||||||
|
return cls(
|
||||||
|
sampling_rate=output_device.sampling_rate,
|
||||||
|
audio_encoding=output_device.audio_encoding,
|
||||||
|
api_key=api_key,
|
||||||
|
voice_id=voice_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_telephone_output_device(
|
||||||
|
cls,
|
||||||
|
api_key: str,
|
||||||
|
voice_id: Optional[str] = None,
|
||||||
|
):
|
||||||
|
return cls(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
api_key=api_key,
|
||||||
|
voice_id=voice_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
|
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,16 @@
|
||||||
|
import io
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
import requests
|
import requests
|
||||||
from vocode import getenv
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
from vocode.streaming.utils import convert_wav
|
||||||
|
|
||||||
|
from vocode import getenv
|
||||||
from vocode.streaming.synthesizer.base_synthesizer import (
|
from vocode.streaming.synthesizer.base_synthesizer import (
|
||||||
BaseSynthesizer,
|
BaseSynthesizer,
|
||||||
SynthesisResult,
|
SynthesisResult,
|
||||||
|
encode_as_wav,
|
||||||
)
|
)
|
||||||
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
|
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
|
||||||
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
||||||
|
|
@ -13,7 +19,6 @@ from vocode.streaming.models.message import BaseMessage
|
||||||
|
|
||||||
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
|
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
|
||||||
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
|
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
|
||||||
OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
|
|
||||||
|
|
||||||
|
|
||||||
class ElevenLabsSynthesizer(BaseSynthesizer):
|
class ElevenLabsSynthesizer(BaseSynthesizer):
|
||||||
|
|
@ -29,19 +34,48 @@ class ElevenLabsSynthesizer(BaseSynthesizer):
|
||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
bot_sentiment: Optional[BotSentiment] = None,
|
bot_sentiment: Optional[BotSentiment] = None,
|
||||||
) -> SynthesisResult:
|
) -> SynthesisResult:
|
||||||
url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream"
|
url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
|
||||||
headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
|
headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
|
||||||
body = {
|
body = {
|
||||||
"text": message.text,
|
"text": message.text,
|
||||||
}
|
}
|
||||||
response = requests.post(url, headers=headers, json=body)
|
response = requests.post(url, headers=headers, json=body, timeout=5)
|
||||||
|
|
||||||
def chunk_generator(response):
|
audio_segment: AudioSegment = AudioSegment.from_mp3(
|
||||||
for chunk in response.iter_content(chunk_size=chunk_size):
|
io.BytesIO(response.content)
|
||||||
yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size)
|
)
|
||||||
|
|
||||||
assert (
|
output_bytes_io = io.BytesIO()
|
||||||
not self.synthesizer_config.should_encode_as_wav
|
audio_segment.export(output_bytes_io, format="wav")
|
||||||
), "ElevenLabs does not support WAV encoding"
|
|
||||||
# return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute)
|
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
|
||||||
return SynthesisResult(chunk_generator(response), lambda seconds: message.text)
|
output_bytes = convert_wav(
|
||||||
|
output_bytes_io,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=AudioEncoding.LINEAR16,
|
||||||
|
)
|
||||||
|
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
output_bytes = convert_wav(
|
||||||
|
output_bytes_io,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=AudioEncoding.MULAW,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.synthesizer_config.should_encode_as_wav:
|
||||||
|
output_bytes = encode_as_wav(output_bytes)
|
||||||
|
|
||||||
|
def chunk_generator(output_bytes):
|
||||||
|
for i in range(0, len(output_bytes), chunk_size):
|
||||||
|
if i + chunk_size > len(output_bytes):
|
||||||
|
yield SynthesisResult.ChunkResult(output_bytes[i:], True)
|
||||||
|
else:
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
output_bytes[i : i + chunk_size], False
|
||||||
|
)
|
||||||
|
|
||||||
|
return SynthesisResult(
|
||||||
|
chunk_generator(output_bytes),
|
||||||
|
lambda seconds: self.get_message_cutoff_from_total_response_length(
|
||||||
|
message, seconds, len(output_bytes)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue