81 lines
2.9 KiB
Python
81 lines
2.9 KiB
Python
import io
|
|
from typing import Any, Optional
|
|
import requests
|
|
from pydub import AudioSegment
|
|
|
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
|
from vocode.streaming.utils import convert_wav
|
|
|
|
from vocode import getenv
|
|
from vocode.streaming.synthesizer.base_synthesizer import (
|
|
BaseSynthesizer,
|
|
SynthesisResult,
|
|
encode_as_wav,
|
|
)
|
|
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
|
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
|
from vocode.streaming.models.message import BaseMessage
|
|
|
|
|
|
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
|
|
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
|
|
|
|
|
|
class ElevenLabsSynthesizer(BaseSynthesizer):
|
|
def __init__(self, config: ElevenLabsSynthesizerConfig):
|
|
super().__init__(config)
|
|
self.api_key = getenv("ELEVEN_LABS_API_KEY")
|
|
self.voice_id = config.voice_id or ADAM_VOICE_ID
|
|
self.words_per_minute = 150
|
|
|
|
def create_speech(
|
|
self,
|
|
message: BaseMessage,
|
|
chunk_size: int,
|
|
bot_sentiment: Optional[BotSentiment] = None,
|
|
) -> SynthesisResult:
|
|
url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
|
|
headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
|
|
body = {
|
|
"text": message.text,
|
|
}
|
|
response = requests.post(url, headers=headers, json=body, timeout=5)
|
|
|
|
audio_segment: AudioSegment = AudioSegment.from_mp3(
|
|
io.BytesIO(response.content)
|
|
)
|
|
|
|
output_bytes_io = io.BytesIO()
|
|
audio_segment.export(output_bytes_io, format="wav")
|
|
|
|
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
|
|
output_bytes = convert_wav(
|
|
output_bytes_io,
|
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
|
output_encoding=AudioEncoding.LINEAR16,
|
|
)
|
|
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
|
|
output_bytes = convert_wav(
|
|
output_bytes_io,
|
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
|
output_encoding=AudioEncoding.MULAW,
|
|
)
|
|
|
|
if self.synthesizer_config.should_encode_as_wav:
|
|
output_bytes = encode_as_wav(output_bytes)
|
|
|
|
def chunk_generator(output_bytes):
|
|
for i in range(0, len(output_bytes), chunk_size):
|
|
if i + chunk_size > len(output_bytes):
|
|
yield SynthesisResult.ChunkResult(output_bytes[i:], True)
|
|
else:
|
|
yield SynthesisResult.ChunkResult(
|
|
output_bytes[i : i + chunk_size], False
|
|
)
|
|
|
|
return SynthesisResult(
|
|
chunk_generator(output_bytes),
|
|
lambda seconds: self.get_message_cutoff_from_total_response_length(
|
|
message, seconds, len(output_bytes)
|
|
),
|
|
)
|