vocode-python/vocode/streaming/synthesizer/eleven_labs_synthesizer.py

import io
from typing import Any, Optional
import requests
from pydub import AudioSegment

from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.utils import convert_wav

from vocode import getenv
from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
    encode_as_wav,
)
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.message import BaseMessage


ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"


class ElevenLabsSynthesizer(BaseSynthesizer):
    def __init__(self, config: ElevenLabsSynthesizerConfig):
        super().__init__(config)
        self.api_key = getenv("ELEVEN_LABS_API_KEY")
        self.voice_id = config.voice_id or ADAM_VOICE_ID
        self.words_per_minute = 150

    def create_speech(
        self,
        message: BaseMessage,
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
        headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
        body = {
            "text": message.text,
        }
        response = requests.post(url, headers=headers, json=body, timeout=5)

        audio_segment: AudioSegment = AudioSegment.from_mp3(
            io.BytesIO(response.content)
        )

        output_bytes_io = io.BytesIO()
        audio_segment.export(output_bytes_io, format="wav")

        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
            output_bytes = convert_wav(
                output_bytes_io,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.LINEAR16,
            )
        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
            output_bytes = convert_wav(
                output_bytes_io,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.MULAW,
            )

        if self.synthesizer_config.should_encode_as_wav:
            output_bytes = encode_as_wav(output_bytes)

        def chunk_generator(output_bytes):
            for i in range(0, len(output_bytes), chunk_size):
                if i + chunk_size > len(output_bytes):
                    yield SynthesisResult.ChunkResult(output_bytes[i:], True)
                else:
                    yield SynthesisResult.ChunkResult(
                        output_bytes[i : i + chunk_size], False
                    )

        return SynthesisResult(
            chunk_generator(output_bytes),
            lambda seconds: self.get_message_cutoff_from_total_response_length(
                message, seconds, len(output_bytes)
            ),
        )