from enum import Enum from typing import Optional, Union from pydantic import BaseModel, validator from vocode.streaming.output_device.base_output_device import BaseOutputDevice from vocode.streaming.telephony.constants import ( DEFAULT_AUDIO_ENCODING, DEFAULT_SAMPLING_RATE, ) from .model import TypedModel from .audio_encoding import AudioEncoding class SynthesizerType(str, Enum): BASE = "synthesizer_base" AZURE = "synthesizer_azure" GOOGLE = "synthesizer_google" ELEVEN_LABS = "synthesizer_eleven_labs" RIME = "synthesizer_rime" class TrackBotSentimentConfig(BaseModel): emotions: list[str] = ["angry", "friendly", "sad", "whispering"] @validator("emotions") def emotions_must_not_be_empty(cls, v): if len(v) == 0: raise ValueError("must have at least one emotion") return v class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE): sampling_rate: int audio_encoding: AudioEncoding should_encode_as_wav: bool = False track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False @classmethod def from_output_device(cls, output_device: BaseOutputDevice): return cls( sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding, ) @classmethod def from_telephone_output_device(cls): return cls( sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING, ) AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural" AZURE_SYNTHESIZER_DEFAULT_PITCH = 0 AZURE_SYNTHESIZER_DEFAULT_RATE = 15 class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE): voice_name: Optional[str] = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME pitch: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_PITCH rate: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_RATE class Config: validate_assignment = True @validator("voice_name") def set_name(cls, voice_name): return voice_name or AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME @validator("pitch") def set_pitch(cls, pitch): return pitch or AZURE_SYNTHESIZER_DEFAULT_PITCH @validator("rate") def set_rate(cls, rate): return rate or AZURE_SYNTHESIZER_DEFAULT_RATE @classmethod def from_output_device( cls, output_device: BaseOutputDevice, voice_name: Optional[str] = None, pitch: Optional[int] = None, rate: Optional[int] = None, ): return cls( sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding, voice_name=voice_name, pitch=pitch, rate=rate, ) @classmethod def from_telephone_output_device( cls, voice_name: Optional[str] = None, pitch: Optional[int] = None, rate: Optional[int] = None, ): return cls( sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING, voice_name=voice_name, pitch=pitch, rate=rate, ) class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE): pass class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS): api_key: str voice_id: Optional[str] = None class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME): speaker: str @classmethod def from_output_device( cls, output_device: BaseOutputDevice, speaker: str, ): return cls( sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding, speaker=speaker, ) @classmethod def from_telephone_output_device( cls, speaker: str, ): return cls( sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING, speaker=speaker, )