178 lines
4.8 KiB
Python
178 lines
4.8 KiB
Python
from enum import Enum
|
|
from typing import Optional, Union
|
|
|
|
from pydantic import BaseModel, validator
|
|
|
|
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
|
from vocode.streaming.telephony.constants import (
|
|
DEFAULT_AUDIO_ENCODING,
|
|
DEFAULT_SAMPLING_RATE,
|
|
)
|
|
from .model import TypedModel
|
|
from .audio_encoding import AudioEncoding
|
|
|
|
|
|
class SynthesizerType(str, Enum):
|
|
BASE = "synthesizer_base"
|
|
AZURE = "synthesizer_azure"
|
|
GOOGLE = "synthesizer_google"
|
|
ELEVEN_LABS = "synthesizer_eleven_labs"
|
|
RIME = "synthesizer_rime"
|
|
|
|
|
|
class TrackBotSentimentConfig(BaseModel):
|
|
emotions: list[str] = ["angry", "friendly", "sad", "whispering"]
|
|
|
|
@validator("emotions")
|
|
def emotions_must_not_be_empty(cls, v):
|
|
if len(v) == 0:
|
|
raise ValueError("must have at least one emotion")
|
|
return v
|
|
|
|
|
|
class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
|
|
sampling_rate: int
|
|
audio_encoding: AudioEncoding
|
|
should_encode_as_wav: bool = False
|
|
track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False
|
|
|
|
@classmethod
|
|
def from_output_device(cls, output_device: BaseOutputDevice):
|
|
return cls(
|
|
sampling_rate=output_device.sampling_rate,
|
|
audio_encoding=output_device.audio_encoding,
|
|
)
|
|
|
|
@classmethod
|
|
def from_telephone_output_device(cls):
|
|
return cls(
|
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
|
)
|
|
|
|
|
|
AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
|
|
AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
|
|
AZURE_SYNTHESIZER_DEFAULT_RATE = 15
|
|
|
|
|
|
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
|
|
voice_name: Optional[str] = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
|
|
pitch: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_PITCH
|
|
rate: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_RATE
|
|
|
|
class Config:
|
|
validate_assignment = True
|
|
|
|
@validator("voice_name")
|
|
def set_name(cls, voice_name):
|
|
return voice_name or AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
|
|
|
|
@validator("pitch")
|
|
def set_pitch(cls, pitch):
|
|
return pitch or AZURE_SYNTHESIZER_DEFAULT_PITCH
|
|
|
|
@validator("rate")
|
|
def set_rate(cls, rate):
|
|
return rate or AZURE_SYNTHESIZER_DEFAULT_RATE
|
|
|
|
@classmethod
|
|
def from_output_device(
|
|
cls,
|
|
output_device: BaseOutputDevice,
|
|
voice_name: Optional[str] = None,
|
|
pitch: Optional[int] = None,
|
|
rate: Optional[int] = None,
|
|
):
|
|
return cls(
|
|
sampling_rate=output_device.sampling_rate,
|
|
audio_encoding=output_device.audio_encoding,
|
|
voice_name=voice_name,
|
|
pitch=pitch,
|
|
rate=rate,
|
|
)
|
|
|
|
@classmethod
|
|
def from_telephone_output_device(
|
|
cls,
|
|
voice_name: Optional[str] = None,
|
|
pitch: Optional[int] = None,
|
|
rate: Optional[int] = None,
|
|
):
|
|
return cls(
|
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
|
voice_name=voice_name,
|
|
pitch=pitch,
|
|
rate=rate,
|
|
)
|
|
|
|
|
|
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
|
|
pass
|
|
|
|
|
|
ELEVEN_LABS_ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
|
|
|
|
|
|
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
|
|
api_key: str
|
|
voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID
|
|
|
|
@validator("voice_id")
|
|
def set_name(cls, voice_id):
|
|
return voice_id or ELEVEN_LABS_ADAM_VOICE_ID
|
|
|
|
@classmethod
|
|
def from_output_device(
|
|
cls,
|
|
output_device: BaseOutputDevice,
|
|
api_key: str,
|
|
voice_id: Optional[str] = None,
|
|
):
|
|
return cls(
|
|
sampling_rate=output_device.sampling_rate,
|
|
audio_encoding=output_device.audio_encoding,
|
|
api_key=api_key,
|
|
voice_id=voice_id,
|
|
)
|
|
|
|
@classmethod
|
|
def from_telephone_output_device(
|
|
cls,
|
|
api_key: str,
|
|
voice_id: Optional[str] = None,
|
|
):
|
|
return cls(
|
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
|
api_key=api_key,
|
|
voice_id=voice_id,
|
|
)
|
|
|
|
|
|
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
|
|
speaker: str
|
|
|
|
@classmethod
|
|
def from_output_device(
|
|
cls,
|
|
output_device: BaseOutputDevice,
|
|
speaker: str,
|
|
):
|
|
return cls(
|
|
sampling_rate=output_device.sampling_rate,
|
|
audio_encoding=output_device.audio_encoding,
|
|
speaker=speaker,
|
|
)
|
|
|
|
@classmethod
|
|
def from_telephone_output_device(
|
|
cls,
|
|
speaker: str,
|
|
):
|
|
return cls(
|
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
|
speaker=speaker,
|
|
)
|