open source

2023-03-28 00:15:34 -07:00 · 2023-03-28 00:15:34 -07:00 · a93bfc1ec9
commit a93bfc1ec9
parent 70b6e17c69
61 changed files with 4013 additions and 126 deletions
--- a/vocode/streaming/models/agent.py
+++ b/vocode/streaming/models/agent.py
@ -42,6 +42,7 @@ class AgentConfig(TypedModel, type=AgentType.BASE):
    initial_message: Optional[BaseMessage] = None
    generate_responses: bool = True
    allowed_idle_time_seconds: Optional[float] = None
+    allow_agent_to_be_cut_off: bool = True
    end_conversation_on_goodbye: bool = False
    send_filler_audio: Union[bool, FillerAudioConfig] = False

@ -59,6 +60,13 @@ class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
    cut_off_response: Optional[CutOffResponse] = None


+class ChatGPTAlphaAgentConfig(AgentConfig, type=AgentType.CHAT_GPT_ALPHA):
+    prompt_preamble: str
+    expected_first_prompt: Optional[str] = None
+    temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
+    max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
+
+
 class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
    prompt_preamble: str
    expected_first_prompt: Optional[str] = None
--- a/vocode/streaming/models/audio_encoding.py
+++ b/vocode/streaming/models/audio_encoding.py
@ -1,5 +1,6 @@
 from enum import Enum

+
 class AudioEncoding(str, Enum):
    LINEAR16 = "linear16"
-    MULAW = "mulaw"
+    MULAW = "mulaw"
--- a/vocode/streaming/models/model.py
+++ b/vocode/streaming/models/model.py
@ -1,17 +1,17 @@
 import pydantic

-class BaseModel(pydantic.BaseModel):

+class BaseModel(pydantic.BaseModel):
    def __init__(self, **data):
        for key, value in data.items():
            if isinstance(value, dict):
-                if 'type' in value:
+                if "type" in value:
                    data[key] = TypedModel.parse_obj(value)
        super().__init__(**data)

+
 # Adapted from https://github.com/pydantic/pydantic/discussions/3091
 class TypedModel(BaseModel):
-
    _subtypes_ = []

    def __init_subclass__(cls, type=None):
@ -22,31 +22,30 @@ class TypedModel(BaseModel):
        for t, cls in _cls._subtypes_:
            if t == type:
                return cls
-        raise ValueError(f'Unknown type {type}')
-    
+        raise ValueError(f"Unknown type {type}")
+
    @classmethod
    def get_type(_cls, cls_name):
        for t, cls in _cls._subtypes_:
            if cls.__name__ == cls_name:
                return t
-        raise ValueError(f'Unknown class {cls_name}')
-    
+        raise ValueError(f"Unknown class {cls_name}")
+
    @classmethod
    def parse_obj(cls, obj):
-        data_type = obj.get('type')
+        data_type = obj.get("type")
        if data_type is None:
-            raise ValueError(f'type is required for {cls.__name__}')
-    
+            raise ValueError(f"type is required for {cls.__name__}")
+
        sub = cls.get_cls(data_type)
        if sub is None:
-            raise ValueError(f'Unknown type {data_type}')
+            raise ValueError(f"Unknown type {data_type}")
        return sub(**obj)

    def _iter(self, **kwargs):
-        yield 'type', self.get_type(self.__class__.__name__)
+        yield "type", self.get_type(self.__class__.__name__)
        yield from super()._iter(**kwargs)

    @property
    def type(self):
        return self.get_type(self.__class__.__name__)
-
--- a/vocode/streaming/models/synthesizer.py
+++ b/vocode/streaming/models/synthesizer.py
@ -2,9 +2,14 @@ from enum import Enum
 from typing import Optional, Union

 from pydantic import BaseModel, validator
+
+from vocode.streaming.output_device.base_output_device import BaseOutputDevice
+from vocode.streaming.telephony.constants import (
+    DEFAULT_AUDIO_ENCODING,
+    DEFAULT_SAMPLING_RATE,
+)
 from .model import TypedModel
 from .audio_encoding import AudioEncoding
-from ..output_device.base_output_device import BaseOutputDevice


 class SynthesizerType(str, Enum):
@ -38,6 +43,13 @@ class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
            audio_encoding=output_device.audio_encoding,
        )

+    @classmethod
+    def from_telephone_output_device(cls):
+        return cls(
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+        )
+

 AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
 AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
@ -45,18 +57,32 @@ AZURE_SYNTHESIZER_DEFAULT_RATE = 15


 class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
-    voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
-    pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
-    rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
+    voice_name: Optional[str] = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
+    pitch: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_PITCH
+    rate: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_RATE
+
+    class Config:
+        validate_assignment = True
+
+    @validator("voice_name")
+    def set_name(cls, voice_name):
+        return voice_name or AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
+
+    @validator("pitch")
+    def set_pitch(cls, pitch):
+        return pitch or AZURE_SYNTHESIZER_DEFAULT_PITCH
+
+    @validator("rate")
+    def set_rate(cls, rate):
+        return rate or AZURE_SYNTHESIZER_DEFAULT_RATE

    @classmethod
    def from_output_device(
        cls,
        output_device: BaseOutputDevice,
-        voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME,
-        pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH,
-        rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE,
-        track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False,
+        voice_name: Optional[str] = None,
+        pitch: Optional[int] = None,
+        rate: Optional[int] = None,
    ):
        return cls(
            sampling_rate=output_device.sampling_rate,
@ -64,16 +90,33 @@ class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
            voice_name=voice_name,
            pitch=pitch,
            rate=rate,
-            track_bot_sentiment_in_voice=track_bot_sentiment_in_voice,
        )

-    pass
+    @classmethod
+    def from_telephone_output_device(
+        cls,
+        voice_name: Optional[str] = None,
+        pitch: Optional[int] = None,
+        rate: Optional[int] = None,
+    ):
+        return cls(
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+            voice_name=voice_name,
+            pitch=pitch,
+            rate=rate,
+        )


 class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
    pass


+class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
+    api_key: str
+    voice_id: Optional[str] = None
+
+
 class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
    speaker: str

@ -88,3 +131,14 @@ class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
            audio_encoding=output_device.audio_encoding,
            speaker=speaker,
        )
+
+    @classmethod
+    def from_telephone_output_device(
+        cls,
+        speaker: str,
+    ):
+        return cls(
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+            speaker=speaker,
+        )
--- a/vocode/streaming/models/telephony.py
+++ b/vocode/streaming/models/telephony.py
@ -1,4 +1,5 @@
 from typing import Optional
+from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.models.model import BaseModel
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
@ -19,6 +20,7 @@ class CreateInboundCall(BaseModel):
    agent_config: AgentConfig
    synthesizer_config: Optional[SynthesizerConfig] = None
    twilio_sid: str
+    conversation_id: Optional[str] = None
    twilio_config: Optional[TwilioConfig] = None


@ -48,3 +50,11 @@ class DialIntoZoomCall(BaseModel):
    synthesizer_config: Optional[SynthesizerConfig] = None
    conversation_id: Optional[str] = None
    twilio_config: Optional[TwilioConfig] = None
+
+
+class CallConfig(BaseModel):
+    transcriber_config: TranscriberConfig
+    agent_config: AgentConfig
+    synthesizer_config: SynthesizerConfig
+    twilio_config: Optional[TwilioConfig]
+    twilio_sid: str
--- a/vocode/streaming/models/transcriber.py
+++ b/vocode/streaming/models/transcriber.py
@ -1,8 +1,11 @@
 from enum import Enum
 from typing import Optional

-from vocode.streaming.input_device.base_input_device import (
-    BaseInputDevice,
+from vocode.streaming.input_device.base_input_device import BaseInputDevice
+from vocode.streaming.telephony.constants import (
+    DEFAULT_AUDIO_ENCODING,
+    DEFAULT_CHUNK_SIZE,
+    DEFAULT_SAMPLING_RATE,
 )
 from .audio_encoding import AudioEncoding
 from .model import BaseModel, TypedModel
@ -54,11 +57,25 @@ class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
            endpointing_config=endpointing_config,
        )

+    @classmethod
+    def from_telephone_input_device(
+        cls,
+        endpointing_config: Optional[EndpointingConfig] = None,
+    ):
+        return cls(
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+            chunk_size=DEFAULT_CHUNK_SIZE,
+            endpointing_config=endpointing_config,
+        )
+

 class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
    model: Optional[str] = None
+    tier: Optional[str] = None
    should_warmup_model: bool = False
    version: Optional[str] = None
+    downsampling: Optional[int] = None


 class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
--- a/vocode/streaming/models/websocket.py
+++ b/vocode/streaming/models/websocket.py
@ -6,33 +6,40 @@ from .transcriber import TranscriberConfig
 from .agent import AgentConfig
 from .synthesizer import SynthesizerConfig

-class WebSocketMessageType(str, Enum):
-    BASE = 'websocket_base'
-    START = 'websocket_start'
-    AUDIO = 'websocket_audio'
-    READY = 'websocket_ready'
-    STOP = 'websocket_stop'

-class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
+class WebSocketMessageType(str, Enum):
+    BASE = "websocket_base"
+    START = "websocket_start"
+    AUDIO = "websocket_audio"
+    READY = "websocket_ready"
+    STOP = "websocket_stop"
+
+
+class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE):
+    pass
+

 class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
    data: str

    @classmethod
    def from_bytes(cls, chunk: bytes):
-        return cls(data=base64.b64encode(chunk).decode('utf-8'))
+        return cls(data=base64.b64encode(chunk).decode("utf-8"))

    def get_bytes(self) -> bytes:
        return base64.b64decode(self.data)

+
 class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
    transcriber_config: TranscriberConfig
    agent_config: AgentConfig
    synthesizer_config: SynthesizerConfig
    conversation_id: Optional[str] = None

+
 class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
    pass

+
 class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
-    pass
+    pass