first pass at turn based conversation

2023-03-20 15:37:23 -07:00 · 2023-03-20 15:37:23 -07:00 · 518a0f2b53
commit 518a0f2b53
parent d1118d375e
40 changed files with 503 additions and 99 deletions
--- a/vocode/streaming/input_device/base_input_device.py
+++ b/vocode/streaming/input_device/base_input_device.py
@ -0,0 +1,16 @@
+from vocode.streaming.models.audio_encoding import AudioEncoding
+import queue
+from typing import Optional
+
+
+class BaseInputDevice:
+    def __init__(
+        self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int
+    ):
+        self.sampling_rate = sampling_rate
+        self.audio_encoding = audio_encoding
+        self.chunk_size = chunk_size
+        self.queue = queue.Queue()
+
+    def get_audio(self) -> Optional[bytes]:
+        raise NotImplementedError
--- a/vocode/streaming/input_device/microphone_input.py
+++ b/vocode/streaming/input_device/microphone_input.py
@ -0,0 +1,51 @@
+import sounddevice as sd
+import numpy as np
+from typing import Optional
+import queue
+import wave
+
+from vocode.streaming.input_device.base_input_device import BaseInputDevice
+from vocode.streaming.models.audio_encoding import AudioEncoding
+
+
+class MicrophoneInput(BaseInputDevice):
+    DEFAULT_SAMPLING_RATE = 44100
+    DEFAULT_CHUNK_SIZE = 2048
+
+    def __init__(
+        self,
+        device_info: dict,
+        sampling_rate: int = None,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        microphone_gain: int = 1,
+    ):
+        self.device_info = device_info
+        sampling_rate = sampling_rate or (
+            self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
+        )
+        super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
+        self.stream = sd.InputStream(
+            dtype=np.int16,
+            channels=1,
+            samplerate=self.sampling_rate,
+            blocksize=self.chunk_size,
+            device=int(self.device_info["index"]),
+            callback=self._stream_callback,
+        )
+        self.stream.start()
+        self.queue = queue.Queue()
+        self.microphone_gain = microphone_gain
+
+    def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
+        if self.microphone_gain > 1:
+            in_data = in_data * (2 ^ self.microphone_gain)
+        else:
+            in_data = in_data // (2 ^ self.microphone_gain)
+        audio_bytes = in_data.tobytes()
+        self.queue.put_nowait(audio_bytes)
+
+    def get_audio(self) -> Optional[bytes]:
+        try:
+            return self.queue.get_nowait()
+        except queue.Empty:
+            return None
--- a/vocode/streaming/input_device/telephone_input.py
+++ b/vocode/streaming/input_device/telephone_input.py
@ -0,0 +1,11 @@
+from vocode.streaming.input_device.base_input_device import (
+    BaseInputDevice,
+)
+from vocode.streaming.models.audio_encoding import AudioEncoding
+
+
+class TelephoneInput(BaseInputDevice):
+    def __init__(self):
+        super().__init__(
+            sampling_rate=8000, audio_encoding=AudioEncoding.MULAW, chunk_size=160
+        )
--- a/vocode/streaming/models/agent.py
+++ b/vocode/streaming/models/agent.py
@ -0,0 +1,181 @@
+from typing import Optional, Union
+from enum import Enum
+
+from pydantic import validator
+
+from vocode.streaming.models.message import BaseMessage
+from .model import TypedModel, BaseModel
+
+FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5
+LLM_AGENT_DEFAULT_TEMPERATURE = 1.0
+LLM_AGENT_DEFAULT_MAX_TOKENS = 256
+LLM_AGENT_DEFAULT_MODEL_NAME = "text-curie-001"
+CHAT_GPT_AGENT_DEFAULT_MODEL_NAME = "gpt-3.5-turbo"
+
+
+class AgentType(str, Enum):
+    BASE = "agent_base"
+    LLM = "agent_llm"
+    CHAT_GPT_ALPHA = "agent_chat_gpt_alpha"
+    CHAT_GPT = "agent_chat_gpt"
+    ECHO = "agent_echo"
+    INFORMATION_RETRIEVAL = "agent_information_retrieval"
+    RESTFUL_USER_IMPLEMENTED = "agent_restful_user_implemented"
+    WEBSOCKET_USER_IMPLEMENTED = "agent_websocket_user_implemented"
+
+
+class FillerAudioConfig(BaseModel):
+    silence_threshold_seconds: float = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
+    use_phrases: bool = True
+    use_typing_noise: bool = False
+
+    @validator("use_typing_noise")
+    def typing_noise_excludes_phrases(cls, v, values):
+        if v and values.get("use_phrases"):
+            values["use_phrases"] = False
+        if not v and not values.get("use_phrases"):
+            raise ValueError("must use either typing noise or phrases for filler audio")
+        return v
+
+
+class AgentConfig(TypedModel, type=AgentType.BASE):
+    initial_message: Optional[BaseMessage] = None
+    generate_responses: bool = True
+    allowed_idle_time_seconds: Optional[float] = None
+    end_conversation_on_goodbye: bool = False
+    send_filler_audio: Union[bool, FillerAudioConfig] = False
+
+
+class CutOffResponse(BaseModel):
+    messages: list[BaseMessage] = [BaseMessage(text="Sorry?")]
+
+
+class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
+    prompt_preamble: str
+    expected_first_prompt: Optional[str] = None
+    model_name: str = LLM_AGENT_DEFAULT_MODEL_NAME
+    temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
+    max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
+    cut_off_response: Optional[CutOffResponse] = None
+
+
+class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
+    prompt_preamble: str
+    expected_first_prompt: Optional[str] = None
+    generate_responses: bool = False
+    model_name: str = CHAT_GPT_AGENT_DEFAULT_MODEL_NAME
+    temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
+    max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
+    cut_off_response: Optional[CutOffResponse] = None
+
+
+class InformationRetrievalAgentConfig(
+    AgentConfig, type=AgentType.INFORMATION_RETRIEVAL
+):
+    recipient_descriptor: str
+    caller_descriptor: str
+    goal_description: str
+    fields: list[str]
+    # TODO: add fields for IVR, voicemail
+
+
+class EchoAgentConfig(AgentConfig, type=AgentType.ECHO):
+    pass
+
+
+class RESTfulUserImplementedAgentConfig(
+    AgentConfig, type=AgentType.RESTFUL_USER_IMPLEMENTED
+):
+    class EndpointConfig(BaseModel):
+        url: str
+        method: str = "POST"
+
+    respond: EndpointConfig
+    generate_responses: bool = False
+    # generate_response: Optional[EndpointConfig]
+    # update_last_bot_message_on_cut_off: Optional[EndpointConfig]
+
+
+class RESTfulAgentInput(BaseModel):
+    conversation_id: str
+    human_input: str
+
+
+class RESTfulAgentOutputType(str, Enum):
+    BASE = "restful_agent_base"
+    TEXT = "restful_agent_text"
+    END = "restful_agent_end"
+
+
+class RESTfulAgentOutput(TypedModel, type=RESTfulAgentOutputType.BASE):
+    pass
+
+
+class RESTfulAgentText(RESTfulAgentOutput, type=RESTfulAgentOutputType.TEXT):
+    response: str
+
+
+class RESTfulAgentEnd(RESTfulAgentOutput, type=RESTfulAgentOutputType.END):
+    pass
+
+
+class WebSocketUserImplementedAgentConfig(
+    AgentConfig, type=AgentType.WEBSOCKET_USER_IMPLEMENTED
+):
+    class RouteConfig(BaseModel):
+        url: str
+
+    respond: RouteConfig
+    generate_responses: bool = False
+    # generate_response: Optional[RouteConfig]
+    # send_message_on_cut_off: bool = False
+
+
+class WebSocketAgentMessageType(str, Enum):
+    BASE = "websocket_agent_base"
+    START = "websocket_agent_start"
+    TEXT = "websocket_agent_text"
+    TEXT_END = "websocket_agent_text_end"
+    READY = "websocket_agent_ready"
+    STOP = "websocket_agent_stop"
+
+
+class WebSocketAgentMessage(TypedModel, type=WebSocketAgentMessageType.BASE):
+    conversation_id: Optional[str] = None
+
+
+class WebSocketAgentTextMessage(
+    WebSocketAgentMessage, type=WebSocketAgentMessageType.TEXT
+):
+    class Payload(BaseModel):
+        text: str
+
+    data: Payload
+
+    @classmethod
+    def from_text(cls, text: str, conversation_id: Optional[str] = None):
+        return cls(data=cls.Payload(text=text), conversation_id=conversation_id)
+
+
+class WebSocketAgentStartMessage(
+    WebSocketAgentMessage, type=WebSocketAgentMessageType.START
+):
+    pass
+
+
+class WebSocketAgentReadyMessage(
+    WebSocketAgentMessage, type=WebSocketAgentMessageType.READY
+):
+    pass
+
+
+class WebSocketAgentStopMessage(
+    WebSocketAgentMessage, type=WebSocketAgentMessageType.STOP
+):
+    pass
+
+
+class WebSocketAgentTextEndMessage(
+    WebSocketAgentMessage, type=WebSocketAgentMessageType.TEXT_END
+):
+    pass
--- a/vocode/streaming/models/audio_encoding.py
+++ b/vocode/streaming/models/audio_encoding.py
@ -0,0 +1,5 @@
+from enum import Enum
+
+class AudioEncoding(str, Enum):
+    LINEAR16 = "linear16"
+    MULAW = "mulaw"
--- a/vocode/streaming/models/message.py
+++ b/vocode/streaming/models/message.py
@ -0,0 +1,16 @@
+from enum import Enum
+from .model import TypedModel
+from enum import Enum
+
+
+class MessageType(str, Enum):
+    BASE = "message_base"
+    SSML = "message_ssml"
+
+
+class BaseMessage(TypedModel, type=MessageType.BASE):
+    text: str
+
+
+class SSMLMessage(BaseMessage, type=MessageType.SSML):
+    ssml: str
--- a/vocode/streaming/models/model.py
+++ b/vocode/streaming/models/model.py
@ -0,0 +1,52 @@
+import pydantic
+
+class BaseModel(pydantic.BaseModel):
+
+    def __init__(self, **data):
+        for key, value in data.items():
+            if isinstance(value, dict):
+                if 'type' in value:
+                    data[key] = TypedModel.parse_obj(value)
+        super().__init__(**data)
+
+# Adapted from https://github.com/pydantic/pydantic/discussions/3091
+class TypedModel(BaseModel):
+
+    _subtypes_ = []
+
+    def __init_subclass__(cls, type=None):
+        cls._subtypes_.append([type, cls])
+
+    @classmethod
+    def get_cls(_cls, type):
+        for t, cls in _cls._subtypes_:
+            if t == type:
+                return cls
+        raise ValueError(f'Unknown type {type}')
+    
+    @classmethod
+    def get_type(_cls, cls_name):
+        for t, cls in _cls._subtypes_:
+            if cls.__name__ == cls_name:
+                return t
+        raise ValueError(f'Unknown class {cls_name}')
+    
+    @classmethod
+    def parse_obj(cls, obj):
+        data_type = obj.get('type')
+        if data_type is None:
+            raise ValueError(f'type is required for {cls.__name__}')
+    
+        sub = cls.get_cls(data_type)
+        if sub is None:
+            raise ValueError(f'Unknown type {data_type}')
+        return sub(**obj)
+
+    def _iter(self, **kwargs):
+        yield 'type', self.get_type(self.__class__.__name__)
+        yield from super()._iter(**kwargs)
+
+    @property
+    def type(self):
+        return self.get_type(self.__class__.__name__)
+
--- a/vocode/streaming/models/synthesizer.py
+++ b/vocode/streaming/models/synthesizer.py
@ -0,0 +1,73 @@
+from enum import Enum
+from typing import Optional, Union
+
+from pydantic import BaseModel, validator
+from .model import TypedModel
+from .audio_encoding import AudioEncoding
+from ..output_device.base_output_device import BaseOutputDevice
+
+
+class SynthesizerType(str, Enum):
+    BASE = "synthesizer_base"
+    AZURE = "synthesizer_azure"
+    GOOGLE = "synthesizer_google"
+    ELEVEN_LABS = "synthesizer_eleven_labs"
+
+
+class TrackBotSentimentConfig(BaseModel):
+    emotions: list[str] = ["angry", "friendly", "sad", "whispering"]
+
+    @validator("emotions")
+    def emotions_must_not_be_empty(cls, v):
+        if len(v) == 0:
+            raise ValueError("must have at least one emotion")
+        return v
+
+
+class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
+    sampling_rate: int
+    audio_encoding: AudioEncoding
+    should_encode_as_wav: bool = False
+    track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False
+
+    @classmethod
+    def from_output_device(cls, output_device: BaseOutputDevice):
+        return cls(
+            sampling_rate=output_device.sampling_rate,
+            audio_encoding=output_device.audio_encoding,
+        )
+
+
+AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
+AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
+AZURE_SYNTHESIZER_DEFAULT_RATE = 15
+
+
+class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
+    voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
+    pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
+    rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
+
+    @classmethod
+    def from_output_device(
+        cls,
+        output_device: BaseOutputDevice,
+        voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME,
+        pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH,
+        rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE,
+        track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False,
+    ):
+        return cls(
+            sampling_rate=output_device.sampling_rate,
+            audio_encoding=output_device.audio_encoding,
+            voice_name=voice_name,
+            pitch=pitch,
+            rate=rate,
+            track_bot_sentiment_in_voice=track_bot_sentiment_in_voice,
+        )
+
+    pass
+
+
+class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
+    pass
--- a/vocode/streaming/models/telephony.py
+++ b/vocode/streaming/models/telephony.py
@ -0,0 +1,50 @@
+from typing import Optional
+from vocode.streaming.models.model import BaseModel
+from vocode.streaming.models.agent import AgentConfig
+from vocode.streaming.models.synthesizer import SynthesizerConfig
+from vocode.streaming.models.transcriber import TranscriberConfig
+
+
+class TwilioConfig(BaseModel):
+    account_sid: str
+    auth_token: str
+
+
+class CallEntity(BaseModel):
+    phone_number: str
+
+
+class CreateInboundCall(BaseModel):
+    transcriber_config: Optional[TranscriberConfig] = None
+    agent_config: AgentConfig
+    synthesizer_config: Optional[SynthesizerConfig] = None
+    twilio_sid: str
+    twilio_config: Optional[TwilioConfig] = None
+
+
+class EndOutboundCall(BaseModel):
+    call_id: str
+    twilio_config: Optional[TwilioConfig] = None
+
+
+class CreateOutboundCall(BaseModel):
+    recipient: CallEntity
+    caller: CallEntity
+    transcriber_config: Optional[TranscriberConfig] = None
+    agent_config: AgentConfig
+    synthesizer_config: Optional[SynthesizerConfig] = None
+    conversation_id: Optional[str] = None
+    twilio_config: Optional[TwilioConfig] = None
+    # TODO add IVR/etc.
+
+
+class DialIntoZoomCall(BaseModel):
+    recipient: CallEntity
+    caller: CallEntity
+    zoom_meeting_id: str
+    zoom_meeting_password: Optional[str]
+    transcriber_config: Optional[TranscriberConfig] = None
+    agent_config: AgentConfig
+    synthesizer_config: Optional[SynthesizerConfig] = None
+    conversation_id: Optional[str] = None
+    twilio_config: Optional[TwilioConfig] = None
--- a/vocode/streaming/models/transcriber.py
+++ b/vocode/streaming/models/transcriber.py
@ -0,0 +1,70 @@
+from enum import Enum
+from typing import Optional
+
+from vocode.streaming.input_device.base_input_device import (
+    BaseInputDevice,
+)
+from .audio_encoding import AudioEncoding
+from .model import BaseModel, TypedModel
+
+
+class TranscriberType(str, Enum):
+    BASE = "transcriber_base"
+    DEEPGRAM = "transcriber_deepgram"
+    GOOGLE = "transcriber_google"
+    ASSEMBLY_AI = "transcriber_assembly_ai"
+
+
+class EndpointingType(str, Enum):
+    BASE = "endpointing_base"
+    TIME_BASED = "endpointing_time_based"
+    PUNCTUATION_BASED = "endpointing_punctuation_based"
+
+
+class EndpointingConfig(TypedModel, type=EndpointingType.BASE):
+    pass
+
+
+class TimeEndpointingConfig(EndpointingConfig, type=EndpointingType.TIME_BASED):
+    time_cutoff_seconds: float = 0.4
+
+
+class PunctuationEndpointingConfig(
+    EndpointingConfig, type=EndpointingType.PUNCTUATION_BASED
+):
+    time_cutoff_seconds: float = 0.4
+
+
+class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
+    sampling_rate: int
+    audio_encoding: AudioEncoding
+    chunk_size: int
+    endpointing_config: Optional[EndpointingConfig] = None
+
+    @classmethod
+    def from_input_device(
+        cls,
+        input_device: BaseInputDevice,
+        endpointing_config: Optional[EndpointingConfig] = None,
+    ):
+        return cls(
+            sampling_rate=input_device.sampling_rate,
+            audio_encoding=input_device.audio_encoding,
+            chunk_size=input_device.chunk_size,
+            endpointing_config=endpointing_config,
+        )
+
+
+class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
+    model: Optional[str] = None
+    should_warmup_model: bool = False
+    version: Optional[str] = None
+
+
+class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
+    model: Optional[str] = None
+    should_warmup_model: bool = False
+
+
+class AssemblyAITranscriberConfig(TranscriberConfig, type=TranscriberType.ASSEMBLY_AI):
+    should_warmup_model: bool = False
--- a/vocode/streaming/models/websocket.py
+++ b/vocode/streaming/models/websocket.py
@ -0,0 +1,38 @@
+import base64
+from enum import Enum
+from typing import Optional
+from .model import TypedModel
+from .transcriber import TranscriberConfig
+from .agent import AgentConfig
+from .synthesizer import SynthesizerConfig
+
+class WebSocketMessageType(str, Enum):
+    BASE = 'websocket_base'
+    START = 'websocket_start'
+    AUDIO = 'websocket_audio'
+    READY = 'websocket_ready'
+    STOP = 'websocket_stop'
+
+class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
+
+class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
+    data: str
+
+    @classmethod
+    def from_bytes(cls, chunk: bytes):
+        return cls(data=base64.b64encode(chunk).decode('utf-8'))
+
+    def get_bytes(self) -> bytes:
+        return base64.b64decode(self.data)
+
+class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
+    transcriber_config: TranscriberConfig
+    agent_config: AgentConfig
+    synthesizer_config: SynthesizerConfig
+    conversation_id: Optional[str] = None
+
+class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
+    pass
+
+class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
+    pass
--- a/vocode/streaming/output_device/base_output_device.py
+++ b/vocode/streaming/output_device/base_output_device.py
@ -0,0 +1,13 @@
+from vocode.streaming.models.audio_encoding import AudioEncoding
+
+
+class BaseOutputDevice:
+    def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
+        self.sampling_rate = sampling_rate
+        self.audio_encoding = audio_encoding
+
+    async def send_async(self, chunk):
+        raise NotImplemented
+
+    async def maybe_send_mark_async(self, message):
+        pass
--- a/vocode/streaming/output_device/speaker_output.py
+++ b/vocode/streaming/output_device/speaker_output.py
@ -0,0 +1,34 @@
+import sounddevice as sd
+import numpy as np
+
+from .base_output_device import BaseOutputDevice
+from vocode.streaming.models.audio_encoding import AudioEncoding
+
+
+class SpeakerOutput(BaseOutputDevice):
+    DEFAULT_SAMPLING_RATE = 44100
+
+    def __init__(
+        self,
+        device_info: dict,
+        sampling_rate: int = None,
+        audio_encoding: AudioEncoding = AudioEncoding.LINEAR16,
+    ):
+        self.device_info = device_info
+        sampling_rate = sampling_rate or int(
+            self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
+        )
+        super().__init__(sampling_rate, audio_encoding)
+        self.stream = sd.OutputStream(
+            channels=1,
+            samplerate=self.sampling_rate,
+            dtype=np.int16,
+            device=int(self.device_info["index"]),
+        )
+        self.stream.start()
+
+    async def send_async(self, chunk):
+        self.stream.write(np.frombuffer(chunk, dtype=np.int16))
+
+    def terminate(self):
+        self.stream.close()
--- a/vocode/streaming/output_device/telephone_output.py
+++ b/vocode/streaming/output_device/telephone_output.py
@ -0,0 +1,7 @@
+from .base_output_device import BaseOutputDevice
+from vocode.streaming.models.audio_encoding import AudioEncoding
+
+
+class TelephoneOutput(BaseOutputDevice):
+    def __init__(self):
+        super().__init__(sampling_rate=8000, audio_encoding=AudioEncoding.MULAW)
--- a/vocode/streaming/streaming_conversation.py
+++ b/vocode/streaming/streaming_conversation.py
@ -0,0 +1,106 @@
+import websockets
+from websockets.exceptions import ConnectionClosedOK
+from websockets.client import WebSocketClientProtocol
+import asyncio
+from dotenv import load_dotenv
+import os
+import logging
+import threading
+import queue
+import vocode
+from vocode.streaming.input_device.base_input_device import (
+    BaseInputDevice,
+)
+from vocode.streaming.output_device.base_output_device import BaseOutputDevice
+from vocode.streaming.models.transcriber import TranscriberConfig
+from vocode.streaming.models.agent import AgentConfig
+from vocode.streaming.models.synthesizer import SynthesizerConfig
+from vocode.streaming.models.websocket import (
+    ReadyMessage,
+    AudioMessage,
+    StartMessage,
+    StopMessage,
+)
+
+load_dotenv()
+
+
+class StreamingConversation:
+    def __init__(
+        self,
+        input_device: BaseInputDevice,
+        output_device: BaseOutputDevice,
+        transcriber_config: TranscriberConfig,
+        agent_config: AgentConfig,
+        synthesizer_config: SynthesizerConfig,
+        id: str = None,
+    ):
+        self.id = id
+        self.input_device = input_device
+        self.output_device = output_device
+        self.transcriber_config = transcriber_config
+        self.agent_config = agent_config
+        self.synthesizer_config = synthesizer_config
+        self.logger = logging.getLogger(__name__)
+        self.receiver_ready = False
+        self.active = True
+        self.output_loop = asyncio.new_event_loop()
+        self.output_audio_queue = queue.Queue()
+        self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
+
+    async def wait_for_ready(self):
+        while not self.receiver_ready:
+            await asyncio.sleep(0.1)
+        return True
+
+    def deactivate(self):
+        self.active = False
+
+    def play_audio(self):
+        async def run():
+            while self.active:
+                try:
+                    audio = self.output_audio_queue.get(timeout=5)
+                    await self.output_device.send_async(audio)
+                except queue.Empty:
+                    continue
+
+        loop = asyncio.new_event_loop()
+        loop.run_until_complete(run())
+
+    async def start(self):
+        async with websockets.connect(
+            f"{self.vocode_websocket_url}?key={vocode.api_key}"
+        ) as ws:
+
+            async def sender(ws: WebSocketClientProtocol):
+                start_message = StartMessage(
+                    transcriber_config=self.transcriber_config,
+                    agent_config=self.agent_config,
+                    synthesizer_config=self.synthesizer_config,
+                    conversation_id=self.id,
+                )
+                await ws.send(start_message.json())
+                await self.wait_for_ready()
+                self.logger.info("Listening...press Ctrl+C to stop")
+                while self.active:
+                    data = self.input_device.get_audio()
+                    if data:
+                        try:
+                            await ws.send(AudioMessage.from_bytes(data).json())
+                        except ConnectionClosedOK:
+                            self.deactivate()
+                            return
+                        await asyncio.sleep(0)
+                await ws.send(StopMessage().json())
+
+            async def receiver(ws: WebSocketClientProtocol):
+                ReadyMessage.parse_raw(await ws.recv())
+                self.receiver_ready = True
+                async for msg in ws:
+                    audio_message = AudioMessage.parse_raw(msg)
+                    self.output_audio_queue.put_nowait(audio_message.get_bytes())
+
+            output_thread = threading.Thread(target=self.play_audio)
+            output_thread.start()
+            return await asyncio.gather(sender(ws), receiver(ws))
--- a/vocode/streaming/telephony/inbound_call_server.py
+++ b/vocode/streaming/telephony/inbound_call_server.py
@ -0,0 +1,62 @@
+from fastapi import FastAPI, Response, Form
+from typing import Optional
+import requests
+import uvicorn
+
+import vocode
+from vocode.streaming.models.transcriber import TranscriberConfig
+from vocode.streaming.models.synthesizer import SynthesizerConfig
+from vocode.streaming.models.agent import AgentConfig
+from vocode.streaming.models.telephony import (
+    CreateInboundCall,
+    TwilioConfig,
+    TwilioConfig,
+)
+
+
+class InboundCallServer:
+    def __init__(
+        self,
+        agent_config: AgentConfig,
+        transcriber_config: Optional[TranscriberConfig] = None,
+        synthesizer_config: Optional[SynthesizerConfig] = None,
+        response_on_rate_limit: Optional[str] = None,
+        twilio_config: Optional[TwilioConfig] = None,
+    ):
+        self.agent_config = agent_config
+        self.transcriber_config = transcriber_config
+        self.synthesizer_config = synthesizer_config
+        self.app = FastAPI()
+        self.app.post("/vocode")(self.handle_call)
+        self.response_on_rate_limit = (
+            response_on_rate_limit
+            or "The line is really busy right now, check back later!"
+        )
+        self.twilio_config = twilio_config
+        self.vocode_inbound_call_url = f"https://{vocode.base_url}/create_inbound_call"
+
+    async def handle_call(self, twilio_sid: str = Form(alias="CallSid")):
+        response = requests.post(
+            self.vocode_inbound_call_url,
+            headers={"Authorization": f"Bearer {vocode.api_key}"},
+            json=CreateInboundCall(
+                agent_config=self.agent_config,
+                twilio_sid=twilio_sid,
+                transcriber_config=self.transcriber_config,
+                synthesizer_config=self.synthesizer_config,
+                twilio_config=self.twilio_config,
+            ).dict(),
+        )
+        if response.status_code == 429:
+            return Response(
+                f"<Response><Say>{self.response_on_rate_limit}</Say></Response>",
+                media_type="application/xml",
+            )
+        assert response.ok, response.text
+        return Response(
+            response.text,
+            media_type="application/xml",
+        )
+
+    def run(self, host="localhost", port=3000):
+        uvicorn.run(self.app, host=host, port=port)
--- a/vocode/streaming/telephony/outbound_call.py
+++ b/vocode/streaming/telephony/outbound_call.py
@ -0,0 +1,68 @@
+from typing import Optional
+import requests
+
+import vocode
+from vocode.streaming.models.agent import AgentConfig
+from vocode.streaming.models.synthesizer import SynthesizerConfig
+from vocode.streaming.models.transcriber import TranscriberConfig
+from ..models.telephony import (
+    CallEntity,
+    CreateOutboundCall,
+    EndOutboundCall,
+    TwilioConfig,
+)
+
+
+class OutboundCall:
+    def __init__(
+        self,
+        recipient: CallEntity,
+        caller: CallEntity,
+        agent_config: AgentConfig,
+        transcriber_config: Optional[TranscriberConfig] = None,
+        synthesizer_config: Optional[SynthesizerConfig] = None,
+        conversation_id: Optional[str] = None,
+        twilio_config: Optional[TwilioConfig] = None,
+    ):
+        self.recipient = recipient
+        self.caller = caller
+        self.agent_config = agent_config
+        self.transcriber_config = transcriber_config
+        self.synthesizer_config = synthesizer_config
+        self.conversation_id = conversation_id
+        self.twilio_config = twilio_config
+        self.vocode_create_outbound_call_url = (
+            f"https://{vocode.base_url}/create_outbound_call"
+        )
+        self.vocode_end_outbound_call_url = (
+            f"https://{vocode.base_url}/end_outbound_call"
+        )
+
+    def start(self) -> str:
+        response = requests.post(
+            self.vocode_create_outbound_call_url,
+            headers={"Authorization": f"Bearer {vocode.api_key}"},
+            json=CreateOutboundCall(
+                recipient=self.recipient,
+                caller=self.caller,
+                agent_config=self.agent_config,
+                transcriber_config=self.transcriber_config,
+                synthesizer_config=self.synthesizer_config,
+                conversation_id=self.conversation_id,
+                twilio_config=self.twilio_config,
+            ).dict(),
+        )
+        assert response.ok, response.text
+        data = response.json()
+        self.conversation_id = data["id"]
+
+    def end(self) -> str:
+        response = requests.post(
+            self.vocode_end_outbound_call_url,
+            headers={"Authorization": f"Bearer {vocode.api_key}"},
+            json=EndOutboundCall(
+                call_id=self.conversation_id,
+                twilio_config=self.twilio_config,
+            ).dict(),
+        )
+        assert response.ok or response.status_code == 404, response.text
--- a/vocode/streaming/telephony/zoom_dial_in.py
+++ b/vocode/streaming/telephony/zoom_dial_in.py
@ -0,0 +1,60 @@
+from typing import Optional
+import requests
+
+import vocode
+from vocode.streaming.models.agent import AgentConfig
+from vocode.streaming.models.synthesizer import SynthesizerConfig
+from vocode.streaming.models.transcriber import TranscriberConfig
+from vocode.streaming.telephony.outbound_call import OutboundCall
+from vocode.streaming.models.telephony import (
+    CallEntity,
+    DialIntoZoomCall,
+    TwilioConfig,
+)
+
+
+class ZoomDialIn(OutboundCall):
+    def __init__(
+        self,
+        recipient: CallEntity,
+        caller: CallEntity,
+        zoom_meeting_id: str,
+        zoom_meeting_password: str,
+        agent_config: AgentConfig,
+        transcriber_config: Optional[TranscriberConfig] = None,
+        synthesizer_config: Optional[SynthesizerConfig] = None,
+        conversation_id: Optional[str] = None,
+        twilio_config: Optional[TwilioConfig] = None,
+    ):
+        super().__init__(
+            recipient=recipient,
+            caller=caller,
+            agent_config=agent_config,
+            transcriber_config=transcriber_config,
+            synthesizer_config=synthesizer_config,
+            conversation_id=conversation_id,
+            twilio_config=twilio_config,
+        )
+        self.zoom_meeting_id = zoom_meeting_id
+        self.zoom_meeting_password = zoom_meeting_password
+        self.vocode_zoom_dial_in_url = f"https://{vocode.base_url}/dial_into_zoom_call"
+
+    def start(self) -> str:
+        response = requests.post(
+            self.vocode_zoom_dial_in_url,
+            headers={"Authorization": f"Bearer {vocode.api_key}"},
+            json=DialIntoZoomCall(
+                recipient=self.recipient,
+                caller=self.caller,
+                zoom_meeting_id=self.zoom_meeting_id,
+                zoom_meeting_password=self.zoom_meeting_password,
+                agent_config=self.agent_config,
+                transcriber_config=self.transcriber_config,
+                synthesizer_config=self.synthesizer_config,
+                conversation_id=self.conversation_id,
+                twilio_config=self.twilio_config,
+            ).dict(),
+        )
+        assert response.ok, response.text
+        data = response.json()
+        self.conversation_id = data["id"]
--- a/vocode/streaming/user_implemented_agent/base_agent.py
+++ b/vocode/streaming/user_implemented_agent/base_agent.py
@ -0,0 +1,10 @@
+from fastapi import FastAPI
+import uvicorn
+
+
+class BaseAgent:
+    def __init__(self):
+        self.app = FastAPI()
+
+    def run(self, host="localhost", port=3000):
+        uvicorn.run(self.app, host=host, port=port)
--- a/vocode/streaming/user_implemented_agent/restful_agent.py
+++ b/vocode/streaming/user_implemented_agent/restful_agent.py
@ -0,0 +1,19 @@
+from .base_agent import BaseAgent
+from ..models.agent import RESTfulAgentInput, RESTfulAgentOutput, RESTfulAgentText, RESTfulAgentEnd
+from pydantic import BaseModel
+from typing import Union
+from fastapi import APIRouter
+
+class RESTfulAgent(BaseAgent):
+        
+    def __init__(self):
+        super().__init__()
+        self.app.post("/respond")(self.respond_rest)
+
+    async def respond(self, human_input, conversation_id) -> RESTfulAgentOutput:
+        raise NotImplementedError
+
+    async def respond_rest(self, request: RESTfulAgentInput) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
+        response = await self.respond(request.human_input, request.conversation_id)
+        return response
+
--- a/vocode/streaming/user_implemented_agent/websocket_agent.py
+++ b/vocode/streaming/user_implemented_agent/websocket_agent.py
@ -0,0 +1,56 @@
+from .base_agent import BaseAgent
+import uuid
+import typing
+from typing import AsyncGenerator, Union, Optional
+from fastapi import WebSocket
+from ..models.agent import (
+    WebSocketAgentStartMessage,
+    WebSocketAgentReadyMessage,
+    WebSocketAgentTextEndMessage,
+    WebSocketAgentTextMessage,
+    WebSocketAgentStopMessage,
+    WebSocketAgentMessage,
+    WebSocketAgentMessageType,
+)
+
+
+class WebSocketAgent(BaseAgent):
+    def __init__(self, generate_responses: bool = False):
+        super().__init__()
+        self.generate_responses = generate_responses
+        self.app.websocket("/respond")(self.respond_websocket)
+
+    async def respond(
+        self, human_input: str, conversation_id: Optional[str] = None
+    ) -> Union[WebSocketAgentTextMessage, WebSocketAgentStopMessage]:
+        raise NotImplementedError
+
+    async def generate_response(
+        self, human_input: str, conversation_id: Optional[str] = None
+    ) -> AsyncGenerator[
+        Union[WebSocketAgentTextMessage, WebSocketAgentTextEndMessage], None
+    ]:
+        raise NotImplementedError
+
+    async def respond_websocket(self, websocket: WebSocket):
+        await websocket.accept()
+        WebSocketAgentStartMessage.parse_obj(await websocket.receive_json())
+        await websocket.send_text(WebSocketAgentReadyMessage().json())
+        while True:
+            input_message: WebSocketAgentMessage = WebSocketAgentMessage.parse_obj(
+                await websocket.receive_json()
+            )
+            if input_message.type == WebSocketAgentMessageType.STOP:
+                break
+            text_message = typing.cast(WebSocketAgentTextMessage, input_message)
+            if self.generate_responses:
+                async for output_response in self.generate_response(
+                    text_message.data.text, text_message.conversation_id
+                ):
+                    await websocket.send_text(output_response.json())
+            else:
+                output_response = await self.respond(
+                    text_message.data.text, text_message.conversation_id
+                )
+                await websocket.send_text(output_response.json())
+        await websocket.close()