open source

2023-03-28 00:15:34 -07:00 · 2023-03-28 00:15:34 -07:00 · a93bfc1ec9
commit a93bfc1ec9
parent 70b6e17c69
61 changed files with 4013 additions and 126 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,5 @@ __pycache__/
 .env
 .DS_Store
 dist/
 credentials.json
 *.npy
--- a/examples/hosted_inbound_call_server.py
+++ b/examples/hosted_inbound_call_server.py
@ -1,4 +1,4 @@
-from vocode.streaming.telephony.inbound_call_server import InboundCallServer
+from vocode.streaming.telephony.hosted.inbound_call_server import InboundCallServer
 from vocode.streaming.models.agent import EchoAgentConfig
 if __name__ == "__main__":
--- a/examples/hosted_outbound_call.py
+++ b/examples/hosted_outbound_call.py
@ -1,6 +1,6 @@
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
 from vocode.streaming.output_device.telephone_output import TelephoneOutput
-from vocode.streaming.telephony.outbound_call import OutboundCall
+from vocode.streaming.telephony.hosted.outbound_call import OutboundCall
 from vocode.streaming.models.telephony import CallEntity
 from vocode.streaming.models.agent import (
    EchoAgentConfig,
@ -8,7 +8,7 @@ from vocode.streaming.models.agent import (
    WebSocketUserImplementedAgentConfig,
 )
 from vocode.streaming.models.message import BaseMessage
-from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn
+from vocode.streaming.telephony.hosted.zoom_dial_in import ZoomDialIn
 if __name__ == "__main__":
    call = ZoomDialIn(
--- a/examples/hosted_streaming_conversation.py
+++ b/examples/hosted_streaming_conversation.py
@ -3,6 +3,7 @@ import logging
 import signal
 from dotenv import load_dotenv
 import os
 from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
 from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.helpers import create_microphone_input_and_speaker_output
 from vocode.streaming.models.transcriber import (
@ -22,7 +23,6 @@ from vocode.streaming.models.agent import (
 )
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
 from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
 import vocode
 load_dotenv()
@ -37,7 +37,7 @@ if __name__ == "__main__":
        streaming=True, use_default_devices=False
    )
-    conversation = StreamingConversation(
+    conversation = HostedStreamingConversation(
        input_device=microphone_input,
        output_device=speaker_output,
        transcriber_config=DeepgramTranscriberConfig.from_input_device(
--- a/examples/streaming_conversation.py
+++ b/examples/streaming_conversation.py
@ -0,0 +1,79 @@
 import asyncio
 import logging
 import signal
 from dotenv import load_dotenv
 import os
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.helpers import create_microphone_input_and_speaker_output
 from vocode.streaming.models.transcriber import (
    DeepgramTranscriberConfig,
    PunctuationEndpointingConfig,
    GoogleTranscriberConfig,
 )
 from vocode.streaming.models.agent import (
    ChatGPTAgentConfig,
    CutOffResponse,
    FillerAudioConfig,
    RESTfulUserImplementedAgentConfig,
    WebSocketUserImplementedAgentConfig,
    EchoAgentConfig,
    LLMAgentConfig,
    ChatGPTAgentConfig,
 )
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import (
    AzureSynthesizerConfig,
    GoogleSynthesizerConfig,
    RimeSynthesizerConfig,
 )
 import vocode
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 load_dotenv()
 vocode.api_key = os.getenv("VOCODE_API_KEY")
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 async def main():
    microphone_input, speaker_output = create_microphone_input_and_speaker_output(
        streaming=True, use_default_devices=False
    )
    conversation = StreamingConversation(
        output_device=speaker_output,
        transcriber=DeepgramTranscriber(
            DeepgramTranscriberConfig.from_input_device(
                microphone_input, endpointing_config=PunctuationEndpointingConfig()
            )
        ),
        agent=ChatGPTAgent(
            ChatGPTAgentConfig(
                initial_message=BaseMessage(text="What up"),
                prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
 hella, down,     fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
                generate_responses=True,
                cut_off_response=CutOffResponse(),
            )
        ),
        synthesizer=AzureSynthesizer(
            AzureSynthesizerConfig.from_output_device(speaker_output),
        ),
        logger=logger,
    )
    await conversation.start()
    print("Conversation started, press Ctrl+C to end")
    signal.signal(signal.SIGINT, lambda _0, _1: conversation.terminate())
    while conversation.is_active():
        chunk = microphone_input.get_audio()
        if chunk:
            conversation.receive_audio(chunk)
        await asyncio.sleep(0)
 if __name__ == "__main__":
    asyncio.run(main())
--- a/examples/telephony_app.py
+++ b/examples/telephony_app.py
@ -0,0 +1,69 @@
 import logging
 from fastapi import FastAPI
 import os
 from dotenv import load_dotenv
 load_dotenv()
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.streaming.models.agent import ChatGPTAgentConfig
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.telephony import TwilioConfig
 from vocode.streaming.telephony.config_manager.redis_config_manager import (
    RedisConfigManager,
 )
 from vocode.streaming.telephony.conversation.outbound_call import OutboundCall
 from vocode.streaming.telephony.server.base import InboundCallConfig, TelephonyServer
 app = FastAPI(docs_url=None)
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 config_manager = RedisConfigManager()
 BASE_URL = "59b8e140372d.ngrok.app"
 telephony_server = TelephonyServer(
    base_url=BASE_URL,
    config_manager=config_manager,
    inbound_call_configs=[
        InboundCallConfig(
            url="/inbound_call",
            agent_config=ChatGPTAgentConfig(
                initial_message=BaseMessage(text="What up"),
                prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
 hella, down,     fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
                generate_responses=True,
            ),
            twilio_config=TwilioConfig(
                account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
                auth_token=os.getenv("TWILIO_AUTH_TOKEN"),
            ),
        )
    ],
    logger=logger,
 )
 app.include_router(telephony_server.get_router())
 # outbound_call = OutboundCall(
 #     base_url=BASE_URL,
 #     to_phone="+14088926228",
 #     from_phone="+14086600744",
 #     config_manager=config_manager,
 #     agent_config=ChatGPTAgentConfig(
 #         initial_message=BaseMessage(text="What up"),
 #         prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
 # hella, down,     fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
 #         generate_responses=True,
 #     ),
 #     twilio_config=TwilioConfig(
 #         account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
 #         auth_token=os.getenv("TWILIO_AUTH_TOKEN"),
 #     ),
 #     logger=logger,
 # )
 # outbound_call.start()
--- a/examples/turn_based_conversation.py
+++ b/examples/turn_based_conversation.py
--- a/examples/user_implemented_agent.py
+++ b/examples/user_implemented_agent.py
--- a/requirements.txt
+++ b/requirements.txt
@ -4,6 +4,8 @@ anyio==3.6.2
 async-timeout==4.0.2
 attrs==22.2.0
 azure-cognitiveservices-speech==1.25.0
 black==23.1.0
 cachetools==5.3.0
 certifi==2022.12.7
 cffi==1.15.1
 charset-normalizer==3.0.1
@ -12,32 +14,50 @@ dataclasses-json==0.5.7
 decorator==5.1.1
 fastapi==0.92.0
 frozenlist==1.3.3
 google-api-core==2.11.0
 google-auth==2.16.3
 google-cloud-speech==2.17.3
 google-cloud-texttospeech==2.14.1
 googleapis-common-protos==1.59.0
 grpcio==1.51.3
 grpcio-status==1.51.3
 h11==0.14.0
 idna==3.4
 Jinja2==3.1.2
 joblib==1.2.0
 langchain==0.0.117
 MarkupSafe==2.1.2
 marshmallow==3.19.0
 marshmallow-enum==1.5.1
 mccabe==0.7.0
 multidict==6.0.4
 mypy-extensions==1.0.0
 nltk==3.8.1
 numpy==1.24.2
 openai==0.27.2
 packaging==23.0
 pathspec==0.11.0
 platformdirs==3.1.0
 ply==3.11
 proto-plus==1.22.2
 protobuf==4.22.1
 pyasn1==0.4.8
 pyasn1-modules==0.2.8
 PyAudio==0.2.13
 pycodestyle==2.10.0
 pycparser==2.21
-pydantic>=1.9.0
+pydantic==1.10.7
 pyflakes>=2.5.0
 pydub==0.25.1
 pyflakes==3.0.1
 PyJWT==2.6.0
 python-dotenv==0.21.1
 python-multipart==0.0.6
 pytz==2022.7.1
 PyYAML==6.0
 redis==4.5.3
 regex==2023.3.23
 requests==2.28.2
 rsa==4.9
 six==1.16.0
 sniffio==1.3.0
 sounddevice==0.4.6
@ -46,8 +66,9 @@ starlette==0.25.0
 tenacity==8.2.2
 tomli==2.0.1
 tqdm==4.65.0
 twilio==7.17.0
 typing-inspect==0.8.0
-typing_extensions>=3.10.0.2
+typing_extensions==4.5.0
 urllib3==1.26.14
 uvicorn==0.20.0
 websockets==10.4
--- a/vocode/streaming/agent/base_agent.py
+++ b/vocode/streaming/agent/base_agent.py
@ -0,0 +1,44 @@
 import random
 from typing import Generator, Optional
 from vocode.streaming.models.agent import (
    AgentConfig,
    ChatGPTAgentConfig,
    LLMAgentConfig,
 )
 class BaseAgent:
    def __init__(self, agent_config: AgentConfig):
        self.agent_config = agent_config
    def get_agent_config(self) -> AgentConfig:
        return self.agent_config
    def start(self):
        pass
    def respond(
        self, human_input, is_interrupt: bool = False
    ) -> tuple[Optional[str], bool]:
        raise NotImplementedError
    def generate_response(
        self, human_input, is_interrupt: bool = False
    ) -> Generator[str, None, None]:
        """Returns a generator that yields a sentence at a time."""
        raise NotImplementedError
    def update_last_bot_message_on_cut_off(self, message: str):
        """Updates the last bot message in the conversation history when the human cuts off the bot's response."""
        pass
    def get_cut_off_response(self) -> Optional[str]:
        assert isinstance(self.agent_config, LLMAgentConfig) or isinstance(
            self.agent_config, ChatGPTAgentConfig
        )
        on_cut_off_messages = self.agent_config.cut_off_response.messages
        if on_cut_off_messages:
            return random.choice(on_cut_off_messages).text
    def terminate(self):
        pass
--- a/vocode/streaming/agent/bot_sentiment_analyser.py
+++ b/vocode/streaming/agent/bot_sentiment_analyser.py
@ -0,0 +1,50 @@
 from typing import Optional
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate
 from pydantic import BaseModel
 TEMPLATE = """
 Read the following conversation classify the final emotion of the Bot as one of [{emotions}].
 Output the degree of emotion as a value between 0 and 1 in the format EMOTION,DEGREE: ex. {example_emotion},0.5
 <start>
 {{transcript}}
 <end>
 """
 class BotSentiment(BaseModel):
    emotion: Optional[str] = None
    degree: float = 0.0
 class BotSentimentAnalyser:
    def __init__(self, emotions: list[str], model_name: str = "text-davinci-003"):
        self.model_name = model_name
        self.llm = OpenAI(
            model_name=self.model_name,
        )
        assert len(emotions) > 0
        self.emotions = [e.lower() for e in emotions]
        self.prompt = PromptTemplate(
            input_variables=["transcript"],
            template=TEMPLATE.format(
                emotions=",".join(self.emotions), example_emotion=self.emotions[0]
            ),
        )
    def analyse(self, transcript: str) -> BotSentiment:
        prompt = self.prompt.format(transcript=transcript)
        response = self.llm(prompt).strip()
        tokens = response.split(",")
        if len(tokens) != 2:
            return BotSentiment(emotion=None, degree=0.0)
        emotion, degree = tokens
        emotion = emotion.strip().lower()
        if emotion.lower() not in self.emotions:
            return BotSentiment(emotion=None, degree=0.0)
        try:
            degree = float(degree.strip())
        except ValueError:
            return BotSentiment(emotion=emotion, degree=0.5)
        return BotSentiment(emotion=emotion, degree=degree)
--- a/vocode/streaming/agent/chat_gpt_agent.py
+++ b/vocode/streaming/agent/chat_gpt_agent.py
@ -0,0 +1,158 @@
 import os
 import random
 import time
 from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
 from langchain.chains import ConversationChain
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import OpenAIChat
 from langchain.memory import ConversationBufferMemory
 from langchain.schema import ChatMessage, AIMessage
 import openai
 import json
 from typing import Generator, Optional
 from dotenv import load_dotenv
 from typing import Generator
 import logging
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.models.agent import ChatGPTAgentConfig
 from vocode.streaming.utils.sse_client import SSEClient
 from vocode.streaming.agent.utils import stream_llm_response
 load_dotenv()
 openai.api_key = os.environ.get("OPENAI_API_KEY")
 class ChatGPTAgent(BaseAgent):
    def __init__(self, agent_config: ChatGPTAgentConfig, logger: logging.Logger = None):
        super().__init__(agent_config)
        self.agent_config = agent_config
        self.logger = logger or logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        self.prompt = ChatPromptTemplate.from_messages(
            [
                SystemMessagePromptTemplate.from_template(agent_config.prompt_preamble),
                MessagesPlaceholder(variable_name="history"),
                HumanMessagePromptTemplate.from_template("{input}"),
            ]
        )
        self.memory = ConversationBufferMemory(return_messages=True)
        if agent_config.initial_message:
            if (
                agent_config.generate_responses
            ):  # we use ChatMessages for memory when we generate responses
                self.memory.chat_memory.messages.append(
                    ChatMessage(
                        content=agent_config.initial_message.text, role="assistant"
                    )
                )
            else:
                self.memory.chat_memory.add_ai_message(
                    agent_config.initial_message.text
                )
        self.llm = ChatOpenAI(
            model_name=self.agent_config.model_name,
            temperature=self.agent_config.temperature,
            max_tokens=self.agent_config.max_tokens,
        )
        self.conversation = ConversationChain(
            memory=self.memory, prompt=self.prompt, llm=self.llm
        )
        self.first_response = (
            self.create_first_response(agent_config.expected_first_prompt)
            if agent_config.expected_first_prompt
            else None
        )
        self.is_first_response = True
    def create_first_response(self, first_prompt):
        return self.conversation.predict(input=first_prompt)
    def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
        if is_interrupt and self.agent_config.cut_off_response:
            cut_off_response = self.get_cut_off_response()
            self.memory.chat_memory.add_user_message(human_input)
            self.memory.chat_memory.add_ai_message(cut_off_response)
            return cut_off_response, False
        self.logger.debug("LLM responding to human input")
        if self.is_first_response and self.first_response:
            self.logger.debug("First response is cached")
            self.is_first_response = False
            text = self.first_response
        else:
            text = self.conversation.predict(input=human_input)
        self.logger.debug(f"LLM response: {text}")
        return text, False
    def generate_response(
        self, human_input, is_interrupt: bool = False
    ) -> Generator[str, None, None]:
        self.memory.chat_memory.messages.append(
            ChatMessage(role="user", content=human_input)
        )
        if is_interrupt and self.agent_config.cut_off_response:
            cut_off_response = self.get_cut_off_response()
            self.memory.chat_memory.messages.append(
                ChatMessage(role="assistant", content=cut_off_response)
            )
            yield cut_off_response
            return
        prompt_messages = [
            ChatMessage(role="system", content=self.agent_config.prompt_preamble)
        ] + self.memory.chat_memory.messages
        messages = SSEClient(
            "POST",
            "https://api.openai.com/v1/chat/completions",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
            },
            json={
                "model": self.agent_config.model_name,
                "messages": [
                    prompt_message.dict(include={"content": True, "role": True})
                    for prompt_message in prompt_messages
                ],
                "max_tokens": 256,
                "temperature": 1.0,
                "stream": True,
            },
        )
        bot_memory_message = ChatMessage(role="assistant", content="")
        self.memory.chat_memory.messages.append(bot_memory_message)
        for message in stream_llm_response(
            map(lambda event: json.loads(event.data), messages),
            get_text=lambda choice: choice.get("delta", {}).get("content"),
        ):
            bot_memory_message.content = f"{bot_memory_message.content} {message}"
            yield message
    def update_last_bot_message_on_cut_off(self, message: str):
        for memory_message in self.memory.chat_memory.messages[::-1]:
            if (
                isinstance(memory_message, ChatMessage)
                and memory_message.role == "assistant"
            ) or isinstance(memory_message, AIMessage):
                memory_message.content = message
                return
 if __name__ == "__main__":
    agent = ChatGPTAgent(
        ChatGPTAgentConfig(
            model_name="gpt-4",
            prompt_preamble="The assistant is having a pleasant conversation about life. If the user hasn't completed their thought, the assistant responds with 'PASS'",
        )
    )
    while True:
        # response = agent.respond(input("Human: "))[0]
        # print(f"AI: {response}")
        for response in agent.generate_response(input("Human: ")):
            print(f"AI: {response}")
--- a/vocode/streaming/agent/echo_agent.py
+++ b/vocode/streaming/agent/echo_agent.py
@ -0,0 +1,13 @@
 from typing import Generator
 from vocode.streaming.agent.base_agent import BaseAgent
 class EchoAgent(BaseAgent):
    def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
        return human_input, False
    def generate_response(self, human_input, is_interrupt: bool = False) -> Generator:
        yield human_input
    def update_last_bot_message_on_cut_off(self, message: str):
        pass
--- a/vocode/streaming/agent/information_retrieval_agent.py
+++ b/vocode/streaming/agent/information_retrieval_agent.py
@ -0,0 +1,32 @@
 import logging
 from typing import List
 from langchain import OpenAI
 from vocode.streaming.agent.llm_agent import LLMAgent
 from ..models.agent import InformationRetrievalAgentConfig, LLMAgentConfig
 class InformationRetrievalAgent(LLMAgent):
    def __init__(
        self,
        agent_config: InformationRetrievalAgentConfig,
        logger: logging.Logger,
    ):
        # super().__init__(agent_config, logger)
        prompt_preamble = f"""
        The AI is a friendly phone bot built for information retrieval. It understands IVR navigation and chooses which numbers to press based on the intended goal and the options provided.
 Once it reaches the human, it verifies the identity of the person it is trying to reach and states its purpose. If it needs to be transferred, then the AI asks to speak to the intended recipient of the phone call.
 Here is the context for the call:
 Intended goal: { agent_config.goal_description }
 Intended recipient: { agent_config.recipient_descriptor }
 Information to be collected: { agent_config.fields }
 Information to provide to the person who answers the phone: this is a robot calling on behalf of { agent_config.caller_descriptor }
 The AI begins the call by introducing itself and who it represents.
        """
        agent_config = LLMAgentConfig(
            prompt_preamble=prompt_preamble,
        )
        super().__init__(agent_config, logger=logger)
        self.llm = OpenAI(model_name="text-davinci-003", temperature=1)
--- a/vocode/streaming/agent/llm_agent.py
+++ b/vocode/streaming/agent/llm_agent.py
@ -0,0 +1,139 @@
 import re
 from typing import Optional
 from dotenv import load_dotenv
 from langchain import OpenAI
 from langchain.llms import OpenAIChat
 from typing import Generator
 import logging
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.agent.utils import stream_llm_response
 from vocode.streaming.models.agent import LLMAgentConfig
 load_dotenv()
 class LLMAgent(BaseAgent):
    SENTENCE_ENDINGS = [".", "!", "?"]
    DEFAULT_PROMPT_TEMPLATE = "{history}\nHuman: {human_input}\nAI:"
    def __init__(
        self,
        agent_config: LLMAgentConfig,
        logger: logging.Logger = None,
        sender="AI",
        recipient="Human",
    ):
        super().__init__(agent_config)
        self.agent_config = agent_config
        self.prompt_template = (
            f"{agent_config.prompt_preamble}\n\n{self.DEFAULT_PROMPT_TEMPLATE}"
        )
        self.initial_bot_message = (
            agent_config.initial_message.text if agent_config.initial_message else None
        )
        self.logger = logger or logging.getLogger(__name__)
        self.sender = sender
        self.recipient = recipient
        self.memory = (
            [f"AI: {agent_config.initial_message.text}"]
            if agent_config.initial_message
            else []
        )
        self.llm = OpenAI(
            model_name=self.agent_config.model_name,
            temperature=self.agent_config.temperature,
            max_tokens=self.agent_config.max_tokens,
        )
        self.stop_tokens = [f"{recipient}:"]
        self.first_response = (
            self.llm(
                self.prompt_template.format(
                    history="", human_input=agent_config.expected_first_prompt
                ),
                stop=self.stop_tokens,
            ).strip()
            if agent_config.expected_first_prompt
            else None
        )
        self.is_first_response = True
    def create_prompt(self, human_input):
        history = "\n".join(self.memory[-5:])
        return self.prompt_template.format(history=history, human_input=human_input)
    def get_memory_entry(self, human_input, response):
        return f"{self.recipient}: {human_input}\n{self.sender}: {response}"
    def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
        if is_interrupt and self.agent_config.cut_off_response:
            cut_off_response = self.get_cut_off_response()
            self.memory.append(self.get_memory_entry(human_input, cut_off_response))
            return cut_off_response, False
        self.logger.debug("LLM responding to human input")
        if self.is_first_response and self.first_response:
            self.logger.debug("First response is cached")
            self.is_first_response = False
            response = self.first_response
        else:
            response = self.llm(self.create_prompt(human_input), stop=self.stop_tokens)
            response = response.replace(f"{self.sender}:", "")
        self.memory.append(self.get_memory_entry(human_input, response))
        self.logger.debug(f"LLM response: {response}")
        return response, False
    def generate_response(self, human_input, is_interrupt: bool = False) -> Generator:
        self.logger.debug("LLM generating response to human input")
        if is_interrupt and self.agent_config.cut_off_response:
            cut_off_response = self.get_cut_off_response()
            self.memory.append(self.get_memory_entry(human_input, cut_off_response))
            yield cut_off_response
            return
        self.memory.append(self.get_memory_entry(human_input, ""))
        if self.is_first_response and self.first_response:
            self.logger.debug("First response is cached")
            self.is_first_response = False
            sentences = [self.first_response]
        else:
            self.logger.debug("Creating LLM prompt")
            prompt = self.create_prompt(human_input)
            self.logger.debug("Streaming LLM response")
            sentences = stream_llm_response(
                map(
                    lambda resp: resp.to_dict(),
                    self.llm.stream(prompt, stop=self.stop_tokens),
                )
            )
        response_buffer = ""
        for sentence in sentences:
            sentence = sentence.replace(f"{self.sender}:", "")
            sentence = re.sub(r"^\s+(.*)", r" \1", sentence)
            response_buffer += sentence
            self.memory[-1] = self.get_memory_entry(human_input, response_buffer)
            yield sentence
    def update_last_bot_message_on_cut_off(self, message: str):
        last_message = self.memory[-1]
        new_last_message = (
            last_message.split("\n", 1)[0] + f"\n{self.sender}: {message}"
        )
        self.memory[-1] = new_last_message
 if __name__ == "__main__":
    chat_responder = LLMAgent(
        LLMAgentConfig(
            prompt_preamble="""
 The AI is having a pleasant conversation about life. If the human hasn't completed their thought, the AI responds with 'PASS'
 {history}
 Human: {human_input}
 AI:""",
        )
    )
    while True:
        # response = chat_responder.respond(input("Human: "))[0]
        for response in chat_responder.generate_response(input("Human: ")):
            print(f"AI: {response}")
--- a/vocode/streaming/agent/utils.py
+++ b/vocode/streaming/agent/utils.py
@ -0,0 +1,25 @@
 from typing import Generator
 SENTENCE_ENDINGS = [".", "!", "?"]
 def stream_llm_response(
    gen, get_text=lambda choice: choice.get("text"), sentence_endings=SENTENCE_ENDINGS
 ) -> Generator:
    buffer = ""
    for response in gen:
        choices = response.get("choices", [])
        if len(choices) == 0:
            break
        choice = choices[0]
        if choice["finish_reason"]:
            break
        token = get_text(choice)
        if not token:
            continue
        buffer += token
        if any(token.endswith(ending) for ending in sentence_endings):
            yield buffer.strip()
            buffer = ""
    if buffer.strip():
        yield buffer
--- a/vocode/streaming/constants.py
+++ b/vocode/streaming/constants.py
@ -0,0 +1,3 @@
 TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS = 1
 PER_CHUNK_ALLOWANCE_SECONDS = 0.05
 ALLOWED_IDLE_TIME = 15
--- a/vocode/streaming/factory.py
+++ b/vocode/streaming/factory.py
@ -0,0 +1,58 @@
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.streaming.agent.echo_agent import EchoAgent
 from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
 from vocode.streaming.agent.llm_agent import LLMAgent
 from vocode.streaming.models.agent import AgentConfig, AgentType
 from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType
 from vocode.streaming.models.transcriber import TranscriberConfig, TranscriberType
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
 from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
 from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
 from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
 def create_transcriber(transcriber_config: TranscriberConfig) -> BaseTranscriber:
    if transcriber_config.type == TranscriberType.DEEPGRAM:
        return DeepgramTranscriber(transcriber_config)
    elif transcriber_config.type == TranscriberType.GOOGLE:
        return GoogleTranscriber(transcriber_config)
    elif transcriber_config.type == TranscriberType.ASSEMBLY_AI:
        return AssemblyAITranscriber(transcriber_config)
    else:
        raise Exception("Invalid transcriber config")
 def create_agent(agent_config: AgentConfig) -> BaseAgent:
    if agent_config.type == AgentType.LLM:
        return LLMAgent(agent_config=agent_config)
    elif agent_config.type == AgentType.CHAT_GPT:
        return ChatGPTAgent(agent_config=agent_config)
    elif agent_config.type == AgentType.ECHO:
        return EchoAgent(agent_config=agent_config)
    elif agent_config.type == AgentType.INFORMATION_RETRIEVAL:
        return InformationRetrievalAgent(
            agent_config=agent_config,
        )
    raise Exception("Invalid agent config", agent_config.type)
 def create_synthesizer(synthesizer_config: SynthesizerConfig) -> BaseSynthesizer:
    if synthesizer_config.type == SynthesizerType.GOOGLE:
        return GoogleSynthesizer(synthesizer_config)
    elif synthesizer_config.type == SynthesizerType.AZURE:
        return AzureSynthesizer(synthesizer_config)
    elif synthesizer_config.type == SynthesizerType.ELEVEN_LABS:
        kwargs = {}
        if synthesizer_config.voice_id:
            kwargs["voice_id"] = synthesizer_config.voice_id
        return ElevenLabsSynthesizer(synthesizer_config, **kwargs)
    elif synthesizer_config.type == SynthesizerType.RIME:
        return RimeSynthesizer(synthesizer_config)
    else:
        raise Exception("Invalid synthesizer config")
--- a/vocode/streaming/hosted_streaming_conversation.py
+++ b/vocode/streaming/hosted_streaming_conversation.py
@ -0,0 +1,106 @@
 import websockets
 from websockets.exceptions import ConnectionClosedOK
 from websockets.client import WebSocketClientProtocol
 import asyncio
 from dotenv import load_dotenv
 import os
 import logging
 import threading
 import queue
 import vocode
 from vocode.streaming.input_device.base_input_device import (
    BaseInputDevice,
 )
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
 from vocode.streaming.models.transcriber import TranscriberConfig
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 from vocode.streaming.models.websocket import (
    ReadyMessage,
    AudioMessage,
    StartMessage,
    StopMessage,
 )
 load_dotenv()
 class HostedStreamingConversation:
    def __init__(
        self,
        input_device: BaseInputDevice,
        output_device: BaseOutputDevice,
        transcriber_config: TranscriberConfig,
        agent_config: AgentConfig,
        synthesizer_config: SynthesizerConfig,
        id: str = None,
    ):
        self.id = id
        self.input_device = input_device
        self.output_device = output_device
        self.transcriber_config = transcriber_config
        self.agent_config = agent_config
        self.synthesizer_config = synthesizer_config
        self.logger = logging.getLogger(__name__)
        self.receiver_ready = False
        self.active = True
        self.output_loop = asyncio.new_event_loop()
        self.output_audio_queue = queue.Queue()
        self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
    async def wait_for_ready(self):
        while not self.receiver_ready:
            await asyncio.sleep(0.1)
        return True
    def deactivate(self):
        self.active = False
    def play_audio(self):
        async def run():
            while self.active:
                try:
                    audio = self.output_audio_queue.get(timeout=5)
                    await self.output_device.send_async(audio)
                except queue.Empty:
                    continue
        loop = asyncio.new_event_loop()
        loop.run_until_complete(run())
    async def start(self):
        async with websockets.connect(
            f"{self.vocode_websocket_url}?key={vocode.api_key}"
        ) as ws:
            async def sender(ws: WebSocketClientProtocol):
                start_message = StartMessage(
                    transcriber_config=self.transcriber_config,
                    agent_config=self.agent_config,
                    synthesizer_config=self.synthesizer_config,
                    conversation_id=self.id,
                )
                await ws.send(start_message.json())
                await self.wait_for_ready()
                self.logger.info("Listening...press Ctrl+C to stop")
                while self.active:
                    data = self.input_device.get_audio()
                    if data:
                        try:
                            await ws.send(AudioMessage.from_bytes(data).json())
                        except ConnectionClosedOK:
                            self.deactivate()
                            return
                        await asyncio.sleep(0)
                await ws.send(StopMessage().json())
            async def receiver(ws: WebSocketClientProtocol):
                ReadyMessage.parse_raw(await ws.recv())
                self.receiver_ready = True
                async for msg in ws:
                    audio_message = AudioMessage.parse_raw(msg)
                    self.output_audio_queue.put_nowait(audio_message.get_bytes())
            output_thread = threading.Thread(target=self.play_audio)
            output_thread.start()
            return await asyncio.gather(sender(ws), receiver(ws))
--- a/vocode/streaming/models/agent.py
+++ b/vocode/streaming/models/agent.py
@ -42,6 +42,7 @@ class AgentConfig(TypedModel, type=AgentType.BASE):
    initial_message: Optional[BaseMessage] = None
    generate_responses: bool = True
    allowed_idle_time_seconds: Optional[float] = None
    allow_agent_to_be_cut_off: bool = True
    end_conversation_on_goodbye: bool = False
    send_filler_audio: Union[bool, FillerAudioConfig] = False
@ -59,6 +60,13 @@ class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
    cut_off_response: Optional[CutOffResponse] = None
 class ChatGPTAlphaAgentConfig(AgentConfig, type=AgentType.CHAT_GPT_ALPHA):
    prompt_preamble: str
    expected_first_prompt: Optional[str] = None
    temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
    max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
 class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
    prompt_preamble: str
    expected_first_prompt: Optional[str] = None
--- a/vocode/streaming/models/audio_encoding.py
+++ b/vocode/streaming/models/audio_encoding.py
@ -1,5 +1,6 @@
 from enum import Enum
 class AudioEncoding(str, Enum):
    LINEAR16 = "linear16"
    MULAW = "mulaw"
--- a/vocode/streaming/models/model.py
+++ b/vocode/streaming/models/model.py
@ -1,17 +1,17 @@
 import pydantic
 class BaseModel(pydantic.BaseModel):
 class BaseModel(pydantic.BaseModel):
    def __init__(self, **data):
        for key, value in data.items():
            if isinstance(value, dict):
-                if 'type' in value:
+                if "type" in value:
                    data[key] = TypedModel.parse_obj(value)
        super().__init__(**data)
 # Adapted from https://github.com/pydantic/pydantic/discussions/3091
 class TypedModel(BaseModel):
    _subtypes_ = []
    def __init_subclass__(cls, type=None):
@ -22,31 +22,30 @@ class TypedModel(BaseModel):
        for t, cls in _cls._subtypes_:
            if t == type:
                return cls
-        raise ValueError(f'Unknown type {type}')
+        raise ValueError(f"Unknown type {type}")
    @classmethod
    def get_type(_cls, cls_name):
        for t, cls in _cls._subtypes_:
            if cls.__name__ == cls_name:
                return t
-        raise ValueError(f'Unknown class {cls_name}')
+        raise ValueError(f"Unknown class {cls_name}")
    @classmethod
    def parse_obj(cls, obj):
-        data_type = obj.get('type')
+        data_type = obj.get("type")
        if data_type is None:
-            raise ValueError(f'type is required for {cls.__name__}')
+            raise ValueError(f"type is required for {cls.__name__}")
        sub = cls.get_cls(data_type)
        if sub is None:
-            raise ValueError(f'Unknown type {data_type}')
+            raise ValueError(f"Unknown type {data_type}")
        return sub(**obj)
    def _iter(self, **kwargs):
-        yield 'type', self.get_type(self.__class__.__name__)
+        yield "type", self.get_type(self.__class__.__name__)
        yield from super()._iter(**kwargs)
    @property
    def type(self):
        return self.get_type(self.__class__.__name__)
--- a/vocode/streaming/models/synthesizer.py
+++ b/vocode/streaming/models/synthesizer.py
@ -2,9 +2,14 @@ from enum import Enum
 from typing import Optional, Union
 from pydantic import BaseModel, validator
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
 from vocode.streaming.telephony.constants import (
    DEFAULT_AUDIO_ENCODING,
    DEFAULT_SAMPLING_RATE,
 )
 from .model import TypedModel
 from .audio_encoding import AudioEncoding
 from ..output_device.base_output_device import BaseOutputDevice
 class SynthesizerType(str, Enum):
@ -38,6 +43,13 @@ class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
            audio_encoding=output_device.audio_encoding,
        )
    @classmethod
    def from_telephone_output_device(cls):
        return cls(
            sampling_rate=DEFAULT_SAMPLING_RATE,
            audio_encoding=DEFAULT_AUDIO_ENCODING,
        )
 AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
 AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
@ -45,18 +57,32 @@ AZURE_SYNTHESIZER_DEFAULT_RATE = 15
 class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
-    voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
+    voice_name: Optional[str] = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
-    pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
+    pitch: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_PITCH
-    rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
+    rate: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_RATE
    class Config:
        validate_assignment = True
    @validator("voice_name")
    def set_name(cls, voice_name):
        return voice_name or AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
    @validator("pitch")
    def set_pitch(cls, pitch):
        return pitch or AZURE_SYNTHESIZER_DEFAULT_PITCH
    @validator("rate")
    def set_rate(cls, rate):
        return rate or AZURE_SYNTHESIZER_DEFAULT_RATE
    @classmethod
    def from_output_device(
        cls,
        output_device: BaseOutputDevice,
-        voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME,
+        voice_name: Optional[str] = None,
-        pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH,
+        pitch: Optional[int] = None,
-        rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE,
+        rate: Optional[int] = None,
        track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False,
    ):
        return cls(
            sampling_rate=output_device.sampling_rate,
@ -64,16 +90,33 @@ class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
            voice_name=voice_name,
            pitch=pitch,
            rate=rate,
            track_bot_sentiment_in_voice=track_bot_sentiment_in_voice,
        )
-    pass
+    @classmethod
    def from_telephone_output_device(
        cls,
        voice_name: Optional[str] = None,
        pitch: Optional[int] = None,
        rate: Optional[int] = None,
    ):
        return cls(
            sampling_rate=DEFAULT_SAMPLING_RATE,
            audio_encoding=DEFAULT_AUDIO_ENCODING,
            voice_name=voice_name,
            pitch=pitch,
            rate=rate,
        )
 class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
    pass
 class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
    api_key: str
    voice_id: Optional[str] = None
 class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
    speaker: str
@ -88,3 +131,14 @@ class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
            audio_encoding=output_device.audio_encoding,
            speaker=speaker,
        )
    @classmethod
    def from_telephone_output_device(
        cls,
        speaker: str,
    ):
        return cls(
            sampling_rate=DEFAULT_SAMPLING_RATE,
            audio_encoding=DEFAULT_AUDIO_ENCODING,
            speaker=speaker,
        )
--- a/vocode/streaming/models/telephony.py
+++ b/vocode/streaming/models/telephony.py
@ -1,4 +1,5 @@
 from typing import Optional
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.models.model import BaseModel
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
@ -19,6 +20,7 @@ class CreateInboundCall(BaseModel):
    agent_config: AgentConfig
    synthesizer_config: Optional[SynthesizerConfig] = None
    twilio_sid: str
    conversation_id: Optional[str] = None
    twilio_config: Optional[TwilioConfig] = None
@ -48,3 +50,11 @@ class DialIntoZoomCall(BaseModel):
    synthesizer_config: Optional[SynthesizerConfig] = None
    conversation_id: Optional[str] = None
    twilio_config: Optional[TwilioConfig] = None
 class CallConfig(BaseModel):
    transcriber_config: TranscriberConfig
    agent_config: AgentConfig
    synthesizer_config: SynthesizerConfig
    twilio_config: Optional[TwilioConfig]
    twilio_sid: str
--- a/vocode/streaming/models/transcriber.py
+++ b/vocode/streaming/models/transcriber.py
@ -1,8 +1,11 @@
 from enum import Enum
 from typing import Optional
-from vocode.streaming.input_device.base_input_device import (
+from vocode.streaming.input_device.base_input_device import BaseInputDevice
-    BaseInputDevice,
+from vocode.streaming.telephony.constants import (
    DEFAULT_AUDIO_ENCODING,
    DEFAULT_CHUNK_SIZE,
    DEFAULT_SAMPLING_RATE,
 )
 from .audio_encoding import AudioEncoding
 from .model import BaseModel, TypedModel
@ -54,11 +57,25 @@ class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
            endpointing_config=endpointing_config,
        )
    @classmethod
    def from_telephone_input_device(
        cls,
        endpointing_config: Optional[EndpointingConfig] = None,
    ):
        return cls(
            sampling_rate=DEFAULT_SAMPLING_RATE,
            audio_encoding=DEFAULT_AUDIO_ENCODING,
            chunk_size=DEFAULT_CHUNK_SIZE,
            endpointing_config=endpointing_config,
        )
 class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
    model: Optional[str] = None
    tier: Optional[str] = None
    should_warmup_model: bool = False
    version: Optional[str] = None
    downsampling: Optional[int] = None
 class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
--- a/vocode/streaming/models/websocket.py
+++ b/vocode/streaming/models/websocket.py
@ -6,33 +6,40 @@ from .transcriber import TranscriberConfig
 from .agent import AgentConfig
 from .synthesizer import SynthesizerConfig
 class WebSocketMessageType(str, Enum):
    BASE = 'websocket_base'
    START = 'websocket_start'
    AUDIO = 'websocket_audio'
    READY = 'websocket_ready'
    STOP = 'websocket_stop'
-class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
+class WebSocketMessageType(str, Enum):
    BASE = "websocket_base"
    START = "websocket_start"
    AUDIO = "websocket_audio"
    READY = "websocket_ready"
    STOP = "websocket_stop"
 class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE):
    pass
 class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
    data: str
    @classmethod
    def from_bytes(cls, chunk: bytes):
-        return cls(data=base64.b64encode(chunk).decode('utf-8'))
+        return cls(data=base64.b64encode(chunk).decode("utf-8"))
    def get_bytes(self) -> bytes:
        return base64.b64decode(self.data)
 class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
    transcriber_config: TranscriberConfig
    agent_config: AgentConfig
    synthesizer_config: SynthesizerConfig
    conversation_id: Optional[str] = None
 class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
    pass
 class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
    pass
--- a/vocode/streaming/output_device/base_output_device.py
+++ b/vocode/streaming/output_device/base_output_device.py
@ -6,7 +6,7 @@ class BaseOutputDevice:
        self.sampling_rate = sampling_rate
        self.audio_encoding = audio_encoding
-    async def send_async(self, chunk):
+    async def send_async(self, chunk: bytes):
        raise NotImplemented
    async def maybe_send_mark_async(self, message):
--- a/vocode/streaming/output_device/twilio_output_device.py
+++ b/vocode/streaming/output_device/twilio_output_device.py
@ -0,0 +1,30 @@
 import json
 import base64
 from fastapi import WebSocket
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
 class TwilioOutputDevice(BaseOutputDevice):
    def __init__(self, ws: WebSocket = None, stream_sid: str = None):
        self.ws = ws
        self.stream_sid = stream_sid
    async def send_async(self, chunk: bytes):
        twilio_message = {
            "event": "media",
            "streamSid": self.stream_sid,
            "media": {"payload": base64.b64encode(chunk).decode("utf-8")},
        }
        await self.ws.send_text(json.dumps(twilio_message))
    async def maybe_send_mark_async(self, message_sent):
        mark_message = {
            "event": "mark",
            "streamSid": self.stream_sid,
            "mark": {
                "name": "Sent {}".format(message_sent),
            },
        }
        await self.ws.send_text(json.dumps(mark_message))
--- a/vocode/streaming/streaming_conversation.py
+++ b/vocode/streaming/streaming_conversation.py
@ -1,26 +1,67 @@
 import websockets
 from websockets.exceptions import ConnectionClosedOK
 from websockets.client import WebSocketClientProtocol
 import asyncio
-from dotenv import load_dotenv
+from asyncio import Future
-import os
+import queue
 from typing import Callable, Awaitable, Optional, Any
 import logging
 import threading
-import queue
+import time
-import vocode
+import secrets
-from vocode.streaming.input_device.base_input_device import (
+import random
-    BaseInputDevice,
+
 from dotenv import load_dotenv
 from vocode.streaming.agent.bot_sentiment_analyser import (
    BotSentiment,
    BotSentimentAnalyser,
 )
 from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
-from vocode.streaming.models.transcriber import TranscriberConfig
+from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
-from vocode.streaming.models.agent import AgentConfig
+from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
-from vocode.streaming.models.synthesizer import SynthesizerConfig
+from vocode.streaming.utils.goodbye_model import GoodbyeModel
-from vocode.streaming.models.websocket import (
+from vocode.streaming.utils.transcript import Transcript
-    ReadyMessage,
+
-    AudioMessage,
+from vocode.streaming.models.transcriber import (
-    StartMessage,
+    TranscriberConfig,
-    StopMessage,
+    TranscriberType,
 )
 from vocode.streaming.models.agent import (
    AgentConfig,
    AgentType,
    FillerAudioConfig,
    FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS,
 )
 from vocode.streaming.models.synthesizer import (
    SynthesizerConfig,
    SynthesizerType,
    TrackBotSentimentConfig,
 )
 from vocode.streaming.models.websocket import AudioMessage
 from vocode.streaming.constants import (
    TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS,
    PER_CHUNK_ALLOWANCE_SECONDS,
    ALLOWED_IDLE_TIME,
 )
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
    FillerAudio,
 )
 from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 from vocode.streaming.utils import (
    create_conversation_id,
    create_loop_in_thread,
    get_chunk_size_per_second,
 )
 from vocode.streaming.transcriber.base_transcriber import (
    Transcription,
    BaseTranscriber,
 )
 from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 load_dotenv()
@ -28,79 +69,468 @@ load_dotenv()
 class StreamingConversation:
    def __init__(
        self,
        input_device: BaseInputDevice,
        output_device: BaseOutputDevice,
-        transcriber_config: TranscriberConfig,
+        transcriber: BaseTranscriber,
-        agent_config: AgentConfig,
+        agent: BaseAgent,
-        synthesizer_config: SynthesizerConfig,
+        synthesizer: BaseSynthesizer,
-        id: str = None,
+        conversation_id: str = None,
        per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
        logger: Optional[logging.Logger] = None,
    ):
-        self.id = id
+        self.id = conversation_id or create_conversation_id()
-        self.input_device = input_device
+        self.logger = logger or logging.getLogger(__name__)
        self.output_device = output_device
-        self.transcriber_config = transcriber_config
+        self.transcriber = transcriber
-        self.agent_config = agent_config
+        self.transcriber.set_on_response(self.on_transcription_response)
-        self.synthesizer_config = synthesizer_config
+        self.transcriber_task = None
-        self.logger = logging.getLogger(__name__)
+        self.agent = agent
-        self.receiver_ready = False
+        self.synthesizer = synthesizer
-        self.active = True
+        self.synthesizer_event_loop = asyncio.new_event_loop()
-        self.output_loop = asyncio.new_event_loop()
+        self.synthesizer_thread = threading.Thread(
-        self.output_audio_queue = queue.Queue()
+            name="synthesizer",
-        self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
+            target=create_loop_in_thread,
            args=(self.synthesizer_event_loop,),
        )
        self.per_chunk_allowance_seconds = per_chunk_allowance_seconds
        self.transcript = Transcript()
        self.bot_sentiment = None
        if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
            if isinstance(
                self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice,
                bool,
            ):
                self.track_bot_sentiment_config = TrackBotSentimentConfig()
            else:
                self.track_bot_sentiment_config = (
                    self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice
                )
            self.bot_sentiment_analyser = BotSentimentAnalyser(
                emotions=self.track_bot_sentiment_config.emotions
            )
        self.goodbye_model = GoodbyeModel()
-    async def wait_for_ready(self):
+        self.is_human_speaking = False
        while not self.receiver_ready:
            await asyncio.sleep(0.1)
        return True
    def deactivate(self):
        self.active = False
-
+        self.current_synthesis_task = None
-    def play_audio(self):
+        self.is_current_synthesis_interruptable = False
-        async def run():
+        self.stop_events: queue.Queue[threading.Event] = queue.Queue()
-            while self.active:
+        self.last_action_timestamp = time.time()
-                try:
+        self.check_for_idle_task = None
-                    audio = self.output_audio_queue.get(timeout=5)
+        self.track_bot_sentiment_task = None
-                    await self.output_device.send_async(audio)
+        self.should_wait_for_filler_audio_done_event = False
-                except queue.Empty:
+        self.current_filler_audio_done_event: Optional[threading.Event] = None
-                    continue
+        self.current_filler_seconds_per_chunk: int = 0
-
+        self.current_transcription_is_interrupt: bool = False
        loop = asyncio.new_event_loop()
        loop.run_until_complete(run())
    async def start(self):
-        async with websockets.connect(
+        self.transcriber_task = asyncio.create_task(self.transcriber.run())
-            f"{self.vocode_websocket_url}?key={vocode.api_key}"
+        is_ready = await self.transcriber.ready()
-        ) as ws:
+        if not is_ready:
-
+            raise Exception("Transcriber startup failed")
-            async def sender(ws: WebSocketClientProtocol):
+        self.synthesizer_thread.start()
-                start_message = StartMessage(
+        if self.agent.get_agent_config().send_filler_audio:
-                    transcriber_config=self.transcriber_config,
+            filler_audio_config = (
-                    agent_config=self.agent_config,
+                self.agent.get_agent_config().send_filler_audio
-                    synthesizer_config=self.synthesizer_config,
+                if isinstance(
-                    conversation_id=self.id,
+                    self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
                )
-                await ws.send(start_message.json())
+                else FillerAudioConfig()
-                await self.wait_for_ready()
+            )
-                self.logger.info("Listening...press Ctrl+C to stop")
+            self.synthesizer.set_filler_audios(filler_audio_config)
-                while self.active:
+        self.agent.start()
-                    data = self.input_device.get_audio()
+        if self.agent.get_agent_config().initial_message:
-                    if data:
+            self.transcript.add_bot_message(
-                        try:
+                self.agent.get_agent_config().initial_message.text
-                            await ws.send(AudioMessage.from_bytes(data).json())
+            )
-                        except ConnectionClosedOK:
+        if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
-                            self.deactivate()
+            self.update_bot_sentiment()
        self.send_message_to_stream_nonblocking(
            self.agent.get_agent_config().initial_message, False
        )
        self.active = True
        if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
            self.track_bot_sentiment_task = asyncio.create_task(
                self.track_bot_sentiment()
            )
        self.check_for_idle_task = asyncio.create_task(self.check_for_idle())
    async def check_for_idle(self):
        while self.is_active():
            if time.time() - self.last_action_timestamp > (
                self.agent.get_agent_config().allowed_idle_time_seconds
                or ALLOWED_IDLE_TIME
            ):
                self.logger.debug("Conversation idle for too long, terminating")
                self.mark_terminated()
                return
            await asyncio.sleep(15)
    async def track_bot_sentiment(self):
        prev_transcript = None
        while self.is_active():
            await asyncio.sleep(1)
            if self.transcript.to_string() != prev_transcript:
                self.update_bot_sentiment()
                prev_transcript = self.transcript.to_string()
    def update_bot_sentiment(self):
        new_bot_sentiment = self.bot_sentiment_analyser.analyse(
            self.transcript.to_string()
        )
        if new_bot_sentiment.emotion:
            self.logger.debug("Bot sentiment: %s", new_bot_sentiment)
            self.bot_sentiment = new_bot_sentiment
    def receive_audio(self, chunk: bytes):
        self.transcriber.send_audio(chunk)
    async def send_messages_to_stream_async(
        self,
        messages,
        should_allow_human_to_cut_off_bot: bool,
        wait_for_filler_audio: bool = False,
    ) -> tuple[str, bool]:
        messages_queue = queue.Queue()
        messages_done = threading.Event()
        speech_cut_off = threading.Event()
        seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
        chunk_size = (
            get_chunk_size_per_second(
                self.synthesizer.get_synthesizer_config().audio_encoding,
                self.synthesizer.get_synthesizer_config().sampling_rate,
            )
            * seconds_per_chunk
        )
        async def send_to_call():
            response_buffer = ""
            cut_off = False
            self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
            while True:
                try:
                    message: BaseMessage = messages_queue.get_nowait()
                except queue.Empty:
                    if messages_done.is_set():
                        break
                    else:
                        await asyncio.sleep(0)
-                await ws.send(StopMessage().json())
+                        continue
-            async def receiver(ws: WebSocketClientProtocol):
+                stop_event = self.enqueue_stop_event()
-                ReadyMessage.parse_raw(await ws.recv())
+                synthesis_result = self.synthesizer.create_speech(
-                self.receiver_ready = True
+                    message, chunk_size, bot_sentiment=self.bot_sentiment
-                async for msg in ws:
+                )
-                    audio_message = AudioMessage.parse_raw(msg)
+                message_sent, cut_off = await self.send_speech_to_output(
-                    self.output_audio_queue.put_nowait(audio_message.get_bytes())
+                    message.text,
                    synthesis_result,
                    stop_event,
                    seconds_per_chunk,
                )
                self.logger.debug("Message sent: {}".format(message_sent))
                response_buffer = f"{response_buffer} {message_sent}"
                if cut_off:
                    speech_cut_off.set()
                    break
                await asyncio.sleep(0)
            if cut_off:
                self.agent.update_last_bot_message_on_cut_off(response_buffer)
            self.transcript.add_bot_message(response_buffer)
            return response_buffer, cut_off
-            output_thread = threading.Thread(target=self.play_audio)
+        asyncio.run_coroutine_threadsafe(send_to_call(), self.synthesizer_event_loop)
-            output_thread.start()
+
-            return await asyncio.gather(sender(ws), receiver(ws))
+        messages_generated = 0
        for i, message in enumerate(messages):
            messages_generated += 1
            if i == 0:
                if wait_for_filler_audio:
                    self.interrupt_all_synthesis()
                    self.wait_for_filler_audio_to_finish()
            if speech_cut_off.is_set():
                break
            messages_queue.put_nowait(BaseMessage(text=message))
            await asyncio.sleep(0)
        if messages_generated == 0:
            self.logger.debug("Agent generated no messages")
            if wait_for_filler_audio:
                self.interrupt_all_synthesis()
        messages_done.set()
    def send_message_to_stream_nonblocking(
        self,
        message: BaseMessage,
        should_allow_human_to_cut_off_bot: bool,
    ):
        asyncio.run_coroutine_threadsafe(
            self.send_message_to_stream_async(
                message,
                self.agent.get_agent_config().allow_agent_to_be_cut_off,
            ),
            self.synthesizer_event_loop,
        )
    async def send_message_to_stream_async(
        self,
        message: BaseMessage,
        should_allow_human_to_cut_off_bot: bool,
    ) -> tuple[str, bool]:
        self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
        stop_event = self.enqueue_stop_event()
        self.logger.debug("Synthesizing speech for message")
        seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
        chunk_size = (
            get_chunk_size_per_second(
                self.synthesizer.get_synthesizer_config().audio_encoding,
                self.synthesizer.get_synthesizer_config().sampling_rate,
            )
            * seconds_per_chunk
        )
        synthesis_result = self.synthesizer.create_speech(
            message, chunk_size, bot_sentiment=self.bot_sentiment
        )
        message_sent, cut_off = await self.send_speech_to_output(
            message.text,
            synthesis_result,
            stop_event,
            seconds_per_chunk,
        )
        self.logger.debug("Message sent: {}".format(message_sent))
        if cut_off:
            self.agent.update_last_bot_message_on_cut_off(message_sent)
        self.transcript.add_bot_message(message_sent)
        return message_sent, cut_off
    def warmup_synthesizer(self):
        self.synthesizer.ready_synthesizer()
    # returns an estimate of what was sent up to, and a flag if the message was cut off
    async def send_speech_to_output(
        self,
        message,
        synthesis_result: SynthesisResult,
        stop_event: threading.Event,
        seconds_per_chunk: int,
        is_filler_audio: bool = False,
    ):
        message_sent = message
        cut_off = False
        chunk_size = seconds_per_chunk * get_chunk_size_per_second(
            self.synthesizer.get_synthesizer_config().audio_encoding,
            self.synthesizer.get_synthesizer_config().sampling_rate,
        )
        for i, chunk_result in enumerate(synthesis_result.chunk_generator):
            start_time = time.time()
            speech_length_seconds = seconds_per_chunk * (
                len(chunk_result.chunk) / chunk_size
            )
            if stop_event.is_set():
                seconds = i * seconds_per_chunk
                self.logger.debug(
                    "Interrupted, stopping text to speech after {} chunks".format(i)
                )
                message_sent = f"{synthesis_result.get_message_up_to(seconds)}-"
                cut_off = True
                break
            if i == 0:
                if is_filler_audio:
                    self.should_wait_for_filler_audio_done_event = True
            await self.output_device.send_async(chunk_result.chunk)
            end_time = time.time()
            await asyncio.sleep(
                max(
                    speech_length_seconds
                    - (end_time - start_time)
                    - self.per_chunk_allowance_seconds,
                    0,
                )
            )
            self.logger.debug(
                "Sent chunk {} with size {}".format(i, len(chunk_result.chunk))
            )
            self.last_action_timestamp = time.time()
        # clears it off the stop events queue
        if not stop_event.is_set():
            stop_event.set()
        return message_sent, cut_off
    async def on_transcription_response(self, transcription: Transcription):
        self.last_action_timestamp = time.time()
        if transcription.is_final:
            self.logger.debug(
                "Got transcription: {}, confidence: {}".format(
                    transcription.message, transcription.confidence
                )
            )
        if not self.is_human_speaking:
            # send interrupt
            self.current_transcription_is_interrupt = False
            if self.is_current_synthesis_interruptable:
                self.logger.debug("sending interrupt")
                self.current_transcription_is_interrupt = self.interrupt_all_synthesis()
            self.logger.debug("Human started speaking")
        transcription.is_interrupt = self.current_transcription_is_interrupt
        self.is_human_speaking = not transcription.is_final
        return await self.handle_transcription(transcription)
    def enqueue_stop_event(self):
        stop_event = threading.Event()
        self.stop_events.put_nowait(stop_event)
        return stop_event
    def interrupt_all_synthesis(self):
        """Returns true if any synthesis was interrupted"""
        num_interrupts = 0
        while True:
            try:
                stop_event = self.stop_events.get_nowait()
                if not stop_event.is_set():
                    self.logger.debug("Interrupting synthesis")
                    stop_event.set()
                    num_interrupts += 1
            except queue.Empty:
                break
        return num_interrupts > 0
    async def send_filler_audio_to_output(
        self,
        filler_audio: FillerAudio,
        stop_event: threading.Event,
        done_event: threading.Event,
    ):
        filler_synthesis_result = filler_audio.create_synthesis_result()
        self.is_current_synthesis_interruptable = filler_audio.is_interruptable
        if isinstance(
            self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
        ):
            silence_threshold = (
                self.agent.get_agent_config().send_filler_audio.silence_threshold_seconds
            )
        else:
            silence_threshold = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
        await asyncio.sleep(silence_threshold)
        self.logger.debug("Sending filler audio to output")
        await self.send_speech_to_output(
            filler_audio.message.text,
            filler_synthesis_result,
            stop_event,
            filler_audio.seconds_per_chunk,
            is_filler_audio=True,
        )
        done_event.set()
    def wait_for_filler_audio_to_finish(self):
        if not self.should_wait_for_filler_audio_done_event:
            self.logger.debug(
                "Not waiting for filler audio to finish since we didn't send any chunks"
            )
            return
        self.should_wait_for_filler_audio_done_event = False
        if (
            self.current_filler_audio_done_event
            and not self.current_filler_audio_done_event.is_set()
        ):
            self.logger.debug("Waiting for filler audio to finish")
            # this should guarantee that filler audio finishes, since it has to be on its last chunk
            if not self.current_filler_audio_done_event.wait(
                self.current_filler_seconds_per_chunk
            ):
                self.logger.debug("Filler audio did not finish")
    async def handle_transcription(self, transcription: Transcription):
        if transcription.is_final:
            self.transcript.add_human_message(transcription.message)
            goodbye_detected_task = None
            if self.agent.get_agent_config().end_conversation_on_goodbye:
                goodbye_detected_task = asyncio.create_task(
                    self.goodbye_model.is_goodbye(transcription.message)
                )
            if self.agent.get_agent_config().send_filler_audio:
                self.logger.debug("Sending filler audio")
                if self.synthesizer.filler_audios:
                    filler_audio = random.choice(self.synthesizer.filler_audios)
                    self.logger.debug(f"Chose {filler_audio.message.text}")
                    self.current_filler_audio_done_event = threading.Event()
                    self.current_filler_seconds_per_chunk = (
                        filler_audio.seconds_per_chunk
                    )
                    stop_event = self.enqueue_stop_event()
                    asyncio.run_coroutine_threadsafe(
                        self.send_filler_audio_to_output(
                            filler_audio,
                            stop_event,
                            done_event=self.current_filler_audio_done_event,
                        ),
                        self.synthesizer_event_loop,
                    )
                else:
                    self.logger.debug("No filler audio available for synthesizer")
            self.logger.debug("Generating response for transcription")
            if self.agent.get_agent_config().generate_responses:
                responses = self.agent.generate_response(
                    transcription.message, is_interrupt=transcription.is_interrupt
                )
                await self.send_messages_to_stream_async(
                    responses,
                    self.agent.get_agent_config().allow_agent_to_be_cut_off,
                    wait_for_filler_audio=self.agent.get_agent_config().send_filler_audio,
                )
            else:
                response, should_stop = self.agent.respond(
                    transcription.message, is_interrupt=transcription.is_interrupt
                )
                if self.agent.get_agent_config().send_filler_audio:
                    self.interrupt_all_synthesis()
                    self.wait_for_filler_audio_to_finish()
                if should_stop:
                    self.logger.debug("Agent requested to stop")
                    self.mark_terminated()
                    return
                if response:
                    self.send_message_to_stream_nonblocking(
                        BaseMessage(text=response),
                        self.agent.get_agent_config().allow_agent_to_be_cut_off,
                    )
                else:
                    self.logger.debug("No response generated")
            if goodbye_detected_task:
                try:
                    goodbye_detected = await asyncio.wait_for(
                        goodbye_detected_task, 0.1
                    )
                    if goodbye_detected:
                        self.logger.debug("Goodbye detected, ending conversation")
                        self.mark_terminated()
                        return
                except asyncio.TimeoutError:
                    self.logger.debug("Goodbye detection timed out")
    def mark_terminated(self):
        self.active = False
    # must be called from the main thread
    def terminate(self):
        self.mark_terminated()
        if self.check_for_idle_task:
            self.logger.debug("Terminating check_for_idle Task")
            self.check_for_idle_task.cancel()
        if self.track_bot_sentiment_task:
            self.logger.debug("Terminating track_bot_sentiment Task")
            self.track_bot_sentiment_task.cancel()
        self.logger.debug("Terminating agent")
        self.agent.terminate()
        self.logger.debug("Terminating speech transcriber")
        self.transcriber.terminate()
        self.logger.debug("Terminating synthesizer event loop")
        self.synthesizer_event_loop.call_soon_threadsafe(
            self.synthesizer_event_loop.stop
        )
        self.logger.debug("Terminating synthesizer thread")
        if self.synthesizer_thread.is_alive():
            self.synthesizer_thread.join()
        self.logger.debug("Terminating transcriber task")
        self.transcriber_task.cancel()
        self.logger.debug("Successfully terminated")
    def is_active(self):
        return self.active
--- a/vocode/streaming/synthesizer/azure_synthesizer.py
+++ b/vocode/streaming/synthesizer/azure_synthesizer.py
@ -0,0 +1,250 @@
 import logging
 import os
 import re
 from typing import Any, Optional
 from xml.etree import ElementTree
 import azure.cognitiveservices.speech as speechsdk
 from dotenv import load_dotenv
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
 from vocode.streaming.models.message import BaseMessage, SSMLMessage
 from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
    FILLER_PHRASES,
    FILLER_AUDIO_PATH,
    FillerAudio,
    encode_as_wav,
 )
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
 from vocode.streaming.models.audio_encoding import AudioEncoding
 load_dotenv()
 NAMESPACES = {
    "mstts": "https://www.w3.org/2001/mstts",
    "": "https://www.w3.org/2001/10/synthesis",
 }
 ElementTree.register_namespace("", NAMESPACES.get(""))
 ElementTree.register_namespace("mstts", NAMESPACES.get("mstts"))
 class WordBoundaryEventPool:
    def __init__(self):
        self.events = []
    def add(self, event):
        self.events.append(
            {
                "text": event.text,
                "text_offset": event.text_offset,
                "audio_offset": (event.audio_offset + 5000) / (10000 * 1000),
                "boudary_type": event.boundary_type,
            }
        )
    def get_events_sorted(self):
        return sorted(self.events, key=lambda event: event["audio_offset"])
 class AzureSynthesizer(BaseSynthesizer):
    OFFSET_MS = 100
    def __init__(
        self, synthesizer_config: AzureSynthesizerConfig, logger: logging.Logger = None
    ):
        super().__init__(synthesizer_config)
        self.synthesizer_config = synthesizer_config
        # Instantiates a client
        speech_config = speechsdk.SpeechConfig(
            subscription=os.environ.get("AZURE_SPEECH_KEY"),
            region=os.environ.get("AZURE_SPEECH_REGION"),
        )
        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
            if self.synthesizer_config.sampling_rate == 44100:
                speech_config.set_speech_synthesis_output_format(
                    speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
                )
            if self.synthesizer_config.sampling_rate == 48000:
                speech_config.set_speech_synthesis_output_format(
                    speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
                )
            if self.synthesizer_config.sampling_rate == 24000:
                speech_config.set_speech_synthesis_output_format(
                    speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
                )
            elif self.synthesizer_config.sampling_rate == 16000:
                speech_config.set_speech_synthesis_output_format(
                    speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
                )
            elif self.synthesizer_config.sampling_rate == 8000:
                speech_config.set_speech_synthesis_output_format(
                    speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
                )
        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
            speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Raw8Khz8BitMonoMULaw
            )
        self.synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=speech_config, audio_config=None
        )
        self.voice_name = self.synthesizer_config.voice_name
        self.pitch = self.synthesizer_config.pitch
        self.rate = self.synthesizer_config.rate
        self.logger = logger or logging.getLogger(__name__)
    def get_phrase_filler_audios(self) -> list[FillerAudio]:
        filler_phrase_audios = []
        for filler_phrase in FILLER_PHRASES:
            cache_key = "-".join(
                (
                    str(filler_phrase.text),
                    str(self.synthesizer_config.type),
                    str(self.synthesizer_config.audio_encoding),
                    str(self.synthesizer_config.sampling_rate),
                    str(self.voice_name),
                    str(self.pitch),
                    str(self.rate),
                )
            )
            filler_audio_path = os.path.join(FILLER_AUDIO_PATH, f"{cache_key}.bytes")
            if os.path.exists(filler_audio_path):
                audio_data = open(filler_audio_path, "rb").read()
            else:
                self.logger.debug(f"Generating filler audio for {filler_phrase.text}")
                ssml = self.create_ssml(filler_phrase.text)
                result = self.synthesizer.speak_ssml(ssml)
                offset = self.synthesizer_config.sampling_rate * self.OFFSET_MS // 1000
                audio_data = result.audio_data[offset:]
                with open(filler_audio_path, "wb") as f:
                    f.write(audio_data)
            filler_phrase_audios.append(
                FillerAudio(
                    filler_phrase,
                    audio_data,
                    self.synthesizer_config,
                )
            )
        return filler_phrase_audios
    def add_marks(self, message: str, index=0) -> str:
        search_result = re.search(r"([\.\,\:\;\-\—]+)", message)
        if search_result is None:
            return message
        start, end = search_result.span()
        with_mark = message[:start] + f'<mark name="{index}" />' + message[start:end]
        rest = message[end:]
        rest_stripped = re.sub(r"^(.+)([\.\,\:\;\-\—]+)$", r"\1", rest)
        if len(rest_stripped) == 0:
            return with_mark
        return with_mark + self.add_marks(rest_stripped, index + 1)
    def word_boundary_cb(self, evt, pool):
        pool.add(evt)
    def create_ssml(
        self, message: str, bot_sentiment: Optional[BotSentiment] = None
    ) -> str:
        ssml_root = ElementTree.fromstring(
            '<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="en-US"></speak>'
        )
        voice = ElementTree.SubElement(ssml_root, "voice")
        voice.set("name", self.voice_name)
        voice_root = voice
        if bot_sentiment and bot_sentiment.emotion:
            styled = ElementTree.SubElement(
                voice, "{%s}express-as" % NAMESPACES.get("mstts")
            )
            styled.set("style", bot_sentiment.emotion)
            styled.set(
                "styledegree", str(bot_sentiment.degree * 2)
            )  # Azure specific, it's a scale of 0-2
            voice_root = styled
        prosody = ElementTree.SubElement(voice_root, "prosody")
        prosody.set("pitch", f"{self.pitch}%")
        prosody.set("rate", f"{self.rate}%")
        prosody.text = message.strip()
        return ElementTree.tostring(ssml_root, encoding="unicode")
    def synthesize_ssml(self, ssml: str) -> tuple[speechsdk.AudioDataStream, str]:
        result = self.synthesizer.start_speaking_ssml_async(ssml).get()
        return speechsdk.AudioDataStream(result)
    def ready_synthesizer(self):
        connection = speechsdk.Connection.from_speech_synthesizer(self.synthesizer)
        connection.open(True)
    # given the number of seconds the message was allowed to go until, where did we get in the message?
    def get_message_up_to(
        self,
        message: str,
        ssml: str,
        seconds: int,
        word_boundary_event_pool: WordBoundaryEventPool,
    ) -> str:
        events = word_boundary_event_pool.get_events_sorted()
        for event in events:
            if event["audio_offset"] > seconds:
                ssml_fragment = ssml[: event["text_offset"]]
                return ssml_fragment.split(">")[-1]
        return message
    def create_speech(
        self,
        message: BaseMessage,
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
        # offset = int(self.OFFSET_MS * (self.synthesizer_config.sampling_rate / 1000))
        offset = 0
        self.logger.debug(f"Synthesizing message: {message}")
        def chunk_generator(
            audio_data_stream: speechsdk.AudioDataStream, chunk_transform=lambda x: x
        ):
            audio_buffer = bytes(chunk_size)
            filled_size = audio_data_stream.read_data(audio_buffer)
            if filled_size != chunk_size:
                yield SynthesisResult.ChunkResult(
                    chunk_transform(audio_buffer[offset:]), True
                )
                return
            else:
                yield SynthesisResult.ChunkResult(
                    chunk_transform(audio_buffer[offset:]), False
                )
            while True:
                filled_size = audio_data_stream.read_data(audio_buffer)
                if filled_size != chunk_size:
                    yield SynthesisResult.ChunkResult(
                        chunk_transform(audio_buffer[: filled_size - offset]), True
                    )
                    break
                yield SynthesisResult.ChunkResult(chunk_transform(audio_buffer), False)
        word_boundary_event_pool = WordBoundaryEventPool()
        self.synthesizer.synthesis_word_boundary.connect(
            lambda event: self.word_boundary_cb(event, word_boundary_event_pool)
        )
        ssml = (
            message.ssml
            if isinstance(message, SSMLMessage)
            else self.create_ssml(message.text, bot_sentiment=bot_sentiment)
        )
        audio_data_stream = self.synthesize_ssml(ssml)
        if self.synthesizer_config.should_encode_as_wav:
            output_generator = chunk_generator(
                audio_data_stream,
                lambda chunk: encode_as_wav(chunk, self.synthesizer_config),
            )
        else:
            output_generator = chunk_generator(audio_data_stream)
        return SynthesisResult(
            output_generator,
            lambda seconds: self.get_message_up_to(
                message, ssml, seconds, word_boundary_event_pool
            ),
        )
--- a/vocode/streaming/synthesizer/base_synthesizer.py
+++ b/vocode/streaming/synthesizer/base_synthesizer.py
@ -0,0 +1,169 @@
 import os
 from typing import Any, Generator, Callable, Optional
 import math
 import io
 import wave
 from nltk.tokenize import word_tokenize
 from nltk.tokenize.treebank import TreebankWordDetokenizer
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
 from vocode.streaming.models.agent import FillerAudioConfig
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.utils import convert_wav, get_chunk_size_per_second
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 FILLER_PHRASES = [
    BaseMessage(text="Um..."),
    BaseMessage(text="Uh..."),
    BaseMessage(text="Uh-huh..."),
    BaseMessage(text="Mm-hmm..."),
    BaseMessage(text="Hmm..."),
    BaseMessage(text="Okay..."),
    BaseMessage(text="Right..."),
    BaseMessage(text="Let me see..."),
 ]
 FILLER_AUDIO_PATH = os.path.join(os.path.dirname(__file__), "filler_audio")
 TYPING_NOISE_PATH = "%s/typing-noise.wav" % FILLER_AUDIO_PATH
 def encode_as_wav(chunk: bytes, synthesizer_config: SynthesizerConfig) -> bytes:
    output_bytes_io = io.BytesIO()
    in_memory_wav = wave.open(output_bytes_io, "wb")
    in_memory_wav.setnchannels(1)
    assert synthesizer_config.audio_encoding == AudioEncoding.LINEAR16
    in_memory_wav.setsampwidth(2)
    in_memory_wav.setframerate(synthesizer_config.sampling_rate)
    in_memory_wav.writeframes(chunk)
    output_bytes_io.seek(0)
    return output_bytes_io.read()
 class SynthesisResult:
    class ChunkResult:
        def __init__(self, chunk: bytes, is_last_chunk: bool):
            self.chunk = chunk
            self.is_last_chunk = is_last_chunk
    def __init__(
        self,
        chunk_generator: Generator[ChunkResult, None, None],
        get_message_up_to: Callable[[int], str],
    ):
        self.chunk_generator = chunk_generator
        self.get_message_up_to = get_message_up_to
 class FillerAudio:
    def __init__(
        self,
        message: BaseMessage,
        audio_data: bytes,
        synthesizer_config: SynthesizerConfig,
        is_interruptable: bool = False,
        seconds_per_chunk: int = 1,
    ):
        self.message = message
        self.audio_data = audio_data
        self.synthesizer_config = synthesizer_config
        self.is_interruptable = is_interruptable
        self.seconds_per_chunk = seconds_per_chunk
    def create_synthesis_result(self) -> SynthesisResult:
        chunk_size = (
            get_chunk_size_per_second(
                self.synthesizer_config.audio_encoding,
                self.synthesizer_config.sampling_rate,
            )
            * self.seconds_per_chunk
        )
        def chunk_generator(chunk_transform=lambda x: x):
            for i in range(0, len(self.audio_data), chunk_size):
                if i + chunk_size > len(self.audio_data):
                    yield SynthesisResult.ChunkResult(
                        chunk_transform(self.audio_data[i:]), True
                    )
                else:
                    yield SynthesisResult.ChunkResult(
                        chunk_transform(self.audio_data[i : i + chunk_size]), False
                    )
        if self.synthesizer_config.should_encode_as_wav:
            output_generator = chunk_generator(
                lambda chunk: encode_as_wav(chunk, self.synthesizer_config)
            )
        else:
            output_generator = chunk_generator()
        return SynthesisResult(output_generator, lambda seconds: self.message.text)
 class BaseSynthesizer:
    def __init__(self, synthesizer_config: SynthesizerConfig):
        self.synthesizer_config = synthesizer_config
        if synthesizer_config.audio_encoding == AudioEncoding.MULAW:
            assert (
                synthesizer_config.sampling_rate == 8000
            ), "MuLaw encoding only supports 8kHz sampling rate"
        self.filler_audios: list[FillerAudio] = []
    def get_synthesizer_config(self) -> SynthesizerConfig:
        return self.synthesizer_config
    def get_typing_noise_filler_audio(self) -> FillerAudio:
        return FillerAudio(
            message=BaseMessage(text="<typing noise>"),
            audio_data=convert_wav(
                TYPING_NOISE_PATH,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=self.synthesizer_config.audio_encoding,
            ),
            synthesizer_config=self.synthesizer_config,
            is_interruptable=True,
            seconds_per_chunk=2,
        )
    def set_filler_audios(self, filler_audio_config: FillerAudioConfig):
        if filler_audio_config.use_phrases:
            self.filler_audios = self.get_phrase_filler_audios()
        elif filler_audio_config.use_typing_noise:
            self.filler_audios = [self.get_typing_noise_filler_audio()]
    def get_phrase_filler_audios(self) -> list[FillerAudio]:
        return []
    def ready_synthesizer(self):
        pass
    # given the number of seconds the message was allowed to go until, where did we get in the message?
    def get_message_cutoff_from_total_response_length(
        self, message: BaseMessage, seconds: int, size_of_output: int
    ) -> str:
        estimated_output_seconds = (
            size_of_output / self.synthesizer_config.sampling_rate
        )
        estimated_output_seconds_per_char = estimated_output_seconds / len(message.text)
        return message.text[: int(seconds / estimated_output_seconds_per_char)]
    def get_message_cutoff_from_voice_speed(
        self, message: BaseMessage, seconds: int, words_per_minute: int
    ) -> str:
        words_per_second = words_per_minute / 60
        estimated_words_spoken = math.floor(words_per_second * seconds)
        tokens = word_tokenize(message.text)
        return TreebankWordDetokenizer().detokenize(tokens[:estimated_words_spoken])
    def get_maybe_cached_synthesis_result(
        self, message: BaseMessage, chunk_size: int
    ) -> Optional[SynthesisResult]:
        return
    # returns a chunk generator and a thunk that can tell you what part of the message was read given the number of seconds spoken
    # chunk generator must return tuple (bytes of size chunk_size, flag if it is the last chunk)
    def create_speech(
        self,
        message: BaseMessage,
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
        raise NotImplementedError
--- a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
+++ b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@ -0,0 +1,50 @@
 from typing import Any, Optional
 import os
 from dotenv import load_dotenv
 import requests
 from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
 )
 from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
 from vocode.streaming.models.message import BaseMessage
 load_dotenv()
 ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY")
 ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
 ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
 OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
 class ElevenLabsSynthesizer(BaseSynthesizer):
    def __init__(self, config: ElevenLabsSynthesizerConfig):
        super().__init__(config)
        self.api_key = config.api_key
        self.voice_id = config.voice_id or ADAM_VOICE_ID
        self.words_per_minute = 150
    def create_speech(
        self,
        message: BaseMessage,
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream"
        headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
        body = {
            "text": message.text,
        }
        response = requests.post(url, headers=headers, json=body)
        def chunk_generator(response):
            for chunk in response.iter_content(chunk_size=chunk_size):
                yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size)
        assert (
            not self.synthesizer_config.should_encode_as_wav
        ), "ElevenLabs does not support WAV encoding"
        # return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute)
        return SynthesisResult(chunk_generator(response), lambda seconds: message.text)
--- a/vocode/streaming/synthesizer/filler_audio/typing-noise.wav
+++ b/vocode/streaming/synthesizer/filler_audio/typing-noise.wav
--- a/vocode/streaming/synthesizer/google_synthesizer.py
+++ b/vocode/streaming/synthesizer/google_synthesizer.py
@ -0,0 +1,110 @@
 import io
 import wave
 from typing import Any, Optional
 from dotenv import load_dotenv
 from google.cloud import texttospeech_v1beta1 as tts
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.synthesizer.base_synthesizer import (
    BaseSynthesizer,
    SynthesisResult,
    encode_as_wav,
 )
 from vocode.streaming.models.synthesizer import GoogleSynthesizerConfig
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.utils import convert_wav
 load_dotenv()
 class GoogleSynthesizer(BaseSynthesizer):
    OFFSET_SECONDS = 0.5
    def __init__(self, synthesizer_config: GoogleSynthesizerConfig):
        super().__init__(synthesizer_config)
        # Instantiates a client
        self.client = tts.TextToSpeechClient()
        # Build the voice request, select the language code ("en-US") and the ssml
        # voice gender ("neutral")
        self.voice = tts.VoiceSelectionParams(
            language_code="en-US", name="en-US-Neural2-I"
        )
        # Select the type of audio file you want returned
        self.audio_config = tts.AudioConfig(
            audio_encoding=tts.AudioEncoding.LINEAR16,
            sample_rate_hertz=24000,
            speaking_rate=1.2,
            pitch=0,
            effects_profile_id=["telephony-class-application"],
        )
    def synthesize(self, message: str) -> tts.SynthesizeSpeechResponse:
        synthesis_input = tts.SynthesisInput(text=message)
        # Perform the text-to-speech request on the text input with the selected
        # voice parameters and audio file type
        return self.client.synthesize_speech(
            request=tts.SynthesizeSpeechRequest(
                input=synthesis_input,
                voice=self.voice,
                audio_config=self.audio_config,
                enable_time_pointing=[
                    tts.SynthesizeSpeechRequest.TimepointType.SSML_MARK
                ],
            )
        )
    def create_speech(
        self,
        message: BaseMessage,
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
        response = self.synthesize(message.text)
        output_sample_rate = response.audio_config.sample_rate_hertz
        real_offset = int(GoogleSynthesizer.OFFSET_SECONDS * output_sample_rate)
        output_bytes_io = io.BytesIO()
        in_memory_wav = wave.open(output_bytes_io, "wb")
        in_memory_wav.setnchannels(1)
        in_memory_wav.setsampwidth(2)
        in_memory_wav.setframerate(output_sample_rate)
        in_memory_wav.writeframes(response.audio_content[real_offset:-real_offset])
        output_bytes_io.seek(0)
        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
            output_bytes = convert_wav(
                output_bytes_io,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.LINEAR16,
            )
        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
            output_bytes = convert_wav(
                output_bytes_io,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.MULAW,
            )
        if self.synthesizer_config.should_encode_as_wav:
            output_bytes = encode_as_wav(output_bytes)
        def chunk_generator(output_bytes):
            for i in range(0, len(output_bytes), chunk_size):
                if i + chunk_size > len(output_bytes):
                    yield SynthesisResult.ChunkResult(output_bytes[i:], True)
                else:
                    yield SynthesisResult.ChunkResult(
                        output_bytes[i : i + chunk_size], False
                    )
        return SynthesisResult(
            chunk_generator(output_bytes),
            lambda seconds: self.get_message_cutoff_from_total_response_length(
                message, seconds, len(output_bytes)
            ),
        )
--- a/vocode/streaming/synthesizer/rime_synthesizer.py
+++ b/vocode/streaming/synthesizer/rime_synthesizer.py
@ -0,0 +1,78 @@
 import audioop
 import base64
 from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.models.message import BaseMessage
 from .base_synthesizer import BaseSynthesizer, SynthesisResult, encode_as_wav
 from typing import Any, Optional
 import os
 import io
 import wave
 from dotenv import load_dotenv
 import requests
 from ..utils import convert_linear_audio, convert_wav
 from ..models.synthesizer import ElevenLabsSynthesizerConfig, RimeSynthesizerConfig
 load_dotenv()
 RIME_API_KEY = os.getenv("RIME_API_KEY")
 RIME_BASE_URL = os.getenv("RIME_BASE_URL")
 class RimeSynthesizer(BaseSynthesizer):
    def __init__(self, config: RimeSynthesizerConfig):
        super().__init__(config)
        self.speaker = config.speaker
    def create_speech(
        self,
        message: BaseMessage,
        chunk_size: int,
        bot_sentiment: Optional[BotSentiment] = None,
    ) -> SynthesisResult:
        url = RIME_BASE_URL
        headers = {"Authorization": f"Bearer {RIME_API_KEY}"}
        body = {"inputs": {"text": message.text, "speaker": self.speaker}}
        response = requests.post(url, headers=headers, json=body)
        def chunk_generator(audio, chunk_transform=lambda x: x):
            for i in range(0, len(audio), chunk_size):
                chunk = audio[i : i + chunk_size]
                yield SynthesisResult.ChunkResult(
                    chunk_transform(chunk), len(chunk) != chunk_size
                )
        assert response.ok, response.text
        data = response.json().get("data")
        assert data
        audio_file = io.BytesIO(base64.b64decode(data))
        if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
            output_bytes = convert_wav(
                audio_file,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.LINEAR16,
            )
        elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
            output_bytes = convert_wav(
                audio_file,
                output_sample_rate=self.synthesizer_config.sampling_rate,
                output_encoding=AudioEncoding.MULAW,
            )
        if self.synthesizer_config.should_encode_as_wav:
            output_generator = chunk_generator(
                output_bytes, chunk_transform=encode_as_wav
            )
        else:
            output_generator = chunk_generator(output_bytes)
        return SynthesisResult(
            output_generator,
            lambda seconds: self.get_message_cutoff_from_total_response_length(
                message, seconds, len(output_bytes)
            ),
        )
--- a/vocode/streaming/telephony/init.py
+++ b/vocode/streaming/telephony/init.py
--- a/vocode/streaming/telephony/config_manager/base_config_manager.py
+++ b/vocode/streaming/telephony/config_manager/base_config_manager.py
@ -0,0 +1,17 @@
 import logging
 import os
 from typing import Optional
 from redis import Redis
 from vocode.streaming.models.telephony import CallConfig
 class BaseConfigManager:
    def save_config(self, conversation_id: str, config: CallConfig):
        raise NotImplementedError
    def get_config(self, conversation_id) -> Optional[CallConfig]:
        raise NotImplementedError
    def delete_config(self, conversation_id):
        raise NotImplementedError
--- a/vocode/streaming/telephony/config_manager/redis_config_manager.py
+++ b/vocode/streaming/telephony/config_manager/redis_config_manager.py
@ -0,0 +1,34 @@
 import logging
 import os
 from typing import Optional
 from redis import Redis
 from vocode.streaming.models.telephony import CallConfig
 from vocode.streaming.telephony.config_manager.base_config_manager import (
    BaseConfigManager,
 )
 class RedisConfigManager(BaseConfigManager):
    def __init__(self, logger: Optional[logging.Logger] = None):
        self.redis = Redis(
            host=os.environ.get("REDISHOST", "localhost"),
            port=int(os.environ.get("REDISPORT", 6379)),
            db=0,
            decode_responses=True,
        )
        self.logger = logger or logging.getLogger(__name__)
    def save_config(self, conversation_id: str, config: CallConfig):
        self.logger.debug(f"Saving config for {conversation_id}")
        self.redis.set(conversation_id, config.json())
    def get_config(self, conversation_id) -> Optional[CallConfig]:
        self.logger.debug(f"Getting config for {conversation_id}")
        raw_config = self.redis.get(conversation_id)
        if raw_config:
            return CallConfig.parse_raw(self.redis.get(conversation_id))
    def delete_config(self, conversation_id):
        self.logger.debug(f"Deleting config for {conversation_id}")
        self.redis.delete(conversation_id)
--- a/vocode/streaming/telephony/constants.py
+++ b/vocode/streaming/telephony/constants.py
@ -0,0 +1,5 @@
 from vocode.streaming.models.audio_encoding import AudioEncoding
 DEFAULT_SAMPLING_RATE = 8000
 DEFAULT_AUDIO_ENCODING = AudioEncoding.MULAW
 DEFAULT_CHUNK_SIZE = 20 * 160
--- a/vocode/streaming/telephony/conversation/call.py
+++ b/vocode/streaming/telephony/conversation/call.py
@ -0,0 +1,170 @@
 from fastapi import WebSocket
 import base64
 from enum import Enum
 import json
 import logging
 from typing import Optional
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.factory import (
    create_agent,
    create_synthesizer,
    create_transcriber,
 )
 from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.streaming.models.telephony import CallConfig, TwilioConfig
 from vocode.streaming.output_device.twilio_output_device import TwilioOutputDevice
 from vocode.streaming.models.synthesizer import (
    AzureSynthesizerConfig,
 )
 from vocode.streaming.models.transcriber import (
    DeepgramTranscriberConfig,
    PunctuationEndpointingConfig,
 )
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
 from vocode.streaming.telephony.config_manager.base_config_manager import (
    BaseConfigManager,
 )
 from vocode.streaming.telephony.twilio import create_twilio_client
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 class PhoneCallAction(Enum):
    CLOSE_WEBSOCKET = 1
 class Call(StreamingConversation):
    def __init__(
        self,
        base_url: str,
        config_manager: BaseConfigManager,
        agent: BaseAgent,
        twilio_config: TwilioConfig,
        transcriber: Optional[BaseTranscriber] = None,
        synthesizer: Optional[BaseSynthesizer] = None,
        twilio_sid=None,
        conversation_id: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.base_url = base_url
        self.config_manager = config_manager
        self.output_device = TwilioOutputDevice()
        self.twilio_config = twilio_config
        self.twilio_client = create_twilio_client(twilio_config)
        super().__init__(
            self.output_device,
            transcriber
            or DeepgramTranscriber(
                DeepgramTranscriberConfig(
                    sampling_rate=8000,
                    audio_encoding=AudioEncoding.MULAW,
                    chunk_size=self.CHUNK_SIZE,
                    model="voicemail",
                    endpointing_config=PunctuationEndpointingConfig(),
                ),
                logger=logger,
            ),
            agent,
            synthesizer
            or AzureSynthesizer(
                AzureSynthesizerConfig(
                    sampling_rate=8000, audio_encoding=AudioEncoding.MULAW
                )
            ),
            conversation_id=conversation_id,
            per_chunk_allowance_seconds=0.01,
            logger=logger,
        )
        self.twilio_sid = twilio_sid
        self.latest_media_timestamp = 0
    @staticmethod
    def from_call_config(
        base_url: str,
        call_config: CallConfig,
        config_manager: BaseConfigManager,
        conversation_id: str,
        logger: logging.Logger,
    ):
        return Call(
            base_url=base_url,
            logger=logger,
            config_manager=config_manager,
            agent=create_agent(call_config.agent_config),
            transcriber=create_transcriber(call_config.transcriber_config),
            synthesizer=create_synthesizer(call_config.synthesizer_config),
            twilio_config=call_config.twilio_config,
            twilio_sid=call_config.twilio_sid,
            conversation_id=conversation_id,
        )
    async def attach_ws_and_start(self, ws: WebSocket):
        self.logger.debug("Trying to attach WS to outbound call")
        self.output_device.ws = ws
        self.logger.debug("Attached WS to outbound call")
        twilio_call = self.twilio_client.calls(self.twilio_sid).fetch()
        if twilio_call.answered_by in ("machine_start", "fax"):
            self.logger.info(f"Call answered by {twilio_call.answered_by}")
            twilio_call.update(status="completed")
        else:
            await self.wait_for_twilio_start(ws)
            await super().start()
            while self.active:
                message = await ws.receive_text()
                response = await self.handle_ws_message(message)
                if response == PhoneCallAction.CLOSE_WEBSOCKET:
                    break
        self.tear_down()
    async def wait_for_twilio_start(self, ws: WebSocket):
        while True:
            message = await ws.receive_text()
            if not message:
                continue
            data = json.loads(message)
            if data["event"] == "start":
                self.logger.debug(
                    f"Media WS: Received event '{data['event']}': {message}"
                )
                self.output_device.stream_sid = data["start"]["streamSid"]
                break
    async def handle_ws_message(self, message) -> PhoneCallAction:
        if message is None:
            return PhoneCallAction.CLOSE_WEBSOCKET
        data = json.loads(message)
        if data["event"] == "media":
            media = data["media"]
            chunk = base64.b64decode(media["payload"])
            if self.latest_media_timestamp + 20 < int(media["timestamp"]):
                bytes_to_fill = 8 * (
                    int(media["timestamp"]) - (self.latest_media_timestamp + 20)
                )
                self.logger.debug(f"Filling {bytes_to_fill} bytes of silence")
                # NOTE: 0xff is silence for mulaw audio
                self.receive_audio(b"\xff" * bytes_to_fill)
            self.latest_media_timestamp = int(media["timestamp"])
            self.receive_audio(chunk)
        elif data["event"] == "stop":
            self.logger.debug(f"Media WS: Received event 'stop': {message}")
            self.logger.debug("Stopping...")
            return PhoneCallAction.CLOSE_WEBSOCKET
    def end_twilio_call(self) -> bool:
        response = self.twilio_client.calls(self.twilio_sid).update(status="completed")
        return response.status == "completed"
    def mark_terminated(self):
        super().mark_terminated()
        self.end_twilio_call()
        self.config_manager.delete_config(self.id)
    def tear_down(self):
        self.terminate()
--- a/vocode/streaming/telephony/conversation/outbound_call.py
+++ b/vocode/streaming/telephony/conversation/outbound_call.py
@ -0,0 +1,110 @@
 import logging
 from typing import Optional
 from twilio.rest import Client
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import (
    AzureSynthesizerConfig,
    SynthesizerConfig,
 )
 from vocode.streaming.models.telephony import CallConfig, TwilioConfig
 from vocode.streaming.models.transcriber import (
    DeepgramTranscriberConfig,
    PunctuationEndpointingConfig,
    TranscriberConfig,
 )
 from vocode.streaming.telephony.config_manager.base_config_manager import (
    BaseConfigManager,
 )
 from vocode.streaming.telephony.constants import (
    DEFAULT_AUDIO_ENCODING,
    DEFAULT_CHUNK_SIZE,
    DEFAULT_SAMPLING_RATE,
 )
 from vocode.streaming.telephony.twilio import create_twilio_client
 from vocode.streaming.utils import create_conversation_id
 class OutboundCall:
    def __init__(
        self,
        base_url: str,
        to_phone: str,
        from_phone: str,
        config_manager: BaseConfigManager,
        agent_config: AgentConfig,
        twilio_config: TwilioConfig,
        transcriber_config: Optional[TranscriberConfig] = None,
        synthesizer_config: Optional[SynthesizerConfig] = None,
        conversation_id: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.base_url = base_url
        self.to_phone = to_phone
        self.from_phone = from_phone
        self.config_manager = config_manager
        self.agent_config = agent_config
        self.transcriber_config = transcriber_config or DeepgramTranscriberConfig(
            sampling_rate=DEFAULT_SAMPLING_RATE,
            audio_encoding=DEFAULT_AUDIO_ENCODING,
            chunk_size=DEFAULT_CHUNK_SIZE,
            model="voicemail",
            endpointing_config=PunctuationEndpointingConfig(),
        )
        self.synthesizer_config = synthesizer_config or AzureSynthesizerConfig(
            sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING
        )
        self.conversation_id = conversation_id or create_conversation_id()
        self.logger = logger
        self.twilio_config = twilio_config
        self.twilio_client = create_twilio_client(twilio_config)
        self.twilio_sid = None
    def create_twilio_call(
        self, to_phone: str, from_phone: str, digits: str = ""
    ) -> str:
        twilio_call = self.twilio_client.calls.create(
            url=f"https://{self.base_url}/twiml/initiate_call/{self.conversation_id}",
            to=to_phone,
            from_=from_phone,
            send_digits=digits,
        )
        return twilio_call.sid
    def validate_outbound_call(
        self,
        to_phone: str,
        from_phone: str,
        mobile_only: bool = True,
    ):
        if len(to_phone) < 8:
            raise ValueError("Invalid 'to' phone")
        if not mobile_only:
            return
        line_type_intelligence = (
            self.twilio_client.lookups.v2.phone_numbers(to_phone)
            .fetch(fields="line_type_intelligence")
            .line_type_intelligence
        )
        if not line_type_intelligence or (
            line_type_intelligence and line_type_intelligence["type"] != "mobile"
        ):
            raise ValueError("Can only call mobile phones")
    def start(self):
        self.logger.debug("Starting outbound call")
        self.validate_outbound_call(self.to_phone, self.from_phone)
        self.twilio_sid = self.create_twilio_call(self.to_phone, self.from_phone)
        call_config = CallConfig(
            transcriber_config=self.transcriber_config,
            agent_config=self.agent_config,
            synthesizer_config=self.synthesizer_config,
            twilio_config=self.twilio_config,
            twilio_sid=self.twilio_sid,
        )
        self.config_manager.save_config(self.conversation_id, call_config)
    def end(self):
        response = self.twilio_client.calls(self.twilio_sid).update(status="completed")
        return response.status == "completed"
--- a/vocode/streaming/telephony/conversation/zoom_dial_in.py
+++ b/vocode/streaming/telephony/conversation/zoom_dial_in.py
@ -0,0 +1,73 @@
 import logging
 from typing import Optional
 from twilio.rest import Client
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 from vocode.streaming.models.telephony import CallConfig, TwilioConfig
 from vocode.streaming.models.transcriber import TranscriberConfig
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
 from vocode.streaming.telephony.config_manager.base_config_manager import (
    BaseConfigManager,
 )
 from vocode.streaming.telephony.conversation.outbound_call import OutboundCall
 from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
 from vocode.streaming.utils import create_conversation_id
 class ZoomDialIn(OutboundCall):
    def __init__(
        self,
        base_url: str,
        zoom_number: str,
        zoom_meeting_id: str,
        zoom_meeting_password: Optional[str],
        from_phone: str,
        config_manager: BaseConfigManager,
        twilio_config: TwilioConfig,
        agent_config: AgentConfig,
        transcriber_config: TranscriberConfig,
        synthesizer_config: SynthesizerConfig,
        conversation_id: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
    ):
        super().__init__(
            base_url=base_url,
            to_phone=zoom_number,
            from_phone=from_phone,
            config_manager=config_manager,
            transcriber_config=transcriber_config,
            agent_config=agent_config,
            synthesizer_config=synthesizer_config,
            twilio_config=twilio_config,
            conversation_id=conversation_id,
            logger=logger,
        )
        self.zoom_number = zoom_number
        self.zoom_meeting_id = zoom_meeting_id
        self.zoom_meeting_password = zoom_meeting_password
        self.from_phone = from_phone
    def start(self):
        self.validate_outbound_call(
            self.zoom_number,
            self.from_phone,
            mobile_only=False,
        )
        digits = f"ww{self.zoom_meeting_id}#"
        if self.zoom_meeting_password:
            digits += f"wwww*{self.zoom_meeting_password}#"
        self.logger.debug("Sending digits %s to the call", digits)
        twilio_sid = self.create_twilio_call(
            self.zoom_number,
            self.from_phone,
            digits=digits,
        )
        call_config = CallConfig(
            transcriber_config=self.transcriber_config,
            agent_config=self.agent_config,
            synthesizer_config=self.synthesizer_config,
            twilio_config=self.twilio_config,
            twilio_sid=twilio_sid,
        )
        self.config_manager.save_config(self.conversation_id, call_config)
--- a/vocode/streaming/telephony/hosted/inbound_call_server.py
+++ b/vocode/streaming/telephony/hosted/inbound_call_server.py
@ -0,0 +1,62 @@
 from fastapi import FastAPI, Response, Form
 from typing import Optional
 import requests
 import uvicorn
 import vocode
 from vocode.streaming.models.transcriber import TranscriberConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.telephony import (
    CreateInboundCall,
    TwilioConfig,
    TwilioConfig,
 )
 class InboundCallServer:
    def __init__(
        self,
        agent_config: AgentConfig,
        transcriber_config: Optional[TranscriberConfig] = None,
        synthesizer_config: Optional[SynthesizerConfig] = None,
        response_on_rate_limit: Optional[str] = None,
        twilio_config: Optional[TwilioConfig] = None,
    ):
        self.agent_config = agent_config
        self.transcriber_config = transcriber_config
        self.synthesizer_config = synthesizer_config
        self.app = FastAPI()
        self.app.post("/vocode")(self.handle_call)
        self.response_on_rate_limit = (
            response_on_rate_limit
            or "The line is really busy right now, check back later!"
        )
        self.twilio_config = twilio_config
        self.vocode_inbound_call_url = f"https://{vocode.base_url}/create_inbound_call"
    async def handle_call(self, twilio_sid: str = Form(alias="CallSid")):
        response = requests.post(
            self.vocode_inbound_call_url,
            headers={"Authorization": f"Bearer {vocode.api_key}"},
            json=CreateInboundCall(
                agent_config=self.agent_config,
                twilio_sid=twilio_sid,
                transcriber_config=self.transcriber_config,
                synthesizer_config=self.synthesizer_config,
                twilio_config=self.twilio_config,
            ).dict(),
        )
        if response.status_code == 429:
            return Response(
                f"<Response><Say>{self.response_on_rate_limit}</Say></Response>",
                media_type="application/xml",
            )
        assert response.ok, response.text
        return Response(
            response.text,
            media_type="application/xml",
        )
    def run(self, host="localhost", port=3000):
        uvicorn.run(self.app, host=host, port=port)
--- a/vocode/streaming/telephony/hosted/inbound_call_user_agent_server.py
+++ b/vocode/streaming/telephony/hosted/inbound_call_user_agent_server.py
@ -0,0 +1,45 @@
 from typing import Optional, Union
 from vocode.streaming.models.telephony import TwilioConfig
 from vocode.streaming.telephony.hosted.inbound_call_server import InboundCallServer
 from vocode.streaming.models.agent import (
    RESTfulAgentEnd,
    RESTfulAgentInput,
    RESTfulAgentText,
    RESTfulUserImplementedAgentConfig,
 )
 from vocode.streaming.models.transcriber import (
    TranscriberConfig,
 )
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 class InboundCallUserAgentServer(InboundCallServer):
    def __init__(
        self,
        agent_config: RESTfulUserImplementedAgentConfig,
        transcriber_config: Optional[TranscriberConfig] = None,
        synthesizer_config: Optional[SynthesizerConfig] = None,
        response_on_rate_limit: Optional[str] = None,
        twilio_config: Optional[TwilioConfig] = None,
    ):
        super().__init__(
            agent_config=agent_config,
            transcriber_config=transcriber_config,
            synthesizer_config=synthesizer_config,
            response_on_rate_limit=response_on_rate_limit,
            twilio_config=twilio_config,
        )
        assert isinstance(
            agent_config, RESTfulUserImplementedAgentConfig
        ), "agent_config must be a RESTfulUserImplementedAgentConfig"
        self.app.post("/respond")(self.respond_rest)
    async def respond(
        self, human_input, conversation_id
    ) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
        raise NotImplementedError
    async def respond_rest(
        self, request: RESTfulAgentInput
    ) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
        return await self.respond(request.human_input, request.conversation_id)
--- a/vocode/streaming/telephony/hosted/outbound_call.py
+++ b/vocode/streaming/telephony/hosted/outbound_call.py
@ -0,0 +1,68 @@
 from typing import Optional
 import requests
 import vocode
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 from vocode.streaming.models.transcriber import TranscriberConfig
 from vocode.streaming.models.telephony import (
    CallEntity,
    CreateOutboundCall,
    EndOutboundCall,
    TwilioConfig,
 )
 class OutboundCall:
    def __init__(
        self,
        recipient: CallEntity,
        caller: CallEntity,
        agent_config: AgentConfig,
        transcriber_config: Optional[TranscriberConfig] = None,
        synthesizer_config: Optional[SynthesizerConfig] = None,
        conversation_id: Optional[str] = None,
        twilio_config: Optional[TwilioConfig] = None,
    ):
        self.recipient = recipient
        self.caller = caller
        self.agent_config = agent_config
        self.transcriber_config = transcriber_config
        self.synthesizer_config = synthesizer_config
        self.conversation_id = conversation_id
        self.twilio_config = twilio_config
        self.vocode_create_outbound_call_url = (
            f"https://{vocode.base_url}/create_outbound_call"
        )
        self.vocode_end_outbound_call_url = (
            f"https://{vocode.base_url}/end_outbound_call"
        )
    def start(self) -> str:
        response = requests.post(
            self.vocode_create_outbound_call_url,
            headers={"Authorization": f"Bearer {vocode.api_key}"},
            json=CreateOutboundCall(
                recipient=self.recipient,
                caller=self.caller,
                agent_config=self.agent_config,
                transcriber_config=self.transcriber_config,
                synthesizer_config=self.synthesizer_config,
                conversation_id=self.conversation_id,
                twilio_config=self.twilio_config,
            ).dict(),
        )
        assert response.ok, response.text
        data = response.json()
        self.conversation_id = data["id"]
    def end(self) -> str:
        response = requests.post(
            self.vocode_end_outbound_call_url,
            headers={"Authorization": f"Bearer {vocode.api_key}"},
            json=EndOutboundCall(
                call_id=self.conversation_id,
                twilio_config=self.twilio_config,
            ).dict(),
        )
        assert response.ok or response.status_code == 404, response.text
--- a/vocode/streaming/telephony/hosted/zoom_dial_in.py
+++ b/vocode/streaming/telephony/hosted/zoom_dial_in.py
@ -0,0 +1,60 @@
 from typing import Optional
 import requests
 import vocode
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import SynthesizerConfig
 from vocode.streaming.models.transcriber import TranscriberConfig
 from vocode.streaming.telephony.hosted.outbound_call import OutboundCall
 from vocode.streaming.models.telephony import (
    CallEntity,
    DialIntoZoomCall,
    TwilioConfig,
 )
 class ZoomDialIn(OutboundCall):
    def __init__(
        self,
        recipient: CallEntity,
        caller: CallEntity,
        zoom_meeting_id: str,
        zoom_meeting_password: str,
        agent_config: AgentConfig,
        transcriber_config: Optional[TranscriberConfig] = None,
        synthesizer_config: Optional[SynthesizerConfig] = None,
        conversation_id: Optional[str] = None,
        twilio_config: Optional[TwilioConfig] = None,
    ):
        super().__init__(
            recipient=recipient,
            caller=caller,
            agent_config=agent_config,
            transcriber_config=transcriber_config,
            synthesizer_config=synthesizer_config,
            conversation_id=conversation_id,
            twilio_config=twilio_config,
        )
        self.zoom_meeting_id = zoom_meeting_id
        self.zoom_meeting_password = zoom_meeting_password
        self.vocode_zoom_dial_in_url = f"https://{vocode.base_url}/dial_into_zoom_call"
    def start(self) -> str:
        response = requests.post(
            self.vocode_zoom_dial_in_url,
            headers={"Authorization": f"Bearer {vocode.api_key}"},
            json=DialIntoZoomCall(
                recipient=self.recipient,
                caller=self.caller,
                zoom_meeting_id=self.zoom_meeting_id,
                zoom_meeting_password=self.zoom_meeting_password,
                agent_config=self.agent_config,
                transcriber_config=self.transcriber_config,
                synthesizer_config=self.synthesizer_config,
                conversation_id=self.conversation_id,
                twilio_config=self.twilio_config,
            ).dict(),
        )
        assert response.ok, response.text
        data = response.json()
        self.conversation_id = data["id"]
--- a/vocode/streaming/telephony/server/base.py
+++ b/vocode/streaming/telephony/server/base.py
@ -0,0 +1,143 @@
 import logging
 from typing import Optional
 from fastapi import APIRouter, Form, Response
 from pydantic import BaseModel
 from vocode.streaming.agent.base_agent import BaseAgent
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.synthesizer import (
    AzureSynthesizerConfig,
    SynthesizerConfig,
 )
 from vocode.streaming.models.transcriber import (
    DeepgramTranscriberConfig,
    PunctuationEndpointingConfig,
    TranscriberConfig,
 )
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
 from vocode.streaming.telephony.config_manager.base_config_manager import (
    BaseConfigManager,
 )
 from vocode.streaming.telephony.constants import (
    DEFAULT_AUDIO_ENCODING,
    DEFAULT_CHUNK_SIZE,
    DEFAULT_SAMPLING_RATE,
 )
 from vocode.streaming.telephony.server.router.calls import CallsRouter
 from vocode.streaming.telephony.server.router.twiml import TwiMLRouter
 from vocode.streaming.models.telephony import (
    CallConfig,
    CallEntity,
    CreateOutboundCall,
    CreateInboundCall,
    DialIntoZoomCall,
    EndOutboundCall,
    TwilioConfig,
 )
 from twilio.rest import Client
 from vocode.streaming.telephony.conversation.call import Call
 from vocode.streaming.telephony.templates import Templater
 from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
 from vocode.streaming.utils import create_conversation_id
 class InboundCallConfig(BaseModel):
    url: str
    agent_config: AgentConfig
    twilio_config: TwilioConfig
    transcriber_config: Optional[TranscriberConfig] = None
    synthesizer_config: Optional[SynthesizerConfig] = None
 class TelephonyServer:
    def __init__(
        self,
        base_url: str,
        config_manager: BaseConfigManager,
        inbound_call_configs: list[InboundCallConfig] = [],
        logger: Optional[logging.Logger] = None,
    ):
        self.base_url = base_url
        self.logger = logger or logging.getLogger(__name__)
        self.router = APIRouter()
        self.config_manager = config_manager
        self.templater = Templater()
        self.router.include_router(
            CallsRouter(
                base_url=base_url,
                templater=self.templater,
                config_manager=self.config_manager,
                logger=self.logger,
            ).get_router()
        )
        self.router.include_router(
            TwiMLRouter(
                base_url=base_url, templater=self.templater, logger=self.logger
            ).get_router()
        )
        for config in inbound_call_configs:
            self.router.add_api_route(
                config.url,
                self.create_inbound_route(
                    agent_config=config.agent_config,
                    twilio_config=config.twilio_config,
                    transcriber_config=config.transcriber_config,
                    synthesizer_config=config.synthesizer_config,
                ),
                methods=["POST"],
            )
            logger.info(f"Set up inbound call TwiML at https://{base_url}{config.url}")
    def create_inbound_route(
        self,
        agent_config: AgentConfig,
        twilio_config: TwilioConfig,
        transcriber_config: Optional[TranscriberConfig] = None,
        synthesizer_config: Optional[SynthesizerConfig] = None,
    ):
        def route(twilio_sid: str = Form(alias="CallSid")) -> Response:
            call_config = CallConfig(
                transcriber_config=transcriber_config
                or DeepgramTranscriberConfig(
                    sampling_rate=DEFAULT_SAMPLING_RATE,
                    audio_encoding=DEFAULT_AUDIO_ENCODING,
                    chunk_size=DEFAULT_CHUNK_SIZE,
                    model="voicemail",
                    endpointing_config=PunctuationEndpointingConfig(),
                ),
                agent_config=agent_config,
                synthesizer_config=synthesizer_config
                or AzureSynthesizerConfig(
                    sampling_rate=DEFAULT_SAMPLING_RATE,
                    audio_encoding=DEFAULT_AUDIO_ENCODING,
                ),
                twilio_config=twilio_config,
                twilio_sid=twilio_sid,
            )
            conversation_id = create_conversation_id()
            self.config_manager.save_config(conversation_id, call_config)
            return self.templater.get_connection_twiml(
                base_url=self.base_url, call_id=conversation_id
            )
        return route
    async def end_outbound_call(self, conversation_id: str):
        # TODO validation via twilio_client
        call_config = self.config_manager.get_config(conversation_id)
        if not call_config:
            raise ValueError("Call not found")
        call = Call.from_call_config(
            self.base_url,
            call_config,
            self.config_manager,
            conversation_id,
            self.logger,
        )
        call.end_twilio_call()
        return {"id": call.id}
    def get_router(self) -> APIRouter:
        return self.router
--- a/vocode/streaming/telephony/server/router/calls.py
+++ b/vocode/streaming/telephony/server/router/calls.py
@ -0,0 +1,45 @@
 from typing import Optional
 import logging
 from fastapi import APIRouter, HTTPException, WebSocket
 from vocode.streaming.telephony.config_manager.base_config_manager import (
    BaseConfigManager,
 )
 from vocode.streaming.telephony.conversation.call import Call
 from vocode.streaming.telephony.templates import Templater
 class CallsRouter:
    def __init__(
        self,
        base_url: str,
        templater: Templater,
        config_manager: BaseConfigManager,
        logger: Optional[logging.Logger] = None,
    ):
        super().__init__()
        self.base_url = base_url
        self.templater = templater
        self.config_manager = config_manager
        self.logger = logger or logging.getLogger(__name__)
        self.router = APIRouter()
        self.router.websocket("/connect_call/{id}")(self.connect_call)
    async def connect_call(self, websocket: WebSocket, id: str):
        await websocket.accept()
        self.logger.debug("Phone WS connection opened for chat {}".format(id))
        call_config = self.config_manager.get_config(id)
        if not call_config:
            raise HTTPException(status_code=400, detail="No active phone call")
        call: Call = Call.from_call_config(
            self.base_url, call_config, self.config_manager, id, self.logger
        )
        await call.attach_ws_and_start(websocket)
        self.config_manager.delete_config(call.id)
        self.logger.debug("Phone WS connection closed for chat {}".format(id))
    def get_router(self) -> APIRouter:
        return self.router
--- a/vocode/streaming/telephony/server/router/twiml.py
+++ b/vocode/streaming/telephony/server/router/twiml.py
@ -0,0 +1,29 @@
 import logging
 from typing import Optional
 from fastapi import APIRouter
 from vocode.streaming.telephony.templates import Templater
 class TwiMLRouter:
    def __init__(
        self,
        base_url: str,
        templater: Templater,
        logger: Optional[logging.Logger] = None,
    ):
        super().__init__()
        self.base_url = base_url
        self.templater = templater
        self.logger = logger or logging.getLogger(__name__)
        self.router = APIRouter()
        self.router.add_api_route(
            "/twiml/initiate_call/{id}", self.call_twiml, methods=["POST"]
        )
    def call_twiml(self, id: str):
        return self.templater.get_connection_twiml(base_url=self.base_url, call_id=id)
    def get_router(self) -> APIRouter:
        return self.router
--- a/vocode/streaming/telephony/templates.py
+++ b/vocode/streaming/telephony/templates.py
@ -0,0 +1,20 @@
 import os
 from jinja2 import Environment, FileSystemLoader
 from fastapi import Response
 class Templater:
    def __init__(self):
        self.templates = Environment(
            loader=FileSystemLoader("%s/templates/" % os.path.dirname(__file__))
        )
    def render_template(self, template_name: str, **kwargs):
        template = self.templates.get_template(template_name)
        return template.render(**kwargs)
    def get_connection_twiml(self, call_id: str, base_url: str):
        return Response(
            self.render_template("connect_call.xml", base_url=base_url, id=call_id),
            media_type="application/xml",
        )
--- a/vocode/streaming/telephony/templates/connect_call.xml
+++ b/vocode/streaming/telephony/templates/connect_call.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <Response>
  <Connect>
    <Stream url="wss://{{ base_url }}/connect_call/{{ id }}" />
  </Connect>
 </Response>
--- a/vocode/streaming/telephony/twilio.py
+++ b/vocode/streaming/telephony/twilio.py
@ -0,0 +1,12 @@
 import os
 from typing import Optional
 from dotenv import load_dotenv
 from twilio.rest import Client
 from vocode.streaming.models.telephony import TwilioConfig
 load_dotenv()
 def create_twilio_client(twilio_config: TwilioConfig):
    return Client(twilio_config.account_sid, twilio_config.auth_token)
--- a/vocode/streaming/transcriber/assembly_ai_transcriber.py
+++ b/vocode/streaming/transcriber/assembly_ai_transcriber.py
@ -0,0 +1,101 @@
 import asyncio
 import json
 import logging
 import os
 from dotenv import load_dotenv
 import websockets
 from urllib.parse import urlencode
 from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig
 from vocode.streaming.models.websocket import AudioMessage
 from vocode.streaming.transcriber.base_transcriber import (
    BaseTranscriber,
    Transcription,
 )
 from vocode.streaming.models.audio_encoding import AudioEncoding
 load_dotenv()
 ASSEMBLY_AI_API_KEY = os.environ.get("ASSEMBLY_AI_API_KEY")
 ASSEMBLY_AI_URL = "wss://api.assemblyai.com/v2/realtime/ws"
 class AssemblyAITranscriber(BaseTranscriber):
    def __init__(
        self,
        transcriber_config: AssemblyAITranscriberConfig,
        logger: logging.Logger = None,
    ):
        super().__init__(transcriber_config)
        self._ended = False
        self.is_ready = False
        self.logger = logger or logging.getLogger(__name__)
        if self.transcriber_config.should_warmup_model:
            raise Exception("AssemblyAI model warmup not supported yet")
        elif self.transcriber_config.endpointing_config:
            raise Exception("Assembly AI endpointing config not supported yet")
    async def ready(self):
        # while not self.warmed_up:
        #     await asyncio.sleep(0.1)
        # return self.is_ready
        return True
    async def run(self):
        await self.process()
    def send_audio(self, chunk):
        self.audio_queue.put_nowait(chunk)
    def terminate(self):
        terminate_msg = json.dumps({"terminate_session": True})
        self.audio_queue.put_nowait(terminate_msg)
        self._ended = True
    def get_assembly_ai_url(self):
        return ASSEMBLY_AI_URL + f"?sample_rate={self.transcriber_config.sampling_rate}"
    async def process(self):
        self.audio_queue = asyncio.Queue()
        URL = self.get_assembly_ai_url()
        async with websockets.connect(
            URL,
            extra_headers=(("Authorization", ASSEMBLY_AI_API_KEY),),
            ping_interval=5,
            ping_timeout=20,
        ) as ws:
            await asyncio.sleep(0.1)
            async def sender(ws):  # sends audio to websocket
                while not self._ended:
                    try:
                        data = await asyncio.wait_for(self.audio_queue.get(), 5)
                    except asyncio.exceptions.TimeoutError:
                        break
                    await ws.send(
                        json.dumps({"audio_data": AudioMessage.from_bytes(data).data})
                    )
                self.logger.debug("Terminating AssemblyAI transcriber sender")
            async def receiver(ws):
                while not self._ended:
                    try:
                        result_str = await ws.recv()
                    except websockets.exceptions.ConnectionClosedError as e:
                        self.logger.debug(e)
                        break
                    except Exception as e:
                        assert False, "Not a websocket 4008 error"
                    data = json.loads(result_str)
                    is_final = (
                        "message_type" in data
                        and data["message_type"] == "FinalTranscript"
                    )
                    if "text" in data and data["text"]:
                        await self.on_response(
                            Transcription(data["text"], data["confidence"], is_final)
                        )
            await asyncio.gather(sender(ws), receiver(ws))
--- a/vocode/streaming/transcriber/base_transcriber.py
+++ b/vocode/streaming/transcriber/base_transcriber.py
@ -0,0 +1,59 @@
 from dotenv import load_dotenv
 from typing import Callable, Optional, Awaitable
 from vocode.streaming.utils import convert_wav
 from vocode.streaming.models.transcriber import EndpointingConfig, TranscriberConfig
 load_dotenv()
 class Transcription:
    def __init__(
        self,
        message: str,
        confidence: float,
        is_final: bool,
        is_interrupt: bool = False,
    ):
        self.message = message
        self.confidence = confidence
        self.is_final = is_final
        self.is_interrupt = is_interrupt
    def __str__(self):
        return f"Transcription({self.message}, {self.confidence}, {self.is_final})"
 class BaseTranscriber:
    def __init__(
        self,
        transcriber_config: TranscriberConfig,
    ):
        self.transcriber_config = transcriber_config
        self.on_response: Optional[Callable[[Transcription], Awaitable]] = None
    def get_transcriber_config(self) -> TranscriberConfig:
        return self.transcriber_config
    def set_on_response(self, on_response: Callable[[Transcription], Awaitable]):
        self.on_response = on_response
    def get_warmup_bytes(self):
        sampling_rate = self.transcriber_config.sampling_rate
        return convert_wav(
            "convo/audio/ajay.wav",
            sampling_rate,
            self.transcriber_config.audio_encoding,
        )
    async def ready(self):
        return True
    async def run(self):
        pass
    def send_audio(self, chunk):
        pass
    def terminate(self):
        pass
--- a/vocode/streaming/transcriber/deepgram_transcriber.py
+++ b/vocode/streaming/transcriber/deepgram_transcriber.py
@ -0,0 +1,230 @@
 import asyncio
 import json
 import logging
 import os
 from dotenv import load_dotenv
 import websockets
 from websockets.client import WebSocketClientProtocol
 import audioop
 from urllib.parse import urlencode
 from vocode.streaming.transcriber.base_transcriber import (
    BaseTranscriber,
    Transcription,
 )
 from vocode.streaming.models.transcriber import (
    DeepgramTranscriberConfig,
    EndpointingConfig,
    EndpointingType,
 )
 from vocode.streaming.models.audio_encoding import AudioEncoding
 load_dotenv()
 DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
 PUNCTUATION_TERMINATORS = [".", "!", "?"]
 NUM_RESTARTS = 5
 class DeepgramTranscriber(BaseTranscriber):
    def __init__(
        self,
        transcriber_config: DeepgramTranscriberConfig,
        logger: logging.Logger = None,
    ):
        super().__init__(transcriber_config)
        self.transcriber_config = transcriber_config
        self._ended = False
        self.warmed_up = False
        self.is_ready = False
        self.logger = logger or logging.getLogger(__name__)
    def create_warmup_chunks(self):
        warmup_chunks = []
        warmup_bytes = self.get_warmup_bytes()
        chunk_size = self.transcriber_config.chunk_size
        for i in range(len(warmup_bytes) // chunk_size):
            warmup_chunks.append(warmup_bytes[i * chunk_size : (i + 1) * chunk_size])
        return warmup_chunks
    async def ready(self):
        while not self.warmed_up:
            await asyncio.sleep(0.1)
        return self.is_ready
    async def run(self):
        # warmup_chunks = await self.create_warmup_chunks()
        restarts = 0
        while not self._ended and restarts < NUM_RESTARTS:
            await self.process(self.transcriber_config.should_warmup_model)
            restarts += 1
            self.logger.debug(
                "Deepgram connection died, restarting, num_restarts: %s", restarts
            )
    def send_audio(self, chunk):
        if (
            self.transcriber_config.downsampling
            and self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16
        ):
            chunk, _ = audioop.ratecv(
                chunk,
                2,
                1,
                self.transcriber_config.sampling_rate
                * self.transcriber_config.downsampling,
                self.transcriber_config.sampling_rate,
                None,
            )
        self.audio_queue.put_nowait(chunk)
    def terminate(self):
        terminate_msg = json.dumps({"type": "CloseStream"})
        self.audio_queue.put_nowait(terminate_msg)
        self._ended = True
    def get_deepgram_url(self):
        if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
            encoding = "linear16"
        elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
            encoding = "mulaw"
        url_params = {
            "encoding": encoding,
            "sample_rate": self.transcriber_config.sampling_rate,
            "channels": 1,
            "interim_results": "true",
        }
        extra_params = {}
        if self.transcriber_config.model:
            extra_params["model"] = self.transcriber_config.model
        if self.transcriber_config.tier:
            extra_params["tier"] = self.transcriber_config.tier
        if self.transcriber_config.version:
            extra_params["version"] = self.transcriber_config.version
        if (
            self.transcriber_config.endpointing_config
            and self.transcriber_config.endpointing_config.type
            == EndpointingType.PUNCTUATION_BASED
        ):
            extra_params["punctuate"] = "true"
        url_params.update(extra_params)
        return f"wss://api.deepgram.com/v1/listen?{urlencode(url_params)}"
    def is_speech_final(
        self, current_buffer: str, deepgram_response: dict, time_silent: float
    ):
        transcript = deepgram_response["channel"]["alternatives"][0]["transcript"]
        # if it is not time based, then return true if speech is final and there is a transcript
        if not self.transcriber_config.endpointing_config:
            return transcript and deepgram_response["speech_final"]
        elif (
            self.transcriber_config.endpointing_config.type
            == EndpointingType.TIME_BASED
        ):
            # if it is time based, then return true if there is no transcript
            # and there is some speech to send
            # and the time_silent is greater than the cutoff
            return (
                not transcript
                and current_buffer
                and (time_silent + deepgram_response["duration"])
                > self.transcriber_config.endpointing_config.time_cutoff_seconds
            )
        elif (
            self.transcriber_config.endpointing_config.type
            == EndpointingType.PUNCTUATION_BASED
        ):
            return (
                transcript
                and deepgram_response["speech_final"]
                and transcript.strip()[-1] in PUNCTUATION_TERMINATORS
            ) or (
                not transcript
                and current_buffer
                and (time_silent + deepgram_response["duration"])
                > self.transcriber_config.endpointing_config.time_cutoff_seconds
            )
        raise Exception("Endpointing config not supported")
    def calculate_time_silent(self, data: dict):
        end = data["start"] + data["duration"]
        words = data["channel"]["alternatives"][0]["words"]
        if words:
            return end - words[-1]["end"]
        return data["duration"]
    async def process(self, warmup=True):
        extra_headers = {"Authorization": f"Token {DEEPGRAM_API_KEY}"}
        self.audio_queue = asyncio.Queue()
        async with websockets.connect(
            self.get_deepgram_url(), extra_headers=extra_headers
        ) as ws:
            async def warmup_sender(ws: WebSocketClientProtocol):
                if warmup:
                    warmup_chunks = self.create_warmup_chunks()
                    for chunk in warmup_chunks:
                        await ws.send(chunk)
                    await asyncio.sleep(5)
                self.warmed_up = True
                self.is_ready = True
            async def sender(ws: WebSocketClientProtocol):  # sends audio to websocket
                while not self._ended:
                    try:
                        data = await asyncio.wait_for(self.audio_queue.get(), 5)
                    except asyncio.exceptions.TimeoutError:
                        break
                    await ws.send(data)
                self.logger.debug("Terminating Deepgram transcriber sender")
            async def receiver(ws: WebSocketClientProtocol):
                buffer = ""
                time_silent = 0
                while not self._ended:
                    try:
                        msg = await ws.recv()
                    except Exception as e:
                        self.logger.debug(f"Got error {e} in Deepgram receiver")
                        break
                    data = json.loads(msg)
                    if (
                        not "is_final" in data
                    ):  # means we've finished receiving transcriptions
                        break
                    is_final = data["is_final"]
                    speech_final = self.is_speech_final(buffer, data, time_silent)
                    top_choice = data["channel"]["alternatives"][0]
                    confidence = top_choice["confidence"]
                    if (
                        top_choice["transcript"]
                        and confidence > 0.0
                        and self.warmed_up
                        and is_final
                    ):
                        buffer = f"{buffer} {top_choice['transcript']}"
                    if speech_final:
                        await self.on_response(Transcription(buffer, confidence, True))
                        buffer = ""
                        time_silent = 0
                    elif (
                        top_choice["transcript"] and confidence > 0.0 and self.warmed_up
                    ):
                        await self.on_response(
                            Transcription(
                                buffer,
                                confidence,
                                False,
                            )
                        )
                        time_silent = self.calculate_time_silent(data)
                    else:
                        time_silent += data["duration"]
                self.logger.debug("Terminating Deepgram transcriber receiver")
            await asyncio.gather(warmup_sender(ws), sender(ws), receiver(ws))
--- a/vocode/streaming/transcriber/google_transcriber.py
+++ b/vocode/streaming/transcriber/google_transcriber.py
@ -0,0 +1,145 @@
 import asyncio
 import time
 import queue
 from google.cloud import speech
 import threading
 from vocode.streaming.models.audio_encoding import AudioEncoding
 from vocode.streaming.transcriber.base_transcriber import (
    BaseTranscriber,
    Transcription,
 )
 from vocode.streaming.models.transcriber import GoogleTranscriberConfig
 from vocode.streaming.utils import create_loop_in_thread
 class GoogleTranscriber(BaseTranscriber):
    def __init__(self, transcriber_config: GoogleTranscriberConfig):
        super().__init__(transcriber_config)
        self._queue = queue.Queue()
        self._ended = False
        self.google_streaming_config = self.create_google_streaming_config()
        self.client = speech.SpeechClient()
        self.warmed_up = False
        self.is_ready = False
        if self.transcriber_config.endpointing_config:
            raise Exception("Google endpointing config not supported yet")
        self.event_loop = asyncio.new_event_loop()
        self.thread = threading.Thread(
            name="google_transcriber",
            target=create_loop_in_thread,
            args=(self.event_loop, self.process()),
        )
    def create_google_streaming_config(self):
        extra_params = {}
        if self.transcriber_config.model:
            extra_params["model"] = self.transcriber_config.model
            extra_params["use_enhanced"] = True
        if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
            google_audio_encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
        elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
            google_audio_encoding = speech.RecognitionConfig.AudioEncoding.MULAW
        return speech.StreamingRecognitionConfig(
            config=speech.RecognitionConfig(
                encoding=google_audio_encoding,
                sample_rate_hertz=self.transcriber_config.sampling_rate,
                language_code="en-US",
                **extra_params
            ),
            interim_results=True,
        )
    async def ready(self):
        if not self.transcriber_config.should_warmup_model:
            return True
        while not self.warmed_up:
            await asyncio.sleep(0.1)
        return self.is_ready
    def warmup(self):
        warmup_bytes = self.get_warmup_bytes()
        def stream():
            chunk_size = self.transcriber_config.sampling_rate * 2
            for i in range(len(warmup_bytes) // chunk_size):
                yield speech.StreamingRecognizeRequest(
                    audio_content=warmup_bytes[i * chunk_size : (i + 1) * chunk_size]
                )
                time.sleep(0.01)
        for _ in self.client.streaming_recognize(
            self.google_streaming_config, stream()
        ):
            pass
        self.warmed_up = True
        self.is_ready = True
    async def run(self):
        self.thread.start()
    async def process(self):
        if self.transcriber_config.should_warmup_model:
            self.warmup()
        stream = self.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in stream
        )
        responses = self.client.streaming_recognize(
            self.google_streaming_config, requests
        )
        await self.process_responses_loop(responses)
    def terminate(self):
        self._ended = True
    def send_audio(self, chunk: bytes):
        self._queue.put(chunk, block=False)
    async def process_responses_loop(self, responses):
        for response in responses:
            await self._on_response(response)
            if self._ended:
                break
    async def _on_response(self, response):
        if not response.results:
            return
        result = response.results[0]
        if not result.alternatives:
            return
        top_choice = result.alternatives[0]
        message = top_choice.transcript
        confidence = top_choice.confidence
        return await self.on_response(
            Transcription(message, confidence, result.is_final)
        )
    def generator(self):
        while not self._ended:
            # Use a blocking get() to ensure there's at least one chunk of
            # data, and stop iteration if the chunk is None, indicating the
            # end of the audio stream.
            chunk = self._queue.get()
            if chunk is None:
                return
            data = [chunk]
            # Now consume whatever other data's still buffered.
            while True:
                try:
                    chunk = self._queue.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break
            yield b"".join(data)
--- a/vocode/streaming/utils/init.py
+++ b/vocode/streaming/utils/init.py
@ -0,0 +1,63 @@
 import asyncio
 import audioop
 import secrets
 from typing import Any
 import wave
 from ..models.audio_encoding import AudioEncoding
 def create_loop_in_thread(loop: asyncio.AbstractEventLoop, long_running_task=None):
    asyncio.set_event_loop(loop)
    if long_running_task:
        loop.run_until_complete(long_running_task)
    else:
        loop.run_forever()
 def convert_linear_audio(
    raw_wav: bytes,
    input_sample_rate=24000,
    output_sample_rate=8000,
    output_encoding=AudioEncoding.LINEAR16,
    output_sample_width=2,
 ):
    # downsample
    if input_sample_rate != output_sample_rate:
        raw_wav, _ = audioop.ratecv(
            raw_wav, 2, 1, input_sample_rate, output_sample_rate, None
        )
    if output_encoding == AudioEncoding.LINEAR16:
        return raw_wav
    elif output_encoding == AudioEncoding.MULAW:
        return audioop.lin2ulaw(raw_wav, output_sample_width)
 def convert_wav(
    file: Any,
    output_sample_rate=8000,
    output_encoding=AudioEncoding.LINEAR16,
 ):
    with wave.open(file, "rb") as wav:
        raw_wav = wav.readframes(wav.getnframes())
        return convert_linear_audio(
            raw_wav,
            input_sample_rate=wav.getframerate(),
            output_sample_rate=output_sample_rate,
            output_encoding=output_encoding,
            output_sample_width=wav.getsampwidth(),
        )
 def get_chunk_size_per_second(audio_encoding: AudioEncoding, sampling_rate: int) -> int:
    if audio_encoding == AudioEncoding.LINEAR16:
        return sampling_rate * 2
    elif audio_encoding == AudioEncoding.MULAW:
        return sampling_rate
    else:
        raise Exception("Unsupported audio encoding")
 def create_conversation_id() -> str:
    return secrets.token_urlsafe(16)
--- a/vocode/streaming/utils/goodbye_embeddings/.gitkeep
+++ b/vocode/streaming/utils/goodbye_embeddings/.gitkeep
--- a/vocode/streaming/utils/goodbye_model.py
+++ b/vocode/streaming/utils/goodbye_model.py
@ -0,0 +1,102 @@
 import os
 import asyncio
 import openai
 from dotenv import load_dotenv
 import numpy as np
 import requests
 load_dotenv()
 openai.api_key = os.getenv("OPENAI_API_KEY")
 PLATFORM = "pyq" if os.getenv("USE_PYQ_EMBEDDINGS", "false") == "true" else "openai"
 SIMILARITY_THRESHOLD = 0.9
 SIMILARITY_THRESHOLD_PYQ = 0.7
 EMBEDDING_SIZE = 1536
 PYQ_EMBEDDING_SIZE = 768
 GOODBYE_PHRASES = [
    "bye",
    "goodbye",
    "see you",
    "see you later",
    "talk to you later",
    "talk to you soon",
    "have a good day",
    "have a good night",
 ]
 PYQ_API_URL = "https://embeddings.pyqai.com"
 class GoodbyeModel:
    def __init__(
        self,
        embeddings_cache_path=os.path.join(
            os.path.dirname(__file__), "goodbye_embeddings"
        ),
    ):
        self.goodbye_embeddings = self.load_or_create_embeddings(
            f"{embeddings_cache_path}/goodbye_embeddings.npy"
        )
        self.goodbye_embeddings_pyq = self.load_or_create_embeddings(
            f"{embeddings_cache_path}/goodbye_embeddings_pyq.npy"
        )
    def load_or_create_embeddings(self, path):
        if os.path.exists(path):
            return np.load(path)
        else:
            embeddings = self.create_embeddings()
            np.save(path, embeddings)
            return embeddings
    def create_embeddings(self, platform=PLATFORM):
        print("Creating embeddings...")
        size = EMBEDDING_SIZE if platform == "openai" else PYQ_EMBEDDING_SIZE
        embeddings = np.empty((size, len(GOODBYE_PHRASES)))
        for i, goodbye_phrase in enumerate(GOODBYE_PHRASES):
            embeddings[:, i] = self.create_embedding(goodbye_phrase, platform=platform)
        return embeddings
    async def is_goodbye(self, text: str, platform=PLATFORM) -> bool:
        if "bye" in text.lower():
            return True
        embedding = self.create_embedding(text.strip().lower(), platform=platform)
        goodbye_embeddings = (
            self.goodbye_embeddings
            if platform == "openai"
            else self.goodbye_embeddings_pyq
        )
        threshold = (
            SIMILARITY_THRESHOLD if platform == "openai" else SIMILARITY_THRESHOLD_PYQ
        )
        similarity_results = embedding @ goodbye_embeddings
        return np.max(similarity_results) > threshold
    def create_embedding(self, text, platform=PLATFORM) -> np.array:
        if platform == "openai":
            return np.array(
                openai.Embedding.create(input=text, model="text-embedding-ada-002")[
                    "data"
                ][0]["embedding"]
            )
        elif platform == "pyq":
            return np.array(
                requests.post(
                    PYQ_API_URL,
                    headers={
                        "Content-Type": "application/json",
                        "Authorization": os.getenv("PYQ_API_KEY"),
                    },
                    json={"input_sequence": [text], "account_id": "400"},
                ).json()["response"][0]
            )
 if __name__ == "__main__":
    async def main():
        model = GoodbyeModel()
        while True:
            print(await model.is_goodbye(input("Text: ")))
    asyncio.run(main())
--- a/vocode/streaming/utils/sse_client.py
+++ b/vocode/streaming/utils/sse_client.py
@ -0,0 +1,236 @@
 """
 A port of sseclient (https://pypi.org/project/sseclient/) that allows you to get server-side events with a POST request
 Copyright (c) 2015 Brent Tubbs 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE."""
 #
 # Distributed under the terms of the MIT license.
 #
 from __future__ import unicode_literals
 import codecs
 import re
 import time
 import warnings
 import six
 import requests
 __version__ = "0.0.27"
 # Technically, we should support streams that mix line endings.  This regex,
 # however, assumes that a system will provide consistent line endings.
 end_of_field = re.compile(r"\r\n\r\n|\r\r|\n\n")
 class SSEClient(object):
    def __init__(
        self,
        method,
        url,
        last_id=None,
        retry=3000,
        session=None,
        chunk_size=1024,
        **kwargs
    ):
        self.url = url
        self.method = method
        self.last_id = last_id
        self.retry = retry
        self.chunk_size = chunk_size
        # Optional support for passing in a requests.Session()
        self.session = session
        # Any extra kwargs will be fed into the requests.get call later.
        self.requests_kwargs = kwargs
        # The SSE spec requires making requests with Cache-Control: nocache
        if "headers" not in self.requests_kwargs:
            self.requests_kwargs["headers"] = {}
        self.requests_kwargs["headers"]["Cache-Control"] = "no-cache"
        # The 'Accept' header is not required, but explicit > implicit
        self.requests_kwargs["headers"]["Accept"] = "text/event-stream"
        # Keep data here as it streams in
        self.buf = ""
        self._connect()
    def _connect(self):
        if self.last_id:
            self.requests_kwargs["headers"]["Last-Event-ID"] = self.last_id
        # Use session if set.  Otherwise fall back to requests module.
        requester = self.session or requests
        self.resp = requester.request(
            self.method, self.url, stream=True, **self.requests_kwargs
        )
        self.resp_iterator = self.iter_content()
        encoding = self.resp.encoding or self.resp.apparent_encoding
        self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace")
        # TODO: Ensure we're handling redirects.  Might also stick the 'origin'
        # attribute on Events like the Javascript spec requires.
        self.resp.raise_for_status()
    def iter_content(self):
        def generate():
            while True:
                if (
                    hasattr(self.resp.raw, "_fp")
                    and hasattr(self.resp.raw._fp, "fp")
                    and hasattr(self.resp.raw._fp.fp, "read1")
                ):
                    chunk = self.resp.raw._fp.fp.read1(self.chunk_size)
                else:
                    # _fp is not available, this means that we cannot use short
                    # reads and this will block until the full chunk size is
                    # actually read
                    chunk = self.resp.raw.read(self.chunk_size)
                if not chunk:
                    break
                yield chunk
        return generate()
    def _event_complete(self):
        return re.search(end_of_field, self.buf) is not None
    def __iter__(self):
        return self
    def __next__(self):
        while not self._event_complete():
            try:
                next_chunk = next(self.resp_iterator)
                if not next_chunk:
                    raise EOFError()
                self.buf += self.decoder.decode(next_chunk)
            except (
                StopIteration,
                requests.RequestException,
                EOFError,
                six.moves.http_client.IncompleteRead,
            ) as e:
                print(e)
                time.sleep(self.retry / 1000.0)
                self._connect()
                # The SSE spec only supports resuming from a whole message, so
                # if we have half a message we should throw it out.
                head, sep, tail = self.buf.rpartition("\n")
                self.buf = head + sep
                continue
        # Split the complete event (up to the end_of_field) into event_string,
        # and retain anything after the current complete event in self.buf
        # for next time.
        (event_string, self.buf) = re.split(end_of_field, self.buf, maxsplit=1)
        msg = Event.parse(event_string)
        # If the server requests a specific retry delay, we need to honor it.
        if msg.retry:
            self.retry = msg.retry
        # last_id should only be set if included in the message.  It's not
        # forgotten if a message omits it.
        if msg.id:
            self.last_id = msg.id
        return msg
    if six.PY2:
        next = __next__
 class Event(object):
    sse_line_pattern = re.compile("(?P<name>[^:]*):?( ?(?P<value>.*))?")
    def __init__(self, data="", event="message", id=None, retry=None):
        assert isinstance(data, six.string_types), "Data must be text"
        self.data = data
        self.event = event
        self.id = id
        self.retry = retry
    def dump(self):
        lines = []
        if self.id:
            lines.append("id: %s" % self.id)
        # Only include an event line if it's not the default already.
        if self.event != "message":
            lines.append("event: %s" % self.event)
        if self.retry:
            lines.append("retry: %s" % self.retry)
        lines.extend("data: %s" % d for d in self.data.split("\n"))
        return "\n".join(lines) + "\n\n"
    @classmethod
    def parse(cls, raw):
        """
        Given a possibly-multiline string representing an SSE message, parse it
        and return a Event object.
        """
        msg = cls()
        for line in raw.splitlines():
            m = cls.sse_line_pattern.match(line)
            if m is None:
                # Malformed line.  Discard but warn.
                warnings.warn('Invalid SSE line: "%s"' % line, SyntaxWarning)
                continue
            name = m.group("name")
            if name == "":
                # line began with a ":", so is a comment.  Ignore
                continue
            value = m.group("value")
            if name == "data":
                # If we already have some data, then join to it with a newline.
                # Else this is it.
                if msg.data:
                    msg.data = "%s\n%s" % (msg.data, value)
                else:
                    msg.data = value
            elif name == "event":
                msg.event = value
            elif name == "id":
                msg.id = value
            elif name == "retry":
                msg.retry = int(value)
        return msg
    def __str__(self):
        return self.data
--- a/vocode/streaming/utils/transcript.py
+++ b/vocode/streaming/utils/transcript.py
@ -0,0 +1,40 @@
 import time
 from pydantic import BaseModel, Field
 from enum import Enum
 class Sender(str, Enum):
    HUMAN = "human"
    BOT = "bot"
 class Message(BaseModel):
    text: str
    sender: Sender
    timestamp: float
    def to_string(self, include_timestamp: bool = False) -> str:
        if include_timestamp:
            return f"{self.sender.name}: {self.text} ({self.timestamp})"
        return f"{self.sender.name}: {self.text}"
 class Transcript(BaseModel):
    messages: list[Message] = []
    start_time: float = Field(default_factory=time.time)
    def to_string(self, include_timestamps: bool = False) -> str:
        return "\n".join(
            message.to_string(include_timestamp=include_timestamps)
            for message in self.messages
        )
    def add_human_message(self, text: str):
        self.messages.append(
            Message(text=text, sender=Sender.HUMAN, timestamp=time.time())
        )
    def add_bot_message(self, text: str):
        self.messages.append(
            Message(text=text, sender=Sender.BOT, timestamp=time.time())
        )