open source

This commit is contained in:
Ajay Raj 2023-03-28 00:15:34 -07:00
commit a93bfc1ec9
61 changed files with 4013 additions and 126 deletions

2
.gitignore vendored
View file

@ -3,3 +3,5 @@ __pycache__/
.env .env
.DS_Store .DS_Store
dist/ dist/
credentials.json
*.npy

View file

@ -1,4 +1,4 @@
from vocode.streaming.telephony.inbound_call_server import InboundCallServer from vocode.streaming.telephony.hosted.inbound_call_server import InboundCallServer
from vocode.streaming.models.agent import EchoAgentConfig from vocode.streaming.models.agent import EchoAgentConfig
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,6 +1,6 @@
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.output_device.telephone_output import TelephoneOutput from vocode.streaming.output_device.telephone_output import TelephoneOutput
from vocode.streaming.telephony.outbound_call import OutboundCall from vocode.streaming.telephony.hosted.outbound_call import OutboundCall
from vocode.streaming.models.telephony import CallEntity from vocode.streaming.models.telephony import CallEntity
from vocode.streaming.models.agent import ( from vocode.streaming.models.agent import (
EchoAgentConfig, EchoAgentConfig,
@ -8,7 +8,7 @@ from vocode.streaming.models.agent import (
WebSocketUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig,
) )
from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn from vocode.streaming.telephony.hosted.zoom_dial_in import ZoomDialIn
if __name__ == "__main__": if __name__ == "__main__":
call = ZoomDialIn( call = ZoomDialIn(

View file

@ -3,6 +3,7 @@ import logging
import signal import signal
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
from vocode.streaming.streaming_conversation import StreamingConversation from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.helpers import create_microphone_input_and_speaker_output from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.streaming.models.transcriber import ( from vocode.streaming.models.transcriber import (
@ -22,7 +23,6 @@ from vocode.streaming.models.agent import (
) )
from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
import vocode import vocode
load_dotenv() load_dotenv()
@ -37,7 +37,7 @@ if __name__ == "__main__":
streaming=True, use_default_devices=False streaming=True, use_default_devices=False
) )
conversation = StreamingConversation( conversation = HostedStreamingConversation(
input_device=microphone_input, input_device=microphone_input,
output_device=speaker_output, output_device=speaker_output,
transcriber_config=DeepgramTranscriberConfig.from_input_device( transcriber_config=DeepgramTranscriberConfig.from_input_device(

View file

@ -0,0 +1,79 @@
import asyncio
import logging
import signal
from dotenv import load_dotenv
import os
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
GoogleTranscriberConfig,
)
from vocode.streaming.models.agent import (
ChatGPTAgentConfig,
CutOffResponse,
FillerAudioConfig,
RESTfulUserImplementedAgentConfig,
WebSocketUserImplementedAgentConfig,
EchoAgentConfig,
LLMAgentConfig,
ChatGPTAgentConfig,
)
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
GoogleSynthesizerConfig,
RimeSynthesizerConfig,
)
import vocode
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
load_dotenv()
vocode.api_key = os.getenv("VOCODE_API_KEY")
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
async def main():
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
streaming=True, use_default_devices=False
)
conversation = StreamingConversation(
output_device=speaker_output,
transcriber=DeepgramTranscriber(
DeepgramTranscriberConfig.from_input_device(
microphone_input, endpointing_config=PunctuationEndpointingConfig()
)
),
agent=ChatGPTAgent(
ChatGPTAgentConfig(
initial_message=BaseMessage(text="What up"),
prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
generate_responses=True,
cut_off_response=CutOffResponse(),
)
),
synthesizer=AzureSynthesizer(
AzureSynthesizerConfig.from_output_device(speaker_output),
),
logger=logger,
)
await conversation.start()
print("Conversation started, press Ctrl+C to end")
signal.signal(signal.SIGINT, lambda _0, _1: conversation.terminate())
while conversation.is_active():
chunk = microphone_input.get_audio()
if chunk:
conversation.receive_audio(chunk)
await asyncio.sleep(0)
if __name__ == "__main__":
asyncio.run(main())

69
examples/telephony_app.py Normal file
View file

@ -0,0 +1,69 @@
import logging
from fastapi import FastAPI
import os
from dotenv import load_dotenv
load_dotenv()
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
from vocode.streaming.models.agent import ChatGPTAgentConfig
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.telephony import TwilioConfig
from vocode.streaming.telephony.config_manager.redis_config_manager import (
RedisConfigManager,
)
from vocode.streaming.telephony.conversation.outbound_call import OutboundCall
from vocode.streaming.telephony.server.base import InboundCallConfig, TelephonyServer
app = FastAPI(docs_url=None)
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
config_manager = RedisConfigManager()
BASE_URL = "59b8e140372d.ngrok.app"
telephony_server = TelephonyServer(
base_url=BASE_URL,
config_manager=config_manager,
inbound_call_configs=[
InboundCallConfig(
url="/inbound_call",
agent_config=ChatGPTAgentConfig(
initial_message=BaseMessage(text="What up"),
prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
generate_responses=True,
),
twilio_config=TwilioConfig(
account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
auth_token=os.getenv("TWILIO_AUTH_TOKEN"),
),
)
],
logger=logger,
)
app.include_router(telephony_server.get_router())
# outbound_call = OutboundCall(
# base_url=BASE_URL,
# to_phone="+14088926228",
# from_phone="+14086600744",
# config_manager=config_manager,
# agent_config=ChatGPTAgentConfig(
# initial_message=BaseMessage(text="What up"),
# prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
# hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
# generate_responses=True,
# ),
# twilio_config=TwilioConfig(
# account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
# auth_token=os.getenv("TWILIO_AUTH_TOKEN"),
# ),
# logger=logger,
# )
# outbound_call.start()

View file

@ -4,6 +4,8 @@ anyio==3.6.2
async-timeout==4.0.2 async-timeout==4.0.2
attrs==22.2.0 attrs==22.2.0
azure-cognitiveservices-speech==1.25.0 azure-cognitiveservices-speech==1.25.0
black==23.1.0
cachetools==5.3.0
certifi==2022.12.7 certifi==2022.12.7
cffi==1.15.1 cffi==1.15.1
charset-normalizer==3.0.1 charset-normalizer==3.0.1
@ -12,32 +14,50 @@ dataclasses-json==0.5.7
decorator==5.1.1 decorator==5.1.1
fastapi==0.92.0 fastapi==0.92.0
frozenlist==1.3.3 frozenlist==1.3.3
google-api-core==2.11.0
google-auth==2.16.3
google-cloud-speech==2.17.3
google-cloud-texttospeech==2.14.1
googleapis-common-protos==1.59.0
grpcio==1.51.3
grpcio-status==1.51.3
h11==0.14.0 h11==0.14.0
idna==3.4 idna==3.4
Jinja2==3.1.2
joblib==1.2.0
langchain==0.0.117 langchain==0.0.117
MarkupSafe==2.1.2
marshmallow==3.19.0 marshmallow==3.19.0
marshmallow-enum==1.5.1 marshmallow-enum==1.5.1
mccabe==0.7.0 mccabe==0.7.0
multidict==6.0.4 multidict==6.0.4
mypy-extensions==1.0.0 mypy-extensions==1.0.0
nltk==3.8.1
numpy==1.24.2 numpy==1.24.2
openai==0.27.2 openai==0.27.2
packaging==23.0 packaging==23.0
pathspec==0.11.0 pathspec==0.11.0
platformdirs==3.1.0 platformdirs==3.1.0
ply==3.11 ply==3.11
proto-plus==1.22.2
protobuf==4.22.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
PyAudio==0.2.13 PyAudio==0.2.13
pycodestyle==2.10.0 pycodestyle==2.10.0
pycparser==2.21 pycparser==2.21
pydantic>=1.9.0 pydantic==1.10.7
pyflakes>=2.5.0
pydub==0.25.1 pydub==0.25.1
pyflakes==3.0.1
PyJWT==2.6.0 PyJWT==2.6.0
python-dotenv==0.21.1 python-dotenv==0.21.1
python-multipart==0.0.6 python-multipart==0.0.6
pytz==2022.7.1 pytz==2022.7.1
PyYAML==6.0 PyYAML==6.0
redis==4.5.3
regex==2023.3.23
requests==2.28.2 requests==2.28.2
rsa==4.9
six==1.16.0 six==1.16.0
sniffio==1.3.0 sniffio==1.3.0
sounddevice==0.4.6 sounddevice==0.4.6
@ -46,8 +66,9 @@ starlette==0.25.0
tenacity==8.2.2 tenacity==8.2.2
tomli==2.0.1 tomli==2.0.1
tqdm==4.65.0 tqdm==4.65.0
twilio==7.17.0
typing-inspect==0.8.0 typing-inspect==0.8.0
typing_extensions>=3.10.0.2 typing_extensions==4.5.0
urllib3==1.26.14 urllib3==1.26.14
uvicorn==0.20.0 uvicorn==0.20.0
websockets==10.4 websockets==10.4

View file

@ -0,0 +1,44 @@
import random
from typing import Generator, Optional
from vocode.streaming.models.agent import (
AgentConfig,
ChatGPTAgentConfig,
LLMAgentConfig,
)
class BaseAgent:
def __init__(self, agent_config: AgentConfig):
self.agent_config = agent_config
def get_agent_config(self) -> AgentConfig:
return self.agent_config
def start(self):
pass
def respond(
self, human_input, is_interrupt: bool = False
) -> tuple[Optional[str], bool]:
raise NotImplementedError
def generate_response(
self, human_input, is_interrupt: bool = False
) -> Generator[str, None, None]:
"""Returns a generator that yields a sentence at a time."""
raise NotImplementedError
def update_last_bot_message_on_cut_off(self, message: str):
"""Updates the last bot message in the conversation history when the human cuts off the bot's response."""
pass
def get_cut_off_response(self) -> Optional[str]:
assert isinstance(self.agent_config, LLMAgentConfig) or isinstance(
self.agent_config, ChatGPTAgentConfig
)
on_cut_off_messages = self.agent_config.cut_off_response.messages
if on_cut_off_messages:
return random.choice(on_cut_off_messages).text
def terminate(self):
pass

View file

@ -0,0 +1,50 @@
from typing import Optional
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
TEMPLATE = """
Read the following conversation classify the final emotion of the Bot as one of [{emotions}].
Output the degree of emotion as a value between 0 and 1 in the format EMOTION,DEGREE: ex. {example_emotion},0.5
<start>
{{transcript}}
<end>
"""
class BotSentiment(BaseModel):
emotion: Optional[str] = None
degree: float = 0.0
class BotSentimentAnalyser:
def __init__(self, emotions: list[str], model_name: str = "text-davinci-003"):
self.model_name = model_name
self.llm = OpenAI(
model_name=self.model_name,
)
assert len(emotions) > 0
self.emotions = [e.lower() for e in emotions]
self.prompt = PromptTemplate(
input_variables=["transcript"],
template=TEMPLATE.format(
emotions=",".join(self.emotions), example_emotion=self.emotions[0]
),
)
def analyse(self, transcript: str) -> BotSentiment:
prompt = self.prompt.format(transcript=transcript)
response = self.llm(prompt).strip()
tokens = response.split(",")
if len(tokens) != 2:
return BotSentiment(emotion=None, degree=0.0)
emotion, degree = tokens
emotion = emotion.strip().lower()
if emotion.lower() not in self.emotions:
return BotSentiment(emotion=None, degree=0.0)
try:
degree = float(degree.strip())
except ValueError:
return BotSentiment(emotion=emotion, degree=0.5)
return BotSentiment(emotion=emotion, degree=degree)

View file

@ -0,0 +1,158 @@
import os
import random
import time
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAIChat
from langchain.memory import ConversationBufferMemory
from langchain.schema import ChatMessage, AIMessage
import openai
import json
from typing import Generator, Optional
from dotenv import load_dotenv
from typing import Generator
import logging
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.models.agent import ChatGPTAgentConfig
from vocode.streaming.utils.sse_client import SSEClient
from vocode.streaming.agent.utils import stream_llm_response
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")
class ChatGPTAgent(BaseAgent):
def __init__(self, agent_config: ChatGPTAgentConfig, logger: logging.Logger = None):
super().__init__(agent_config)
self.agent_config = agent_config
self.logger = logger or logging.getLogger(__name__)
self.logger.setLevel(logging.DEBUG)
self.prompt = ChatPromptTemplate.from_messages(
[
SystemMessagePromptTemplate.from_template(agent_config.prompt_preamble),
MessagesPlaceholder(variable_name="history"),
HumanMessagePromptTemplate.from_template("{input}"),
]
)
self.memory = ConversationBufferMemory(return_messages=True)
if agent_config.initial_message:
if (
agent_config.generate_responses
): # we use ChatMessages for memory when we generate responses
self.memory.chat_memory.messages.append(
ChatMessage(
content=agent_config.initial_message.text, role="assistant"
)
)
else:
self.memory.chat_memory.add_ai_message(
agent_config.initial_message.text
)
self.llm = ChatOpenAI(
model_name=self.agent_config.model_name,
temperature=self.agent_config.temperature,
max_tokens=self.agent_config.max_tokens,
)
self.conversation = ConversationChain(
memory=self.memory, prompt=self.prompt, llm=self.llm
)
self.first_response = (
self.create_first_response(agent_config.expected_first_prompt)
if agent_config.expected_first_prompt
else None
)
self.is_first_response = True
def create_first_response(self, first_prompt):
return self.conversation.predict(input=first_prompt)
def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
if is_interrupt and self.agent_config.cut_off_response:
cut_off_response = self.get_cut_off_response()
self.memory.chat_memory.add_user_message(human_input)
self.memory.chat_memory.add_ai_message(cut_off_response)
return cut_off_response, False
self.logger.debug("LLM responding to human input")
if self.is_first_response and self.first_response:
self.logger.debug("First response is cached")
self.is_first_response = False
text = self.first_response
else:
text = self.conversation.predict(input=human_input)
self.logger.debug(f"LLM response: {text}")
return text, False
def generate_response(
self, human_input, is_interrupt: bool = False
) -> Generator[str, None, None]:
self.memory.chat_memory.messages.append(
ChatMessage(role="user", content=human_input)
)
if is_interrupt and self.agent_config.cut_off_response:
cut_off_response = self.get_cut_off_response()
self.memory.chat_memory.messages.append(
ChatMessage(role="assistant", content=cut_off_response)
)
yield cut_off_response
return
prompt_messages = [
ChatMessage(role="system", content=self.agent_config.prompt_preamble)
] + self.memory.chat_memory.messages
messages = SSEClient(
"POST",
"https://api.openai.com/v1/chat/completions",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
},
json={
"model": self.agent_config.model_name,
"messages": [
prompt_message.dict(include={"content": True, "role": True})
for prompt_message in prompt_messages
],
"max_tokens": 256,
"temperature": 1.0,
"stream": True,
},
)
bot_memory_message = ChatMessage(role="assistant", content="")
self.memory.chat_memory.messages.append(bot_memory_message)
for message in stream_llm_response(
map(lambda event: json.loads(event.data), messages),
get_text=lambda choice: choice.get("delta", {}).get("content"),
):
bot_memory_message.content = f"{bot_memory_message.content} {message}"
yield message
def update_last_bot_message_on_cut_off(self, message: str):
for memory_message in self.memory.chat_memory.messages[::-1]:
if (
isinstance(memory_message, ChatMessage)
and memory_message.role == "assistant"
) or isinstance(memory_message, AIMessage):
memory_message.content = message
return
if __name__ == "__main__":
agent = ChatGPTAgent(
ChatGPTAgentConfig(
model_name="gpt-4",
prompt_preamble="The assistant is having a pleasant conversation about life. If the user hasn't completed their thought, the assistant responds with 'PASS'",
)
)
while True:
# response = agent.respond(input("Human: "))[0]
# print(f"AI: {response}")
for response in agent.generate_response(input("Human: ")):
print(f"AI: {response}")

View file

@ -0,0 +1,13 @@
from typing import Generator
from vocode.streaming.agent.base_agent import BaseAgent
class EchoAgent(BaseAgent):
def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
return human_input, False
def generate_response(self, human_input, is_interrupt: bool = False) -> Generator:
yield human_input
def update_last_bot_message_on_cut_off(self, message: str):
pass

View file

@ -0,0 +1,32 @@
import logging
from typing import List
from langchain import OpenAI
from vocode.streaming.agent.llm_agent import LLMAgent
from ..models.agent import InformationRetrievalAgentConfig, LLMAgentConfig
class InformationRetrievalAgent(LLMAgent):
def __init__(
self,
agent_config: InformationRetrievalAgentConfig,
logger: logging.Logger,
):
# super().__init__(agent_config, logger)
prompt_preamble = f"""
The AI is a friendly phone bot built for information retrieval. It understands IVR navigation and chooses which numbers to press based on the intended goal and the options provided.
Once it reaches the human, it verifies the identity of the person it is trying to reach and states its purpose. If it needs to be transferred, then the AI asks to speak to the intended recipient of the phone call.
Here is the context for the call:
Intended goal: { agent_config.goal_description }
Intended recipient: { agent_config.recipient_descriptor }
Information to be collected: { agent_config.fields }
Information to provide to the person who answers the phone: this is a robot calling on behalf of { agent_config.caller_descriptor }
The AI begins the call by introducing itself and who it represents.
"""
agent_config = LLMAgentConfig(
prompt_preamble=prompt_preamble,
)
super().__init__(agent_config, logger=logger)
self.llm = OpenAI(model_name="text-davinci-003", temperature=1)

View file

@ -0,0 +1,139 @@
import re
from typing import Optional
from dotenv import load_dotenv
from langchain import OpenAI
from langchain.llms import OpenAIChat
from typing import Generator
import logging
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.agent.utils import stream_llm_response
from vocode.streaming.models.agent import LLMAgentConfig
load_dotenv()
class LLMAgent(BaseAgent):
SENTENCE_ENDINGS = [".", "!", "?"]
DEFAULT_PROMPT_TEMPLATE = "{history}\nHuman: {human_input}\nAI:"
def __init__(
self,
agent_config: LLMAgentConfig,
logger: logging.Logger = None,
sender="AI",
recipient="Human",
):
super().__init__(agent_config)
self.agent_config = agent_config
self.prompt_template = (
f"{agent_config.prompt_preamble}\n\n{self.DEFAULT_PROMPT_TEMPLATE}"
)
self.initial_bot_message = (
agent_config.initial_message.text if agent_config.initial_message else None
)
self.logger = logger or logging.getLogger(__name__)
self.sender = sender
self.recipient = recipient
self.memory = (
[f"AI: {agent_config.initial_message.text}"]
if agent_config.initial_message
else []
)
self.llm = OpenAI(
model_name=self.agent_config.model_name,
temperature=self.agent_config.temperature,
max_tokens=self.agent_config.max_tokens,
)
self.stop_tokens = [f"{recipient}:"]
self.first_response = (
self.llm(
self.prompt_template.format(
history="", human_input=agent_config.expected_first_prompt
),
stop=self.stop_tokens,
).strip()
if agent_config.expected_first_prompt
else None
)
self.is_first_response = True
def create_prompt(self, human_input):
history = "\n".join(self.memory[-5:])
return self.prompt_template.format(history=history, human_input=human_input)
def get_memory_entry(self, human_input, response):
return f"{self.recipient}: {human_input}\n{self.sender}: {response}"
def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
if is_interrupt and self.agent_config.cut_off_response:
cut_off_response = self.get_cut_off_response()
self.memory.append(self.get_memory_entry(human_input, cut_off_response))
return cut_off_response, False
self.logger.debug("LLM responding to human input")
if self.is_first_response and self.first_response:
self.logger.debug("First response is cached")
self.is_first_response = False
response = self.first_response
else:
response = self.llm(self.create_prompt(human_input), stop=self.stop_tokens)
response = response.replace(f"{self.sender}:", "")
self.memory.append(self.get_memory_entry(human_input, response))
self.logger.debug(f"LLM response: {response}")
return response, False
def generate_response(self, human_input, is_interrupt: bool = False) -> Generator:
self.logger.debug("LLM generating response to human input")
if is_interrupt and self.agent_config.cut_off_response:
cut_off_response = self.get_cut_off_response()
self.memory.append(self.get_memory_entry(human_input, cut_off_response))
yield cut_off_response
return
self.memory.append(self.get_memory_entry(human_input, ""))
if self.is_first_response and self.first_response:
self.logger.debug("First response is cached")
self.is_first_response = False
sentences = [self.first_response]
else:
self.logger.debug("Creating LLM prompt")
prompt = self.create_prompt(human_input)
self.logger.debug("Streaming LLM response")
sentences = stream_llm_response(
map(
lambda resp: resp.to_dict(),
self.llm.stream(prompt, stop=self.stop_tokens),
)
)
response_buffer = ""
for sentence in sentences:
sentence = sentence.replace(f"{self.sender}:", "")
sentence = re.sub(r"^\s+(.*)", r" \1", sentence)
response_buffer += sentence
self.memory[-1] = self.get_memory_entry(human_input, response_buffer)
yield sentence
def update_last_bot_message_on_cut_off(self, message: str):
last_message = self.memory[-1]
new_last_message = (
last_message.split("\n", 1)[0] + f"\n{self.sender}: {message}"
)
self.memory[-1] = new_last_message
if __name__ == "__main__":
chat_responder = LLMAgent(
LLMAgentConfig(
prompt_preamble="""
The AI is having a pleasant conversation about life. If the human hasn't completed their thought, the AI responds with 'PASS'
{history}
Human: {human_input}
AI:""",
)
)
while True:
# response = chat_responder.respond(input("Human: "))[0]
for response in chat_responder.generate_response(input("Human: ")):
print(f"AI: {response}")

View file

@ -0,0 +1,25 @@
from typing import Generator
SENTENCE_ENDINGS = [".", "!", "?"]
def stream_llm_response(
gen, get_text=lambda choice: choice.get("text"), sentence_endings=SENTENCE_ENDINGS
) -> Generator:
buffer = ""
for response in gen:
choices = response.get("choices", [])
if len(choices) == 0:
break
choice = choices[0]
if choice["finish_reason"]:
break
token = get_text(choice)
if not token:
continue
buffer += token
if any(token.endswith(ending) for ending in sentence_endings):
yield buffer.strip()
buffer = ""
if buffer.strip():
yield buffer

View file

@ -0,0 +1,3 @@
TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS = 1
PER_CHUNK_ALLOWANCE_SECONDS = 0.05
ALLOWED_IDLE_TIME = 15

View file

@ -0,0 +1,58 @@
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
from vocode.streaming.agent.echo_agent import EchoAgent
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
from vocode.streaming.agent.llm_agent import LLMAgent
from vocode.streaming.models.agent import AgentConfig, AgentType
from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType
from vocode.streaming.models.transcriber import TranscriberConfig, TranscriberType
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
def create_transcriber(transcriber_config: TranscriberConfig) -> BaseTranscriber:
if transcriber_config.type == TranscriberType.DEEPGRAM:
return DeepgramTranscriber(transcriber_config)
elif transcriber_config.type == TranscriberType.GOOGLE:
return GoogleTranscriber(transcriber_config)
elif transcriber_config.type == TranscriberType.ASSEMBLY_AI:
return AssemblyAITranscriber(transcriber_config)
else:
raise Exception("Invalid transcriber config")
def create_agent(agent_config: AgentConfig) -> BaseAgent:
if agent_config.type == AgentType.LLM:
return LLMAgent(agent_config=agent_config)
elif agent_config.type == AgentType.CHAT_GPT:
return ChatGPTAgent(agent_config=agent_config)
elif agent_config.type == AgentType.ECHO:
return EchoAgent(agent_config=agent_config)
elif agent_config.type == AgentType.INFORMATION_RETRIEVAL:
return InformationRetrievalAgent(
agent_config=agent_config,
)
raise Exception("Invalid agent config", agent_config.type)
def create_synthesizer(synthesizer_config: SynthesizerConfig) -> BaseSynthesizer:
if synthesizer_config.type == SynthesizerType.GOOGLE:
return GoogleSynthesizer(synthesizer_config)
elif synthesizer_config.type == SynthesizerType.AZURE:
return AzureSynthesizer(synthesizer_config)
elif synthesizer_config.type == SynthesizerType.ELEVEN_LABS:
kwargs = {}
if synthesizer_config.voice_id:
kwargs["voice_id"] = synthesizer_config.voice_id
return ElevenLabsSynthesizer(synthesizer_config, **kwargs)
elif synthesizer_config.type == SynthesizerType.RIME:
return RimeSynthesizer(synthesizer_config)
else:
raise Exception("Invalid synthesizer config")

View file

@ -0,0 +1,106 @@
import websockets
from websockets.exceptions import ConnectionClosedOK
from websockets.client import WebSocketClientProtocol
import asyncio
from dotenv import load_dotenv
import os
import logging
import threading
import queue
import vocode
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.websocket import (
ReadyMessage,
AudioMessage,
StartMessage,
StopMessage,
)
load_dotenv()
class HostedStreamingConversation:
def __init__(
self,
input_device: BaseInputDevice,
output_device: BaseOutputDevice,
transcriber_config: TranscriberConfig,
agent_config: AgentConfig,
synthesizer_config: SynthesizerConfig,
id: str = None,
):
self.id = id
self.input_device = input_device
self.output_device = output_device
self.transcriber_config = transcriber_config
self.agent_config = agent_config
self.synthesizer_config = synthesizer_config
self.logger = logging.getLogger(__name__)
self.receiver_ready = False
self.active = True
self.output_loop = asyncio.new_event_loop()
self.output_audio_queue = queue.Queue()
self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
async def wait_for_ready(self):
while not self.receiver_ready:
await asyncio.sleep(0.1)
return True
def deactivate(self):
self.active = False
def play_audio(self):
async def run():
while self.active:
try:
audio = self.output_audio_queue.get(timeout=5)
await self.output_device.send_async(audio)
except queue.Empty:
continue
loop = asyncio.new_event_loop()
loop.run_until_complete(run())
async def start(self):
async with websockets.connect(
f"{self.vocode_websocket_url}?key={vocode.api_key}"
) as ws:
async def sender(ws: WebSocketClientProtocol):
start_message = StartMessage(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config,
conversation_id=self.id,
)
await ws.send(start_message.json())
await self.wait_for_ready()
self.logger.info("Listening...press Ctrl+C to stop")
while self.active:
data = self.input_device.get_audio()
if data:
try:
await ws.send(AudioMessage.from_bytes(data).json())
except ConnectionClosedOK:
self.deactivate()
return
await asyncio.sleep(0)
await ws.send(StopMessage().json())
async def receiver(ws: WebSocketClientProtocol):
ReadyMessage.parse_raw(await ws.recv())
self.receiver_ready = True
async for msg in ws:
audio_message = AudioMessage.parse_raw(msg)
self.output_audio_queue.put_nowait(audio_message.get_bytes())
output_thread = threading.Thread(target=self.play_audio)
output_thread.start()
return await asyncio.gather(sender(ws), receiver(ws))

View file

@ -42,6 +42,7 @@ class AgentConfig(TypedModel, type=AgentType.BASE):
initial_message: Optional[BaseMessage] = None initial_message: Optional[BaseMessage] = None
generate_responses: bool = True generate_responses: bool = True
allowed_idle_time_seconds: Optional[float] = None allowed_idle_time_seconds: Optional[float] = None
allow_agent_to_be_cut_off: bool = True
end_conversation_on_goodbye: bool = False end_conversation_on_goodbye: bool = False
send_filler_audio: Union[bool, FillerAudioConfig] = False send_filler_audio: Union[bool, FillerAudioConfig] = False
@ -59,6 +60,13 @@ class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
cut_off_response: Optional[CutOffResponse] = None cut_off_response: Optional[CutOffResponse] = None
class ChatGPTAlphaAgentConfig(AgentConfig, type=AgentType.CHAT_GPT_ALPHA):
prompt_preamble: str
expected_first_prompt: Optional[str] = None
temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT): class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
prompt_preamble: str prompt_preamble: str
expected_first_prompt: Optional[str] = None expected_first_prompt: Optional[str] = None

View file

@ -1,5 +1,6 @@
from enum import Enum from enum import Enum
class AudioEncoding(str, Enum): class AudioEncoding(str, Enum):
LINEAR16 = "linear16" LINEAR16 = "linear16"
MULAW = "mulaw" MULAW = "mulaw"

View file

@ -1,17 +1,17 @@
import pydantic import pydantic
class BaseModel(pydantic.BaseModel):
class BaseModel(pydantic.BaseModel):
def __init__(self, **data): def __init__(self, **data):
for key, value in data.items(): for key, value in data.items():
if isinstance(value, dict): if isinstance(value, dict):
if 'type' in value: if "type" in value:
data[key] = TypedModel.parse_obj(value) data[key] = TypedModel.parse_obj(value)
super().__init__(**data) super().__init__(**data)
# Adapted from https://github.com/pydantic/pydantic/discussions/3091 # Adapted from https://github.com/pydantic/pydantic/discussions/3091
class TypedModel(BaseModel): class TypedModel(BaseModel):
_subtypes_ = [] _subtypes_ = []
def __init_subclass__(cls, type=None): def __init_subclass__(cls, type=None):
@ -22,31 +22,30 @@ class TypedModel(BaseModel):
for t, cls in _cls._subtypes_: for t, cls in _cls._subtypes_:
if t == type: if t == type:
return cls return cls
raise ValueError(f'Unknown type {type}') raise ValueError(f"Unknown type {type}")
@classmethod @classmethod
def get_type(_cls, cls_name): def get_type(_cls, cls_name):
for t, cls in _cls._subtypes_: for t, cls in _cls._subtypes_:
if cls.__name__ == cls_name: if cls.__name__ == cls_name:
return t return t
raise ValueError(f'Unknown class {cls_name}') raise ValueError(f"Unknown class {cls_name}")
@classmethod @classmethod
def parse_obj(cls, obj): def parse_obj(cls, obj):
data_type = obj.get('type') data_type = obj.get("type")
if data_type is None: if data_type is None:
raise ValueError(f'type is required for {cls.__name__}') raise ValueError(f"type is required for {cls.__name__}")
sub = cls.get_cls(data_type) sub = cls.get_cls(data_type)
if sub is None: if sub is None:
raise ValueError(f'Unknown type {data_type}') raise ValueError(f"Unknown type {data_type}")
return sub(**obj) return sub(**obj)
def _iter(self, **kwargs): def _iter(self, **kwargs):
yield 'type', self.get_type(self.__class__.__name__) yield "type", self.get_type(self.__class__.__name__)
yield from super()._iter(**kwargs) yield from super()._iter(**kwargs)
@property @property
def type(self): def type(self):
return self.get_type(self.__class__.__name__) return self.get_type(self.__class__.__name__)

View file

@ -2,9 +2,14 @@ from enum import Enum
from typing import Optional, Union from typing import Optional, Union
from pydantic import BaseModel, validator from pydantic import BaseModel, validator
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.telephony.constants import (
DEFAULT_AUDIO_ENCODING,
DEFAULT_SAMPLING_RATE,
)
from .model import TypedModel from .model import TypedModel
from .audio_encoding import AudioEncoding from .audio_encoding import AudioEncoding
from ..output_device.base_output_device import BaseOutputDevice
class SynthesizerType(str, Enum): class SynthesizerType(str, Enum):
@ -38,6 +43,13 @@ class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
audio_encoding=output_device.audio_encoding, audio_encoding=output_device.audio_encoding,
) )
@classmethod
def from_telephone_output_device(cls):
return cls(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
)
AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural" AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
AZURE_SYNTHESIZER_DEFAULT_PITCH = 0 AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
@ -45,18 +57,32 @@ AZURE_SYNTHESIZER_DEFAULT_RATE = 15
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE): class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME voice_name: Optional[str] = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH pitch: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_PITCH
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE rate: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_RATE
class Config:
validate_assignment = True
@validator("voice_name")
def set_name(cls, voice_name):
return voice_name or AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
@validator("pitch")
def set_pitch(cls, pitch):
return pitch or AZURE_SYNTHESIZER_DEFAULT_PITCH
@validator("rate")
def set_rate(cls, rate):
return rate or AZURE_SYNTHESIZER_DEFAULT_RATE
@classmethod @classmethod
def from_output_device( def from_output_device(
cls, cls,
output_device: BaseOutputDevice, output_device: BaseOutputDevice,
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME, voice_name: Optional[str] = None,
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH, pitch: Optional[int] = None,
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE, rate: Optional[int] = None,
track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False,
): ):
return cls( return cls(
sampling_rate=output_device.sampling_rate, sampling_rate=output_device.sampling_rate,
@ -64,16 +90,33 @@ class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
voice_name=voice_name, voice_name=voice_name,
pitch=pitch, pitch=pitch,
rate=rate, rate=rate,
track_bot_sentiment_in_voice=track_bot_sentiment_in_voice,
) )
pass @classmethod
def from_telephone_output_device(
cls,
voice_name: Optional[str] = None,
pitch: Optional[int] = None,
rate: Optional[int] = None,
):
return cls(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
voice_name=voice_name,
pitch=pitch,
rate=rate,
)
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE): class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
pass pass
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
api_key: str
voice_id: Optional[str] = None
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME): class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
speaker: str speaker: str
@ -88,3 +131,14 @@ class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
audio_encoding=output_device.audio_encoding, audio_encoding=output_device.audio_encoding,
speaker=speaker, speaker=speaker,
) )
@classmethod
def from_telephone_output_device(
cls,
speaker: str,
):
return cls(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
speaker=speaker,
)

View file

@ -1,4 +1,5 @@
from typing import Optional from typing import Optional
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.models.model import BaseModel from vocode.streaming.models.model import BaseModel
from vocode.streaming.models.agent import AgentConfig from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig from vocode.streaming.models.synthesizer import SynthesizerConfig
@ -19,6 +20,7 @@ class CreateInboundCall(BaseModel):
agent_config: AgentConfig agent_config: AgentConfig
synthesizer_config: Optional[SynthesizerConfig] = None synthesizer_config: Optional[SynthesizerConfig] = None
twilio_sid: str twilio_sid: str
conversation_id: Optional[str] = None
twilio_config: Optional[TwilioConfig] = None twilio_config: Optional[TwilioConfig] = None
@ -48,3 +50,11 @@ class DialIntoZoomCall(BaseModel):
synthesizer_config: Optional[SynthesizerConfig] = None synthesizer_config: Optional[SynthesizerConfig] = None
conversation_id: Optional[str] = None conversation_id: Optional[str] = None
twilio_config: Optional[TwilioConfig] = None twilio_config: Optional[TwilioConfig] = None
class CallConfig(BaseModel):
transcriber_config: TranscriberConfig
agent_config: AgentConfig
synthesizer_config: SynthesizerConfig
twilio_config: Optional[TwilioConfig]
twilio_sid: str

View file

@ -1,8 +1,11 @@
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
from vocode.streaming.input_device.base_input_device import ( from vocode.streaming.input_device.base_input_device import BaseInputDevice
BaseInputDevice, from vocode.streaming.telephony.constants import (
DEFAULT_AUDIO_ENCODING,
DEFAULT_CHUNK_SIZE,
DEFAULT_SAMPLING_RATE,
) )
from .audio_encoding import AudioEncoding from .audio_encoding import AudioEncoding
from .model import BaseModel, TypedModel from .model import BaseModel, TypedModel
@ -54,11 +57,25 @@ class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
endpointing_config=endpointing_config, endpointing_config=endpointing_config,
) )
@classmethod
def from_telephone_input_device(
cls,
endpointing_config: Optional[EndpointingConfig] = None,
):
return cls(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
chunk_size=DEFAULT_CHUNK_SIZE,
endpointing_config=endpointing_config,
)
class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM): class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
model: Optional[str] = None model: Optional[str] = None
tier: Optional[str] = None
should_warmup_model: bool = False should_warmup_model: bool = False
version: Optional[str] = None version: Optional[str] = None
downsampling: Optional[int] = None
class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE): class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):

View file

@ -6,33 +6,40 @@ from .transcriber import TranscriberConfig
from .agent import AgentConfig from .agent import AgentConfig
from .synthesizer import SynthesizerConfig from .synthesizer import SynthesizerConfig
class WebSocketMessageType(str, Enum):
BASE = 'websocket_base'
START = 'websocket_start'
AUDIO = 'websocket_audio'
READY = 'websocket_ready'
STOP = 'websocket_stop'
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass class WebSocketMessageType(str, Enum):
BASE = "websocket_base"
START = "websocket_start"
AUDIO = "websocket_audio"
READY = "websocket_ready"
STOP = "websocket_stop"
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE):
pass
class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO): class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
data: str data: str
@classmethod @classmethod
def from_bytes(cls, chunk: bytes): def from_bytes(cls, chunk: bytes):
return cls(data=base64.b64encode(chunk).decode('utf-8')) return cls(data=base64.b64encode(chunk).decode("utf-8"))
def get_bytes(self) -> bytes: def get_bytes(self) -> bytes:
return base64.b64decode(self.data) return base64.b64decode(self.data)
class StartMessage(WebSocketMessage, type=WebSocketMessageType.START): class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
transcriber_config: TranscriberConfig transcriber_config: TranscriberConfig
agent_config: AgentConfig agent_config: AgentConfig
synthesizer_config: SynthesizerConfig synthesizer_config: SynthesizerConfig
conversation_id: Optional[str] = None conversation_id: Optional[str] = None
class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY): class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
pass pass
class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP): class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
pass pass

View file

@ -6,7 +6,7 @@ class BaseOutputDevice:
self.sampling_rate = sampling_rate self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding self.audio_encoding = audio_encoding
async def send_async(self, chunk): async def send_async(self, chunk: bytes):
raise NotImplemented raise NotImplemented
async def maybe_send_mark_async(self, message): async def maybe_send_mark_async(self, message):

View file

@ -0,0 +1,30 @@
import json
import base64
from fastapi import WebSocket
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
class TwilioOutputDevice(BaseOutputDevice):
def __init__(self, ws: WebSocket = None, stream_sid: str = None):
self.ws = ws
self.stream_sid = stream_sid
async def send_async(self, chunk: bytes):
twilio_message = {
"event": "media",
"streamSid": self.stream_sid,
"media": {"payload": base64.b64encode(chunk).decode("utf-8")},
}
await self.ws.send_text(json.dumps(twilio_message))
async def maybe_send_mark_async(self, message_sent):
mark_message = {
"event": "mark",
"streamSid": self.stream_sid,
"mark": {
"name": "Sent {}".format(message_sent),
},
}
await self.ws.send_text(json.dumps(mark_message))

View file

@ -1,26 +1,67 @@
import websockets
from websockets.exceptions import ConnectionClosedOK
from websockets.client import WebSocketClientProtocol
import asyncio import asyncio
from dotenv import load_dotenv from asyncio import Future
import os import queue
from typing import Callable, Awaitable, Optional, Any
import logging import logging
import threading import threading
import queue import time
import vocode import secrets
from vocode.streaming.input_device.base_input_device import ( import random
BaseInputDevice,
from dotenv import load_dotenv
from vocode.streaming.agent.bot_sentiment_analyser import (
BotSentiment,
BotSentimentAnalyser,
) )
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.output_device.base_output_device import BaseOutputDevice from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.models.transcriber import TranscriberConfig from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
from vocode.streaming.models.agent import AgentConfig from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
from vocode.streaming.models.synthesizer import SynthesizerConfig from vocode.streaming.utils.goodbye_model import GoodbyeModel
from vocode.streaming.models.websocket import ( from vocode.streaming.utils.transcript import Transcript
ReadyMessage,
AudioMessage, from vocode.streaming.models.transcriber import (
StartMessage, TranscriberConfig,
StopMessage, TranscriberType,
) )
from vocode.streaming.models.agent import (
AgentConfig,
AgentType,
FillerAudioConfig,
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS,
)
from vocode.streaming.models.synthesizer import (
SynthesizerConfig,
SynthesizerType,
TrackBotSentimentConfig,
)
from vocode.streaming.models.websocket import AudioMessage
from vocode.streaming.constants import (
TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS,
PER_CHUNK_ALLOWANCE_SECONDS,
ALLOWED_IDLE_TIME,
)
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
SynthesisResult,
FillerAudio,
)
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.utils import (
create_conversation_id,
create_loop_in_thread,
get_chunk_size_per_second,
)
from vocode.streaming.transcriber.base_transcriber import (
Transcription,
BaseTranscriber,
)
from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
load_dotenv() load_dotenv()
@ -28,79 +69,468 @@ load_dotenv()
class StreamingConversation: class StreamingConversation:
def __init__( def __init__(
self, self,
input_device: BaseInputDevice,
output_device: BaseOutputDevice, output_device: BaseOutputDevice,
transcriber_config: TranscriberConfig, transcriber: BaseTranscriber,
agent_config: AgentConfig, agent: BaseAgent,
synthesizer_config: SynthesizerConfig, synthesizer: BaseSynthesizer,
id: str = None, conversation_id: str = None,
per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
logger: Optional[logging.Logger] = None,
): ):
self.id = id self.id = conversation_id or create_conversation_id()
self.input_device = input_device self.logger = logger or logging.getLogger(__name__)
self.output_device = output_device self.output_device = output_device
self.transcriber_config = transcriber_config self.transcriber = transcriber
self.agent_config = agent_config self.transcriber.set_on_response(self.on_transcription_response)
self.synthesizer_config = synthesizer_config self.transcriber_task = None
self.logger = logging.getLogger(__name__) self.agent = agent
self.receiver_ready = False self.synthesizer = synthesizer
self.active = True self.synthesizer_event_loop = asyncio.new_event_loop()
self.output_loop = asyncio.new_event_loop() self.synthesizer_thread = threading.Thread(
self.output_audio_queue = queue.Queue() name="synthesizer",
self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation" target=create_loop_in_thread,
args=(self.synthesizer_event_loop,),
)
self.per_chunk_allowance_seconds = per_chunk_allowance_seconds
self.transcript = Transcript()
self.bot_sentiment = None
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
if isinstance(
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice,
bool,
):
self.track_bot_sentiment_config = TrackBotSentimentConfig()
else:
self.track_bot_sentiment_config = (
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice
)
self.bot_sentiment_analyser = BotSentimentAnalyser(
emotions=self.track_bot_sentiment_config.emotions
)
self.goodbye_model = GoodbyeModel()
async def wait_for_ready(self): self.is_human_speaking = False
while not self.receiver_ready:
await asyncio.sleep(0.1)
return True
def deactivate(self):
self.active = False self.active = False
self.current_synthesis_task = None
def play_audio(self): self.is_current_synthesis_interruptable = False
async def run(): self.stop_events: queue.Queue[threading.Event] = queue.Queue()
while self.active: self.last_action_timestamp = time.time()
try: self.check_for_idle_task = None
audio = self.output_audio_queue.get(timeout=5) self.track_bot_sentiment_task = None
await self.output_device.send_async(audio) self.should_wait_for_filler_audio_done_event = False
except queue.Empty: self.current_filler_audio_done_event: Optional[threading.Event] = None
continue self.current_filler_seconds_per_chunk: int = 0
self.current_transcription_is_interrupt: bool = False
loop = asyncio.new_event_loop()
loop.run_until_complete(run())
async def start(self): async def start(self):
async with websockets.connect( self.transcriber_task = asyncio.create_task(self.transcriber.run())
f"{self.vocode_websocket_url}?key={vocode.api_key}" is_ready = await self.transcriber.ready()
) as ws: if not is_ready:
raise Exception("Transcriber startup failed")
async def sender(ws: WebSocketClientProtocol): self.synthesizer_thread.start()
start_message = StartMessage( if self.agent.get_agent_config().send_filler_audio:
transcriber_config=self.transcriber_config, filler_audio_config = (
agent_config=self.agent_config, self.agent.get_agent_config().send_filler_audio
synthesizer_config=self.synthesizer_config, if isinstance(
conversation_id=self.id, self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
) )
await ws.send(start_message.json()) else FillerAudioConfig()
await self.wait_for_ready() )
self.logger.info("Listening...press Ctrl+C to stop") self.synthesizer.set_filler_audios(filler_audio_config)
while self.active: self.agent.start()
data = self.input_device.get_audio() if self.agent.get_agent_config().initial_message:
if data: self.transcript.add_bot_message(
try: self.agent.get_agent_config().initial_message.text
await ws.send(AudioMessage.from_bytes(data).json()) )
except ConnectionClosedOK: if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
self.deactivate() self.update_bot_sentiment()
self.send_message_to_stream_nonblocking(
self.agent.get_agent_config().initial_message, False
)
self.active = True
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
self.track_bot_sentiment_task = asyncio.create_task(
self.track_bot_sentiment()
)
self.check_for_idle_task = asyncio.create_task(self.check_for_idle())
async def check_for_idle(self):
while self.is_active():
if time.time() - self.last_action_timestamp > (
self.agent.get_agent_config().allowed_idle_time_seconds
or ALLOWED_IDLE_TIME
):
self.logger.debug("Conversation idle for too long, terminating")
self.mark_terminated()
return return
await asyncio.sleep(15)
async def track_bot_sentiment(self):
prev_transcript = None
while self.is_active():
await asyncio.sleep(1)
if self.transcript.to_string() != prev_transcript:
self.update_bot_sentiment()
prev_transcript = self.transcript.to_string()
def update_bot_sentiment(self):
new_bot_sentiment = self.bot_sentiment_analyser.analyse(
self.transcript.to_string()
)
if new_bot_sentiment.emotion:
self.logger.debug("Bot sentiment: %s", new_bot_sentiment)
self.bot_sentiment = new_bot_sentiment
def receive_audio(self, chunk: bytes):
self.transcriber.send_audio(chunk)
async def send_messages_to_stream_async(
self,
messages,
should_allow_human_to_cut_off_bot: bool,
wait_for_filler_audio: bool = False,
) -> tuple[str, bool]:
messages_queue = queue.Queue()
messages_done = threading.Event()
speech_cut_off = threading.Event()
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
chunk_size = (
get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
* seconds_per_chunk
)
async def send_to_call():
response_buffer = ""
cut_off = False
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
while True:
try:
message: BaseMessage = messages_queue.get_nowait()
except queue.Empty:
if messages_done.is_set():
break
else:
await asyncio.sleep(0) await asyncio.sleep(0)
await ws.send(StopMessage().json()) continue
async def receiver(ws: WebSocketClientProtocol): stop_event = self.enqueue_stop_event()
ReadyMessage.parse_raw(await ws.recv()) synthesis_result = self.synthesizer.create_speech(
self.receiver_ready = True message, chunk_size, bot_sentiment=self.bot_sentiment
async for msg in ws: )
audio_message = AudioMessage.parse_raw(msg) message_sent, cut_off = await self.send_speech_to_output(
self.output_audio_queue.put_nowait(audio_message.get_bytes()) message.text,
synthesis_result,
stop_event,
seconds_per_chunk,
)
self.logger.debug("Message sent: {}".format(message_sent))
response_buffer = f"{response_buffer} {message_sent}"
if cut_off:
speech_cut_off.set()
break
await asyncio.sleep(0)
if cut_off:
self.agent.update_last_bot_message_on_cut_off(response_buffer)
self.transcript.add_bot_message(response_buffer)
return response_buffer, cut_off
output_thread = threading.Thread(target=self.play_audio) asyncio.run_coroutine_threadsafe(send_to_call(), self.synthesizer_event_loop)
output_thread.start()
return await asyncio.gather(sender(ws), receiver(ws)) messages_generated = 0
for i, message in enumerate(messages):
messages_generated += 1
if i == 0:
if wait_for_filler_audio:
self.interrupt_all_synthesis()
self.wait_for_filler_audio_to_finish()
if speech_cut_off.is_set():
break
messages_queue.put_nowait(BaseMessage(text=message))
await asyncio.sleep(0)
if messages_generated == 0:
self.logger.debug("Agent generated no messages")
if wait_for_filler_audio:
self.interrupt_all_synthesis()
messages_done.set()
def send_message_to_stream_nonblocking(
self,
message: BaseMessage,
should_allow_human_to_cut_off_bot: bool,
):
asyncio.run_coroutine_threadsafe(
self.send_message_to_stream_async(
message,
self.agent.get_agent_config().allow_agent_to_be_cut_off,
),
self.synthesizer_event_loop,
)
async def send_message_to_stream_async(
self,
message: BaseMessage,
should_allow_human_to_cut_off_bot: bool,
) -> tuple[str, bool]:
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
stop_event = self.enqueue_stop_event()
self.logger.debug("Synthesizing speech for message")
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
chunk_size = (
get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
* seconds_per_chunk
)
synthesis_result = self.synthesizer.create_speech(
message, chunk_size, bot_sentiment=self.bot_sentiment
)
message_sent, cut_off = await self.send_speech_to_output(
message.text,
synthesis_result,
stop_event,
seconds_per_chunk,
)
self.logger.debug("Message sent: {}".format(message_sent))
if cut_off:
self.agent.update_last_bot_message_on_cut_off(message_sent)
self.transcript.add_bot_message(message_sent)
return message_sent, cut_off
def warmup_synthesizer(self):
self.synthesizer.ready_synthesizer()
# returns an estimate of what was sent up to, and a flag if the message was cut off
async def send_speech_to_output(
self,
message,
synthesis_result: SynthesisResult,
stop_event: threading.Event,
seconds_per_chunk: int,
is_filler_audio: bool = False,
):
message_sent = message
cut_off = False
chunk_size = seconds_per_chunk * get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
for i, chunk_result in enumerate(synthesis_result.chunk_generator):
start_time = time.time()
speech_length_seconds = seconds_per_chunk * (
len(chunk_result.chunk) / chunk_size
)
if stop_event.is_set():
seconds = i * seconds_per_chunk
self.logger.debug(
"Interrupted, stopping text to speech after {} chunks".format(i)
)
message_sent = f"{synthesis_result.get_message_up_to(seconds)}-"
cut_off = True
break
if i == 0:
if is_filler_audio:
self.should_wait_for_filler_audio_done_event = True
await self.output_device.send_async(chunk_result.chunk)
end_time = time.time()
await asyncio.sleep(
max(
speech_length_seconds
- (end_time - start_time)
- self.per_chunk_allowance_seconds,
0,
)
)
self.logger.debug(
"Sent chunk {} with size {}".format(i, len(chunk_result.chunk))
)
self.last_action_timestamp = time.time()
# clears it off the stop events queue
if not stop_event.is_set():
stop_event.set()
return message_sent, cut_off
async def on_transcription_response(self, transcription: Transcription):
self.last_action_timestamp = time.time()
if transcription.is_final:
self.logger.debug(
"Got transcription: {}, confidence: {}".format(
transcription.message, transcription.confidence
)
)
if not self.is_human_speaking:
# send interrupt
self.current_transcription_is_interrupt = False
if self.is_current_synthesis_interruptable:
self.logger.debug("sending interrupt")
self.current_transcription_is_interrupt = self.interrupt_all_synthesis()
self.logger.debug("Human started speaking")
transcription.is_interrupt = self.current_transcription_is_interrupt
self.is_human_speaking = not transcription.is_final
return await self.handle_transcription(transcription)
def enqueue_stop_event(self):
stop_event = threading.Event()
self.stop_events.put_nowait(stop_event)
return stop_event
def interrupt_all_synthesis(self):
"""Returns true if any synthesis was interrupted"""
num_interrupts = 0
while True:
try:
stop_event = self.stop_events.get_nowait()
if not stop_event.is_set():
self.logger.debug("Interrupting synthesis")
stop_event.set()
num_interrupts += 1
except queue.Empty:
break
return num_interrupts > 0
async def send_filler_audio_to_output(
self,
filler_audio: FillerAudio,
stop_event: threading.Event,
done_event: threading.Event,
):
filler_synthesis_result = filler_audio.create_synthesis_result()
self.is_current_synthesis_interruptable = filler_audio.is_interruptable
if isinstance(
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
):
silence_threshold = (
self.agent.get_agent_config().send_filler_audio.silence_threshold_seconds
)
else:
silence_threshold = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
await asyncio.sleep(silence_threshold)
self.logger.debug("Sending filler audio to output")
await self.send_speech_to_output(
filler_audio.message.text,
filler_synthesis_result,
stop_event,
filler_audio.seconds_per_chunk,
is_filler_audio=True,
)
done_event.set()
def wait_for_filler_audio_to_finish(self):
if not self.should_wait_for_filler_audio_done_event:
self.logger.debug(
"Not waiting for filler audio to finish since we didn't send any chunks"
)
return
self.should_wait_for_filler_audio_done_event = False
if (
self.current_filler_audio_done_event
and not self.current_filler_audio_done_event.is_set()
):
self.logger.debug("Waiting for filler audio to finish")
# this should guarantee that filler audio finishes, since it has to be on its last chunk
if not self.current_filler_audio_done_event.wait(
self.current_filler_seconds_per_chunk
):
self.logger.debug("Filler audio did not finish")
async def handle_transcription(self, transcription: Transcription):
if transcription.is_final:
self.transcript.add_human_message(transcription.message)
goodbye_detected_task = None
if self.agent.get_agent_config().end_conversation_on_goodbye:
goodbye_detected_task = asyncio.create_task(
self.goodbye_model.is_goodbye(transcription.message)
)
if self.agent.get_agent_config().send_filler_audio:
self.logger.debug("Sending filler audio")
if self.synthesizer.filler_audios:
filler_audio = random.choice(self.synthesizer.filler_audios)
self.logger.debug(f"Chose {filler_audio.message.text}")
self.current_filler_audio_done_event = threading.Event()
self.current_filler_seconds_per_chunk = (
filler_audio.seconds_per_chunk
)
stop_event = self.enqueue_stop_event()
asyncio.run_coroutine_threadsafe(
self.send_filler_audio_to_output(
filler_audio,
stop_event,
done_event=self.current_filler_audio_done_event,
),
self.synthesizer_event_loop,
)
else:
self.logger.debug("No filler audio available for synthesizer")
self.logger.debug("Generating response for transcription")
if self.agent.get_agent_config().generate_responses:
responses = self.agent.generate_response(
transcription.message, is_interrupt=transcription.is_interrupt
)
await self.send_messages_to_stream_async(
responses,
self.agent.get_agent_config().allow_agent_to_be_cut_off,
wait_for_filler_audio=self.agent.get_agent_config().send_filler_audio,
)
else:
response, should_stop = self.agent.respond(
transcription.message, is_interrupt=transcription.is_interrupt
)
if self.agent.get_agent_config().send_filler_audio:
self.interrupt_all_synthesis()
self.wait_for_filler_audio_to_finish()
if should_stop:
self.logger.debug("Agent requested to stop")
self.mark_terminated()
return
if response:
self.send_message_to_stream_nonblocking(
BaseMessage(text=response),
self.agent.get_agent_config().allow_agent_to_be_cut_off,
)
else:
self.logger.debug("No response generated")
if goodbye_detected_task:
try:
goodbye_detected = await asyncio.wait_for(
goodbye_detected_task, 0.1
)
if goodbye_detected:
self.logger.debug("Goodbye detected, ending conversation")
self.mark_terminated()
return
except asyncio.TimeoutError:
self.logger.debug("Goodbye detection timed out")
def mark_terminated(self):
self.active = False
# must be called from the main thread
def terminate(self):
self.mark_terminated()
if self.check_for_idle_task:
self.logger.debug("Terminating check_for_idle Task")
self.check_for_idle_task.cancel()
if self.track_bot_sentiment_task:
self.logger.debug("Terminating track_bot_sentiment Task")
self.track_bot_sentiment_task.cancel()
self.logger.debug("Terminating agent")
self.agent.terminate()
self.logger.debug("Terminating speech transcriber")
self.transcriber.terminate()
self.logger.debug("Terminating synthesizer event loop")
self.synthesizer_event_loop.call_soon_threadsafe(
self.synthesizer_event_loop.stop
)
self.logger.debug("Terminating synthesizer thread")
if self.synthesizer_thread.is_alive():
self.synthesizer_thread.join()
self.logger.debug("Terminating transcriber task")
self.transcriber_task.cancel()
self.logger.debug("Successfully terminated")
def is_active(self):
return self.active

View file

@ -0,0 +1,250 @@
import logging
import os
import re
from typing import Any, Optional
from xml.etree import ElementTree
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.message import BaseMessage, SSMLMessage
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
SynthesisResult,
FILLER_PHRASES,
FILLER_AUDIO_PATH,
FillerAudio,
encode_as_wav,
)
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.models.audio_encoding import AudioEncoding
load_dotenv()
NAMESPACES = {
"mstts": "https://www.w3.org/2001/mstts",
"": "https://www.w3.org/2001/10/synthesis",
}
ElementTree.register_namespace("", NAMESPACES.get(""))
ElementTree.register_namespace("mstts", NAMESPACES.get("mstts"))
class WordBoundaryEventPool:
def __init__(self):
self.events = []
def add(self, event):
self.events.append(
{
"text": event.text,
"text_offset": event.text_offset,
"audio_offset": (event.audio_offset + 5000) / (10000 * 1000),
"boudary_type": event.boundary_type,
}
)
def get_events_sorted(self):
return sorted(self.events, key=lambda event: event["audio_offset"])
class AzureSynthesizer(BaseSynthesizer):
OFFSET_MS = 100
def __init__(
self, synthesizer_config: AzureSynthesizerConfig, logger: logging.Logger = None
):
super().__init__(synthesizer_config)
self.synthesizer_config = synthesizer_config
# Instantiates a client
speech_config = speechsdk.SpeechConfig(
subscription=os.environ.get("AZURE_SPEECH_KEY"),
region=os.environ.get("AZURE_SPEECH_REGION"),
)
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
if self.synthesizer_config.sampling_rate == 44100:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
)
if self.synthesizer_config.sampling_rate == 48000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
)
if self.synthesizer_config.sampling_rate == 24000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
)
elif self.synthesizer_config.sampling_rate == 16000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
)
elif self.synthesizer_config.sampling_rate == 8000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
)
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz8BitMonoMULaw
)
self.synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=None
)
self.voice_name = self.synthesizer_config.voice_name
self.pitch = self.synthesizer_config.pitch
self.rate = self.synthesizer_config.rate
self.logger = logger or logging.getLogger(__name__)
def get_phrase_filler_audios(self) -> list[FillerAudio]:
filler_phrase_audios = []
for filler_phrase in FILLER_PHRASES:
cache_key = "-".join(
(
str(filler_phrase.text),
str(self.synthesizer_config.type),
str(self.synthesizer_config.audio_encoding),
str(self.synthesizer_config.sampling_rate),
str(self.voice_name),
str(self.pitch),
str(self.rate),
)
)
filler_audio_path = os.path.join(FILLER_AUDIO_PATH, f"{cache_key}.bytes")
if os.path.exists(filler_audio_path):
audio_data = open(filler_audio_path, "rb").read()
else:
self.logger.debug(f"Generating filler audio for {filler_phrase.text}")
ssml = self.create_ssml(filler_phrase.text)
result = self.synthesizer.speak_ssml(ssml)
offset = self.synthesizer_config.sampling_rate * self.OFFSET_MS // 1000
audio_data = result.audio_data[offset:]
with open(filler_audio_path, "wb") as f:
f.write(audio_data)
filler_phrase_audios.append(
FillerAudio(
filler_phrase,
audio_data,
self.synthesizer_config,
)
)
return filler_phrase_audios
def add_marks(self, message: str, index=0) -> str:
search_result = re.search(r"([\.\,\:\;\-\—]+)", message)
if search_result is None:
return message
start, end = search_result.span()
with_mark = message[:start] + f'<mark name="{index}" />' + message[start:end]
rest = message[end:]
rest_stripped = re.sub(r"^(.+)([\.\,\:\;\-\—]+)$", r"\1", rest)
if len(rest_stripped) == 0:
return with_mark
return with_mark + self.add_marks(rest_stripped, index + 1)
def word_boundary_cb(self, evt, pool):
pool.add(evt)
def create_ssml(
self, message: str, bot_sentiment: Optional[BotSentiment] = None
) -> str:
ssml_root = ElementTree.fromstring(
'<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="en-US"></speak>'
)
voice = ElementTree.SubElement(ssml_root, "voice")
voice.set("name", self.voice_name)
voice_root = voice
if bot_sentiment and bot_sentiment.emotion:
styled = ElementTree.SubElement(
voice, "{%s}express-as" % NAMESPACES.get("mstts")
)
styled.set("style", bot_sentiment.emotion)
styled.set(
"styledegree", str(bot_sentiment.degree * 2)
) # Azure specific, it's a scale of 0-2
voice_root = styled
prosody = ElementTree.SubElement(voice_root, "prosody")
prosody.set("pitch", f"{self.pitch}%")
prosody.set("rate", f"{self.rate}%")
prosody.text = message.strip()
return ElementTree.tostring(ssml_root, encoding="unicode")
def synthesize_ssml(self, ssml: str) -> tuple[speechsdk.AudioDataStream, str]:
result = self.synthesizer.start_speaking_ssml_async(ssml).get()
return speechsdk.AudioDataStream(result)
def ready_synthesizer(self):
connection = speechsdk.Connection.from_speech_synthesizer(self.synthesizer)
connection.open(True)
# given the number of seconds the message was allowed to go until, where did we get in the message?
def get_message_up_to(
self,
message: str,
ssml: str,
seconds: int,
word_boundary_event_pool: WordBoundaryEventPool,
) -> str:
events = word_boundary_event_pool.get_events_sorted()
for event in events:
if event["audio_offset"] > seconds:
ssml_fragment = ssml[: event["text_offset"]]
return ssml_fragment.split(">")[-1]
return message
def create_speech(
self,
message: BaseMessage,
chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult:
# offset = int(self.OFFSET_MS * (self.synthesizer_config.sampling_rate / 1000))
offset = 0
self.logger.debug(f"Synthesizing message: {message}")
def chunk_generator(
audio_data_stream: speechsdk.AudioDataStream, chunk_transform=lambda x: x
):
audio_buffer = bytes(chunk_size)
filled_size = audio_data_stream.read_data(audio_buffer)
if filled_size != chunk_size:
yield SynthesisResult.ChunkResult(
chunk_transform(audio_buffer[offset:]), True
)
return
else:
yield SynthesisResult.ChunkResult(
chunk_transform(audio_buffer[offset:]), False
)
while True:
filled_size = audio_data_stream.read_data(audio_buffer)
if filled_size != chunk_size:
yield SynthesisResult.ChunkResult(
chunk_transform(audio_buffer[: filled_size - offset]), True
)
break
yield SynthesisResult.ChunkResult(chunk_transform(audio_buffer), False)
word_boundary_event_pool = WordBoundaryEventPool()
self.synthesizer.synthesis_word_boundary.connect(
lambda event: self.word_boundary_cb(event, word_boundary_event_pool)
)
ssml = (
message.ssml
if isinstance(message, SSMLMessage)
else self.create_ssml(message.text, bot_sentiment=bot_sentiment)
)
audio_data_stream = self.synthesize_ssml(ssml)
if self.synthesizer_config.should_encode_as_wav:
output_generator = chunk_generator(
audio_data_stream,
lambda chunk: encode_as_wav(chunk, self.synthesizer_config),
)
else:
output_generator = chunk_generator(audio_data_stream)
return SynthesisResult(
output_generator,
lambda seconds: self.get_message_up_to(
message, ssml, seconds, word_boundary_event_pool
),
)

View file

@ -0,0 +1,169 @@
import os
from typing import Any, Generator, Callable, Optional
import math
import io
import wave
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.agent import FillerAudioConfig
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.utils import convert_wav, get_chunk_size_per_second
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.models.synthesizer import SynthesizerConfig
FILLER_PHRASES = [
BaseMessage(text="Um..."),
BaseMessage(text="Uh..."),
BaseMessage(text="Uh-huh..."),
BaseMessage(text="Mm-hmm..."),
BaseMessage(text="Hmm..."),
BaseMessage(text="Okay..."),
BaseMessage(text="Right..."),
BaseMessage(text="Let me see..."),
]
FILLER_AUDIO_PATH = os.path.join(os.path.dirname(__file__), "filler_audio")
TYPING_NOISE_PATH = "%s/typing-noise.wav" % FILLER_AUDIO_PATH
def encode_as_wav(chunk: bytes, synthesizer_config: SynthesizerConfig) -> bytes:
output_bytes_io = io.BytesIO()
in_memory_wav = wave.open(output_bytes_io, "wb")
in_memory_wav.setnchannels(1)
assert synthesizer_config.audio_encoding == AudioEncoding.LINEAR16
in_memory_wav.setsampwidth(2)
in_memory_wav.setframerate(synthesizer_config.sampling_rate)
in_memory_wav.writeframes(chunk)
output_bytes_io.seek(0)
return output_bytes_io.read()
class SynthesisResult:
class ChunkResult:
def __init__(self, chunk: bytes, is_last_chunk: bool):
self.chunk = chunk
self.is_last_chunk = is_last_chunk
def __init__(
self,
chunk_generator: Generator[ChunkResult, None, None],
get_message_up_to: Callable[[int], str],
):
self.chunk_generator = chunk_generator
self.get_message_up_to = get_message_up_to
class FillerAudio:
def __init__(
self,
message: BaseMessage,
audio_data: bytes,
synthesizer_config: SynthesizerConfig,
is_interruptable: bool = False,
seconds_per_chunk: int = 1,
):
self.message = message
self.audio_data = audio_data
self.synthesizer_config = synthesizer_config
self.is_interruptable = is_interruptable
self.seconds_per_chunk = seconds_per_chunk
def create_synthesis_result(self) -> SynthesisResult:
chunk_size = (
get_chunk_size_per_second(
self.synthesizer_config.audio_encoding,
self.synthesizer_config.sampling_rate,
)
* self.seconds_per_chunk
)
def chunk_generator(chunk_transform=lambda x: x):
for i in range(0, len(self.audio_data), chunk_size):
if i + chunk_size > len(self.audio_data):
yield SynthesisResult.ChunkResult(
chunk_transform(self.audio_data[i:]), True
)
else:
yield SynthesisResult.ChunkResult(
chunk_transform(self.audio_data[i : i + chunk_size]), False
)
if self.synthesizer_config.should_encode_as_wav:
output_generator = chunk_generator(
lambda chunk: encode_as_wav(chunk, self.synthesizer_config)
)
else:
output_generator = chunk_generator()
return SynthesisResult(output_generator, lambda seconds: self.message.text)
class BaseSynthesizer:
def __init__(self, synthesizer_config: SynthesizerConfig):
self.synthesizer_config = synthesizer_config
if synthesizer_config.audio_encoding == AudioEncoding.MULAW:
assert (
synthesizer_config.sampling_rate == 8000
), "MuLaw encoding only supports 8kHz sampling rate"
self.filler_audios: list[FillerAudio] = []
def get_synthesizer_config(self) -> SynthesizerConfig:
return self.synthesizer_config
def get_typing_noise_filler_audio(self) -> FillerAudio:
return FillerAudio(
message=BaseMessage(text="<typing noise>"),
audio_data=convert_wav(
TYPING_NOISE_PATH,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=self.synthesizer_config.audio_encoding,
),
synthesizer_config=self.synthesizer_config,
is_interruptable=True,
seconds_per_chunk=2,
)
def set_filler_audios(self, filler_audio_config: FillerAudioConfig):
if filler_audio_config.use_phrases:
self.filler_audios = self.get_phrase_filler_audios()
elif filler_audio_config.use_typing_noise:
self.filler_audios = [self.get_typing_noise_filler_audio()]
def get_phrase_filler_audios(self) -> list[FillerAudio]:
return []
def ready_synthesizer(self):
pass
# given the number of seconds the message was allowed to go until, where did we get in the message?
def get_message_cutoff_from_total_response_length(
self, message: BaseMessage, seconds: int, size_of_output: int
) -> str:
estimated_output_seconds = (
size_of_output / self.synthesizer_config.sampling_rate
)
estimated_output_seconds_per_char = estimated_output_seconds / len(message.text)
return message.text[: int(seconds / estimated_output_seconds_per_char)]
def get_message_cutoff_from_voice_speed(
self, message: BaseMessage, seconds: int, words_per_minute: int
) -> str:
words_per_second = words_per_minute / 60
estimated_words_spoken = math.floor(words_per_second * seconds)
tokens = word_tokenize(message.text)
return TreebankWordDetokenizer().detokenize(tokens[:estimated_words_spoken])
def get_maybe_cached_synthesis_result(
self, message: BaseMessage, chunk_size: int
) -> Optional[SynthesisResult]:
return
# returns a chunk generator and a thunk that can tell you what part of the message was read given the number of seconds spoken
# chunk generator must return tuple (bytes of size chunk_size, flag if it is the last chunk)
def create_speech(
self,
message: BaseMessage,
chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult:
raise NotImplementedError

View file

@ -0,0 +1,50 @@
from typing import Any, Optional
import os
from dotenv import load_dotenv
import requests
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
SynthesisResult,
)
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.message import BaseMessage
load_dotenv()
ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY")
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
class ElevenLabsSynthesizer(BaseSynthesizer):
def __init__(self, config: ElevenLabsSynthesizerConfig):
super().__init__(config)
self.api_key = config.api_key
self.voice_id = config.voice_id or ADAM_VOICE_ID
self.words_per_minute = 150
def create_speech(
self,
message: BaseMessage,
chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult:
url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream"
headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
body = {
"text": message.text,
}
response = requests.post(url, headers=headers, json=body)
def chunk_generator(response):
for chunk in response.iter_content(chunk_size=chunk_size):
yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size)
assert (
not self.synthesizer_config.should_encode_as_wav
), "ElevenLabs does not support WAV encoding"
# return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute)
return SynthesisResult(chunk_generator(response), lambda seconds: message.text)

View file

@ -0,0 +1,110 @@
import io
import wave
from typing import Any, Optional
from dotenv import load_dotenv
from google.cloud import texttospeech_v1beta1 as tts
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
SynthesisResult,
encode_as_wav,
)
from vocode.streaming.models.synthesizer import GoogleSynthesizerConfig
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.utils import convert_wav
load_dotenv()
class GoogleSynthesizer(BaseSynthesizer):
OFFSET_SECONDS = 0.5
def __init__(self, synthesizer_config: GoogleSynthesizerConfig):
super().__init__(synthesizer_config)
# Instantiates a client
self.client = tts.TextToSpeechClient()
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
self.voice = tts.VoiceSelectionParams(
language_code="en-US", name="en-US-Neural2-I"
)
# Select the type of audio file you want returned
self.audio_config = tts.AudioConfig(
audio_encoding=tts.AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
speaking_rate=1.2,
pitch=0,
effects_profile_id=["telephony-class-application"],
)
def synthesize(self, message: str) -> tts.SynthesizeSpeechResponse:
synthesis_input = tts.SynthesisInput(text=message)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
return self.client.synthesize_speech(
request=tts.SynthesizeSpeechRequest(
input=synthesis_input,
voice=self.voice,
audio_config=self.audio_config,
enable_time_pointing=[
tts.SynthesizeSpeechRequest.TimepointType.SSML_MARK
],
)
)
def create_speech(
self,
message: BaseMessage,
chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult:
response = self.synthesize(message.text)
output_sample_rate = response.audio_config.sample_rate_hertz
real_offset = int(GoogleSynthesizer.OFFSET_SECONDS * output_sample_rate)
output_bytes_io = io.BytesIO()
in_memory_wav = wave.open(output_bytes_io, "wb")
in_memory_wav.setnchannels(1)
in_memory_wav.setsampwidth(2)
in_memory_wav.setframerate(output_sample_rate)
in_memory_wav.writeframes(response.audio_content[real_offset:-real_offset])
output_bytes_io.seek(0)
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
output_bytes = convert_wav(
output_bytes_io,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.LINEAR16,
)
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
output_bytes = convert_wav(
output_bytes_io,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.MULAW,
)
if self.synthesizer_config.should_encode_as_wav:
output_bytes = encode_as_wav(output_bytes)
def chunk_generator(output_bytes):
for i in range(0, len(output_bytes), chunk_size):
if i + chunk_size > len(output_bytes):
yield SynthesisResult.ChunkResult(output_bytes[i:], True)
else:
yield SynthesisResult.ChunkResult(
output_bytes[i : i + chunk_size], False
)
return SynthesisResult(
chunk_generator(output_bytes),
lambda seconds: self.get_message_cutoff_from_total_response_length(
message, seconds, len(output_bytes)
),
)

View file

@ -0,0 +1,78 @@
import audioop
import base64
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.models.message import BaseMessage
from .base_synthesizer import BaseSynthesizer, SynthesisResult, encode_as_wav
from typing import Any, Optional
import os
import io
import wave
from dotenv import load_dotenv
import requests
from ..utils import convert_linear_audio, convert_wav
from ..models.synthesizer import ElevenLabsSynthesizerConfig, RimeSynthesizerConfig
load_dotenv()
RIME_API_KEY = os.getenv("RIME_API_KEY")
RIME_BASE_URL = os.getenv("RIME_BASE_URL")
class RimeSynthesizer(BaseSynthesizer):
def __init__(self, config: RimeSynthesizerConfig):
super().__init__(config)
self.speaker = config.speaker
def create_speech(
self,
message: BaseMessage,
chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult:
url = RIME_BASE_URL
headers = {"Authorization": f"Bearer {RIME_API_KEY}"}
body = {"inputs": {"text": message.text, "speaker": self.speaker}}
response = requests.post(url, headers=headers, json=body)
def chunk_generator(audio, chunk_transform=lambda x: x):
for i in range(0, len(audio), chunk_size):
chunk = audio[i : i + chunk_size]
yield SynthesisResult.ChunkResult(
chunk_transform(chunk), len(chunk) != chunk_size
)
assert response.ok, response.text
data = response.json().get("data")
assert data
audio_file = io.BytesIO(base64.b64decode(data))
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
output_bytes = convert_wav(
audio_file,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.LINEAR16,
)
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
output_bytes = convert_wav(
audio_file,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.MULAW,
)
if self.synthesizer_config.should_encode_as_wav:
output_generator = chunk_generator(
output_bytes, chunk_transform=encode_as_wav
)
else:
output_generator = chunk_generator(output_bytes)
return SynthesisResult(
output_generator,
lambda seconds: self.get_message_cutoff_from_total_response_length(
message, seconds, len(output_bytes)
),
)

View file

View file

@ -0,0 +1,17 @@
import logging
import os
from typing import Optional
from redis import Redis
from vocode.streaming.models.telephony import CallConfig
class BaseConfigManager:
def save_config(self, conversation_id: str, config: CallConfig):
raise NotImplementedError
def get_config(self, conversation_id) -> Optional[CallConfig]:
raise NotImplementedError
def delete_config(self, conversation_id):
raise NotImplementedError

View file

@ -0,0 +1,34 @@
import logging
import os
from typing import Optional
from redis import Redis
from vocode.streaming.models.telephony import CallConfig
from vocode.streaming.telephony.config_manager.base_config_manager import (
BaseConfigManager,
)
class RedisConfigManager(BaseConfigManager):
def __init__(self, logger: Optional[logging.Logger] = None):
self.redis = Redis(
host=os.environ.get("REDISHOST", "localhost"),
port=int(os.environ.get("REDISPORT", 6379)),
db=0,
decode_responses=True,
)
self.logger = logger or logging.getLogger(__name__)
def save_config(self, conversation_id: str, config: CallConfig):
self.logger.debug(f"Saving config for {conversation_id}")
self.redis.set(conversation_id, config.json())
def get_config(self, conversation_id) -> Optional[CallConfig]:
self.logger.debug(f"Getting config for {conversation_id}")
raw_config = self.redis.get(conversation_id)
if raw_config:
return CallConfig.parse_raw(self.redis.get(conversation_id))
def delete_config(self, conversation_id):
self.logger.debug(f"Deleting config for {conversation_id}")
self.redis.delete(conversation_id)

View file

@ -0,0 +1,5 @@
from vocode.streaming.models.audio_encoding import AudioEncoding
DEFAULT_SAMPLING_RATE = 8000
DEFAULT_AUDIO_ENCODING = AudioEncoding.MULAW
DEFAULT_CHUNK_SIZE = 20 * 160

View file

@ -0,0 +1,170 @@
from fastapi import WebSocket
import base64
from enum import Enum
import json
import logging
from typing import Optional
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.factory import (
create_agent,
create_synthesizer,
create_transcriber,
)
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.streaming.models.telephony import CallConfig, TwilioConfig
from vocode.streaming.output_device.twilio_output_device import TwilioOutputDevice
from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
)
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
)
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.telephony.config_manager.base_config_manager import (
BaseConfigManager,
)
from vocode.streaming.telephony.twilio import create_twilio_client
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
class PhoneCallAction(Enum):
CLOSE_WEBSOCKET = 1
class Call(StreamingConversation):
def __init__(
self,
base_url: str,
config_manager: BaseConfigManager,
agent: BaseAgent,
twilio_config: TwilioConfig,
transcriber: Optional[BaseTranscriber] = None,
synthesizer: Optional[BaseSynthesizer] = None,
twilio_sid=None,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
):
self.base_url = base_url
self.config_manager = config_manager
self.output_device = TwilioOutputDevice()
self.twilio_config = twilio_config
self.twilio_client = create_twilio_client(twilio_config)
super().__init__(
self.output_device,
transcriber
or DeepgramTranscriber(
DeepgramTranscriberConfig(
sampling_rate=8000,
audio_encoding=AudioEncoding.MULAW,
chunk_size=self.CHUNK_SIZE,
model="voicemail",
endpointing_config=PunctuationEndpointingConfig(),
),
logger=logger,
),
agent,
synthesizer
or AzureSynthesizer(
AzureSynthesizerConfig(
sampling_rate=8000, audio_encoding=AudioEncoding.MULAW
)
),
conversation_id=conversation_id,
per_chunk_allowance_seconds=0.01,
logger=logger,
)
self.twilio_sid = twilio_sid
self.latest_media_timestamp = 0
@staticmethod
def from_call_config(
base_url: str,
call_config: CallConfig,
config_manager: BaseConfigManager,
conversation_id: str,
logger: logging.Logger,
):
return Call(
base_url=base_url,
logger=logger,
config_manager=config_manager,
agent=create_agent(call_config.agent_config),
transcriber=create_transcriber(call_config.transcriber_config),
synthesizer=create_synthesizer(call_config.synthesizer_config),
twilio_config=call_config.twilio_config,
twilio_sid=call_config.twilio_sid,
conversation_id=conversation_id,
)
async def attach_ws_and_start(self, ws: WebSocket):
self.logger.debug("Trying to attach WS to outbound call")
self.output_device.ws = ws
self.logger.debug("Attached WS to outbound call")
twilio_call = self.twilio_client.calls(self.twilio_sid).fetch()
if twilio_call.answered_by in ("machine_start", "fax"):
self.logger.info(f"Call answered by {twilio_call.answered_by}")
twilio_call.update(status="completed")
else:
await self.wait_for_twilio_start(ws)
await super().start()
while self.active:
message = await ws.receive_text()
response = await self.handle_ws_message(message)
if response == PhoneCallAction.CLOSE_WEBSOCKET:
break
self.tear_down()
async def wait_for_twilio_start(self, ws: WebSocket):
while True:
message = await ws.receive_text()
if not message:
continue
data = json.loads(message)
if data["event"] == "start":
self.logger.debug(
f"Media WS: Received event '{data['event']}': {message}"
)
self.output_device.stream_sid = data["start"]["streamSid"]
break
async def handle_ws_message(self, message) -> PhoneCallAction:
if message is None:
return PhoneCallAction.CLOSE_WEBSOCKET
data = json.loads(message)
if data["event"] == "media":
media = data["media"]
chunk = base64.b64decode(media["payload"])
if self.latest_media_timestamp + 20 < int(media["timestamp"]):
bytes_to_fill = 8 * (
int(media["timestamp"]) - (self.latest_media_timestamp + 20)
)
self.logger.debug(f"Filling {bytes_to_fill} bytes of silence")
# NOTE: 0xff is silence for mulaw audio
self.receive_audio(b"\xff" * bytes_to_fill)
self.latest_media_timestamp = int(media["timestamp"])
self.receive_audio(chunk)
elif data["event"] == "stop":
self.logger.debug(f"Media WS: Received event 'stop': {message}")
self.logger.debug("Stopping...")
return PhoneCallAction.CLOSE_WEBSOCKET
def end_twilio_call(self) -> bool:
response = self.twilio_client.calls(self.twilio_sid).update(status="completed")
return response.status == "completed"
def mark_terminated(self):
super().mark_terminated()
self.end_twilio_call()
self.config_manager.delete_config(self.id)
def tear_down(self):
self.terminate()

View file

@ -0,0 +1,110 @@
import logging
from typing import Optional
from twilio.rest import Client
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
SynthesizerConfig,
)
from vocode.streaming.models.telephony import CallConfig, TwilioConfig
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
TranscriberConfig,
)
from vocode.streaming.telephony.config_manager.base_config_manager import (
BaseConfigManager,
)
from vocode.streaming.telephony.constants import (
DEFAULT_AUDIO_ENCODING,
DEFAULT_CHUNK_SIZE,
DEFAULT_SAMPLING_RATE,
)
from vocode.streaming.telephony.twilio import create_twilio_client
from vocode.streaming.utils import create_conversation_id
class OutboundCall:
def __init__(
self,
base_url: str,
to_phone: str,
from_phone: str,
config_manager: BaseConfigManager,
agent_config: AgentConfig,
twilio_config: TwilioConfig,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
):
self.base_url = base_url
self.to_phone = to_phone
self.from_phone = from_phone
self.config_manager = config_manager
self.agent_config = agent_config
self.transcriber_config = transcriber_config or DeepgramTranscriberConfig(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
chunk_size=DEFAULT_CHUNK_SIZE,
model="voicemail",
endpointing_config=PunctuationEndpointingConfig(),
)
self.synthesizer_config = synthesizer_config or AzureSynthesizerConfig(
sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING
)
self.conversation_id = conversation_id or create_conversation_id()
self.logger = logger
self.twilio_config = twilio_config
self.twilio_client = create_twilio_client(twilio_config)
self.twilio_sid = None
def create_twilio_call(
self, to_phone: str, from_phone: str, digits: str = ""
) -> str:
twilio_call = self.twilio_client.calls.create(
url=f"https://{self.base_url}/twiml/initiate_call/{self.conversation_id}",
to=to_phone,
from_=from_phone,
send_digits=digits,
)
return twilio_call.sid
def validate_outbound_call(
self,
to_phone: str,
from_phone: str,
mobile_only: bool = True,
):
if len(to_phone) < 8:
raise ValueError("Invalid 'to' phone")
if not mobile_only:
return
line_type_intelligence = (
self.twilio_client.lookups.v2.phone_numbers(to_phone)
.fetch(fields="line_type_intelligence")
.line_type_intelligence
)
if not line_type_intelligence or (
line_type_intelligence and line_type_intelligence["type"] != "mobile"
):
raise ValueError("Can only call mobile phones")
def start(self):
self.logger.debug("Starting outbound call")
self.validate_outbound_call(self.to_phone, self.from_phone)
self.twilio_sid = self.create_twilio_call(self.to_phone, self.from_phone)
call_config = CallConfig(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config,
twilio_config=self.twilio_config,
twilio_sid=self.twilio_sid,
)
self.config_manager.save_config(self.conversation_id, call_config)
def end(self):
response = self.twilio_client.calls(self.twilio_sid).update(status="completed")
return response.status == "completed"

View file

@ -0,0 +1,73 @@
import logging
from typing import Optional
from twilio.rest import Client
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.telephony import CallConfig, TwilioConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.telephony.config_manager.base_config_manager import (
BaseConfigManager,
)
from vocode.streaming.telephony.conversation.outbound_call import OutboundCall
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
from vocode.streaming.utils import create_conversation_id
class ZoomDialIn(OutboundCall):
def __init__(
self,
base_url: str,
zoom_number: str,
zoom_meeting_id: str,
zoom_meeting_password: Optional[str],
from_phone: str,
config_manager: BaseConfigManager,
twilio_config: TwilioConfig,
agent_config: AgentConfig,
transcriber_config: TranscriberConfig,
synthesizer_config: SynthesizerConfig,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
):
super().__init__(
base_url=base_url,
to_phone=zoom_number,
from_phone=from_phone,
config_manager=config_manager,
transcriber_config=transcriber_config,
agent_config=agent_config,
synthesizer_config=synthesizer_config,
twilio_config=twilio_config,
conversation_id=conversation_id,
logger=logger,
)
self.zoom_number = zoom_number
self.zoom_meeting_id = zoom_meeting_id
self.zoom_meeting_password = zoom_meeting_password
self.from_phone = from_phone
def start(self):
self.validate_outbound_call(
self.zoom_number,
self.from_phone,
mobile_only=False,
)
digits = f"ww{self.zoom_meeting_id}#"
if self.zoom_meeting_password:
digits += f"wwww*{self.zoom_meeting_password}#"
self.logger.debug("Sending digits %s to the call", digits)
twilio_sid = self.create_twilio_call(
self.zoom_number,
self.from_phone,
digits=digits,
)
call_config = CallConfig(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config,
twilio_config=self.twilio_config,
twilio_sid=twilio_sid,
)
self.config_manager.save_config(self.conversation_id, call_config)

View file

@ -0,0 +1,62 @@
from fastapi import FastAPI, Response, Form
from typing import Optional
import requests
import uvicorn
import vocode
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.telephony import (
CreateInboundCall,
TwilioConfig,
TwilioConfig,
)
class InboundCallServer:
def __init__(
self,
agent_config: AgentConfig,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
response_on_rate_limit: Optional[str] = None,
twilio_config: Optional[TwilioConfig] = None,
):
self.agent_config = agent_config
self.transcriber_config = transcriber_config
self.synthesizer_config = synthesizer_config
self.app = FastAPI()
self.app.post("/vocode")(self.handle_call)
self.response_on_rate_limit = (
response_on_rate_limit
or "The line is really busy right now, check back later!"
)
self.twilio_config = twilio_config
self.vocode_inbound_call_url = f"https://{vocode.base_url}/create_inbound_call"
async def handle_call(self, twilio_sid: str = Form(alias="CallSid")):
response = requests.post(
self.vocode_inbound_call_url,
headers={"Authorization": f"Bearer {vocode.api_key}"},
json=CreateInboundCall(
agent_config=self.agent_config,
twilio_sid=twilio_sid,
transcriber_config=self.transcriber_config,
synthesizer_config=self.synthesizer_config,
twilio_config=self.twilio_config,
).dict(),
)
if response.status_code == 429:
return Response(
f"<Response><Say>{self.response_on_rate_limit}</Say></Response>",
media_type="application/xml",
)
assert response.ok, response.text
return Response(
response.text,
media_type="application/xml",
)
def run(self, host="localhost", port=3000):
uvicorn.run(self.app, host=host, port=port)

View file

@ -0,0 +1,45 @@
from typing import Optional, Union
from vocode.streaming.models.telephony import TwilioConfig
from vocode.streaming.telephony.hosted.inbound_call_server import InboundCallServer
from vocode.streaming.models.agent import (
RESTfulAgentEnd,
RESTfulAgentInput,
RESTfulAgentText,
RESTfulUserImplementedAgentConfig,
)
from vocode.streaming.models.transcriber import (
TranscriberConfig,
)
from vocode.streaming.models.synthesizer import SynthesizerConfig
class InboundCallUserAgentServer(InboundCallServer):
def __init__(
self,
agent_config: RESTfulUserImplementedAgentConfig,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
response_on_rate_limit: Optional[str] = None,
twilio_config: Optional[TwilioConfig] = None,
):
super().__init__(
agent_config=agent_config,
transcriber_config=transcriber_config,
synthesizer_config=synthesizer_config,
response_on_rate_limit=response_on_rate_limit,
twilio_config=twilio_config,
)
assert isinstance(
agent_config, RESTfulUserImplementedAgentConfig
), "agent_config must be a RESTfulUserImplementedAgentConfig"
self.app.post("/respond")(self.respond_rest)
async def respond(
self, human_input, conversation_id
) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
raise NotImplementedError
async def respond_rest(
self, request: RESTfulAgentInput
) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
return await self.respond(request.human_input, request.conversation_id)

View file

@ -0,0 +1,68 @@
from typing import Optional
import requests
import vocode
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.telephony import (
CallEntity,
CreateOutboundCall,
EndOutboundCall,
TwilioConfig,
)
class OutboundCall:
def __init__(
self,
recipient: CallEntity,
caller: CallEntity,
agent_config: AgentConfig,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
conversation_id: Optional[str] = None,
twilio_config: Optional[TwilioConfig] = None,
):
self.recipient = recipient
self.caller = caller
self.agent_config = agent_config
self.transcriber_config = transcriber_config
self.synthesizer_config = synthesizer_config
self.conversation_id = conversation_id
self.twilio_config = twilio_config
self.vocode_create_outbound_call_url = (
f"https://{vocode.base_url}/create_outbound_call"
)
self.vocode_end_outbound_call_url = (
f"https://{vocode.base_url}/end_outbound_call"
)
def start(self) -> str:
response = requests.post(
self.vocode_create_outbound_call_url,
headers={"Authorization": f"Bearer {vocode.api_key}"},
json=CreateOutboundCall(
recipient=self.recipient,
caller=self.caller,
agent_config=self.agent_config,
transcriber_config=self.transcriber_config,
synthesizer_config=self.synthesizer_config,
conversation_id=self.conversation_id,
twilio_config=self.twilio_config,
).dict(),
)
assert response.ok, response.text
data = response.json()
self.conversation_id = data["id"]
def end(self) -> str:
response = requests.post(
self.vocode_end_outbound_call_url,
headers={"Authorization": f"Bearer {vocode.api_key}"},
json=EndOutboundCall(
call_id=self.conversation_id,
twilio_config=self.twilio_config,
).dict(),
)
assert response.ok or response.status_code == 404, response.text

View file

@ -0,0 +1,60 @@
from typing import Optional
import requests
import vocode
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.telephony.hosted.outbound_call import OutboundCall
from vocode.streaming.models.telephony import (
CallEntity,
DialIntoZoomCall,
TwilioConfig,
)
class ZoomDialIn(OutboundCall):
def __init__(
self,
recipient: CallEntity,
caller: CallEntity,
zoom_meeting_id: str,
zoom_meeting_password: str,
agent_config: AgentConfig,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
conversation_id: Optional[str] = None,
twilio_config: Optional[TwilioConfig] = None,
):
super().__init__(
recipient=recipient,
caller=caller,
agent_config=agent_config,
transcriber_config=transcriber_config,
synthesizer_config=synthesizer_config,
conversation_id=conversation_id,
twilio_config=twilio_config,
)
self.zoom_meeting_id = zoom_meeting_id
self.zoom_meeting_password = zoom_meeting_password
self.vocode_zoom_dial_in_url = f"https://{vocode.base_url}/dial_into_zoom_call"
def start(self) -> str:
response = requests.post(
self.vocode_zoom_dial_in_url,
headers={"Authorization": f"Bearer {vocode.api_key}"},
json=DialIntoZoomCall(
recipient=self.recipient,
caller=self.caller,
zoom_meeting_id=self.zoom_meeting_id,
zoom_meeting_password=self.zoom_meeting_password,
agent_config=self.agent_config,
transcriber_config=self.transcriber_config,
synthesizer_config=self.synthesizer_config,
conversation_id=self.conversation_id,
twilio_config=self.twilio_config,
).dict(),
)
assert response.ok, response.text
data = response.json()
self.conversation_id = data["id"]

View file

@ -0,0 +1,143 @@
import logging
from typing import Optional
from fastapi import APIRouter, Form, Response
from pydantic import BaseModel
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
SynthesizerConfig,
)
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
TranscriberConfig,
)
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.telephony.config_manager.base_config_manager import (
BaseConfigManager,
)
from vocode.streaming.telephony.constants import (
DEFAULT_AUDIO_ENCODING,
DEFAULT_CHUNK_SIZE,
DEFAULT_SAMPLING_RATE,
)
from vocode.streaming.telephony.server.router.calls import CallsRouter
from vocode.streaming.telephony.server.router.twiml import TwiMLRouter
from vocode.streaming.models.telephony import (
CallConfig,
CallEntity,
CreateOutboundCall,
CreateInboundCall,
DialIntoZoomCall,
EndOutboundCall,
TwilioConfig,
)
from twilio.rest import Client
from vocode.streaming.telephony.conversation.call import Call
from vocode.streaming.telephony.templates import Templater
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
from vocode.streaming.utils import create_conversation_id
class InboundCallConfig(BaseModel):
url: str
agent_config: AgentConfig
twilio_config: TwilioConfig
transcriber_config: Optional[TranscriberConfig] = None
synthesizer_config: Optional[SynthesizerConfig] = None
class TelephonyServer:
def __init__(
self,
base_url: str,
config_manager: BaseConfigManager,
inbound_call_configs: list[InboundCallConfig] = [],
logger: Optional[logging.Logger] = None,
):
self.base_url = base_url
self.logger = logger or logging.getLogger(__name__)
self.router = APIRouter()
self.config_manager = config_manager
self.templater = Templater()
self.router.include_router(
CallsRouter(
base_url=base_url,
templater=self.templater,
config_manager=self.config_manager,
logger=self.logger,
).get_router()
)
self.router.include_router(
TwiMLRouter(
base_url=base_url, templater=self.templater, logger=self.logger
).get_router()
)
for config in inbound_call_configs:
self.router.add_api_route(
config.url,
self.create_inbound_route(
agent_config=config.agent_config,
twilio_config=config.twilio_config,
transcriber_config=config.transcriber_config,
synthesizer_config=config.synthesizer_config,
),
methods=["POST"],
)
logger.info(f"Set up inbound call TwiML at https://{base_url}{config.url}")
def create_inbound_route(
self,
agent_config: AgentConfig,
twilio_config: TwilioConfig,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
):
def route(twilio_sid: str = Form(alias="CallSid")) -> Response:
call_config = CallConfig(
transcriber_config=transcriber_config
or DeepgramTranscriberConfig(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
chunk_size=DEFAULT_CHUNK_SIZE,
model="voicemail",
endpointing_config=PunctuationEndpointingConfig(),
),
agent_config=agent_config,
synthesizer_config=synthesizer_config
or AzureSynthesizerConfig(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
),
twilio_config=twilio_config,
twilio_sid=twilio_sid,
)
conversation_id = create_conversation_id()
self.config_manager.save_config(conversation_id, call_config)
return self.templater.get_connection_twiml(
base_url=self.base_url, call_id=conversation_id
)
return route
async def end_outbound_call(self, conversation_id: str):
# TODO validation via twilio_client
call_config = self.config_manager.get_config(conversation_id)
if not call_config:
raise ValueError("Call not found")
call = Call.from_call_config(
self.base_url,
call_config,
self.config_manager,
conversation_id,
self.logger,
)
call.end_twilio_call()
return {"id": call.id}
def get_router(self) -> APIRouter:
return self.router

View file

@ -0,0 +1,45 @@
from typing import Optional
import logging
from fastapi import APIRouter, HTTPException, WebSocket
from vocode.streaming.telephony.config_manager.base_config_manager import (
BaseConfigManager,
)
from vocode.streaming.telephony.conversation.call import Call
from vocode.streaming.telephony.templates import Templater
class CallsRouter:
def __init__(
self,
base_url: str,
templater: Templater,
config_manager: BaseConfigManager,
logger: Optional[logging.Logger] = None,
):
super().__init__()
self.base_url = base_url
self.templater = templater
self.config_manager = config_manager
self.logger = logger or logging.getLogger(__name__)
self.router = APIRouter()
self.router.websocket("/connect_call/{id}")(self.connect_call)
async def connect_call(self, websocket: WebSocket, id: str):
await websocket.accept()
self.logger.debug("Phone WS connection opened for chat {}".format(id))
call_config = self.config_manager.get_config(id)
if not call_config:
raise HTTPException(status_code=400, detail="No active phone call")
call: Call = Call.from_call_config(
self.base_url, call_config, self.config_manager, id, self.logger
)
await call.attach_ws_and_start(websocket)
self.config_manager.delete_config(call.id)
self.logger.debug("Phone WS connection closed for chat {}".format(id))
def get_router(self) -> APIRouter:
return self.router

View file

@ -0,0 +1,29 @@
import logging
from typing import Optional
from fastapi import APIRouter
from vocode.streaming.telephony.templates import Templater
class TwiMLRouter:
def __init__(
self,
base_url: str,
templater: Templater,
logger: Optional[logging.Logger] = None,
):
super().__init__()
self.base_url = base_url
self.templater = templater
self.logger = logger or logging.getLogger(__name__)
self.router = APIRouter()
self.router.add_api_route(
"/twiml/initiate_call/{id}", self.call_twiml, methods=["POST"]
)
def call_twiml(self, id: str):
return self.templater.get_connection_twiml(base_url=self.base_url, call_id=id)
def get_router(self) -> APIRouter:
return self.router

View file

@ -0,0 +1,20 @@
import os
from jinja2 import Environment, FileSystemLoader
from fastapi import Response
class Templater:
def __init__(self):
self.templates = Environment(
loader=FileSystemLoader("%s/templates/" % os.path.dirname(__file__))
)
def render_template(self, template_name: str, **kwargs):
template = self.templates.get_template(template_name)
return template.render(**kwargs)
def get_connection_twiml(self, call_id: str, base_url: str):
return Response(
self.render_template("connect_call.xml", base_url=base_url, id=call_id),
media_type="application/xml",
)

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
<Stream url="wss://{{ base_url }}/connect_call/{{ id }}" />
</Connect>
</Response>

View file

@ -0,0 +1,12 @@
import os
from typing import Optional
from dotenv import load_dotenv
from twilio.rest import Client
from vocode.streaming.models.telephony import TwilioConfig
load_dotenv()
def create_twilio_client(twilio_config: TwilioConfig):
return Client(twilio_config.account_sid, twilio_config.auth_token)

View file

@ -0,0 +1,101 @@
import asyncio
import json
import logging
import os
from dotenv import load_dotenv
import websockets
from urllib.parse import urlencode
from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig
from vocode.streaming.models.websocket import AudioMessage
from vocode.streaming.transcriber.base_transcriber import (
BaseTranscriber,
Transcription,
)
from vocode.streaming.models.audio_encoding import AudioEncoding
load_dotenv()
ASSEMBLY_AI_API_KEY = os.environ.get("ASSEMBLY_AI_API_KEY")
ASSEMBLY_AI_URL = "wss://api.assemblyai.com/v2/realtime/ws"
class AssemblyAITranscriber(BaseTranscriber):
def __init__(
self,
transcriber_config: AssemblyAITranscriberConfig,
logger: logging.Logger = None,
):
super().__init__(transcriber_config)
self._ended = False
self.is_ready = False
self.logger = logger or logging.getLogger(__name__)
if self.transcriber_config.should_warmup_model:
raise Exception("AssemblyAI model warmup not supported yet")
elif self.transcriber_config.endpointing_config:
raise Exception("Assembly AI endpointing config not supported yet")
async def ready(self):
# while not self.warmed_up:
# await asyncio.sleep(0.1)
# return self.is_ready
return True
async def run(self):
await self.process()
def send_audio(self, chunk):
self.audio_queue.put_nowait(chunk)
def terminate(self):
terminate_msg = json.dumps({"terminate_session": True})
self.audio_queue.put_nowait(terminate_msg)
self._ended = True
def get_assembly_ai_url(self):
return ASSEMBLY_AI_URL + f"?sample_rate={self.transcriber_config.sampling_rate}"
async def process(self):
self.audio_queue = asyncio.Queue()
URL = self.get_assembly_ai_url()
async with websockets.connect(
URL,
extra_headers=(("Authorization", ASSEMBLY_AI_API_KEY),),
ping_interval=5,
ping_timeout=20,
) as ws:
await asyncio.sleep(0.1)
async def sender(ws): # sends audio to websocket
while not self._ended:
try:
data = await asyncio.wait_for(self.audio_queue.get(), 5)
except asyncio.exceptions.TimeoutError:
break
await ws.send(
json.dumps({"audio_data": AudioMessage.from_bytes(data).data})
)
self.logger.debug("Terminating AssemblyAI transcriber sender")
async def receiver(ws):
while not self._ended:
try:
result_str = await ws.recv()
except websockets.exceptions.ConnectionClosedError as e:
self.logger.debug(e)
break
except Exception as e:
assert False, "Not a websocket 4008 error"
data = json.loads(result_str)
is_final = (
"message_type" in data
and data["message_type"] == "FinalTranscript"
)
if "text" in data and data["text"]:
await self.on_response(
Transcription(data["text"], data["confidence"], is_final)
)
await asyncio.gather(sender(ws), receiver(ws))

View file

@ -0,0 +1,59 @@
from dotenv import load_dotenv
from typing import Callable, Optional, Awaitable
from vocode.streaming.utils import convert_wav
from vocode.streaming.models.transcriber import EndpointingConfig, TranscriberConfig
load_dotenv()
class Transcription:
def __init__(
self,
message: str,
confidence: float,
is_final: bool,
is_interrupt: bool = False,
):
self.message = message
self.confidence = confidence
self.is_final = is_final
self.is_interrupt = is_interrupt
def __str__(self):
return f"Transcription({self.message}, {self.confidence}, {self.is_final})"
class BaseTranscriber:
def __init__(
self,
transcriber_config: TranscriberConfig,
):
self.transcriber_config = transcriber_config
self.on_response: Optional[Callable[[Transcription], Awaitable]] = None
def get_transcriber_config(self) -> TranscriberConfig:
return self.transcriber_config
def set_on_response(self, on_response: Callable[[Transcription], Awaitable]):
self.on_response = on_response
def get_warmup_bytes(self):
sampling_rate = self.transcriber_config.sampling_rate
return convert_wav(
"convo/audio/ajay.wav",
sampling_rate,
self.transcriber_config.audio_encoding,
)
async def ready(self):
return True
async def run(self):
pass
def send_audio(self, chunk):
pass
def terminate(self):
pass

View file

@ -0,0 +1,230 @@
import asyncio
import json
import logging
import os
from dotenv import load_dotenv
import websockets
from websockets.client import WebSocketClientProtocol
import audioop
from urllib.parse import urlencode
from vocode.streaming.transcriber.base_transcriber import (
BaseTranscriber,
Transcription,
)
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
EndpointingConfig,
EndpointingType,
)
from vocode.streaming.models.audio_encoding import AudioEncoding
load_dotenv()
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
PUNCTUATION_TERMINATORS = [".", "!", "?"]
NUM_RESTARTS = 5
class DeepgramTranscriber(BaseTranscriber):
def __init__(
self,
transcriber_config: DeepgramTranscriberConfig,
logger: logging.Logger = None,
):
super().__init__(transcriber_config)
self.transcriber_config = transcriber_config
self._ended = False
self.warmed_up = False
self.is_ready = False
self.logger = logger or logging.getLogger(__name__)
def create_warmup_chunks(self):
warmup_chunks = []
warmup_bytes = self.get_warmup_bytes()
chunk_size = self.transcriber_config.chunk_size
for i in range(len(warmup_bytes) // chunk_size):
warmup_chunks.append(warmup_bytes[i * chunk_size : (i + 1) * chunk_size])
return warmup_chunks
async def ready(self):
while not self.warmed_up:
await asyncio.sleep(0.1)
return self.is_ready
async def run(self):
# warmup_chunks = await self.create_warmup_chunks()
restarts = 0
while not self._ended and restarts < NUM_RESTARTS:
await self.process(self.transcriber_config.should_warmup_model)
restarts += 1
self.logger.debug(
"Deepgram connection died, restarting, num_restarts: %s", restarts
)
def send_audio(self, chunk):
if (
self.transcriber_config.downsampling
and self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16
):
chunk, _ = audioop.ratecv(
chunk,
2,
1,
self.transcriber_config.sampling_rate
* self.transcriber_config.downsampling,
self.transcriber_config.sampling_rate,
None,
)
self.audio_queue.put_nowait(chunk)
def terminate(self):
terminate_msg = json.dumps({"type": "CloseStream"})
self.audio_queue.put_nowait(terminate_msg)
self._ended = True
def get_deepgram_url(self):
if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
encoding = "linear16"
elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
encoding = "mulaw"
url_params = {
"encoding": encoding,
"sample_rate": self.transcriber_config.sampling_rate,
"channels": 1,
"interim_results": "true",
}
extra_params = {}
if self.transcriber_config.model:
extra_params["model"] = self.transcriber_config.model
if self.transcriber_config.tier:
extra_params["tier"] = self.transcriber_config.tier
if self.transcriber_config.version:
extra_params["version"] = self.transcriber_config.version
if (
self.transcriber_config.endpointing_config
and self.transcriber_config.endpointing_config.type
== EndpointingType.PUNCTUATION_BASED
):
extra_params["punctuate"] = "true"
url_params.update(extra_params)
return f"wss://api.deepgram.com/v1/listen?{urlencode(url_params)}"
def is_speech_final(
self, current_buffer: str, deepgram_response: dict, time_silent: float
):
transcript = deepgram_response["channel"]["alternatives"][0]["transcript"]
# if it is not time based, then return true if speech is final and there is a transcript
if not self.transcriber_config.endpointing_config:
return transcript and deepgram_response["speech_final"]
elif (
self.transcriber_config.endpointing_config.type
== EndpointingType.TIME_BASED
):
# if it is time based, then return true if there is no transcript
# and there is some speech to send
# and the time_silent is greater than the cutoff
return (
not transcript
and current_buffer
and (time_silent + deepgram_response["duration"])
> self.transcriber_config.endpointing_config.time_cutoff_seconds
)
elif (
self.transcriber_config.endpointing_config.type
== EndpointingType.PUNCTUATION_BASED
):
return (
transcript
and deepgram_response["speech_final"]
and transcript.strip()[-1] in PUNCTUATION_TERMINATORS
) or (
not transcript
and current_buffer
and (time_silent + deepgram_response["duration"])
> self.transcriber_config.endpointing_config.time_cutoff_seconds
)
raise Exception("Endpointing config not supported")
def calculate_time_silent(self, data: dict):
end = data["start"] + data["duration"]
words = data["channel"]["alternatives"][0]["words"]
if words:
return end - words[-1]["end"]
return data["duration"]
async def process(self, warmup=True):
extra_headers = {"Authorization": f"Token {DEEPGRAM_API_KEY}"}
self.audio_queue = asyncio.Queue()
async with websockets.connect(
self.get_deepgram_url(), extra_headers=extra_headers
) as ws:
async def warmup_sender(ws: WebSocketClientProtocol):
if warmup:
warmup_chunks = self.create_warmup_chunks()
for chunk in warmup_chunks:
await ws.send(chunk)
await asyncio.sleep(5)
self.warmed_up = True
self.is_ready = True
async def sender(ws: WebSocketClientProtocol): # sends audio to websocket
while not self._ended:
try:
data = await asyncio.wait_for(self.audio_queue.get(), 5)
except asyncio.exceptions.TimeoutError:
break
await ws.send(data)
self.logger.debug("Terminating Deepgram transcriber sender")
async def receiver(ws: WebSocketClientProtocol):
buffer = ""
time_silent = 0
while not self._ended:
try:
msg = await ws.recv()
except Exception as e:
self.logger.debug(f"Got error {e} in Deepgram receiver")
break
data = json.loads(msg)
if (
not "is_final" in data
): # means we've finished receiving transcriptions
break
is_final = data["is_final"]
speech_final = self.is_speech_final(buffer, data, time_silent)
top_choice = data["channel"]["alternatives"][0]
confidence = top_choice["confidence"]
if (
top_choice["transcript"]
and confidence > 0.0
and self.warmed_up
and is_final
):
buffer = f"{buffer} {top_choice['transcript']}"
if speech_final:
await self.on_response(Transcription(buffer, confidence, True))
buffer = ""
time_silent = 0
elif (
top_choice["transcript"] and confidence > 0.0 and self.warmed_up
):
await self.on_response(
Transcription(
buffer,
confidence,
False,
)
)
time_silent = self.calculate_time_silent(data)
else:
time_silent += data["duration"]
self.logger.debug("Terminating Deepgram transcriber receiver")
await asyncio.gather(warmup_sender(ws), sender(ws), receiver(ws))

View file

@ -0,0 +1,145 @@
import asyncio
import time
import queue
from google.cloud import speech
import threading
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.transcriber.base_transcriber import (
BaseTranscriber,
Transcription,
)
from vocode.streaming.models.transcriber import GoogleTranscriberConfig
from vocode.streaming.utils import create_loop_in_thread
class GoogleTranscriber(BaseTranscriber):
def __init__(self, transcriber_config: GoogleTranscriberConfig):
super().__init__(transcriber_config)
self._queue = queue.Queue()
self._ended = False
self.google_streaming_config = self.create_google_streaming_config()
self.client = speech.SpeechClient()
self.warmed_up = False
self.is_ready = False
if self.transcriber_config.endpointing_config:
raise Exception("Google endpointing config not supported yet")
self.event_loop = asyncio.new_event_loop()
self.thread = threading.Thread(
name="google_transcriber",
target=create_loop_in_thread,
args=(self.event_loop, self.process()),
)
def create_google_streaming_config(self):
extra_params = {}
if self.transcriber_config.model:
extra_params["model"] = self.transcriber_config.model
extra_params["use_enhanced"] = True
if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
google_audio_encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
google_audio_encoding = speech.RecognitionConfig.AudioEncoding.MULAW
return speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=google_audio_encoding,
sample_rate_hertz=self.transcriber_config.sampling_rate,
language_code="en-US",
**extra_params
),
interim_results=True,
)
async def ready(self):
if not self.transcriber_config.should_warmup_model:
return True
while not self.warmed_up:
await asyncio.sleep(0.1)
return self.is_ready
def warmup(self):
warmup_bytes = self.get_warmup_bytes()
def stream():
chunk_size = self.transcriber_config.sampling_rate * 2
for i in range(len(warmup_bytes) // chunk_size):
yield speech.StreamingRecognizeRequest(
audio_content=warmup_bytes[i * chunk_size : (i + 1) * chunk_size]
)
time.sleep(0.01)
for _ in self.client.streaming_recognize(
self.google_streaming_config, stream()
):
pass
self.warmed_up = True
self.is_ready = True
async def run(self):
self.thread.start()
async def process(self):
if self.transcriber_config.should_warmup_model:
self.warmup()
stream = self.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in stream
)
responses = self.client.streaming_recognize(
self.google_streaming_config, requests
)
await self.process_responses_loop(responses)
def terminate(self):
self._ended = True
def send_audio(self, chunk: bytes):
self._queue.put(chunk, block=False)
async def process_responses_loop(self, responses):
for response in responses:
await self._on_response(response)
if self._ended:
break
async def _on_response(self, response):
if not response.results:
return
result = response.results[0]
if not result.alternatives:
return
top_choice = result.alternatives[0]
message = top_choice.transcript
confidence = top_choice.confidence
return await self.on_response(
Transcription(message, confidence, result.is_final)
)
def generator(self):
while not self._ended:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._queue.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._queue.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)

View file

@ -0,0 +1,63 @@
import asyncio
import audioop
import secrets
from typing import Any
import wave
from ..models.audio_encoding import AudioEncoding
def create_loop_in_thread(loop: asyncio.AbstractEventLoop, long_running_task=None):
asyncio.set_event_loop(loop)
if long_running_task:
loop.run_until_complete(long_running_task)
else:
loop.run_forever()
def convert_linear_audio(
raw_wav: bytes,
input_sample_rate=24000,
output_sample_rate=8000,
output_encoding=AudioEncoding.LINEAR16,
output_sample_width=2,
):
# downsample
if input_sample_rate != output_sample_rate:
raw_wav, _ = audioop.ratecv(
raw_wav, 2, 1, input_sample_rate, output_sample_rate, None
)
if output_encoding == AudioEncoding.LINEAR16:
return raw_wav
elif output_encoding == AudioEncoding.MULAW:
return audioop.lin2ulaw(raw_wav, output_sample_width)
def convert_wav(
file: Any,
output_sample_rate=8000,
output_encoding=AudioEncoding.LINEAR16,
):
with wave.open(file, "rb") as wav:
raw_wav = wav.readframes(wav.getnframes())
return convert_linear_audio(
raw_wav,
input_sample_rate=wav.getframerate(),
output_sample_rate=output_sample_rate,
output_encoding=output_encoding,
output_sample_width=wav.getsampwidth(),
)
def get_chunk_size_per_second(audio_encoding: AudioEncoding, sampling_rate: int) -> int:
if audio_encoding == AudioEncoding.LINEAR16:
return sampling_rate * 2
elif audio_encoding == AudioEncoding.MULAW:
return sampling_rate
else:
raise Exception("Unsupported audio encoding")
def create_conversation_id() -> str:
return secrets.token_urlsafe(16)

View file

@ -0,0 +1,102 @@
import os
import asyncio
import openai
from dotenv import load_dotenv
import numpy as np
import requests
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
PLATFORM = "pyq" if os.getenv("USE_PYQ_EMBEDDINGS", "false") == "true" else "openai"
SIMILARITY_THRESHOLD = 0.9
SIMILARITY_THRESHOLD_PYQ = 0.7
EMBEDDING_SIZE = 1536
PYQ_EMBEDDING_SIZE = 768
GOODBYE_PHRASES = [
"bye",
"goodbye",
"see you",
"see you later",
"talk to you later",
"talk to you soon",
"have a good day",
"have a good night",
]
PYQ_API_URL = "https://embeddings.pyqai.com"
class GoodbyeModel:
def __init__(
self,
embeddings_cache_path=os.path.join(
os.path.dirname(__file__), "goodbye_embeddings"
),
):
self.goodbye_embeddings = self.load_or_create_embeddings(
f"{embeddings_cache_path}/goodbye_embeddings.npy"
)
self.goodbye_embeddings_pyq = self.load_or_create_embeddings(
f"{embeddings_cache_path}/goodbye_embeddings_pyq.npy"
)
def load_or_create_embeddings(self, path):
if os.path.exists(path):
return np.load(path)
else:
embeddings = self.create_embeddings()
np.save(path, embeddings)
return embeddings
def create_embeddings(self, platform=PLATFORM):
print("Creating embeddings...")
size = EMBEDDING_SIZE if platform == "openai" else PYQ_EMBEDDING_SIZE
embeddings = np.empty((size, len(GOODBYE_PHRASES)))
for i, goodbye_phrase in enumerate(GOODBYE_PHRASES):
embeddings[:, i] = self.create_embedding(goodbye_phrase, platform=platform)
return embeddings
async def is_goodbye(self, text: str, platform=PLATFORM) -> bool:
if "bye" in text.lower():
return True
embedding = self.create_embedding(text.strip().lower(), platform=platform)
goodbye_embeddings = (
self.goodbye_embeddings
if platform == "openai"
else self.goodbye_embeddings_pyq
)
threshold = (
SIMILARITY_THRESHOLD if platform == "openai" else SIMILARITY_THRESHOLD_PYQ
)
similarity_results = embedding @ goodbye_embeddings
return np.max(similarity_results) > threshold
def create_embedding(self, text, platform=PLATFORM) -> np.array:
if platform == "openai":
return np.array(
openai.Embedding.create(input=text, model="text-embedding-ada-002")[
"data"
][0]["embedding"]
)
elif platform == "pyq":
return np.array(
requests.post(
PYQ_API_URL,
headers={
"Content-Type": "application/json",
"Authorization": os.getenv("PYQ_API_KEY"),
},
json={"input_sequence": [text], "account_id": "400"},
).json()["response"][0]
)
if __name__ == "__main__":
async def main():
model = GoodbyeModel()
while True:
print(await model.is_goodbye(input("Text: ")))
asyncio.run(main())

View file

@ -0,0 +1,236 @@
"""
A port of sseclient (https://pypi.org/project/sseclient/) that allows you to get server-side events with a POST request
Copyright (c) 2015 Brent Tubbs
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE."""
#
# Distributed under the terms of the MIT license.
#
from __future__ import unicode_literals
import codecs
import re
import time
import warnings
import six
import requests
__version__ = "0.0.27"
# Technically, we should support streams that mix line endings. This regex,
# however, assumes that a system will provide consistent line endings.
end_of_field = re.compile(r"\r\n\r\n|\r\r|\n\n")
class SSEClient(object):
def __init__(
self,
method,
url,
last_id=None,
retry=3000,
session=None,
chunk_size=1024,
**kwargs
):
self.url = url
self.method = method
self.last_id = last_id
self.retry = retry
self.chunk_size = chunk_size
# Optional support for passing in a requests.Session()
self.session = session
# Any extra kwargs will be fed into the requests.get call later.
self.requests_kwargs = kwargs
# The SSE spec requires making requests with Cache-Control: nocache
if "headers" not in self.requests_kwargs:
self.requests_kwargs["headers"] = {}
self.requests_kwargs["headers"]["Cache-Control"] = "no-cache"
# The 'Accept' header is not required, but explicit > implicit
self.requests_kwargs["headers"]["Accept"] = "text/event-stream"
# Keep data here as it streams in
self.buf = ""
self._connect()
def _connect(self):
if self.last_id:
self.requests_kwargs["headers"]["Last-Event-ID"] = self.last_id
# Use session if set. Otherwise fall back to requests module.
requester = self.session or requests
self.resp = requester.request(
self.method, self.url, stream=True, **self.requests_kwargs
)
self.resp_iterator = self.iter_content()
encoding = self.resp.encoding or self.resp.apparent_encoding
self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace")
# TODO: Ensure we're handling redirects. Might also stick the 'origin'
# attribute on Events like the Javascript spec requires.
self.resp.raise_for_status()
def iter_content(self):
def generate():
while True:
if (
hasattr(self.resp.raw, "_fp")
and hasattr(self.resp.raw._fp, "fp")
and hasattr(self.resp.raw._fp.fp, "read1")
):
chunk = self.resp.raw._fp.fp.read1(self.chunk_size)
else:
# _fp is not available, this means that we cannot use short
# reads and this will block until the full chunk size is
# actually read
chunk = self.resp.raw.read(self.chunk_size)
if not chunk:
break
yield chunk
return generate()
def _event_complete(self):
return re.search(end_of_field, self.buf) is not None
def __iter__(self):
return self
def __next__(self):
while not self._event_complete():
try:
next_chunk = next(self.resp_iterator)
if not next_chunk:
raise EOFError()
self.buf += self.decoder.decode(next_chunk)
except (
StopIteration,
requests.RequestException,
EOFError,
six.moves.http_client.IncompleteRead,
) as e:
print(e)
time.sleep(self.retry / 1000.0)
self._connect()
# The SSE spec only supports resuming from a whole message, so
# if we have half a message we should throw it out.
head, sep, tail = self.buf.rpartition("\n")
self.buf = head + sep
continue
# Split the complete event (up to the end_of_field) into event_string,
# and retain anything after the current complete event in self.buf
# for next time.
(event_string, self.buf) = re.split(end_of_field, self.buf, maxsplit=1)
msg = Event.parse(event_string)
# If the server requests a specific retry delay, we need to honor it.
if msg.retry:
self.retry = msg.retry
# last_id should only be set if included in the message. It's not
# forgotten if a message omits it.
if msg.id:
self.last_id = msg.id
return msg
if six.PY2:
next = __next__
class Event(object):
sse_line_pattern = re.compile("(?P<name>[^:]*):?( ?(?P<value>.*))?")
def __init__(self, data="", event="message", id=None, retry=None):
assert isinstance(data, six.string_types), "Data must be text"
self.data = data
self.event = event
self.id = id
self.retry = retry
def dump(self):
lines = []
if self.id:
lines.append("id: %s" % self.id)
# Only include an event line if it's not the default already.
if self.event != "message":
lines.append("event: %s" % self.event)
if self.retry:
lines.append("retry: %s" % self.retry)
lines.extend("data: %s" % d for d in self.data.split("\n"))
return "\n".join(lines) + "\n\n"
@classmethod
def parse(cls, raw):
"""
Given a possibly-multiline string representing an SSE message, parse it
and return a Event object.
"""
msg = cls()
for line in raw.splitlines():
m = cls.sse_line_pattern.match(line)
if m is None:
# Malformed line. Discard but warn.
warnings.warn('Invalid SSE line: "%s"' % line, SyntaxWarning)
continue
name = m.group("name")
if name == "":
# line began with a ":", so is a comment. Ignore
continue
value = m.group("value")
if name == "data":
# If we already have some data, then join to it with a newline.
# Else this is it.
if msg.data:
msg.data = "%s\n%s" % (msg.data, value)
else:
msg.data = value
elif name == "event":
msg.event = value
elif name == "id":
msg.id = value
elif name == "retry":
msg.retry = int(value)
return msg
def __str__(self):
return self.data

View file

@ -0,0 +1,40 @@
import time
from pydantic import BaseModel, Field
from enum import Enum
class Sender(str, Enum):
HUMAN = "human"
BOT = "bot"
class Message(BaseModel):
text: str
sender: Sender
timestamp: float
def to_string(self, include_timestamp: bool = False) -> str:
if include_timestamp:
return f"{self.sender.name}: {self.text} ({self.timestamp})"
return f"{self.sender.name}: {self.text}"
class Transcript(BaseModel):
messages: list[Message] = []
start_time: float = Field(default_factory=time.time)
def to_string(self, include_timestamps: bool = False) -> str:
return "\n".join(
message.to_string(include_timestamp=include_timestamps)
for message in self.messages
)
def add_human_message(self, text: str):
self.messages.append(
Message(text=text, sender=Sender.HUMAN, timestamp=time.time())
)
def add_bot_message(self, text: str):
self.messages.append(
Message(text=text, sender=Sender.BOT, timestamp=time.time())
)