open source
This commit is contained in:
parent
70b6e17c69
commit
a93bfc1ec9
61 changed files with 4013 additions and 126 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -3,3 +3,5 @@ __pycache__/
|
||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
dist/
|
dist/
|
||||||
|
credentials.json
|
||||||
|
*.npy
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from vocode.streaming.telephony.inbound_call_server import InboundCallServer
|
from vocode.streaming.telephony.hosted.inbound_call_server import InboundCallServer
|
||||||
from vocode.streaming.models.agent import EchoAgentConfig
|
from vocode.streaming.models.agent import EchoAgentConfig
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
||||||
from vocode.streaming.output_device.telephone_output import TelephoneOutput
|
from vocode.streaming.output_device.telephone_output import TelephoneOutput
|
||||||
from vocode.streaming.telephony.outbound_call import OutboundCall
|
from vocode.streaming.telephony.hosted.outbound_call import OutboundCall
|
||||||
from vocode.streaming.models.telephony import CallEntity
|
from vocode.streaming.models.telephony import CallEntity
|
||||||
from vocode.streaming.models.agent import (
|
from vocode.streaming.models.agent import (
|
||||||
EchoAgentConfig,
|
EchoAgentConfig,
|
||||||
|
|
@ -8,7 +8,7 @@ from vocode.streaming.models.agent import (
|
||||||
WebSocketUserImplementedAgentConfig,
|
WebSocketUserImplementedAgentConfig,
|
||||||
)
|
)
|
||||||
from vocode.streaming.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn
|
from vocode.streaming.telephony.hosted.zoom_dial_in import ZoomDialIn
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
call = ZoomDialIn(
|
call = ZoomDialIn(
|
||||||
|
|
@ -3,6 +3,7 @@ import logging
|
||||||
import signal
|
import signal
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
|
from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation
|
||||||
from vocode.streaming.streaming_conversation import StreamingConversation
|
from vocode.streaming.streaming_conversation import StreamingConversation
|
||||||
from vocode.helpers import create_microphone_input_and_speaker_output
|
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||||
from vocode.streaming.models.transcriber import (
|
from vocode.streaming.models.transcriber import (
|
||||||
|
|
@ -22,7 +23,6 @@ from vocode.streaming.models.agent import (
|
||||||
)
|
)
|
||||||
from vocode.streaming.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
||||||
from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
|
|
||||||
import vocode
|
import vocode
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
@ -37,7 +37,7 @@ if __name__ == "__main__":
|
||||||
streaming=True, use_default_devices=False
|
streaming=True, use_default_devices=False
|
||||||
)
|
)
|
||||||
|
|
||||||
conversation = StreamingConversation(
|
conversation = HostedStreamingConversation(
|
||||||
input_device=microphone_input,
|
input_device=microphone_input,
|
||||||
output_device=speaker_output,
|
output_device=speaker_output,
|
||||||
transcriber_config=DeepgramTranscriberConfig.from_input_device(
|
transcriber_config=DeepgramTranscriberConfig.from_input_device(
|
||||||
79
examples/streaming_conversation.py
Normal file
79
examples/streaming_conversation.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
|
||||||
|
from vocode.streaming.streaming_conversation import StreamingConversation
|
||||||
|
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||||
|
from vocode.streaming.models.transcriber import (
|
||||||
|
DeepgramTranscriberConfig,
|
||||||
|
PunctuationEndpointingConfig,
|
||||||
|
GoogleTranscriberConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.agent import (
|
||||||
|
ChatGPTAgentConfig,
|
||||||
|
CutOffResponse,
|
||||||
|
FillerAudioConfig,
|
||||||
|
RESTfulUserImplementedAgentConfig,
|
||||||
|
WebSocketUserImplementedAgentConfig,
|
||||||
|
EchoAgentConfig,
|
||||||
|
LLMAgentConfig,
|
||||||
|
ChatGPTAgentConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
|
from vocode.streaming.models.synthesizer import (
|
||||||
|
AzureSynthesizerConfig,
|
||||||
|
GoogleSynthesizerConfig,
|
||||||
|
RimeSynthesizerConfig,
|
||||||
|
)
|
||||||
|
import vocode
|
||||||
|
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
|
||||||
|
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
vocode.api_key = os.getenv("VOCODE_API_KEY")
|
||||||
|
|
||||||
|
logging.basicConfig()
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
|
||||||
|
streaming=True, use_default_devices=False
|
||||||
|
)
|
||||||
|
|
||||||
|
conversation = StreamingConversation(
|
||||||
|
output_device=speaker_output,
|
||||||
|
transcriber=DeepgramTranscriber(
|
||||||
|
DeepgramTranscriberConfig.from_input_device(
|
||||||
|
microphone_input, endpointing_config=PunctuationEndpointingConfig()
|
||||||
|
)
|
||||||
|
),
|
||||||
|
agent=ChatGPTAgent(
|
||||||
|
ChatGPTAgentConfig(
|
||||||
|
initial_message=BaseMessage(text="What up"),
|
||||||
|
prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
|
||||||
|
hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
|
||||||
|
generate_responses=True,
|
||||||
|
cut_off_response=CutOffResponse(),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
synthesizer=AzureSynthesizer(
|
||||||
|
AzureSynthesizerConfig.from_output_device(speaker_output),
|
||||||
|
),
|
||||||
|
logger=logger,
|
||||||
|
)
|
||||||
|
await conversation.start()
|
||||||
|
print("Conversation started, press Ctrl+C to end")
|
||||||
|
signal.signal(signal.SIGINT, lambda _0, _1: conversation.terminate())
|
||||||
|
while conversation.is_active():
|
||||||
|
chunk = microphone_input.get_audio()
|
||||||
|
if chunk:
|
||||||
|
conversation.receive_audio(chunk)
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
69
examples/telephony_app.py
Normal file
69
examples/telephony_app.py
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
import logging
|
||||||
|
from fastapi import FastAPI
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
|
||||||
|
from vocode.streaming.models.agent import ChatGPTAgentConfig
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
|
from vocode.streaming.models.telephony import TwilioConfig
|
||||||
|
from vocode.streaming.telephony.config_manager.redis_config_manager import (
|
||||||
|
RedisConfigManager,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.conversation.outbound_call import OutboundCall
|
||||||
|
|
||||||
|
from vocode.streaming.telephony.server.base import InboundCallConfig, TelephonyServer
|
||||||
|
|
||||||
|
app = FastAPI(docs_url=None)
|
||||||
|
|
||||||
|
logging.basicConfig()
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
config_manager = RedisConfigManager()
|
||||||
|
|
||||||
|
BASE_URL = "59b8e140372d.ngrok.app"
|
||||||
|
|
||||||
|
telephony_server = TelephonyServer(
|
||||||
|
base_url=BASE_URL,
|
||||||
|
config_manager=config_manager,
|
||||||
|
inbound_call_configs=[
|
||||||
|
InboundCallConfig(
|
||||||
|
url="/inbound_call",
|
||||||
|
agent_config=ChatGPTAgentConfig(
|
||||||
|
initial_message=BaseMessage(text="What up"),
|
||||||
|
prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
|
||||||
|
hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
|
||||||
|
generate_responses=True,
|
||||||
|
),
|
||||||
|
twilio_config=TwilioConfig(
|
||||||
|
account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
|
||||||
|
auth_token=os.getenv("TWILIO_AUTH_TOKEN"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
logger=logger,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(telephony_server.get_router())
|
||||||
|
|
||||||
|
# outbound_call = OutboundCall(
|
||||||
|
# base_url=BASE_URL,
|
||||||
|
# to_phone="+14088926228",
|
||||||
|
# from_phone="+14086600744",
|
||||||
|
# config_manager=config_manager,
|
||||||
|
# agent_config=ChatGPTAgentConfig(
|
||||||
|
# initial_message=BaseMessage(text="What up"),
|
||||||
|
# prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like
|
||||||
|
# hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""",
|
||||||
|
# generate_responses=True,
|
||||||
|
# ),
|
||||||
|
# twilio_config=TwilioConfig(
|
||||||
|
# account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
|
||||||
|
# auth_token=os.getenv("TWILIO_AUTH_TOKEN"),
|
||||||
|
# ),
|
||||||
|
# logger=logger,
|
||||||
|
# )
|
||||||
|
# outbound_call.start()
|
||||||
|
|
@ -4,6 +4,8 @@ anyio==3.6.2
|
||||||
async-timeout==4.0.2
|
async-timeout==4.0.2
|
||||||
attrs==22.2.0
|
attrs==22.2.0
|
||||||
azure-cognitiveservices-speech==1.25.0
|
azure-cognitiveservices-speech==1.25.0
|
||||||
|
black==23.1.0
|
||||||
|
cachetools==5.3.0
|
||||||
certifi==2022.12.7
|
certifi==2022.12.7
|
||||||
cffi==1.15.1
|
cffi==1.15.1
|
||||||
charset-normalizer==3.0.1
|
charset-normalizer==3.0.1
|
||||||
|
|
@ -12,32 +14,50 @@ dataclasses-json==0.5.7
|
||||||
decorator==5.1.1
|
decorator==5.1.1
|
||||||
fastapi==0.92.0
|
fastapi==0.92.0
|
||||||
frozenlist==1.3.3
|
frozenlist==1.3.3
|
||||||
|
google-api-core==2.11.0
|
||||||
|
google-auth==2.16.3
|
||||||
|
google-cloud-speech==2.17.3
|
||||||
|
google-cloud-texttospeech==2.14.1
|
||||||
|
googleapis-common-protos==1.59.0
|
||||||
|
grpcio==1.51.3
|
||||||
|
grpcio-status==1.51.3
|
||||||
h11==0.14.0
|
h11==0.14.0
|
||||||
idna==3.4
|
idna==3.4
|
||||||
|
Jinja2==3.1.2
|
||||||
|
joblib==1.2.0
|
||||||
langchain==0.0.117
|
langchain==0.0.117
|
||||||
|
MarkupSafe==2.1.2
|
||||||
marshmallow==3.19.0
|
marshmallow==3.19.0
|
||||||
marshmallow-enum==1.5.1
|
marshmallow-enum==1.5.1
|
||||||
mccabe==0.7.0
|
mccabe==0.7.0
|
||||||
multidict==6.0.4
|
multidict==6.0.4
|
||||||
mypy-extensions==1.0.0
|
mypy-extensions==1.0.0
|
||||||
|
nltk==3.8.1
|
||||||
numpy==1.24.2
|
numpy==1.24.2
|
||||||
openai==0.27.2
|
openai==0.27.2
|
||||||
packaging==23.0
|
packaging==23.0
|
||||||
pathspec==0.11.0
|
pathspec==0.11.0
|
||||||
platformdirs==3.1.0
|
platformdirs==3.1.0
|
||||||
ply==3.11
|
ply==3.11
|
||||||
|
proto-plus==1.22.2
|
||||||
|
protobuf==4.22.1
|
||||||
|
pyasn1==0.4.8
|
||||||
|
pyasn1-modules==0.2.8
|
||||||
PyAudio==0.2.13
|
PyAudio==0.2.13
|
||||||
pycodestyle==2.10.0
|
pycodestyle==2.10.0
|
||||||
pycparser==2.21
|
pycparser==2.21
|
||||||
pydantic>=1.9.0
|
pydantic==1.10.7
|
||||||
pyflakes>=2.5.0
|
|
||||||
pydub==0.25.1
|
pydub==0.25.1
|
||||||
|
pyflakes==3.0.1
|
||||||
PyJWT==2.6.0
|
PyJWT==2.6.0
|
||||||
python-dotenv==0.21.1
|
python-dotenv==0.21.1
|
||||||
python-multipart==0.0.6
|
python-multipart==0.0.6
|
||||||
pytz==2022.7.1
|
pytz==2022.7.1
|
||||||
PyYAML==6.0
|
PyYAML==6.0
|
||||||
|
redis==4.5.3
|
||||||
|
regex==2023.3.23
|
||||||
requests==2.28.2
|
requests==2.28.2
|
||||||
|
rsa==4.9
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
sniffio==1.3.0
|
sniffio==1.3.0
|
||||||
sounddevice==0.4.6
|
sounddevice==0.4.6
|
||||||
|
|
@ -46,8 +66,9 @@ starlette==0.25.0
|
||||||
tenacity==8.2.2
|
tenacity==8.2.2
|
||||||
tomli==2.0.1
|
tomli==2.0.1
|
||||||
tqdm==4.65.0
|
tqdm==4.65.0
|
||||||
|
twilio==7.17.0
|
||||||
typing-inspect==0.8.0
|
typing-inspect==0.8.0
|
||||||
typing_extensions>=3.10.0.2
|
typing_extensions==4.5.0
|
||||||
urllib3==1.26.14
|
urllib3==1.26.14
|
||||||
uvicorn==0.20.0
|
uvicorn==0.20.0
|
||||||
websockets==10.4
|
websockets==10.4
|
||||||
|
|
|
||||||
44
vocode/streaming/agent/base_agent.py
Normal file
44
vocode/streaming/agent/base_agent.py
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
import random
|
||||||
|
from typing import Generator, Optional
|
||||||
|
from vocode.streaming.models.agent import (
|
||||||
|
AgentConfig,
|
||||||
|
ChatGPTAgentConfig,
|
||||||
|
LLMAgentConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseAgent:
|
||||||
|
def __init__(self, agent_config: AgentConfig):
|
||||||
|
self.agent_config = agent_config
|
||||||
|
|
||||||
|
def get_agent_config(self) -> AgentConfig:
|
||||||
|
return self.agent_config
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def respond(
|
||||||
|
self, human_input, is_interrupt: bool = False
|
||||||
|
) -> tuple[Optional[str], bool]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def generate_response(
|
||||||
|
self, human_input, is_interrupt: bool = False
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
|
"""Returns a generator that yields a sentence at a time."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def update_last_bot_message_on_cut_off(self, message: str):
|
||||||
|
"""Updates the last bot message in the conversation history when the human cuts off the bot's response."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_cut_off_response(self) -> Optional[str]:
|
||||||
|
assert isinstance(self.agent_config, LLMAgentConfig) or isinstance(
|
||||||
|
self.agent_config, ChatGPTAgentConfig
|
||||||
|
)
|
||||||
|
on_cut_off_messages = self.agent_config.cut_off_response.messages
|
||||||
|
if on_cut_off_messages:
|
||||||
|
return random.choice(on_cut_off_messages).text
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
pass
|
||||||
50
vocode/streaming/agent/bot_sentiment_analyser.py
Normal file
50
vocode/streaming/agent/bot_sentiment_analyser.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
from typing import Optional
|
||||||
|
from langchain.llms import OpenAI
|
||||||
|
from langchain.prompts import PromptTemplate
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
TEMPLATE = """
|
||||||
|
Read the following conversation classify the final emotion of the Bot as one of [{emotions}].
|
||||||
|
Output the degree of emotion as a value between 0 and 1 in the format EMOTION,DEGREE: ex. {example_emotion},0.5
|
||||||
|
|
||||||
|
<start>
|
||||||
|
{{transcript}}
|
||||||
|
<end>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class BotSentiment(BaseModel):
|
||||||
|
emotion: Optional[str] = None
|
||||||
|
degree: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class BotSentimentAnalyser:
|
||||||
|
def __init__(self, emotions: list[str], model_name: str = "text-davinci-003"):
|
||||||
|
self.model_name = model_name
|
||||||
|
self.llm = OpenAI(
|
||||||
|
model_name=self.model_name,
|
||||||
|
)
|
||||||
|
assert len(emotions) > 0
|
||||||
|
self.emotions = [e.lower() for e in emotions]
|
||||||
|
self.prompt = PromptTemplate(
|
||||||
|
input_variables=["transcript"],
|
||||||
|
template=TEMPLATE.format(
|
||||||
|
emotions=",".join(self.emotions), example_emotion=self.emotions[0]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def analyse(self, transcript: str) -> BotSentiment:
|
||||||
|
prompt = self.prompt.format(transcript=transcript)
|
||||||
|
response = self.llm(prompt).strip()
|
||||||
|
tokens = response.split(",")
|
||||||
|
if len(tokens) != 2:
|
||||||
|
return BotSentiment(emotion=None, degree=0.0)
|
||||||
|
emotion, degree = tokens
|
||||||
|
emotion = emotion.strip().lower()
|
||||||
|
if emotion.lower() not in self.emotions:
|
||||||
|
return BotSentiment(emotion=None, degree=0.0)
|
||||||
|
try:
|
||||||
|
degree = float(degree.strip())
|
||||||
|
except ValueError:
|
||||||
|
return BotSentiment(emotion=emotion, degree=0.5)
|
||||||
|
return BotSentiment(emotion=emotion, degree=degree)
|
||||||
158
vocode/streaming/agent/chat_gpt_agent.py
Normal file
158
vocode/streaming/agent/chat_gpt_agent.py
Normal file
|
|
@ -0,0 +1,158 @@
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from langchain.prompts import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
MessagesPlaceholder,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.chains import ConversationChain
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.llms import OpenAIChat
|
||||||
|
from langchain.memory import ConversationBufferMemory
|
||||||
|
from langchain.schema import ChatMessage, AIMessage
|
||||||
|
import openai
|
||||||
|
import json
|
||||||
|
from typing import Generator, Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from typing import Generator
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.models.agent import ChatGPTAgentConfig
|
||||||
|
from vocode.streaming.utils.sse_client import SSEClient
|
||||||
|
from vocode.streaming.agent.utils import stream_llm_response
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
openai.api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
class ChatGPTAgent(BaseAgent):
|
||||||
|
def __init__(self, agent_config: ChatGPTAgentConfig, logger: logging.Logger = None):
|
||||||
|
super().__init__(agent_config)
|
||||||
|
self.agent_config = agent_config
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
self.logger.setLevel(logging.DEBUG)
|
||||||
|
self.prompt = ChatPromptTemplate.from_messages(
|
||||||
|
[
|
||||||
|
SystemMessagePromptTemplate.from_template(agent_config.prompt_preamble),
|
||||||
|
MessagesPlaceholder(variable_name="history"),
|
||||||
|
HumanMessagePromptTemplate.from_template("{input}"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.memory = ConversationBufferMemory(return_messages=True)
|
||||||
|
if agent_config.initial_message:
|
||||||
|
if (
|
||||||
|
agent_config.generate_responses
|
||||||
|
): # we use ChatMessages for memory when we generate responses
|
||||||
|
self.memory.chat_memory.messages.append(
|
||||||
|
ChatMessage(
|
||||||
|
content=agent_config.initial_message.text, role="assistant"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.memory.chat_memory.add_ai_message(
|
||||||
|
agent_config.initial_message.text
|
||||||
|
)
|
||||||
|
self.llm = ChatOpenAI(
|
||||||
|
model_name=self.agent_config.model_name,
|
||||||
|
temperature=self.agent_config.temperature,
|
||||||
|
max_tokens=self.agent_config.max_tokens,
|
||||||
|
)
|
||||||
|
self.conversation = ConversationChain(
|
||||||
|
memory=self.memory, prompt=self.prompt, llm=self.llm
|
||||||
|
)
|
||||||
|
self.first_response = (
|
||||||
|
self.create_first_response(agent_config.expected_first_prompt)
|
||||||
|
if agent_config.expected_first_prompt
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
self.is_first_response = True
|
||||||
|
|
||||||
|
def create_first_response(self, first_prompt):
|
||||||
|
return self.conversation.predict(input=first_prompt)
|
||||||
|
|
||||||
|
def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
|
||||||
|
if is_interrupt and self.agent_config.cut_off_response:
|
||||||
|
cut_off_response = self.get_cut_off_response()
|
||||||
|
self.memory.chat_memory.add_user_message(human_input)
|
||||||
|
self.memory.chat_memory.add_ai_message(cut_off_response)
|
||||||
|
return cut_off_response, False
|
||||||
|
self.logger.debug("LLM responding to human input")
|
||||||
|
if self.is_first_response and self.first_response:
|
||||||
|
self.logger.debug("First response is cached")
|
||||||
|
self.is_first_response = False
|
||||||
|
text = self.first_response
|
||||||
|
else:
|
||||||
|
text = self.conversation.predict(input=human_input)
|
||||||
|
self.logger.debug(f"LLM response: {text}")
|
||||||
|
return text, False
|
||||||
|
|
||||||
|
def generate_response(
|
||||||
|
self, human_input, is_interrupt: bool = False
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
|
self.memory.chat_memory.messages.append(
|
||||||
|
ChatMessage(role="user", content=human_input)
|
||||||
|
)
|
||||||
|
if is_interrupt and self.agent_config.cut_off_response:
|
||||||
|
cut_off_response = self.get_cut_off_response()
|
||||||
|
self.memory.chat_memory.messages.append(
|
||||||
|
ChatMessage(role="assistant", content=cut_off_response)
|
||||||
|
)
|
||||||
|
yield cut_off_response
|
||||||
|
return
|
||||||
|
prompt_messages = [
|
||||||
|
ChatMessage(role="system", content=self.agent_config.prompt_preamble)
|
||||||
|
] + self.memory.chat_memory.messages
|
||||||
|
messages = SSEClient(
|
||||||
|
"POST",
|
||||||
|
"https://api.openai.com/v1/chat/completions",
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"model": self.agent_config.model_name,
|
||||||
|
"messages": [
|
||||||
|
prompt_message.dict(include={"content": True, "role": True})
|
||||||
|
for prompt_message in prompt_messages
|
||||||
|
],
|
||||||
|
"max_tokens": 256,
|
||||||
|
"temperature": 1.0,
|
||||||
|
"stream": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
bot_memory_message = ChatMessage(role="assistant", content="")
|
||||||
|
self.memory.chat_memory.messages.append(bot_memory_message)
|
||||||
|
for message in stream_llm_response(
|
||||||
|
map(lambda event: json.loads(event.data), messages),
|
||||||
|
get_text=lambda choice: choice.get("delta", {}).get("content"),
|
||||||
|
):
|
||||||
|
bot_memory_message.content = f"{bot_memory_message.content} {message}"
|
||||||
|
yield message
|
||||||
|
|
||||||
|
def update_last_bot_message_on_cut_off(self, message: str):
|
||||||
|
for memory_message in self.memory.chat_memory.messages[::-1]:
|
||||||
|
if (
|
||||||
|
isinstance(memory_message, ChatMessage)
|
||||||
|
and memory_message.role == "assistant"
|
||||||
|
) or isinstance(memory_message, AIMessage):
|
||||||
|
memory_message.content = message
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
agent = ChatGPTAgent(
|
||||||
|
ChatGPTAgentConfig(
|
||||||
|
model_name="gpt-4",
|
||||||
|
prompt_preamble="The assistant is having a pleasant conversation about life. If the user hasn't completed their thought, the assistant responds with 'PASS'",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
# response = agent.respond(input("Human: "))[0]
|
||||||
|
# print(f"AI: {response}")
|
||||||
|
for response in agent.generate_response(input("Human: ")):
|
||||||
|
print(f"AI: {response}")
|
||||||
13
vocode/streaming/agent/echo_agent.py
Normal file
13
vocode/streaming/agent/echo_agent.py
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
from typing import Generator
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
|
||||||
|
|
||||||
|
class EchoAgent(BaseAgent):
|
||||||
|
def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
|
||||||
|
return human_input, False
|
||||||
|
|
||||||
|
def generate_response(self, human_input, is_interrupt: bool = False) -> Generator:
|
||||||
|
yield human_input
|
||||||
|
|
||||||
|
def update_last_bot_message_on_cut_off(self, message: str):
|
||||||
|
pass
|
||||||
32
vocode/streaming/agent/information_retrieval_agent.py
Normal file
32
vocode/streaming/agent/information_retrieval_agent.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain import OpenAI
|
||||||
|
from vocode.streaming.agent.llm_agent import LLMAgent
|
||||||
|
from ..models.agent import InformationRetrievalAgentConfig, LLMAgentConfig
|
||||||
|
|
||||||
|
|
||||||
|
class InformationRetrievalAgent(LLMAgent):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
agent_config: InformationRetrievalAgentConfig,
|
||||||
|
logger: logging.Logger,
|
||||||
|
):
|
||||||
|
# super().__init__(agent_config, logger)
|
||||||
|
prompt_preamble = f"""
|
||||||
|
The AI is a friendly phone bot built for information retrieval. It understands IVR navigation and chooses which numbers to press based on the intended goal and the options provided.
|
||||||
|
Once it reaches the human, it verifies the identity of the person it is trying to reach and states its purpose. If it needs to be transferred, then the AI asks to speak to the intended recipient of the phone call.
|
||||||
|
|
||||||
|
Here is the context for the call:
|
||||||
|
Intended goal: { agent_config.goal_description }
|
||||||
|
Intended recipient: { agent_config.recipient_descriptor }
|
||||||
|
Information to be collected: { agent_config.fields }
|
||||||
|
Information to provide to the person who answers the phone: this is a robot calling on behalf of { agent_config.caller_descriptor }
|
||||||
|
|
||||||
|
The AI begins the call by introducing itself and who it represents.
|
||||||
|
"""
|
||||||
|
agent_config = LLMAgentConfig(
|
||||||
|
prompt_preamble=prompt_preamble,
|
||||||
|
)
|
||||||
|
super().__init__(agent_config, logger=logger)
|
||||||
|
self.llm = OpenAI(model_name="text-davinci-003", temperature=1)
|
||||||
139
vocode/streaming/agent/llm_agent.py
Normal file
139
vocode/streaming/agent/llm_agent.py
Normal file
|
|
@ -0,0 +1,139 @@
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from langchain import OpenAI
|
||||||
|
from langchain.llms import OpenAIChat
|
||||||
|
from typing import Generator
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.agent.utils import stream_llm_response
|
||||||
|
from vocode.streaming.models.agent import LLMAgentConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class LLMAgent(BaseAgent):
|
||||||
|
SENTENCE_ENDINGS = [".", "!", "?"]
|
||||||
|
|
||||||
|
DEFAULT_PROMPT_TEMPLATE = "{history}\nHuman: {human_input}\nAI:"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
agent_config: LLMAgentConfig,
|
||||||
|
logger: logging.Logger = None,
|
||||||
|
sender="AI",
|
||||||
|
recipient="Human",
|
||||||
|
):
|
||||||
|
super().__init__(agent_config)
|
||||||
|
self.agent_config = agent_config
|
||||||
|
self.prompt_template = (
|
||||||
|
f"{agent_config.prompt_preamble}\n\n{self.DEFAULT_PROMPT_TEMPLATE}"
|
||||||
|
)
|
||||||
|
self.initial_bot_message = (
|
||||||
|
agent_config.initial_message.text if agent_config.initial_message else None
|
||||||
|
)
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
self.sender = sender
|
||||||
|
self.recipient = recipient
|
||||||
|
self.memory = (
|
||||||
|
[f"AI: {agent_config.initial_message.text}"]
|
||||||
|
if agent_config.initial_message
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
self.llm = OpenAI(
|
||||||
|
model_name=self.agent_config.model_name,
|
||||||
|
temperature=self.agent_config.temperature,
|
||||||
|
max_tokens=self.agent_config.max_tokens,
|
||||||
|
)
|
||||||
|
self.stop_tokens = [f"{recipient}:"]
|
||||||
|
self.first_response = (
|
||||||
|
self.llm(
|
||||||
|
self.prompt_template.format(
|
||||||
|
history="", human_input=agent_config.expected_first_prompt
|
||||||
|
),
|
||||||
|
stop=self.stop_tokens,
|
||||||
|
).strip()
|
||||||
|
if agent_config.expected_first_prompt
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
self.is_first_response = True
|
||||||
|
|
||||||
|
def create_prompt(self, human_input):
|
||||||
|
history = "\n".join(self.memory[-5:])
|
||||||
|
return self.prompt_template.format(history=history, human_input=human_input)
|
||||||
|
|
||||||
|
def get_memory_entry(self, human_input, response):
|
||||||
|
return f"{self.recipient}: {human_input}\n{self.sender}: {response}"
|
||||||
|
|
||||||
|
def respond(self, human_input, is_interrupt: bool = False) -> tuple[str, bool]:
|
||||||
|
if is_interrupt and self.agent_config.cut_off_response:
|
||||||
|
cut_off_response = self.get_cut_off_response()
|
||||||
|
self.memory.append(self.get_memory_entry(human_input, cut_off_response))
|
||||||
|
return cut_off_response, False
|
||||||
|
self.logger.debug("LLM responding to human input")
|
||||||
|
if self.is_first_response and self.first_response:
|
||||||
|
self.logger.debug("First response is cached")
|
||||||
|
self.is_first_response = False
|
||||||
|
response = self.first_response
|
||||||
|
else:
|
||||||
|
response = self.llm(self.create_prompt(human_input), stop=self.stop_tokens)
|
||||||
|
response = response.replace(f"{self.sender}:", "")
|
||||||
|
self.memory.append(self.get_memory_entry(human_input, response))
|
||||||
|
self.logger.debug(f"LLM response: {response}")
|
||||||
|
return response, False
|
||||||
|
|
||||||
|
def generate_response(self, human_input, is_interrupt: bool = False) -> Generator:
|
||||||
|
self.logger.debug("LLM generating response to human input")
|
||||||
|
if is_interrupt and self.agent_config.cut_off_response:
|
||||||
|
cut_off_response = self.get_cut_off_response()
|
||||||
|
self.memory.append(self.get_memory_entry(human_input, cut_off_response))
|
||||||
|
yield cut_off_response
|
||||||
|
return
|
||||||
|
self.memory.append(self.get_memory_entry(human_input, ""))
|
||||||
|
if self.is_first_response and self.first_response:
|
||||||
|
self.logger.debug("First response is cached")
|
||||||
|
self.is_first_response = False
|
||||||
|
sentences = [self.first_response]
|
||||||
|
else:
|
||||||
|
self.logger.debug("Creating LLM prompt")
|
||||||
|
prompt = self.create_prompt(human_input)
|
||||||
|
self.logger.debug("Streaming LLM response")
|
||||||
|
sentences = stream_llm_response(
|
||||||
|
map(
|
||||||
|
lambda resp: resp.to_dict(),
|
||||||
|
self.llm.stream(prompt, stop=self.stop_tokens),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
response_buffer = ""
|
||||||
|
for sentence in sentences:
|
||||||
|
sentence = sentence.replace(f"{self.sender}:", "")
|
||||||
|
sentence = re.sub(r"^\s+(.*)", r" \1", sentence)
|
||||||
|
response_buffer += sentence
|
||||||
|
self.memory[-1] = self.get_memory_entry(human_input, response_buffer)
|
||||||
|
yield sentence
|
||||||
|
|
||||||
|
def update_last_bot_message_on_cut_off(self, message: str):
|
||||||
|
last_message = self.memory[-1]
|
||||||
|
new_last_message = (
|
||||||
|
last_message.split("\n", 1)[0] + f"\n{self.sender}: {message}"
|
||||||
|
)
|
||||||
|
self.memory[-1] = new_last_message
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
chat_responder = LLMAgent(
|
||||||
|
LLMAgentConfig(
|
||||||
|
prompt_preamble="""
|
||||||
|
The AI is having a pleasant conversation about life. If the human hasn't completed their thought, the AI responds with 'PASS'
|
||||||
|
|
||||||
|
{history}
|
||||||
|
Human: {human_input}
|
||||||
|
AI:""",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
# response = chat_responder.respond(input("Human: "))[0]
|
||||||
|
for response in chat_responder.generate_response(input("Human: ")):
|
||||||
|
print(f"AI: {response}")
|
||||||
25
vocode/streaming/agent/utils.py
Normal file
25
vocode/streaming/agent/utils.py
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
SENTENCE_ENDINGS = [".", "!", "?"]
|
||||||
|
|
||||||
|
|
||||||
|
def stream_llm_response(
|
||||||
|
gen, get_text=lambda choice: choice.get("text"), sentence_endings=SENTENCE_ENDINGS
|
||||||
|
) -> Generator:
|
||||||
|
buffer = ""
|
||||||
|
for response in gen:
|
||||||
|
choices = response.get("choices", [])
|
||||||
|
if len(choices) == 0:
|
||||||
|
break
|
||||||
|
choice = choices[0]
|
||||||
|
if choice["finish_reason"]:
|
||||||
|
break
|
||||||
|
token = get_text(choice)
|
||||||
|
if not token:
|
||||||
|
continue
|
||||||
|
buffer += token
|
||||||
|
if any(token.endswith(ending) for ending in sentence_endings):
|
||||||
|
yield buffer.strip()
|
||||||
|
buffer = ""
|
||||||
|
if buffer.strip():
|
||||||
|
yield buffer
|
||||||
3
vocode/streaming/constants.py
Normal file
3
vocode/streaming/constants.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS = 1
|
||||||
|
PER_CHUNK_ALLOWANCE_SECONDS = 0.05
|
||||||
|
ALLOWED_IDLE_TIME = 15
|
||||||
58
vocode/streaming/factory.py
Normal file
58
vocode/streaming/factory.py
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
|
||||||
|
from vocode.streaming.agent.echo_agent import EchoAgent
|
||||||
|
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
|
||||||
|
from vocode.streaming.agent.llm_agent import LLMAgent
|
||||||
|
from vocode.streaming.models.agent import AgentConfig, AgentType
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType
|
||||||
|
from vocode.streaming.models.transcriber import TranscriberConfig, TranscriberType
|
||||||
|
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
|
||||||
|
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
|
||||||
|
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
|
||||||
|
from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
|
||||||
|
|
||||||
|
|
||||||
|
def create_transcriber(transcriber_config: TranscriberConfig) -> BaseTranscriber:
|
||||||
|
if transcriber_config.type == TranscriberType.DEEPGRAM:
|
||||||
|
return DeepgramTranscriber(transcriber_config)
|
||||||
|
elif transcriber_config.type == TranscriberType.GOOGLE:
|
||||||
|
return GoogleTranscriber(transcriber_config)
|
||||||
|
elif transcriber_config.type == TranscriberType.ASSEMBLY_AI:
|
||||||
|
return AssemblyAITranscriber(transcriber_config)
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid transcriber config")
|
||||||
|
|
||||||
|
|
||||||
|
def create_agent(agent_config: AgentConfig) -> BaseAgent:
|
||||||
|
if agent_config.type == AgentType.LLM:
|
||||||
|
return LLMAgent(agent_config=agent_config)
|
||||||
|
elif agent_config.type == AgentType.CHAT_GPT:
|
||||||
|
return ChatGPTAgent(agent_config=agent_config)
|
||||||
|
elif agent_config.type == AgentType.ECHO:
|
||||||
|
return EchoAgent(agent_config=agent_config)
|
||||||
|
elif agent_config.type == AgentType.INFORMATION_RETRIEVAL:
|
||||||
|
return InformationRetrievalAgent(
|
||||||
|
agent_config=agent_config,
|
||||||
|
)
|
||||||
|
raise Exception("Invalid agent config", agent_config.type)
|
||||||
|
|
||||||
|
|
||||||
|
def create_synthesizer(synthesizer_config: SynthesizerConfig) -> BaseSynthesizer:
|
||||||
|
if synthesizer_config.type == SynthesizerType.GOOGLE:
|
||||||
|
return GoogleSynthesizer(synthesizer_config)
|
||||||
|
elif synthesizer_config.type == SynthesizerType.AZURE:
|
||||||
|
return AzureSynthesizer(synthesizer_config)
|
||||||
|
elif synthesizer_config.type == SynthesizerType.ELEVEN_LABS:
|
||||||
|
kwargs = {}
|
||||||
|
if synthesizer_config.voice_id:
|
||||||
|
kwargs["voice_id"] = synthesizer_config.voice_id
|
||||||
|
return ElevenLabsSynthesizer(synthesizer_config, **kwargs)
|
||||||
|
elif synthesizer_config.type == SynthesizerType.RIME:
|
||||||
|
return RimeSynthesizer(synthesizer_config)
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid synthesizer config")
|
||||||
106
vocode/streaming/hosted_streaming_conversation.py
Normal file
106
vocode/streaming/hosted_streaming_conversation.py
Normal file
|
|
@ -0,0 +1,106 @@
|
||||||
|
import websockets
|
||||||
|
from websockets.exceptions import ConnectionClosedOK
|
||||||
|
from websockets.client import WebSocketClientProtocol
|
||||||
|
import asyncio
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import vocode
|
||||||
|
from vocode.streaming.input_device.base_input_device import (
|
||||||
|
BaseInputDevice,
|
||||||
|
)
|
||||||
|
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
||||||
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
from vocode.streaming.models.websocket import (
|
||||||
|
ReadyMessage,
|
||||||
|
AudioMessage,
|
||||||
|
StartMessage,
|
||||||
|
StopMessage,
|
||||||
|
)
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class HostedStreamingConversation:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_device: BaseInputDevice,
|
||||||
|
output_device: BaseOutputDevice,
|
||||||
|
transcriber_config: TranscriberConfig,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
synthesizer_config: SynthesizerConfig,
|
||||||
|
id: str = None,
|
||||||
|
):
|
||||||
|
self.id = id
|
||||||
|
self.input_device = input_device
|
||||||
|
self.output_device = output_device
|
||||||
|
self.transcriber_config = transcriber_config
|
||||||
|
self.agent_config = agent_config
|
||||||
|
self.synthesizer_config = synthesizer_config
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.receiver_ready = False
|
||||||
|
self.active = True
|
||||||
|
self.output_loop = asyncio.new_event_loop()
|
||||||
|
self.output_audio_queue = queue.Queue()
|
||||||
|
self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
|
||||||
|
|
||||||
|
async def wait_for_ready(self):
|
||||||
|
while not self.receiver_ready:
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def deactivate(self):
|
||||||
|
self.active = False
|
||||||
|
|
||||||
|
def play_audio(self):
|
||||||
|
async def run():
|
||||||
|
while self.active:
|
||||||
|
try:
|
||||||
|
audio = self.output_audio_queue.get(timeout=5)
|
||||||
|
await self.output_device.send_async(audio)
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
loop.run_until_complete(run())
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
async with websockets.connect(
|
||||||
|
f"{self.vocode_websocket_url}?key={vocode.api_key}"
|
||||||
|
) as ws:
|
||||||
|
|
||||||
|
async def sender(ws: WebSocketClientProtocol):
|
||||||
|
start_message = StartMessage(
|
||||||
|
transcriber_config=self.transcriber_config,
|
||||||
|
agent_config=self.agent_config,
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
conversation_id=self.id,
|
||||||
|
)
|
||||||
|
await ws.send(start_message.json())
|
||||||
|
await self.wait_for_ready()
|
||||||
|
self.logger.info("Listening...press Ctrl+C to stop")
|
||||||
|
while self.active:
|
||||||
|
data = self.input_device.get_audio()
|
||||||
|
if data:
|
||||||
|
try:
|
||||||
|
await ws.send(AudioMessage.from_bytes(data).json())
|
||||||
|
except ConnectionClosedOK:
|
||||||
|
self.deactivate()
|
||||||
|
return
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
await ws.send(StopMessage().json())
|
||||||
|
|
||||||
|
async def receiver(ws: WebSocketClientProtocol):
|
||||||
|
ReadyMessage.parse_raw(await ws.recv())
|
||||||
|
self.receiver_ready = True
|
||||||
|
async for msg in ws:
|
||||||
|
audio_message = AudioMessage.parse_raw(msg)
|
||||||
|
self.output_audio_queue.put_nowait(audio_message.get_bytes())
|
||||||
|
|
||||||
|
output_thread = threading.Thread(target=self.play_audio)
|
||||||
|
output_thread.start()
|
||||||
|
return await asyncio.gather(sender(ws), receiver(ws))
|
||||||
|
|
@ -42,6 +42,7 @@ class AgentConfig(TypedModel, type=AgentType.BASE):
|
||||||
initial_message: Optional[BaseMessage] = None
|
initial_message: Optional[BaseMessage] = None
|
||||||
generate_responses: bool = True
|
generate_responses: bool = True
|
||||||
allowed_idle_time_seconds: Optional[float] = None
|
allowed_idle_time_seconds: Optional[float] = None
|
||||||
|
allow_agent_to_be_cut_off: bool = True
|
||||||
end_conversation_on_goodbye: bool = False
|
end_conversation_on_goodbye: bool = False
|
||||||
send_filler_audio: Union[bool, FillerAudioConfig] = False
|
send_filler_audio: Union[bool, FillerAudioConfig] = False
|
||||||
|
|
||||||
|
|
@ -59,6 +60,13 @@ class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
|
||||||
cut_off_response: Optional[CutOffResponse] = None
|
cut_off_response: Optional[CutOffResponse] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ChatGPTAlphaAgentConfig(AgentConfig, type=AgentType.CHAT_GPT_ALPHA):
|
||||||
|
prompt_preamble: str
|
||||||
|
expected_first_prompt: Optional[str] = None
|
||||||
|
temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
|
||||||
|
max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
|
||||||
|
|
||||||
|
|
||||||
class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
|
class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
|
||||||
prompt_preamble: str
|
prompt_preamble: str
|
||||||
expected_first_prompt: Optional[str] = None
|
expected_first_prompt: Optional[str] = None
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
class AudioEncoding(str, Enum):
|
class AudioEncoding(str, Enum):
|
||||||
LINEAR16 = "linear16"
|
LINEAR16 = "linear16"
|
||||||
MULAW = "mulaw"
|
MULAW = "mulaw"
|
||||||
|
|
@ -1,17 +1,17 @@
|
||||||
import pydantic
|
import pydantic
|
||||||
|
|
||||||
class BaseModel(pydantic.BaseModel):
|
|
||||||
|
|
||||||
|
class BaseModel(pydantic.BaseModel):
|
||||||
def __init__(self, **data):
|
def __init__(self, **data):
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
if 'type' in value:
|
if "type" in value:
|
||||||
data[key] = TypedModel.parse_obj(value)
|
data[key] = TypedModel.parse_obj(value)
|
||||||
super().__init__(**data)
|
super().__init__(**data)
|
||||||
|
|
||||||
|
|
||||||
# Adapted from https://github.com/pydantic/pydantic/discussions/3091
|
# Adapted from https://github.com/pydantic/pydantic/discussions/3091
|
||||||
class TypedModel(BaseModel):
|
class TypedModel(BaseModel):
|
||||||
|
|
||||||
_subtypes_ = []
|
_subtypes_ = []
|
||||||
|
|
||||||
def __init_subclass__(cls, type=None):
|
def __init_subclass__(cls, type=None):
|
||||||
|
|
@ -22,31 +22,30 @@ class TypedModel(BaseModel):
|
||||||
for t, cls in _cls._subtypes_:
|
for t, cls in _cls._subtypes_:
|
||||||
if t == type:
|
if t == type:
|
||||||
return cls
|
return cls
|
||||||
raise ValueError(f'Unknown type {type}')
|
raise ValueError(f"Unknown type {type}")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_type(_cls, cls_name):
|
def get_type(_cls, cls_name):
|
||||||
for t, cls in _cls._subtypes_:
|
for t, cls in _cls._subtypes_:
|
||||||
if cls.__name__ == cls_name:
|
if cls.__name__ == cls_name:
|
||||||
return t
|
return t
|
||||||
raise ValueError(f'Unknown class {cls_name}')
|
raise ValueError(f"Unknown class {cls_name}")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_obj(cls, obj):
|
def parse_obj(cls, obj):
|
||||||
data_type = obj.get('type')
|
data_type = obj.get("type")
|
||||||
if data_type is None:
|
if data_type is None:
|
||||||
raise ValueError(f'type is required for {cls.__name__}')
|
raise ValueError(f"type is required for {cls.__name__}")
|
||||||
|
|
||||||
sub = cls.get_cls(data_type)
|
sub = cls.get_cls(data_type)
|
||||||
if sub is None:
|
if sub is None:
|
||||||
raise ValueError(f'Unknown type {data_type}')
|
raise ValueError(f"Unknown type {data_type}")
|
||||||
return sub(**obj)
|
return sub(**obj)
|
||||||
|
|
||||||
def _iter(self, **kwargs):
|
def _iter(self, **kwargs):
|
||||||
yield 'type', self.get_type(self.__class__.__name__)
|
yield "type", self.get_type(self.__class__.__name__)
|
||||||
yield from super()._iter(**kwargs)
|
yield from super()._iter(**kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def type(self):
|
def type(self):
|
||||||
return self.get_type(self.__class__.__name__)
|
return self.get_type(self.__class__.__name__)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,14 @@ from enum import Enum
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel, validator
|
||||||
|
|
||||||
|
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
||||||
|
from vocode.streaming.telephony.constants import (
|
||||||
|
DEFAULT_AUDIO_ENCODING,
|
||||||
|
DEFAULT_SAMPLING_RATE,
|
||||||
|
)
|
||||||
from .model import TypedModel
|
from .model import TypedModel
|
||||||
from .audio_encoding import AudioEncoding
|
from .audio_encoding import AudioEncoding
|
||||||
from ..output_device.base_output_device import BaseOutputDevice
|
|
||||||
|
|
||||||
|
|
||||||
class SynthesizerType(str, Enum):
|
class SynthesizerType(str, Enum):
|
||||||
|
|
@ -38,6 +43,13 @@ class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
|
||||||
audio_encoding=output_device.audio_encoding,
|
audio_encoding=output_device.audio_encoding,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_telephone_output_device(cls):
|
||||||
|
return cls(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
|
AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
|
||||||
AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
|
AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
|
||||||
|
|
@ -45,18 +57,32 @@ AZURE_SYNTHESIZER_DEFAULT_RATE = 15
|
||||||
|
|
||||||
|
|
||||||
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
|
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
|
||||||
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
|
voice_name: Optional[str] = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
|
||||||
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
|
pitch: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_PITCH
|
||||||
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
|
rate: Optional[int] = AZURE_SYNTHESIZER_DEFAULT_RATE
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
validate_assignment = True
|
||||||
|
|
||||||
|
@validator("voice_name")
|
||||||
|
def set_name(cls, voice_name):
|
||||||
|
return voice_name or AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
|
||||||
|
|
||||||
|
@validator("pitch")
|
||||||
|
def set_pitch(cls, pitch):
|
||||||
|
return pitch or AZURE_SYNTHESIZER_DEFAULT_PITCH
|
||||||
|
|
||||||
|
@validator("rate")
|
||||||
|
def set_rate(cls, rate):
|
||||||
|
return rate or AZURE_SYNTHESIZER_DEFAULT_RATE
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_output_device(
|
def from_output_device(
|
||||||
cls,
|
cls,
|
||||||
output_device: BaseOutputDevice,
|
output_device: BaseOutputDevice,
|
||||||
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME,
|
voice_name: Optional[str] = None,
|
||||||
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH,
|
pitch: Optional[int] = None,
|
||||||
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE,
|
rate: Optional[int] = None,
|
||||||
track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False,
|
|
||||||
):
|
):
|
||||||
return cls(
|
return cls(
|
||||||
sampling_rate=output_device.sampling_rate,
|
sampling_rate=output_device.sampling_rate,
|
||||||
|
|
@ -64,16 +90,33 @@ class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
|
||||||
voice_name=voice_name,
|
voice_name=voice_name,
|
||||||
pitch=pitch,
|
pitch=pitch,
|
||||||
rate=rate,
|
rate=rate,
|
||||||
track_bot_sentiment_in_voice=track_bot_sentiment_in_voice,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
pass
|
@classmethod
|
||||||
|
def from_telephone_output_device(
|
||||||
|
cls,
|
||||||
|
voice_name: Optional[str] = None,
|
||||||
|
pitch: Optional[int] = None,
|
||||||
|
rate: Optional[int] = None,
|
||||||
|
):
|
||||||
|
return cls(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
voice_name=voice_name,
|
||||||
|
pitch=pitch,
|
||||||
|
rate=rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
|
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
|
||||||
|
api_key: str
|
||||||
|
voice_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
|
class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
|
||||||
speaker: str
|
speaker: str
|
||||||
|
|
||||||
|
|
@ -88,3 +131,14 @@ class RimeSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.RIME):
|
||||||
audio_encoding=output_device.audio_encoding,
|
audio_encoding=output_device.audio_encoding,
|
||||||
speaker=speaker,
|
speaker=speaker,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_telephone_output_device(
|
||||||
|
cls,
|
||||||
|
speaker: str,
|
||||||
|
):
|
||||||
|
return cls(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
speaker=speaker,
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
from vocode.streaming.models.model import BaseModel
|
from vocode.streaming.models.model import BaseModel
|
||||||
from vocode.streaming.models.agent import AgentConfig
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
|
@ -19,6 +20,7 @@ class CreateInboundCall(BaseModel):
|
||||||
agent_config: AgentConfig
|
agent_config: AgentConfig
|
||||||
synthesizer_config: Optional[SynthesizerConfig] = None
|
synthesizer_config: Optional[SynthesizerConfig] = None
|
||||||
twilio_sid: str
|
twilio_sid: str
|
||||||
|
conversation_id: Optional[str] = None
|
||||||
twilio_config: Optional[TwilioConfig] = None
|
twilio_config: Optional[TwilioConfig] = None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,3 +50,11 @@ class DialIntoZoomCall(BaseModel):
|
||||||
synthesizer_config: Optional[SynthesizerConfig] = None
|
synthesizer_config: Optional[SynthesizerConfig] = None
|
||||||
conversation_id: Optional[str] = None
|
conversation_id: Optional[str] = None
|
||||||
twilio_config: Optional[TwilioConfig] = None
|
twilio_config: Optional[TwilioConfig] = None
|
||||||
|
|
||||||
|
|
||||||
|
class CallConfig(BaseModel):
|
||||||
|
transcriber_config: TranscriberConfig
|
||||||
|
agent_config: AgentConfig
|
||||||
|
synthesizer_config: SynthesizerConfig
|
||||||
|
twilio_config: Optional[TwilioConfig]
|
||||||
|
twilio_sid: str
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,11 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from vocode.streaming.input_device.base_input_device import (
|
from vocode.streaming.input_device.base_input_device import BaseInputDevice
|
||||||
BaseInputDevice,
|
from vocode.streaming.telephony.constants import (
|
||||||
|
DEFAULT_AUDIO_ENCODING,
|
||||||
|
DEFAULT_CHUNK_SIZE,
|
||||||
|
DEFAULT_SAMPLING_RATE,
|
||||||
)
|
)
|
||||||
from .audio_encoding import AudioEncoding
|
from .audio_encoding import AudioEncoding
|
||||||
from .model import BaseModel, TypedModel
|
from .model import BaseModel, TypedModel
|
||||||
|
|
@ -54,11 +57,25 @@ class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
|
||||||
endpointing_config=endpointing_config,
|
endpointing_config=endpointing_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_telephone_input_device(
|
||||||
|
cls,
|
||||||
|
endpointing_config: Optional[EndpointingConfig] = None,
|
||||||
|
):
|
||||||
|
return cls(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
chunk_size=DEFAULT_CHUNK_SIZE,
|
||||||
|
endpointing_config=endpointing_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
|
class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
|
tier: Optional[str] = None
|
||||||
should_warmup_model: bool = False
|
should_warmup_model: bool = False
|
||||||
version: Optional[str] = None
|
version: Optional[str] = None
|
||||||
|
downsampling: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
|
class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
|
||||||
|
|
|
||||||
|
|
@ -6,33 +6,40 @@ from .transcriber import TranscriberConfig
|
||||||
from .agent import AgentConfig
|
from .agent import AgentConfig
|
||||||
from .synthesizer import SynthesizerConfig
|
from .synthesizer import SynthesizerConfig
|
||||||
|
|
||||||
class WebSocketMessageType(str, Enum):
|
|
||||||
BASE = 'websocket_base'
|
|
||||||
START = 'websocket_start'
|
|
||||||
AUDIO = 'websocket_audio'
|
|
||||||
READY = 'websocket_ready'
|
|
||||||
STOP = 'websocket_stop'
|
|
||||||
|
|
||||||
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
|
class WebSocketMessageType(str, Enum):
|
||||||
|
BASE = "websocket_base"
|
||||||
|
START = "websocket_start"
|
||||||
|
AUDIO = "websocket_audio"
|
||||||
|
READY = "websocket_ready"
|
||||||
|
STOP = "websocket_stop"
|
||||||
|
|
||||||
|
|
||||||
|
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
|
class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
|
||||||
data: str
|
data: str
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_bytes(cls, chunk: bytes):
|
def from_bytes(cls, chunk: bytes):
|
||||||
return cls(data=base64.b64encode(chunk).decode('utf-8'))
|
return cls(data=base64.b64encode(chunk).decode("utf-8"))
|
||||||
|
|
||||||
def get_bytes(self) -> bytes:
|
def get_bytes(self) -> bytes:
|
||||||
return base64.b64decode(self.data)
|
return base64.b64decode(self.data)
|
||||||
|
|
||||||
|
|
||||||
class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
|
class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
|
||||||
transcriber_config: TranscriberConfig
|
transcriber_config: TranscriberConfig
|
||||||
agent_config: AgentConfig
|
agent_config: AgentConfig
|
||||||
synthesizer_config: SynthesizerConfig
|
synthesizer_config: SynthesizerConfig
|
||||||
conversation_id: Optional[str] = None
|
conversation_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
|
class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
|
class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
|
||||||
pass
|
pass
|
||||||
|
|
@ -6,7 +6,7 @@ class BaseOutputDevice:
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
self.audio_encoding = audio_encoding
|
self.audio_encoding = audio_encoding
|
||||||
|
|
||||||
async def send_async(self, chunk):
|
async def send_async(self, chunk: bytes):
|
||||||
raise NotImplemented
|
raise NotImplemented
|
||||||
|
|
||||||
async def maybe_send_mark_async(self, message):
|
async def maybe_send_mark_async(self, message):
|
||||||
|
|
|
||||||
30
vocode/streaming/output_device/twilio_output_device.py
Normal file
30
vocode/streaming/output_device/twilio_output_device.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
|
||||||
|
from fastapi import WebSocket
|
||||||
|
|
||||||
|
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
||||||
|
|
||||||
|
|
||||||
|
class TwilioOutputDevice(BaseOutputDevice):
|
||||||
|
def __init__(self, ws: WebSocket = None, stream_sid: str = None):
|
||||||
|
self.ws = ws
|
||||||
|
self.stream_sid = stream_sid
|
||||||
|
|
||||||
|
async def send_async(self, chunk: bytes):
|
||||||
|
twilio_message = {
|
||||||
|
"event": "media",
|
||||||
|
"streamSid": self.stream_sid,
|
||||||
|
"media": {"payload": base64.b64encode(chunk).decode("utf-8")},
|
||||||
|
}
|
||||||
|
await self.ws.send_text(json.dumps(twilio_message))
|
||||||
|
|
||||||
|
async def maybe_send_mark_async(self, message_sent):
|
||||||
|
mark_message = {
|
||||||
|
"event": "mark",
|
||||||
|
"streamSid": self.stream_sid,
|
||||||
|
"mark": {
|
||||||
|
"name": "Sent {}".format(message_sent),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
await self.ws.send_text(json.dumps(mark_message))
|
||||||
|
|
@ -1,26 +1,67 @@
|
||||||
import websockets
|
|
||||||
from websockets.exceptions import ConnectionClosedOK
|
|
||||||
from websockets.client import WebSocketClientProtocol
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from dotenv import load_dotenv
|
from asyncio import Future
|
||||||
import os
|
import queue
|
||||||
|
from typing import Callable, Awaitable, Optional, Any
|
||||||
import logging
|
import logging
|
||||||
import threading
|
import threading
|
||||||
import queue
|
import time
|
||||||
import vocode
|
import secrets
|
||||||
from vocode.streaming.input_device.base_input_device import (
|
import random
|
||||||
BaseInputDevice,
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from vocode.streaming.agent.bot_sentiment_analyser import (
|
||||||
|
BotSentiment,
|
||||||
|
BotSentimentAnalyser,
|
||||||
)
|
)
|
||||||
|
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
||||||
from vocode.streaming.models.transcriber import TranscriberConfig
|
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
|
||||||
from vocode.streaming.models.agent import AgentConfig
|
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
|
||||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.utils.goodbye_model import GoodbyeModel
|
||||||
from vocode.streaming.models.websocket import (
|
from vocode.streaming.utils.transcript import Transcript
|
||||||
ReadyMessage,
|
|
||||||
AudioMessage,
|
from vocode.streaming.models.transcriber import (
|
||||||
StartMessage,
|
TranscriberConfig,
|
||||||
StopMessage,
|
TranscriberType,
|
||||||
)
|
)
|
||||||
|
from vocode.streaming.models.agent import (
|
||||||
|
AgentConfig,
|
||||||
|
AgentType,
|
||||||
|
FillerAudioConfig,
|
||||||
|
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.synthesizer import (
|
||||||
|
SynthesizerConfig,
|
||||||
|
SynthesizerType,
|
||||||
|
TrackBotSentimentConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.websocket import AudioMessage
|
||||||
|
from vocode.streaming.constants import (
|
||||||
|
TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS,
|
||||||
|
PER_CHUNK_ALLOWANCE_SECONDS,
|
||||||
|
ALLOWED_IDLE_TIME,
|
||||||
|
)
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import (
|
||||||
|
BaseSynthesizer,
|
||||||
|
SynthesisResult,
|
||||||
|
FillerAudio,
|
||||||
|
)
|
||||||
|
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
|
||||||
|
from vocode.streaming.utils import (
|
||||||
|
create_conversation_id,
|
||||||
|
create_loop_in_thread,
|
||||||
|
get_chunk_size_per_second,
|
||||||
|
)
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import (
|
||||||
|
Transcription,
|
||||||
|
BaseTranscriber,
|
||||||
|
)
|
||||||
|
from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
|
||||||
|
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
@ -28,79 +69,468 @@ load_dotenv()
|
||||||
class StreamingConversation:
|
class StreamingConversation:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_device: BaseInputDevice,
|
|
||||||
output_device: BaseOutputDevice,
|
output_device: BaseOutputDevice,
|
||||||
transcriber_config: TranscriberConfig,
|
transcriber: BaseTranscriber,
|
||||||
agent_config: AgentConfig,
|
agent: BaseAgent,
|
||||||
synthesizer_config: SynthesizerConfig,
|
synthesizer: BaseSynthesizer,
|
||||||
id: str = None,
|
conversation_id: str = None,
|
||||||
|
per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
):
|
):
|
||||||
self.id = id
|
self.id = conversation_id or create_conversation_id()
|
||||||
self.input_device = input_device
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
self.output_device = output_device
|
self.output_device = output_device
|
||||||
self.transcriber_config = transcriber_config
|
self.transcriber = transcriber
|
||||||
self.agent_config = agent_config
|
self.transcriber.set_on_response(self.on_transcription_response)
|
||||||
self.synthesizer_config = synthesizer_config
|
self.transcriber_task = None
|
||||||
self.logger = logging.getLogger(__name__)
|
self.agent = agent
|
||||||
self.receiver_ready = False
|
self.synthesizer = synthesizer
|
||||||
self.active = True
|
self.synthesizer_event_loop = asyncio.new_event_loop()
|
||||||
self.output_loop = asyncio.new_event_loop()
|
self.synthesizer_thread = threading.Thread(
|
||||||
self.output_audio_queue = queue.Queue()
|
name="synthesizer",
|
||||||
self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
|
target=create_loop_in_thread,
|
||||||
|
args=(self.synthesizer_event_loop,),
|
||||||
|
)
|
||||||
|
self.per_chunk_allowance_seconds = per_chunk_allowance_seconds
|
||||||
|
self.transcript = Transcript()
|
||||||
|
self.bot_sentiment = None
|
||||||
|
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
|
||||||
|
if isinstance(
|
||||||
|
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice,
|
||||||
|
bool,
|
||||||
|
):
|
||||||
|
self.track_bot_sentiment_config = TrackBotSentimentConfig()
|
||||||
|
else:
|
||||||
|
self.track_bot_sentiment_config = (
|
||||||
|
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice
|
||||||
|
)
|
||||||
|
self.bot_sentiment_analyser = BotSentimentAnalyser(
|
||||||
|
emotions=self.track_bot_sentiment_config.emotions
|
||||||
|
)
|
||||||
|
self.goodbye_model = GoodbyeModel()
|
||||||
|
|
||||||
async def wait_for_ready(self):
|
self.is_human_speaking = False
|
||||||
while not self.receiver_ready:
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def deactivate(self):
|
|
||||||
self.active = False
|
self.active = False
|
||||||
|
self.current_synthesis_task = None
|
||||||
def play_audio(self):
|
self.is_current_synthesis_interruptable = False
|
||||||
async def run():
|
self.stop_events: queue.Queue[threading.Event] = queue.Queue()
|
||||||
while self.active:
|
self.last_action_timestamp = time.time()
|
||||||
try:
|
self.check_for_idle_task = None
|
||||||
audio = self.output_audio_queue.get(timeout=5)
|
self.track_bot_sentiment_task = None
|
||||||
await self.output_device.send_async(audio)
|
self.should_wait_for_filler_audio_done_event = False
|
||||||
except queue.Empty:
|
self.current_filler_audio_done_event: Optional[threading.Event] = None
|
||||||
continue
|
self.current_filler_seconds_per_chunk: int = 0
|
||||||
|
self.current_transcription_is_interrupt: bool = False
|
||||||
loop = asyncio.new_event_loop()
|
|
||||||
loop.run_until_complete(run())
|
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
async with websockets.connect(
|
self.transcriber_task = asyncio.create_task(self.transcriber.run())
|
||||||
f"{self.vocode_websocket_url}?key={vocode.api_key}"
|
is_ready = await self.transcriber.ready()
|
||||||
) as ws:
|
if not is_ready:
|
||||||
|
raise Exception("Transcriber startup failed")
|
||||||
async def sender(ws: WebSocketClientProtocol):
|
self.synthesizer_thread.start()
|
||||||
start_message = StartMessage(
|
if self.agent.get_agent_config().send_filler_audio:
|
||||||
transcriber_config=self.transcriber_config,
|
filler_audio_config = (
|
||||||
agent_config=self.agent_config,
|
self.agent.get_agent_config().send_filler_audio
|
||||||
synthesizer_config=self.synthesizer_config,
|
if isinstance(
|
||||||
conversation_id=self.id,
|
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
|
||||||
)
|
)
|
||||||
await ws.send(start_message.json())
|
else FillerAudioConfig()
|
||||||
await self.wait_for_ready()
|
)
|
||||||
self.logger.info("Listening...press Ctrl+C to stop")
|
self.synthesizer.set_filler_audios(filler_audio_config)
|
||||||
while self.active:
|
self.agent.start()
|
||||||
data = self.input_device.get_audio()
|
if self.agent.get_agent_config().initial_message:
|
||||||
if data:
|
self.transcript.add_bot_message(
|
||||||
try:
|
self.agent.get_agent_config().initial_message.text
|
||||||
await ws.send(AudioMessage.from_bytes(data).json())
|
)
|
||||||
except ConnectionClosedOK:
|
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
|
||||||
self.deactivate()
|
self.update_bot_sentiment()
|
||||||
|
self.send_message_to_stream_nonblocking(
|
||||||
|
self.agent.get_agent_config().initial_message, False
|
||||||
|
)
|
||||||
|
self.active = True
|
||||||
|
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
|
||||||
|
self.track_bot_sentiment_task = asyncio.create_task(
|
||||||
|
self.track_bot_sentiment()
|
||||||
|
)
|
||||||
|
self.check_for_idle_task = asyncio.create_task(self.check_for_idle())
|
||||||
|
|
||||||
|
async def check_for_idle(self):
|
||||||
|
while self.is_active():
|
||||||
|
if time.time() - self.last_action_timestamp > (
|
||||||
|
self.agent.get_agent_config().allowed_idle_time_seconds
|
||||||
|
or ALLOWED_IDLE_TIME
|
||||||
|
):
|
||||||
|
self.logger.debug("Conversation idle for too long, terminating")
|
||||||
|
self.mark_terminated()
|
||||||
return
|
return
|
||||||
|
await asyncio.sleep(15)
|
||||||
|
|
||||||
|
async def track_bot_sentiment(self):
|
||||||
|
prev_transcript = None
|
||||||
|
while self.is_active():
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
if self.transcript.to_string() != prev_transcript:
|
||||||
|
self.update_bot_sentiment()
|
||||||
|
prev_transcript = self.transcript.to_string()
|
||||||
|
|
||||||
|
def update_bot_sentiment(self):
|
||||||
|
new_bot_sentiment = self.bot_sentiment_analyser.analyse(
|
||||||
|
self.transcript.to_string()
|
||||||
|
)
|
||||||
|
if new_bot_sentiment.emotion:
|
||||||
|
self.logger.debug("Bot sentiment: %s", new_bot_sentiment)
|
||||||
|
self.bot_sentiment = new_bot_sentiment
|
||||||
|
|
||||||
|
def receive_audio(self, chunk: bytes):
|
||||||
|
self.transcriber.send_audio(chunk)
|
||||||
|
|
||||||
|
async def send_messages_to_stream_async(
|
||||||
|
self,
|
||||||
|
messages,
|
||||||
|
should_allow_human_to_cut_off_bot: bool,
|
||||||
|
wait_for_filler_audio: bool = False,
|
||||||
|
) -> tuple[str, bool]:
|
||||||
|
messages_queue = queue.Queue()
|
||||||
|
messages_done = threading.Event()
|
||||||
|
speech_cut_off = threading.Event()
|
||||||
|
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
|
||||||
|
chunk_size = (
|
||||||
|
get_chunk_size_per_second(
|
||||||
|
self.synthesizer.get_synthesizer_config().audio_encoding,
|
||||||
|
self.synthesizer.get_synthesizer_config().sampling_rate,
|
||||||
|
)
|
||||||
|
* seconds_per_chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
async def send_to_call():
|
||||||
|
response_buffer = ""
|
||||||
|
cut_off = False
|
||||||
|
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
message: BaseMessage = messages_queue.get_nowait()
|
||||||
|
except queue.Empty:
|
||||||
|
if messages_done.is_set():
|
||||||
|
break
|
||||||
|
else:
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
await ws.send(StopMessage().json())
|
continue
|
||||||
|
|
||||||
async def receiver(ws: WebSocketClientProtocol):
|
stop_event = self.enqueue_stop_event()
|
||||||
ReadyMessage.parse_raw(await ws.recv())
|
synthesis_result = self.synthesizer.create_speech(
|
||||||
self.receiver_ready = True
|
message, chunk_size, bot_sentiment=self.bot_sentiment
|
||||||
async for msg in ws:
|
)
|
||||||
audio_message = AudioMessage.parse_raw(msg)
|
message_sent, cut_off = await self.send_speech_to_output(
|
||||||
self.output_audio_queue.put_nowait(audio_message.get_bytes())
|
message.text,
|
||||||
|
synthesis_result,
|
||||||
|
stop_event,
|
||||||
|
seconds_per_chunk,
|
||||||
|
)
|
||||||
|
self.logger.debug("Message sent: {}".format(message_sent))
|
||||||
|
response_buffer = f"{response_buffer} {message_sent}"
|
||||||
|
if cut_off:
|
||||||
|
speech_cut_off.set()
|
||||||
|
break
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
if cut_off:
|
||||||
|
self.agent.update_last_bot_message_on_cut_off(response_buffer)
|
||||||
|
self.transcript.add_bot_message(response_buffer)
|
||||||
|
return response_buffer, cut_off
|
||||||
|
|
||||||
output_thread = threading.Thread(target=self.play_audio)
|
asyncio.run_coroutine_threadsafe(send_to_call(), self.synthesizer_event_loop)
|
||||||
output_thread.start()
|
|
||||||
return await asyncio.gather(sender(ws), receiver(ws))
|
messages_generated = 0
|
||||||
|
for i, message in enumerate(messages):
|
||||||
|
messages_generated += 1
|
||||||
|
if i == 0:
|
||||||
|
if wait_for_filler_audio:
|
||||||
|
self.interrupt_all_synthesis()
|
||||||
|
self.wait_for_filler_audio_to_finish()
|
||||||
|
if speech_cut_off.is_set():
|
||||||
|
break
|
||||||
|
messages_queue.put_nowait(BaseMessage(text=message))
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
if messages_generated == 0:
|
||||||
|
self.logger.debug("Agent generated no messages")
|
||||||
|
if wait_for_filler_audio:
|
||||||
|
self.interrupt_all_synthesis()
|
||||||
|
messages_done.set()
|
||||||
|
|
||||||
|
def send_message_to_stream_nonblocking(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
should_allow_human_to_cut_off_bot: bool,
|
||||||
|
):
|
||||||
|
asyncio.run_coroutine_threadsafe(
|
||||||
|
self.send_message_to_stream_async(
|
||||||
|
message,
|
||||||
|
self.agent.get_agent_config().allow_agent_to_be_cut_off,
|
||||||
|
),
|
||||||
|
self.synthesizer_event_loop,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def send_message_to_stream_async(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
should_allow_human_to_cut_off_bot: bool,
|
||||||
|
) -> tuple[str, bool]:
|
||||||
|
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
|
||||||
|
stop_event = self.enqueue_stop_event()
|
||||||
|
self.logger.debug("Synthesizing speech for message")
|
||||||
|
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
|
||||||
|
chunk_size = (
|
||||||
|
get_chunk_size_per_second(
|
||||||
|
self.synthesizer.get_synthesizer_config().audio_encoding,
|
||||||
|
self.synthesizer.get_synthesizer_config().sampling_rate,
|
||||||
|
)
|
||||||
|
* seconds_per_chunk
|
||||||
|
)
|
||||||
|
synthesis_result = self.synthesizer.create_speech(
|
||||||
|
message, chunk_size, bot_sentiment=self.bot_sentiment
|
||||||
|
)
|
||||||
|
message_sent, cut_off = await self.send_speech_to_output(
|
||||||
|
message.text,
|
||||||
|
synthesis_result,
|
||||||
|
stop_event,
|
||||||
|
seconds_per_chunk,
|
||||||
|
)
|
||||||
|
self.logger.debug("Message sent: {}".format(message_sent))
|
||||||
|
if cut_off:
|
||||||
|
self.agent.update_last_bot_message_on_cut_off(message_sent)
|
||||||
|
self.transcript.add_bot_message(message_sent)
|
||||||
|
return message_sent, cut_off
|
||||||
|
|
||||||
|
def warmup_synthesizer(self):
|
||||||
|
self.synthesizer.ready_synthesizer()
|
||||||
|
|
||||||
|
# returns an estimate of what was sent up to, and a flag if the message was cut off
|
||||||
|
async def send_speech_to_output(
|
||||||
|
self,
|
||||||
|
message,
|
||||||
|
synthesis_result: SynthesisResult,
|
||||||
|
stop_event: threading.Event,
|
||||||
|
seconds_per_chunk: int,
|
||||||
|
is_filler_audio: bool = False,
|
||||||
|
):
|
||||||
|
message_sent = message
|
||||||
|
cut_off = False
|
||||||
|
chunk_size = seconds_per_chunk * get_chunk_size_per_second(
|
||||||
|
self.synthesizer.get_synthesizer_config().audio_encoding,
|
||||||
|
self.synthesizer.get_synthesizer_config().sampling_rate,
|
||||||
|
)
|
||||||
|
for i, chunk_result in enumerate(synthesis_result.chunk_generator):
|
||||||
|
start_time = time.time()
|
||||||
|
speech_length_seconds = seconds_per_chunk * (
|
||||||
|
len(chunk_result.chunk) / chunk_size
|
||||||
|
)
|
||||||
|
if stop_event.is_set():
|
||||||
|
seconds = i * seconds_per_chunk
|
||||||
|
self.logger.debug(
|
||||||
|
"Interrupted, stopping text to speech after {} chunks".format(i)
|
||||||
|
)
|
||||||
|
message_sent = f"{synthesis_result.get_message_up_to(seconds)}-"
|
||||||
|
cut_off = True
|
||||||
|
break
|
||||||
|
if i == 0:
|
||||||
|
if is_filler_audio:
|
||||||
|
self.should_wait_for_filler_audio_done_event = True
|
||||||
|
await self.output_device.send_async(chunk_result.chunk)
|
||||||
|
end_time = time.time()
|
||||||
|
await asyncio.sleep(
|
||||||
|
max(
|
||||||
|
speech_length_seconds
|
||||||
|
- (end_time - start_time)
|
||||||
|
- self.per_chunk_allowance_seconds,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.logger.debug(
|
||||||
|
"Sent chunk {} with size {}".format(i, len(chunk_result.chunk))
|
||||||
|
)
|
||||||
|
self.last_action_timestamp = time.time()
|
||||||
|
# clears it off the stop events queue
|
||||||
|
if not stop_event.is_set():
|
||||||
|
stop_event.set()
|
||||||
|
return message_sent, cut_off
|
||||||
|
|
||||||
|
async def on_transcription_response(self, transcription: Transcription):
|
||||||
|
self.last_action_timestamp = time.time()
|
||||||
|
if transcription.is_final:
|
||||||
|
self.logger.debug(
|
||||||
|
"Got transcription: {}, confidence: {}".format(
|
||||||
|
transcription.message, transcription.confidence
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not self.is_human_speaking:
|
||||||
|
# send interrupt
|
||||||
|
self.current_transcription_is_interrupt = False
|
||||||
|
if self.is_current_synthesis_interruptable:
|
||||||
|
self.logger.debug("sending interrupt")
|
||||||
|
self.current_transcription_is_interrupt = self.interrupt_all_synthesis()
|
||||||
|
self.logger.debug("Human started speaking")
|
||||||
|
|
||||||
|
transcription.is_interrupt = self.current_transcription_is_interrupt
|
||||||
|
self.is_human_speaking = not transcription.is_final
|
||||||
|
return await self.handle_transcription(transcription)
|
||||||
|
|
||||||
|
def enqueue_stop_event(self):
|
||||||
|
stop_event = threading.Event()
|
||||||
|
self.stop_events.put_nowait(stop_event)
|
||||||
|
return stop_event
|
||||||
|
|
||||||
|
def interrupt_all_synthesis(self):
|
||||||
|
"""Returns true if any synthesis was interrupted"""
|
||||||
|
num_interrupts = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
stop_event = self.stop_events.get_nowait()
|
||||||
|
if not stop_event.is_set():
|
||||||
|
self.logger.debug("Interrupting synthesis")
|
||||||
|
stop_event.set()
|
||||||
|
num_interrupts += 1
|
||||||
|
except queue.Empty:
|
||||||
|
break
|
||||||
|
return num_interrupts > 0
|
||||||
|
|
||||||
|
async def send_filler_audio_to_output(
|
||||||
|
self,
|
||||||
|
filler_audio: FillerAudio,
|
||||||
|
stop_event: threading.Event,
|
||||||
|
done_event: threading.Event,
|
||||||
|
):
|
||||||
|
filler_synthesis_result = filler_audio.create_synthesis_result()
|
||||||
|
self.is_current_synthesis_interruptable = filler_audio.is_interruptable
|
||||||
|
if isinstance(
|
||||||
|
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
|
||||||
|
):
|
||||||
|
silence_threshold = (
|
||||||
|
self.agent.get_agent_config().send_filler_audio.silence_threshold_seconds
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
silence_threshold = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
|
||||||
|
await asyncio.sleep(silence_threshold)
|
||||||
|
self.logger.debug("Sending filler audio to output")
|
||||||
|
await self.send_speech_to_output(
|
||||||
|
filler_audio.message.text,
|
||||||
|
filler_synthesis_result,
|
||||||
|
stop_event,
|
||||||
|
filler_audio.seconds_per_chunk,
|
||||||
|
is_filler_audio=True,
|
||||||
|
)
|
||||||
|
done_event.set()
|
||||||
|
|
||||||
|
def wait_for_filler_audio_to_finish(self):
|
||||||
|
if not self.should_wait_for_filler_audio_done_event:
|
||||||
|
self.logger.debug(
|
||||||
|
"Not waiting for filler audio to finish since we didn't send any chunks"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
self.should_wait_for_filler_audio_done_event = False
|
||||||
|
if (
|
||||||
|
self.current_filler_audio_done_event
|
||||||
|
and not self.current_filler_audio_done_event.is_set()
|
||||||
|
):
|
||||||
|
self.logger.debug("Waiting for filler audio to finish")
|
||||||
|
# this should guarantee that filler audio finishes, since it has to be on its last chunk
|
||||||
|
if not self.current_filler_audio_done_event.wait(
|
||||||
|
self.current_filler_seconds_per_chunk
|
||||||
|
):
|
||||||
|
self.logger.debug("Filler audio did not finish")
|
||||||
|
|
||||||
|
async def handle_transcription(self, transcription: Transcription):
|
||||||
|
if transcription.is_final:
|
||||||
|
self.transcript.add_human_message(transcription.message)
|
||||||
|
goodbye_detected_task = None
|
||||||
|
if self.agent.get_agent_config().end_conversation_on_goodbye:
|
||||||
|
goodbye_detected_task = asyncio.create_task(
|
||||||
|
self.goodbye_model.is_goodbye(transcription.message)
|
||||||
|
)
|
||||||
|
if self.agent.get_agent_config().send_filler_audio:
|
||||||
|
self.logger.debug("Sending filler audio")
|
||||||
|
if self.synthesizer.filler_audios:
|
||||||
|
filler_audio = random.choice(self.synthesizer.filler_audios)
|
||||||
|
self.logger.debug(f"Chose {filler_audio.message.text}")
|
||||||
|
self.current_filler_audio_done_event = threading.Event()
|
||||||
|
self.current_filler_seconds_per_chunk = (
|
||||||
|
filler_audio.seconds_per_chunk
|
||||||
|
)
|
||||||
|
stop_event = self.enqueue_stop_event()
|
||||||
|
asyncio.run_coroutine_threadsafe(
|
||||||
|
self.send_filler_audio_to_output(
|
||||||
|
filler_audio,
|
||||||
|
stop_event,
|
||||||
|
done_event=self.current_filler_audio_done_event,
|
||||||
|
),
|
||||||
|
self.synthesizer_event_loop,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.debug("No filler audio available for synthesizer")
|
||||||
|
self.logger.debug("Generating response for transcription")
|
||||||
|
if self.agent.get_agent_config().generate_responses:
|
||||||
|
responses = self.agent.generate_response(
|
||||||
|
transcription.message, is_interrupt=transcription.is_interrupt
|
||||||
|
)
|
||||||
|
await self.send_messages_to_stream_async(
|
||||||
|
responses,
|
||||||
|
self.agent.get_agent_config().allow_agent_to_be_cut_off,
|
||||||
|
wait_for_filler_audio=self.agent.get_agent_config().send_filler_audio,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response, should_stop = self.agent.respond(
|
||||||
|
transcription.message, is_interrupt=transcription.is_interrupt
|
||||||
|
)
|
||||||
|
if self.agent.get_agent_config().send_filler_audio:
|
||||||
|
self.interrupt_all_synthesis()
|
||||||
|
self.wait_for_filler_audio_to_finish()
|
||||||
|
if should_stop:
|
||||||
|
self.logger.debug("Agent requested to stop")
|
||||||
|
self.mark_terminated()
|
||||||
|
return
|
||||||
|
if response:
|
||||||
|
self.send_message_to_stream_nonblocking(
|
||||||
|
BaseMessage(text=response),
|
||||||
|
self.agent.get_agent_config().allow_agent_to_be_cut_off,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.debug("No response generated")
|
||||||
|
if goodbye_detected_task:
|
||||||
|
try:
|
||||||
|
goodbye_detected = await asyncio.wait_for(
|
||||||
|
goodbye_detected_task, 0.1
|
||||||
|
)
|
||||||
|
if goodbye_detected:
|
||||||
|
self.logger.debug("Goodbye detected, ending conversation")
|
||||||
|
self.mark_terminated()
|
||||||
|
return
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self.logger.debug("Goodbye detection timed out")
|
||||||
|
|
||||||
|
def mark_terminated(self):
|
||||||
|
self.active = False
|
||||||
|
|
||||||
|
# must be called from the main thread
|
||||||
|
def terminate(self):
|
||||||
|
self.mark_terminated()
|
||||||
|
if self.check_for_idle_task:
|
||||||
|
self.logger.debug("Terminating check_for_idle Task")
|
||||||
|
self.check_for_idle_task.cancel()
|
||||||
|
if self.track_bot_sentiment_task:
|
||||||
|
self.logger.debug("Terminating track_bot_sentiment Task")
|
||||||
|
self.track_bot_sentiment_task.cancel()
|
||||||
|
self.logger.debug("Terminating agent")
|
||||||
|
self.agent.terminate()
|
||||||
|
self.logger.debug("Terminating speech transcriber")
|
||||||
|
self.transcriber.terminate()
|
||||||
|
self.logger.debug("Terminating synthesizer event loop")
|
||||||
|
self.synthesizer_event_loop.call_soon_threadsafe(
|
||||||
|
self.synthesizer_event_loop.stop
|
||||||
|
)
|
||||||
|
self.logger.debug("Terminating synthesizer thread")
|
||||||
|
if self.synthesizer_thread.is_alive():
|
||||||
|
self.synthesizer_thread.join()
|
||||||
|
self.logger.debug("Terminating transcriber task")
|
||||||
|
self.transcriber_task.cancel()
|
||||||
|
self.logger.debug("Successfully terminated")
|
||||||
|
|
||||||
|
def is_active(self):
|
||||||
|
return self.active
|
||||||
|
|
|
||||||
250
vocode/streaming/synthesizer/azure_synthesizer.py
Normal file
250
vocode/streaming/synthesizer/azure_synthesizer.py
Normal file
|
|
@ -0,0 +1,250 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Any, Optional
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
import azure.cognitiveservices.speech as speechsdk
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
||||||
|
from vocode.streaming.models.message import BaseMessage, SSMLMessage
|
||||||
|
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import (
|
||||||
|
BaseSynthesizer,
|
||||||
|
SynthesisResult,
|
||||||
|
FILLER_PHRASES,
|
||||||
|
FILLER_AUDIO_PATH,
|
||||||
|
FillerAudio,
|
||||||
|
encode_as_wav,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
"mstts": "https://www.w3.org/2001/mstts",
|
||||||
|
"": "https://www.w3.org/2001/10/synthesis",
|
||||||
|
}
|
||||||
|
|
||||||
|
ElementTree.register_namespace("", NAMESPACES.get(""))
|
||||||
|
ElementTree.register_namespace("mstts", NAMESPACES.get("mstts"))
|
||||||
|
|
||||||
|
|
||||||
|
class WordBoundaryEventPool:
|
||||||
|
def __init__(self):
|
||||||
|
self.events = []
|
||||||
|
|
||||||
|
def add(self, event):
|
||||||
|
self.events.append(
|
||||||
|
{
|
||||||
|
"text": event.text,
|
||||||
|
"text_offset": event.text_offset,
|
||||||
|
"audio_offset": (event.audio_offset + 5000) / (10000 * 1000),
|
||||||
|
"boudary_type": event.boundary_type,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_events_sorted(self):
|
||||||
|
return sorted(self.events, key=lambda event: event["audio_offset"])
|
||||||
|
|
||||||
|
|
||||||
|
class AzureSynthesizer(BaseSynthesizer):
|
||||||
|
OFFSET_MS = 100
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, synthesizer_config: AzureSynthesizerConfig, logger: logging.Logger = None
|
||||||
|
):
|
||||||
|
super().__init__(synthesizer_config)
|
||||||
|
self.synthesizer_config = synthesizer_config
|
||||||
|
# Instantiates a client
|
||||||
|
speech_config = speechsdk.SpeechConfig(
|
||||||
|
subscription=os.environ.get("AZURE_SPEECH_KEY"),
|
||||||
|
region=os.environ.get("AZURE_SPEECH_REGION"),
|
||||||
|
)
|
||||||
|
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
|
||||||
|
if self.synthesizer_config.sampling_rate == 44100:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
|
||||||
|
)
|
||||||
|
if self.synthesizer_config.sampling_rate == 48000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
if self.synthesizer_config.sampling_rate == 24000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
elif self.synthesizer_config.sampling_rate == 16000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
elif self.synthesizer_config.sampling_rate == 8000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz8BitMonoMULaw
|
||||||
|
)
|
||||||
|
self.synthesizer = speechsdk.SpeechSynthesizer(
|
||||||
|
speech_config=speech_config, audio_config=None
|
||||||
|
)
|
||||||
|
|
||||||
|
self.voice_name = self.synthesizer_config.voice_name
|
||||||
|
self.pitch = self.synthesizer_config.pitch
|
||||||
|
self.rate = self.synthesizer_config.rate
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def get_phrase_filler_audios(self) -> list[FillerAudio]:
|
||||||
|
filler_phrase_audios = []
|
||||||
|
for filler_phrase in FILLER_PHRASES:
|
||||||
|
cache_key = "-".join(
|
||||||
|
(
|
||||||
|
str(filler_phrase.text),
|
||||||
|
str(self.synthesizer_config.type),
|
||||||
|
str(self.synthesizer_config.audio_encoding),
|
||||||
|
str(self.synthesizer_config.sampling_rate),
|
||||||
|
str(self.voice_name),
|
||||||
|
str(self.pitch),
|
||||||
|
str(self.rate),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
filler_audio_path = os.path.join(FILLER_AUDIO_PATH, f"{cache_key}.bytes")
|
||||||
|
if os.path.exists(filler_audio_path):
|
||||||
|
audio_data = open(filler_audio_path, "rb").read()
|
||||||
|
else:
|
||||||
|
self.logger.debug(f"Generating filler audio for {filler_phrase.text}")
|
||||||
|
ssml = self.create_ssml(filler_phrase.text)
|
||||||
|
result = self.synthesizer.speak_ssml(ssml)
|
||||||
|
offset = self.synthesizer_config.sampling_rate * self.OFFSET_MS // 1000
|
||||||
|
audio_data = result.audio_data[offset:]
|
||||||
|
with open(filler_audio_path, "wb") as f:
|
||||||
|
f.write(audio_data)
|
||||||
|
filler_phrase_audios.append(
|
||||||
|
FillerAudio(
|
||||||
|
filler_phrase,
|
||||||
|
audio_data,
|
||||||
|
self.synthesizer_config,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return filler_phrase_audios
|
||||||
|
|
||||||
|
def add_marks(self, message: str, index=0) -> str:
|
||||||
|
search_result = re.search(r"([\.\,\:\;\-\—]+)", message)
|
||||||
|
if search_result is None:
|
||||||
|
return message
|
||||||
|
start, end = search_result.span()
|
||||||
|
with_mark = message[:start] + f'<mark name="{index}" />' + message[start:end]
|
||||||
|
rest = message[end:]
|
||||||
|
rest_stripped = re.sub(r"^(.+)([\.\,\:\;\-\—]+)$", r"\1", rest)
|
||||||
|
if len(rest_stripped) == 0:
|
||||||
|
return with_mark
|
||||||
|
return with_mark + self.add_marks(rest_stripped, index + 1)
|
||||||
|
|
||||||
|
def word_boundary_cb(self, evt, pool):
|
||||||
|
pool.add(evt)
|
||||||
|
|
||||||
|
def create_ssml(
|
||||||
|
self, message: str, bot_sentiment: Optional[BotSentiment] = None
|
||||||
|
) -> str:
|
||||||
|
ssml_root = ElementTree.fromstring(
|
||||||
|
'<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="en-US"></speak>'
|
||||||
|
)
|
||||||
|
voice = ElementTree.SubElement(ssml_root, "voice")
|
||||||
|
voice.set("name", self.voice_name)
|
||||||
|
voice_root = voice
|
||||||
|
if bot_sentiment and bot_sentiment.emotion:
|
||||||
|
styled = ElementTree.SubElement(
|
||||||
|
voice, "{%s}express-as" % NAMESPACES.get("mstts")
|
||||||
|
)
|
||||||
|
styled.set("style", bot_sentiment.emotion)
|
||||||
|
styled.set(
|
||||||
|
"styledegree", str(bot_sentiment.degree * 2)
|
||||||
|
) # Azure specific, it's a scale of 0-2
|
||||||
|
voice_root = styled
|
||||||
|
prosody = ElementTree.SubElement(voice_root, "prosody")
|
||||||
|
prosody.set("pitch", f"{self.pitch}%")
|
||||||
|
prosody.set("rate", f"{self.rate}%")
|
||||||
|
prosody.text = message.strip()
|
||||||
|
return ElementTree.tostring(ssml_root, encoding="unicode")
|
||||||
|
|
||||||
|
def synthesize_ssml(self, ssml: str) -> tuple[speechsdk.AudioDataStream, str]:
|
||||||
|
result = self.synthesizer.start_speaking_ssml_async(ssml).get()
|
||||||
|
return speechsdk.AudioDataStream(result)
|
||||||
|
|
||||||
|
def ready_synthesizer(self):
|
||||||
|
connection = speechsdk.Connection.from_speech_synthesizer(self.synthesizer)
|
||||||
|
connection.open(True)
|
||||||
|
|
||||||
|
# given the number of seconds the message was allowed to go until, where did we get in the message?
|
||||||
|
def get_message_up_to(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
ssml: str,
|
||||||
|
seconds: int,
|
||||||
|
word_boundary_event_pool: WordBoundaryEventPool,
|
||||||
|
) -> str:
|
||||||
|
events = word_boundary_event_pool.get_events_sorted()
|
||||||
|
for event in events:
|
||||||
|
if event["audio_offset"] > seconds:
|
||||||
|
ssml_fragment = ssml[: event["text_offset"]]
|
||||||
|
return ssml_fragment.split(">")[-1]
|
||||||
|
return message
|
||||||
|
|
||||||
|
def create_speech(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
chunk_size: int,
|
||||||
|
bot_sentiment: Optional[BotSentiment] = None,
|
||||||
|
) -> SynthesisResult:
|
||||||
|
# offset = int(self.OFFSET_MS * (self.synthesizer_config.sampling_rate / 1000))
|
||||||
|
offset = 0
|
||||||
|
self.logger.debug(f"Synthesizing message: {message}")
|
||||||
|
|
||||||
|
def chunk_generator(
|
||||||
|
audio_data_stream: speechsdk.AudioDataStream, chunk_transform=lambda x: x
|
||||||
|
):
|
||||||
|
audio_buffer = bytes(chunk_size)
|
||||||
|
filled_size = audio_data_stream.read_data(audio_buffer)
|
||||||
|
if filled_size != chunk_size:
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
chunk_transform(audio_buffer[offset:]), True
|
||||||
|
)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
chunk_transform(audio_buffer[offset:]), False
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
filled_size = audio_data_stream.read_data(audio_buffer)
|
||||||
|
if filled_size != chunk_size:
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
chunk_transform(audio_buffer[: filled_size - offset]), True
|
||||||
|
)
|
||||||
|
break
|
||||||
|
yield SynthesisResult.ChunkResult(chunk_transform(audio_buffer), False)
|
||||||
|
|
||||||
|
word_boundary_event_pool = WordBoundaryEventPool()
|
||||||
|
self.synthesizer.synthesis_word_boundary.connect(
|
||||||
|
lambda event: self.word_boundary_cb(event, word_boundary_event_pool)
|
||||||
|
)
|
||||||
|
ssml = (
|
||||||
|
message.ssml
|
||||||
|
if isinstance(message, SSMLMessage)
|
||||||
|
else self.create_ssml(message.text, bot_sentiment=bot_sentiment)
|
||||||
|
)
|
||||||
|
audio_data_stream = self.synthesize_ssml(ssml)
|
||||||
|
if self.synthesizer_config.should_encode_as_wav:
|
||||||
|
output_generator = chunk_generator(
|
||||||
|
audio_data_stream,
|
||||||
|
lambda chunk: encode_as_wav(chunk, self.synthesizer_config),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_generator = chunk_generator(audio_data_stream)
|
||||||
|
return SynthesisResult(
|
||||||
|
output_generator,
|
||||||
|
lambda seconds: self.get_message_up_to(
|
||||||
|
message, ssml, seconds, word_boundary_event_pool
|
||||||
|
),
|
||||||
|
)
|
||||||
169
vocode/streaming/synthesizer/base_synthesizer.py
Normal file
169
vocode/streaming/synthesizer/base_synthesizer.py
Normal file
|
|
@ -0,0 +1,169 @@
|
||||||
|
import os
|
||||||
|
from typing import Any, Generator, Callable, Optional
|
||||||
|
import math
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||||
|
|
||||||
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
||||||
|
from vocode.streaming.models.agent import FillerAudioConfig
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
|
from vocode.streaming.utils import convert_wav, get_chunk_size_per_second
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
|
||||||
|
FILLER_PHRASES = [
|
||||||
|
BaseMessage(text="Um..."),
|
||||||
|
BaseMessage(text="Uh..."),
|
||||||
|
BaseMessage(text="Uh-huh..."),
|
||||||
|
BaseMessage(text="Mm-hmm..."),
|
||||||
|
BaseMessage(text="Hmm..."),
|
||||||
|
BaseMessage(text="Okay..."),
|
||||||
|
BaseMessage(text="Right..."),
|
||||||
|
BaseMessage(text="Let me see..."),
|
||||||
|
]
|
||||||
|
FILLER_AUDIO_PATH = os.path.join(os.path.dirname(__file__), "filler_audio")
|
||||||
|
TYPING_NOISE_PATH = "%s/typing-noise.wav" % FILLER_AUDIO_PATH
|
||||||
|
|
||||||
|
|
||||||
|
def encode_as_wav(chunk: bytes, synthesizer_config: SynthesizerConfig) -> bytes:
|
||||||
|
output_bytes_io = io.BytesIO()
|
||||||
|
in_memory_wav = wave.open(output_bytes_io, "wb")
|
||||||
|
in_memory_wav.setnchannels(1)
|
||||||
|
assert synthesizer_config.audio_encoding == AudioEncoding.LINEAR16
|
||||||
|
in_memory_wav.setsampwidth(2)
|
||||||
|
in_memory_wav.setframerate(synthesizer_config.sampling_rate)
|
||||||
|
in_memory_wav.writeframes(chunk)
|
||||||
|
output_bytes_io.seek(0)
|
||||||
|
return output_bytes_io.read()
|
||||||
|
|
||||||
|
|
||||||
|
class SynthesisResult:
|
||||||
|
class ChunkResult:
|
||||||
|
def __init__(self, chunk: bytes, is_last_chunk: bool):
|
||||||
|
self.chunk = chunk
|
||||||
|
self.is_last_chunk = is_last_chunk
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
chunk_generator: Generator[ChunkResult, None, None],
|
||||||
|
get_message_up_to: Callable[[int], str],
|
||||||
|
):
|
||||||
|
self.chunk_generator = chunk_generator
|
||||||
|
self.get_message_up_to = get_message_up_to
|
||||||
|
|
||||||
|
|
||||||
|
class FillerAudio:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
audio_data: bytes,
|
||||||
|
synthesizer_config: SynthesizerConfig,
|
||||||
|
is_interruptable: bool = False,
|
||||||
|
seconds_per_chunk: int = 1,
|
||||||
|
):
|
||||||
|
self.message = message
|
||||||
|
self.audio_data = audio_data
|
||||||
|
self.synthesizer_config = synthesizer_config
|
||||||
|
self.is_interruptable = is_interruptable
|
||||||
|
self.seconds_per_chunk = seconds_per_chunk
|
||||||
|
|
||||||
|
def create_synthesis_result(self) -> SynthesisResult:
|
||||||
|
chunk_size = (
|
||||||
|
get_chunk_size_per_second(
|
||||||
|
self.synthesizer_config.audio_encoding,
|
||||||
|
self.synthesizer_config.sampling_rate,
|
||||||
|
)
|
||||||
|
* self.seconds_per_chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
def chunk_generator(chunk_transform=lambda x: x):
|
||||||
|
for i in range(0, len(self.audio_data), chunk_size):
|
||||||
|
if i + chunk_size > len(self.audio_data):
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
chunk_transform(self.audio_data[i:]), True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
chunk_transform(self.audio_data[i : i + chunk_size]), False
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.synthesizer_config.should_encode_as_wav:
|
||||||
|
output_generator = chunk_generator(
|
||||||
|
lambda chunk: encode_as_wav(chunk, self.synthesizer_config)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_generator = chunk_generator()
|
||||||
|
return SynthesisResult(output_generator, lambda seconds: self.message.text)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSynthesizer:
|
||||||
|
def __init__(self, synthesizer_config: SynthesizerConfig):
|
||||||
|
self.synthesizer_config = synthesizer_config
|
||||||
|
if synthesizer_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
assert (
|
||||||
|
synthesizer_config.sampling_rate == 8000
|
||||||
|
), "MuLaw encoding only supports 8kHz sampling rate"
|
||||||
|
self.filler_audios: list[FillerAudio] = []
|
||||||
|
|
||||||
|
def get_synthesizer_config(self) -> SynthesizerConfig:
|
||||||
|
return self.synthesizer_config
|
||||||
|
|
||||||
|
def get_typing_noise_filler_audio(self) -> FillerAudio:
|
||||||
|
return FillerAudio(
|
||||||
|
message=BaseMessage(text="<typing noise>"),
|
||||||
|
audio_data=convert_wav(
|
||||||
|
TYPING_NOISE_PATH,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=self.synthesizer_config.audio_encoding,
|
||||||
|
),
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
is_interruptable=True,
|
||||||
|
seconds_per_chunk=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_filler_audios(self, filler_audio_config: FillerAudioConfig):
|
||||||
|
if filler_audio_config.use_phrases:
|
||||||
|
self.filler_audios = self.get_phrase_filler_audios()
|
||||||
|
elif filler_audio_config.use_typing_noise:
|
||||||
|
self.filler_audios = [self.get_typing_noise_filler_audio()]
|
||||||
|
|
||||||
|
def get_phrase_filler_audios(self) -> list[FillerAudio]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def ready_synthesizer(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# given the number of seconds the message was allowed to go until, where did we get in the message?
|
||||||
|
def get_message_cutoff_from_total_response_length(
|
||||||
|
self, message: BaseMessage, seconds: int, size_of_output: int
|
||||||
|
) -> str:
|
||||||
|
estimated_output_seconds = (
|
||||||
|
size_of_output / self.synthesizer_config.sampling_rate
|
||||||
|
)
|
||||||
|
estimated_output_seconds_per_char = estimated_output_seconds / len(message.text)
|
||||||
|
return message.text[: int(seconds / estimated_output_seconds_per_char)]
|
||||||
|
|
||||||
|
def get_message_cutoff_from_voice_speed(
|
||||||
|
self, message: BaseMessage, seconds: int, words_per_minute: int
|
||||||
|
) -> str:
|
||||||
|
words_per_second = words_per_minute / 60
|
||||||
|
estimated_words_spoken = math.floor(words_per_second * seconds)
|
||||||
|
tokens = word_tokenize(message.text)
|
||||||
|
return TreebankWordDetokenizer().detokenize(tokens[:estimated_words_spoken])
|
||||||
|
|
||||||
|
def get_maybe_cached_synthesis_result(
|
||||||
|
self, message: BaseMessage, chunk_size: int
|
||||||
|
) -> Optional[SynthesisResult]:
|
||||||
|
return
|
||||||
|
|
||||||
|
# returns a chunk generator and a thunk that can tell you what part of the message was read given the number of seconds spoken
|
||||||
|
# chunk generator must return tuple (bytes of size chunk_size, flag if it is the last chunk)
|
||||||
|
def create_speech(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
chunk_size: int,
|
||||||
|
bot_sentiment: Optional[BotSentiment] = None,
|
||||||
|
) -> SynthesisResult:
|
||||||
|
raise NotImplementedError
|
||||||
50
vocode/streaming/synthesizer/eleven_labs_synthesizer.py
Normal file
50
vocode/streaming/synthesizer/eleven_labs_synthesizer.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
from typing import Any, Optional
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import (
|
||||||
|
BaseSynthesizer,
|
||||||
|
SynthesisResult,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
|
||||||
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY")
|
||||||
|
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
|
||||||
|
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
|
||||||
|
OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
|
||||||
|
|
||||||
|
|
||||||
|
class ElevenLabsSynthesizer(BaseSynthesizer):
|
||||||
|
def __init__(self, config: ElevenLabsSynthesizerConfig):
|
||||||
|
super().__init__(config)
|
||||||
|
self.api_key = config.api_key
|
||||||
|
self.voice_id = config.voice_id or ADAM_VOICE_ID
|
||||||
|
self.words_per_minute = 150
|
||||||
|
|
||||||
|
def create_speech(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
chunk_size: int,
|
||||||
|
bot_sentiment: Optional[BotSentiment] = None,
|
||||||
|
) -> SynthesisResult:
|
||||||
|
url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}/stream"
|
||||||
|
headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
|
||||||
|
body = {
|
||||||
|
"text": message.text,
|
||||||
|
}
|
||||||
|
response = requests.post(url, headers=headers, json=body)
|
||||||
|
|
||||||
|
def chunk_generator(response):
|
||||||
|
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||||
|
yield SynthesisResult.ChunkResult(chunk, len(chunk) != chunk_size)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
not self.synthesizer_config.should_encode_as_wav
|
||||||
|
), "ElevenLabs does not support WAV encoding"
|
||||||
|
# return chunk_generator(response), lambda seconds: self.get_message_cutoff_from_voice_speed(message, seconds, self.words_per_minute)
|
||||||
|
return SynthesisResult(chunk_generator(response), lambda seconds: message.text)
|
||||||
BIN
vocode/streaming/synthesizer/filler_audio/typing-noise.wav
Normal file
BIN
vocode/streaming/synthesizer/filler_audio/typing-noise.wav
Normal file
Binary file not shown.
110
vocode/streaming/synthesizer/google_synthesizer.py
Normal file
110
vocode/streaming/synthesizer/google_synthesizer.py
Normal file
|
|
@ -0,0 +1,110 @@
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from google.cloud import texttospeech_v1beta1 as tts
|
||||||
|
|
||||||
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import (
|
||||||
|
BaseSynthesizer,
|
||||||
|
SynthesisResult,
|
||||||
|
encode_as_wav,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.synthesizer import GoogleSynthesizerConfig
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
from vocode.streaming.utils import convert_wav
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleSynthesizer(BaseSynthesizer):
|
||||||
|
OFFSET_SECONDS = 0.5
|
||||||
|
|
||||||
|
def __init__(self, synthesizer_config: GoogleSynthesizerConfig):
|
||||||
|
super().__init__(synthesizer_config)
|
||||||
|
# Instantiates a client
|
||||||
|
self.client = tts.TextToSpeechClient()
|
||||||
|
|
||||||
|
# Build the voice request, select the language code ("en-US") and the ssml
|
||||||
|
# voice gender ("neutral")
|
||||||
|
self.voice = tts.VoiceSelectionParams(
|
||||||
|
language_code="en-US", name="en-US-Neural2-I"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Select the type of audio file you want returned
|
||||||
|
self.audio_config = tts.AudioConfig(
|
||||||
|
audio_encoding=tts.AudioEncoding.LINEAR16,
|
||||||
|
sample_rate_hertz=24000,
|
||||||
|
speaking_rate=1.2,
|
||||||
|
pitch=0,
|
||||||
|
effects_profile_id=["telephony-class-application"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def synthesize(self, message: str) -> tts.SynthesizeSpeechResponse:
|
||||||
|
synthesis_input = tts.SynthesisInput(text=message)
|
||||||
|
|
||||||
|
# Perform the text-to-speech request on the text input with the selected
|
||||||
|
# voice parameters and audio file type
|
||||||
|
return self.client.synthesize_speech(
|
||||||
|
request=tts.SynthesizeSpeechRequest(
|
||||||
|
input=synthesis_input,
|
||||||
|
voice=self.voice,
|
||||||
|
audio_config=self.audio_config,
|
||||||
|
enable_time_pointing=[
|
||||||
|
tts.SynthesizeSpeechRequest.TimepointType.SSML_MARK
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_speech(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
chunk_size: int,
|
||||||
|
bot_sentiment: Optional[BotSentiment] = None,
|
||||||
|
) -> SynthesisResult:
|
||||||
|
response = self.synthesize(message.text)
|
||||||
|
output_sample_rate = response.audio_config.sample_rate_hertz
|
||||||
|
|
||||||
|
real_offset = int(GoogleSynthesizer.OFFSET_SECONDS * output_sample_rate)
|
||||||
|
|
||||||
|
output_bytes_io = io.BytesIO()
|
||||||
|
in_memory_wav = wave.open(output_bytes_io, "wb")
|
||||||
|
in_memory_wav.setnchannels(1)
|
||||||
|
in_memory_wav.setsampwidth(2)
|
||||||
|
in_memory_wav.setframerate(output_sample_rate)
|
||||||
|
in_memory_wav.writeframes(response.audio_content[real_offset:-real_offset])
|
||||||
|
output_bytes_io.seek(0)
|
||||||
|
|
||||||
|
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
|
||||||
|
output_bytes = convert_wav(
|
||||||
|
output_bytes_io,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=AudioEncoding.LINEAR16,
|
||||||
|
)
|
||||||
|
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
output_bytes = convert_wav(
|
||||||
|
output_bytes_io,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=AudioEncoding.MULAW,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.synthesizer_config.should_encode_as_wav:
|
||||||
|
output_bytes = encode_as_wav(output_bytes)
|
||||||
|
|
||||||
|
def chunk_generator(output_bytes):
|
||||||
|
for i in range(0, len(output_bytes), chunk_size):
|
||||||
|
if i + chunk_size > len(output_bytes):
|
||||||
|
yield SynthesisResult.ChunkResult(output_bytes[i:], True)
|
||||||
|
else:
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
output_bytes[i : i + chunk_size], False
|
||||||
|
)
|
||||||
|
|
||||||
|
return SynthesisResult(
|
||||||
|
chunk_generator(output_bytes),
|
||||||
|
lambda seconds: self.get_message_cutoff_from_total_response_length(
|
||||||
|
message, seconds, len(output_bytes)
|
||||||
|
),
|
||||||
|
)
|
||||||
78
vocode/streaming/synthesizer/rime_synthesizer.py
Normal file
78
vocode/streaming/synthesizer/rime_synthesizer.py
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
import audioop
|
||||||
|
import base64
|
||||||
|
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
from vocode.streaming.models.message import BaseMessage
|
||||||
|
|
||||||
|
from .base_synthesizer import BaseSynthesizer, SynthesisResult, encode_as_wav
|
||||||
|
from typing import Any, Optional
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from ..utils import convert_linear_audio, convert_wav
|
||||||
|
from ..models.synthesizer import ElevenLabsSynthesizerConfig, RimeSynthesizerConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
RIME_API_KEY = os.getenv("RIME_API_KEY")
|
||||||
|
RIME_BASE_URL = os.getenv("RIME_BASE_URL")
|
||||||
|
|
||||||
|
|
||||||
|
class RimeSynthesizer(BaseSynthesizer):
|
||||||
|
def __init__(self, config: RimeSynthesizerConfig):
|
||||||
|
super().__init__(config)
|
||||||
|
self.speaker = config.speaker
|
||||||
|
|
||||||
|
def create_speech(
|
||||||
|
self,
|
||||||
|
message: BaseMessage,
|
||||||
|
chunk_size: int,
|
||||||
|
bot_sentiment: Optional[BotSentiment] = None,
|
||||||
|
) -> SynthesisResult:
|
||||||
|
url = RIME_BASE_URL
|
||||||
|
headers = {"Authorization": f"Bearer {RIME_API_KEY}"}
|
||||||
|
body = {"inputs": {"text": message.text, "speaker": self.speaker}}
|
||||||
|
response = requests.post(url, headers=headers, json=body)
|
||||||
|
|
||||||
|
def chunk_generator(audio, chunk_transform=lambda x: x):
|
||||||
|
for i in range(0, len(audio), chunk_size):
|
||||||
|
chunk = audio[i : i + chunk_size]
|
||||||
|
yield SynthesisResult.ChunkResult(
|
||||||
|
chunk_transform(chunk), len(chunk) != chunk_size
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.ok, response.text
|
||||||
|
data = response.json().get("data")
|
||||||
|
assert data
|
||||||
|
|
||||||
|
audio_file = io.BytesIO(base64.b64decode(data))
|
||||||
|
|
||||||
|
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
|
||||||
|
output_bytes = convert_wav(
|
||||||
|
audio_file,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=AudioEncoding.LINEAR16,
|
||||||
|
)
|
||||||
|
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
output_bytes = convert_wav(
|
||||||
|
audio_file,
|
||||||
|
output_sample_rate=self.synthesizer_config.sampling_rate,
|
||||||
|
output_encoding=AudioEncoding.MULAW,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.synthesizer_config.should_encode_as_wav:
|
||||||
|
output_generator = chunk_generator(
|
||||||
|
output_bytes, chunk_transform=encode_as_wav
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_generator = chunk_generator(output_bytes)
|
||||||
|
return SynthesisResult(
|
||||||
|
output_generator,
|
||||||
|
lambda seconds: self.get_message_cutoff_from_total_response_length(
|
||||||
|
message, seconds, len(output_bytes)
|
||||||
|
),
|
||||||
|
)
|
||||||
0
vocode/streaming/telephony/__init__.py
Normal file
0
vocode/streaming/telephony/__init__.py
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
from redis import Redis
|
||||||
|
|
||||||
|
from vocode.streaming.models.telephony import CallConfig
|
||||||
|
|
||||||
|
|
||||||
|
class BaseConfigManager:
|
||||||
|
def save_config(self, conversation_id: str, config: CallConfig):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_config(self, conversation_id) -> Optional[CallConfig]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def delete_config(self, conversation_id):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
from redis import Redis
|
||||||
|
|
||||||
|
from vocode.streaming.models.telephony import CallConfig
|
||||||
|
from vocode.streaming.telephony.config_manager.base_config_manager import (
|
||||||
|
BaseConfigManager,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RedisConfigManager(BaseConfigManager):
|
||||||
|
def __init__(self, logger: Optional[logging.Logger] = None):
|
||||||
|
self.redis = Redis(
|
||||||
|
host=os.environ.get("REDISHOST", "localhost"),
|
||||||
|
port=int(os.environ.get("REDISPORT", 6379)),
|
||||||
|
db=0,
|
||||||
|
decode_responses=True,
|
||||||
|
)
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def save_config(self, conversation_id: str, config: CallConfig):
|
||||||
|
self.logger.debug(f"Saving config for {conversation_id}")
|
||||||
|
self.redis.set(conversation_id, config.json())
|
||||||
|
|
||||||
|
def get_config(self, conversation_id) -> Optional[CallConfig]:
|
||||||
|
self.logger.debug(f"Getting config for {conversation_id}")
|
||||||
|
raw_config = self.redis.get(conversation_id)
|
||||||
|
if raw_config:
|
||||||
|
return CallConfig.parse_raw(self.redis.get(conversation_id))
|
||||||
|
|
||||||
|
def delete_config(self, conversation_id):
|
||||||
|
self.logger.debug(f"Deleting config for {conversation_id}")
|
||||||
|
self.redis.delete(conversation_id)
|
||||||
5
vocode/streaming/telephony/constants.py
Normal file
5
vocode/streaming/telephony/constants.py
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
DEFAULT_SAMPLING_RATE = 8000
|
||||||
|
DEFAULT_AUDIO_ENCODING = AudioEncoding.MULAW
|
||||||
|
DEFAULT_CHUNK_SIZE = 20 * 160
|
||||||
170
vocode/streaming/telephony/conversation/call.py
Normal file
170
vocode/streaming/telephony/conversation/call.py
Normal file
|
|
@ -0,0 +1,170 @@
|
||||||
|
from fastapi import WebSocket
|
||||||
|
import base64
|
||||||
|
from enum import Enum
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.factory import (
|
||||||
|
create_agent,
|
||||||
|
create_synthesizer,
|
||||||
|
create_transcriber,
|
||||||
|
)
|
||||||
|
|
||||||
|
from vocode.streaming.streaming_conversation import StreamingConversation
|
||||||
|
from vocode.streaming.models.telephony import CallConfig, TwilioConfig
|
||||||
|
from vocode.streaming.output_device.twilio_output_device import TwilioOutputDevice
|
||||||
|
from vocode.streaming.models.synthesizer import (
|
||||||
|
AzureSynthesizerConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.transcriber import (
|
||||||
|
DeepgramTranscriberConfig,
|
||||||
|
PunctuationEndpointingConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
|
||||||
|
from vocode.streaming.telephony.config_manager.base_config_manager import (
|
||||||
|
BaseConfigManager,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.twilio import create_twilio_client
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
from vocode.streaming.streaming_conversation import StreamingConversation
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
|
||||||
|
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
|
||||||
|
|
||||||
|
|
||||||
|
class PhoneCallAction(Enum):
|
||||||
|
CLOSE_WEBSOCKET = 1
|
||||||
|
|
||||||
|
|
||||||
|
class Call(StreamingConversation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
config_manager: BaseConfigManager,
|
||||||
|
agent: BaseAgent,
|
||||||
|
twilio_config: TwilioConfig,
|
||||||
|
transcriber: Optional[BaseTranscriber] = None,
|
||||||
|
synthesizer: Optional[BaseSynthesizer] = None,
|
||||||
|
twilio_sid=None,
|
||||||
|
conversation_id: Optional[str] = None,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.config_manager = config_manager
|
||||||
|
self.output_device = TwilioOutputDevice()
|
||||||
|
self.twilio_config = twilio_config
|
||||||
|
self.twilio_client = create_twilio_client(twilio_config)
|
||||||
|
super().__init__(
|
||||||
|
self.output_device,
|
||||||
|
transcriber
|
||||||
|
or DeepgramTranscriber(
|
||||||
|
DeepgramTranscriberConfig(
|
||||||
|
sampling_rate=8000,
|
||||||
|
audio_encoding=AudioEncoding.MULAW,
|
||||||
|
chunk_size=self.CHUNK_SIZE,
|
||||||
|
model="voicemail",
|
||||||
|
endpointing_config=PunctuationEndpointingConfig(),
|
||||||
|
),
|
||||||
|
logger=logger,
|
||||||
|
),
|
||||||
|
agent,
|
||||||
|
synthesizer
|
||||||
|
or AzureSynthesizer(
|
||||||
|
AzureSynthesizerConfig(
|
||||||
|
sampling_rate=8000, audio_encoding=AudioEncoding.MULAW
|
||||||
|
)
|
||||||
|
),
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
per_chunk_allowance_seconds=0.01,
|
||||||
|
logger=logger,
|
||||||
|
)
|
||||||
|
self.twilio_sid = twilio_sid
|
||||||
|
self.latest_media_timestamp = 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_call_config(
|
||||||
|
base_url: str,
|
||||||
|
call_config: CallConfig,
|
||||||
|
config_manager: BaseConfigManager,
|
||||||
|
conversation_id: str,
|
||||||
|
logger: logging.Logger,
|
||||||
|
):
|
||||||
|
return Call(
|
||||||
|
base_url=base_url,
|
||||||
|
logger=logger,
|
||||||
|
config_manager=config_manager,
|
||||||
|
agent=create_agent(call_config.agent_config),
|
||||||
|
transcriber=create_transcriber(call_config.transcriber_config),
|
||||||
|
synthesizer=create_synthesizer(call_config.synthesizer_config),
|
||||||
|
twilio_config=call_config.twilio_config,
|
||||||
|
twilio_sid=call_config.twilio_sid,
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def attach_ws_and_start(self, ws: WebSocket):
|
||||||
|
self.logger.debug("Trying to attach WS to outbound call")
|
||||||
|
self.output_device.ws = ws
|
||||||
|
self.logger.debug("Attached WS to outbound call")
|
||||||
|
|
||||||
|
twilio_call = self.twilio_client.calls(self.twilio_sid).fetch()
|
||||||
|
|
||||||
|
if twilio_call.answered_by in ("machine_start", "fax"):
|
||||||
|
self.logger.info(f"Call answered by {twilio_call.answered_by}")
|
||||||
|
twilio_call.update(status="completed")
|
||||||
|
else:
|
||||||
|
await self.wait_for_twilio_start(ws)
|
||||||
|
await super().start()
|
||||||
|
while self.active:
|
||||||
|
message = await ws.receive_text()
|
||||||
|
response = await self.handle_ws_message(message)
|
||||||
|
if response == PhoneCallAction.CLOSE_WEBSOCKET:
|
||||||
|
break
|
||||||
|
self.tear_down()
|
||||||
|
|
||||||
|
async def wait_for_twilio_start(self, ws: WebSocket):
|
||||||
|
while True:
|
||||||
|
message = await ws.receive_text()
|
||||||
|
if not message:
|
||||||
|
continue
|
||||||
|
data = json.loads(message)
|
||||||
|
if data["event"] == "start":
|
||||||
|
self.logger.debug(
|
||||||
|
f"Media WS: Received event '{data['event']}': {message}"
|
||||||
|
)
|
||||||
|
self.output_device.stream_sid = data["start"]["streamSid"]
|
||||||
|
break
|
||||||
|
|
||||||
|
async def handle_ws_message(self, message) -> PhoneCallAction:
|
||||||
|
if message is None:
|
||||||
|
return PhoneCallAction.CLOSE_WEBSOCKET
|
||||||
|
|
||||||
|
data = json.loads(message)
|
||||||
|
if data["event"] == "media":
|
||||||
|
media = data["media"]
|
||||||
|
chunk = base64.b64decode(media["payload"])
|
||||||
|
if self.latest_media_timestamp + 20 < int(media["timestamp"]):
|
||||||
|
bytes_to_fill = 8 * (
|
||||||
|
int(media["timestamp"]) - (self.latest_media_timestamp + 20)
|
||||||
|
)
|
||||||
|
self.logger.debug(f"Filling {bytes_to_fill} bytes of silence")
|
||||||
|
# NOTE: 0xff is silence for mulaw audio
|
||||||
|
self.receive_audio(b"\xff" * bytes_to_fill)
|
||||||
|
self.latest_media_timestamp = int(media["timestamp"])
|
||||||
|
self.receive_audio(chunk)
|
||||||
|
elif data["event"] == "stop":
|
||||||
|
self.logger.debug(f"Media WS: Received event 'stop': {message}")
|
||||||
|
self.logger.debug("Stopping...")
|
||||||
|
return PhoneCallAction.CLOSE_WEBSOCKET
|
||||||
|
|
||||||
|
def end_twilio_call(self) -> bool:
|
||||||
|
response = self.twilio_client.calls(self.twilio_sid).update(status="completed")
|
||||||
|
return response.status == "completed"
|
||||||
|
|
||||||
|
def mark_terminated(self):
|
||||||
|
super().mark_terminated()
|
||||||
|
self.end_twilio_call()
|
||||||
|
self.config_manager.delete_config(self.id)
|
||||||
|
|
||||||
|
def tear_down(self):
|
||||||
|
self.terminate()
|
||||||
110
vocode/streaming/telephony/conversation/outbound_call.py
Normal file
110
vocode/streaming/telephony/conversation/outbound_call.py
Normal file
|
|
@ -0,0 +1,110 @@
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from twilio.rest import Client
|
||||||
|
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import (
|
||||||
|
AzureSynthesizerConfig,
|
||||||
|
SynthesizerConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.telephony import CallConfig, TwilioConfig
|
||||||
|
from vocode.streaming.models.transcriber import (
|
||||||
|
DeepgramTranscriberConfig,
|
||||||
|
PunctuationEndpointingConfig,
|
||||||
|
TranscriberConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.config_manager.base_config_manager import (
|
||||||
|
BaseConfigManager,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.constants import (
|
||||||
|
DEFAULT_AUDIO_ENCODING,
|
||||||
|
DEFAULT_CHUNK_SIZE,
|
||||||
|
DEFAULT_SAMPLING_RATE,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.twilio import create_twilio_client
|
||||||
|
from vocode.streaming.utils import create_conversation_id
|
||||||
|
|
||||||
|
|
||||||
|
class OutboundCall:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
to_phone: str,
|
||||||
|
from_phone: str,
|
||||||
|
config_manager: BaseConfigManager,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
twilio_config: TwilioConfig,
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None,
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||||
|
conversation_id: Optional[str] = None,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.to_phone = to_phone
|
||||||
|
self.from_phone = from_phone
|
||||||
|
self.config_manager = config_manager
|
||||||
|
self.agent_config = agent_config
|
||||||
|
self.transcriber_config = transcriber_config or DeepgramTranscriberConfig(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
chunk_size=DEFAULT_CHUNK_SIZE,
|
||||||
|
model="voicemail",
|
||||||
|
endpointing_config=PunctuationEndpointingConfig(),
|
||||||
|
)
|
||||||
|
self.synthesizer_config = synthesizer_config or AzureSynthesizerConfig(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING
|
||||||
|
)
|
||||||
|
self.conversation_id = conversation_id or create_conversation_id()
|
||||||
|
self.logger = logger
|
||||||
|
self.twilio_config = twilio_config
|
||||||
|
self.twilio_client = create_twilio_client(twilio_config)
|
||||||
|
self.twilio_sid = None
|
||||||
|
|
||||||
|
def create_twilio_call(
|
||||||
|
self, to_phone: str, from_phone: str, digits: str = ""
|
||||||
|
) -> str:
|
||||||
|
twilio_call = self.twilio_client.calls.create(
|
||||||
|
url=f"https://{self.base_url}/twiml/initiate_call/{self.conversation_id}",
|
||||||
|
to=to_phone,
|
||||||
|
from_=from_phone,
|
||||||
|
send_digits=digits,
|
||||||
|
)
|
||||||
|
return twilio_call.sid
|
||||||
|
|
||||||
|
def validate_outbound_call(
|
||||||
|
self,
|
||||||
|
to_phone: str,
|
||||||
|
from_phone: str,
|
||||||
|
mobile_only: bool = True,
|
||||||
|
):
|
||||||
|
if len(to_phone) < 8:
|
||||||
|
raise ValueError("Invalid 'to' phone")
|
||||||
|
|
||||||
|
if not mobile_only:
|
||||||
|
return
|
||||||
|
line_type_intelligence = (
|
||||||
|
self.twilio_client.lookups.v2.phone_numbers(to_phone)
|
||||||
|
.fetch(fields="line_type_intelligence")
|
||||||
|
.line_type_intelligence
|
||||||
|
)
|
||||||
|
if not line_type_intelligence or (
|
||||||
|
line_type_intelligence and line_type_intelligence["type"] != "mobile"
|
||||||
|
):
|
||||||
|
raise ValueError("Can only call mobile phones")
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.logger.debug("Starting outbound call")
|
||||||
|
self.validate_outbound_call(self.to_phone, self.from_phone)
|
||||||
|
self.twilio_sid = self.create_twilio_call(self.to_phone, self.from_phone)
|
||||||
|
call_config = CallConfig(
|
||||||
|
transcriber_config=self.transcriber_config,
|
||||||
|
agent_config=self.agent_config,
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
twilio_config=self.twilio_config,
|
||||||
|
twilio_sid=self.twilio_sid,
|
||||||
|
)
|
||||||
|
self.config_manager.save_config(self.conversation_id, call_config)
|
||||||
|
|
||||||
|
def end(self):
|
||||||
|
response = self.twilio_client.calls(self.twilio_sid).update(status="completed")
|
||||||
|
return response.status == "completed"
|
||||||
73
vocode/streaming/telephony/conversation/zoom_dial_in.py
Normal file
73
vocode/streaming/telephony/conversation/zoom_dial_in.py
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from twilio.rest import Client
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
from vocode.streaming.models.telephony import CallConfig, TwilioConfig
|
||||||
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
|
||||||
|
from vocode.streaming.telephony.config_manager.base_config_manager import (
|
||||||
|
BaseConfigManager,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.conversation.outbound_call import OutboundCall
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
|
||||||
|
from vocode.streaming.utils import create_conversation_id
|
||||||
|
|
||||||
|
|
||||||
|
class ZoomDialIn(OutboundCall):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
zoom_number: str,
|
||||||
|
zoom_meeting_id: str,
|
||||||
|
zoom_meeting_password: Optional[str],
|
||||||
|
from_phone: str,
|
||||||
|
config_manager: BaseConfigManager,
|
||||||
|
twilio_config: TwilioConfig,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
transcriber_config: TranscriberConfig,
|
||||||
|
synthesizer_config: SynthesizerConfig,
|
||||||
|
conversation_id: Optional[str] = None,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
base_url=base_url,
|
||||||
|
to_phone=zoom_number,
|
||||||
|
from_phone=from_phone,
|
||||||
|
config_manager=config_manager,
|
||||||
|
transcriber_config=transcriber_config,
|
||||||
|
agent_config=agent_config,
|
||||||
|
synthesizer_config=synthesizer_config,
|
||||||
|
twilio_config=twilio_config,
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
logger=logger,
|
||||||
|
)
|
||||||
|
self.zoom_number = zoom_number
|
||||||
|
self.zoom_meeting_id = zoom_meeting_id
|
||||||
|
self.zoom_meeting_password = zoom_meeting_password
|
||||||
|
self.from_phone = from_phone
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.validate_outbound_call(
|
||||||
|
self.zoom_number,
|
||||||
|
self.from_phone,
|
||||||
|
mobile_only=False,
|
||||||
|
)
|
||||||
|
digits = f"ww{self.zoom_meeting_id}#"
|
||||||
|
if self.zoom_meeting_password:
|
||||||
|
digits += f"wwww*{self.zoom_meeting_password}#"
|
||||||
|
self.logger.debug("Sending digits %s to the call", digits)
|
||||||
|
twilio_sid = self.create_twilio_call(
|
||||||
|
self.zoom_number,
|
||||||
|
self.from_phone,
|
||||||
|
digits=digits,
|
||||||
|
)
|
||||||
|
call_config = CallConfig(
|
||||||
|
transcriber_config=self.transcriber_config,
|
||||||
|
agent_config=self.agent_config,
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
twilio_config=self.twilio_config,
|
||||||
|
twilio_sid=twilio_sid,
|
||||||
|
)
|
||||||
|
self.config_manager.save_config(self.conversation_id, call_config)
|
||||||
62
vocode/streaming/telephony/hosted/inbound_call_server.py
Normal file
62
vocode/streaming/telephony/hosted/inbound_call_server.py
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
from fastapi import FastAPI, Response, Form
|
||||||
|
from typing import Optional
|
||||||
|
import requests
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
import vocode
|
||||||
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.telephony import (
|
||||||
|
CreateInboundCall,
|
||||||
|
TwilioConfig,
|
||||||
|
TwilioConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class InboundCallServer:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None,
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||||
|
response_on_rate_limit: Optional[str] = None,
|
||||||
|
twilio_config: Optional[TwilioConfig] = None,
|
||||||
|
):
|
||||||
|
self.agent_config = agent_config
|
||||||
|
self.transcriber_config = transcriber_config
|
||||||
|
self.synthesizer_config = synthesizer_config
|
||||||
|
self.app = FastAPI()
|
||||||
|
self.app.post("/vocode")(self.handle_call)
|
||||||
|
self.response_on_rate_limit = (
|
||||||
|
response_on_rate_limit
|
||||||
|
or "The line is really busy right now, check back later!"
|
||||||
|
)
|
||||||
|
self.twilio_config = twilio_config
|
||||||
|
self.vocode_inbound_call_url = f"https://{vocode.base_url}/create_inbound_call"
|
||||||
|
|
||||||
|
async def handle_call(self, twilio_sid: str = Form(alias="CallSid")):
|
||||||
|
response = requests.post(
|
||||||
|
self.vocode_inbound_call_url,
|
||||||
|
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||||
|
json=CreateInboundCall(
|
||||||
|
agent_config=self.agent_config,
|
||||||
|
twilio_sid=twilio_sid,
|
||||||
|
transcriber_config=self.transcriber_config,
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
twilio_config=self.twilio_config,
|
||||||
|
).dict(),
|
||||||
|
)
|
||||||
|
if response.status_code == 429:
|
||||||
|
return Response(
|
||||||
|
f"<Response><Say>{self.response_on_rate_limit}</Say></Response>",
|
||||||
|
media_type="application/xml",
|
||||||
|
)
|
||||||
|
assert response.ok, response.text
|
||||||
|
return Response(
|
||||||
|
response.text,
|
||||||
|
media_type="application/xml",
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self, host="localhost", port=3000):
|
||||||
|
uvicorn.run(self.app, host=host, port=port)
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
from typing import Optional, Union
|
||||||
|
from vocode.streaming.models.telephony import TwilioConfig
|
||||||
|
from vocode.streaming.telephony.hosted.inbound_call_server import InboundCallServer
|
||||||
|
from vocode.streaming.models.agent import (
|
||||||
|
RESTfulAgentEnd,
|
||||||
|
RESTfulAgentInput,
|
||||||
|
RESTfulAgentText,
|
||||||
|
RESTfulUserImplementedAgentConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.transcriber import (
|
||||||
|
TranscriberConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
|
||||||
|
|
||||||
|
class InboundCallUserAgentServer(InboundCallServer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
agent_config: RESTfulUserImplementedAgentConfig,
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None,
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||||
|
response_on_rate_limit: Optional[str] = None,
|
||||||
|
twilio_config: Optional[TwilioConfig] = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
agent_config=agent_config,
|
||||||
|
transcriber_config=transcriber_config,
|
||||||
|
synthesizer_config=synthesizer_config,
|
||||||
|
response_on_rate_limit=response_on_rate_limit,
|
||||||
|
twilio_config=twilio_config,
|
||||||
|
)
|
||||||
|
assert isinstance(
|
||||||
|
agent_config, RESTfulUserImplementedAgentConfig
|
||||||
|
), "agent_config must be a RESTfulUserImplementedAgentConfig"
|
||||||
|
self.app.post("/respond")(self.respond_rest)
|
||||||
|
|
||||||
|
async def respond(
|
||||||
|
self, human_input, conversation_id
|
||||||
|
) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def respond_rest(
|
||||||
|
self, request: RESTfulAgentInput
|
||||||
|
) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
|
||||||
|
return await self.respond(request.human_input, request.conversation_id)
|
||||||
68
vocode/streaming/telephony/hosted/outbound_call.py
Normal file
68
vocode/streaming/telephony/hosted/outbound_call.py
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
from typing import Optional
|
||||||
|
import requests
|
||||||
|
|
||||||
|
import vocode
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
|
from vocode.streaming.models.telephony import (
|
||||||
|
CallEntity,
|
||||||
|
CreateOutboundCall,
|
||||||
|
EndOutboundCall,
|
||||||
|
TwilioConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OutboundCall:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
recipient: CallEntity,
|
||||||
|
caller: CallEntity,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None,
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||||
|
conversation_id: Optional[str] = None,
|
||||||
|
twilio_config: Optional[TwilioConfig] = None,
|
||||||
|
):
|
||||||
|
self.recipient = recipient
|
||||||
|
self.caller = caller
|
||||||
|
self.agent_config = agent_config
|
||||||
|
self.transcriber_config = transcriber_config
|
||||||
|
self.synthesizer_config = synthesizer_config
|
||||||
|
self.conversation_id = conversation_id
|
||||||
|
self.twilio_config = twilio_config
|
||||||
|
self.vocode_create_outbound_call_url = (
|
||||||
|
f"https://{vocode.base_url}/create_outbound_call"
|
||||||
|
)
|
||||||
|
self.vocode_end_outbound_call_url = (
|
||||||
|
f"https://{vocode.base_url}/end_outbound_call"
|
||||||
|
)
|
||||||
|
|
||||||
|
def start(self) -> str:
|
||||||
|
response = requests.post(
|
||||||
|
self.vocode_create_outbound_call_url,
|
||||||
|
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||||
|
json=CreateOutboundCall(
|
||||||
|
recipient=self.recipient,
|
||||||
|
caller=self.caller,
|
||||||
|
agent_config=self.agent_config,
|
||||||
|
transcriber_config=self.transcriber_config,
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
conversation_id=self.conversation_id,
|
||||||
|
twilio_config=self.twilio_config,
|
||||||
|
).dict(),
|
||||||
|
)
|
||||||
|
assert response.ok, response.text
|
||||||
|
data = response.json()
|
||||||
|
self.conversation_id = data["id"]
|
||||||
|
|
||||||
|
def end(self) -> str:
|
||||||
|
response = requests.post(
|
||||||
|
self.vocode_end_outbound_call_url,
|
||||||
|
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||||
|
json=EndOutboundCall(
|
||||||
|
call_id=self.conversation_id,
|
||||||
|
twilio_config=self.twilio_config,
|
||||||
|
).dict(),
|
||||||
|
)
|
||||||
|
assert response.ok or response.status_code == 404, response.text
|
||||||
60
vocode/streaming/telephony/hosted/zoom_dial_in.py
Normal file
60
vocode/streaming/telephony/hosted/zoom_dial_in.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
from typing import Optional
|
||||||
|
import requests
|
||||||
|
|
||||||
|
import vocode
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
|
from vocode.streaming.telephony.hosted.outbound_call import OutboundCall
|
||||||
|
from vocode.streaming.models.telephony import (
|
||||||
|
CallEntity,
|
||||||
|
DialIntoZoomCall,
|
||||||
|
TwilioConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ZoomDialIn(OutboundCall):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
recipient: CallEntity,
|
||||||
|
caller: CallEntity,
|
||||||
|
zoom_meeting_id: str,
|
||||||
|
zoom_meeting_password: str,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None,
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||||
|
conversation_id: Optional[str] = None,
|
||||||
|
twilio_config: Optional[TwilioConfig] = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
recipient=recipient,
|
||||||
|
caller=caller,
|
||||||
|
agent_config=agent_config,
|
||||||
|
transcriber_config=transcriber_config,
|
||||||
|
synthesizer_config=synthesizer_config,
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
twilio_config=twilio_config,
|
||||||
|
)
|
||||||
|
self.zoom_meeting_id = zoom_meeting_id
|
||||||
|
self.zoom_meeting_password = zoom_meeting_password
|
||||||
|
self.vocode_zoom_dial_in_url = f"https://{vocode.base_url}/dial_into_zoom_call"
|
||||||
|
|
||||||
|
def start(self) -> str:
|
||||||
|
response = requests.post(
|
||||||
|
self.vocode_zoom_dial_in_url,
|
||||||
|
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||||
|
json=DialIntoZoomCall(
|
||||||
|
recipient=self.recipient,
|
||||||
|
caller=self.caller,
|
||||||
|
zoom_meeting_id=self.zoom_meeting_id,
|
||||||
|
zoom_meeting_password=self.zoom_meeting_password,
|
||||||
|
agent_config=self.agent_config,
|
||||||
|
transcriber_config=self.transcriber_config,
|
||||||
|
synthesizer_config=self.synthesizer_config,
|
||||||
|
conversation_id=self.conversation_id,
|
||||||
|
twilio_config=self.twilio_config,
|
||||||
|
).dict(),
|
||||||
|
)
|
||||||
|
assert response.ok, response.text
|
||||||
|
data = response.json()
|
||||||
|
self.conversation_id = data["id"]
|
||||||
143
vocode/streaming/telephony/server/base.py
Normal file
143
vocode/streaming/telephony/server/base.py
Normal file
|
|
@ -0,0 +1,143 @@
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from fastapi import APIRouter, Form, Response
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from vocode.streaming.agent.base_agent import BaseAgent
|
||||||
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import (
|
||||||
|
AzureSynthesizerConfig,
|
||||||
|
SynthesizerConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.transcriber import (
|
||||||
|
DeepgramTranscriberConfig,
|
||||||
|
PunctuationEndpointingConfig,
|
||||||
|
TranscriberConfig,
|
||||||
|
)
|
||||||
|
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
|
||||||
|
from vocode.streaming.telephony.config_manager.base_config_manager import (
|
||||||
|
BaseConfigManager,
|
||||||
|
)
|
||||||
|
from vocode.streaming.telephony.constants import (
|
||||||
|
DEFAULT_AUDIO_ENCODING,
|
||||||
|
DEFAULT_CHUNK_SIZE,
|
||||||
|
DEFAULT_SAMPLING_RATE,
|
||||||
|
)
|
||||||
|
|
||||||
|
from vocode.streaming.telephony.server.router.calls import CallsRouter
|
||||||
|
from vocode.streaming.telephony.server.router.twiml import TwiMLRouter
|
||||||
|
from vocode.streaming.models.telephony import (
|
||||||
|
CallConfig,
|
||||||
|
CallEntity,
|
||||||
|
CreateOutboundCall,
|
||||||
|
CreateInboundCall,
|
||||||
|
DialIntoZoomCall,
|
||||||
|
EndOutboundCall,
|
||||||
|
TwilioConfig,
|
||||||
|
)
|
||||||
|
from twilio.rest import Client
|
||||||
|
|
||||||
|
from vocode.streaming.telephony.conversation.call import Call
|
||||||
|
from vocode.streaming.telephony.templates import Templater
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
|
||||||
|
from vocode.streaming.utils import create_conversation_id
|
||||||
|
|
||||||
|
|
||||||
|
class InboundCallConfig(BaseModel):
|
||||||
|
url: str
|
||||||
|
agent_config: AgentConfig
|
||||||
|
twilio_config: TwilioConfig
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None
|
||||||
|
|
||||||
|
|
||||||
|
class TelephonyServer:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
config_manager: BaseConfigManager,
|
||||||
|
inbound_call_configs: list[InboundCallConfig] = [],
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
self.router = APIRouter()
|
||||||
|
self.config_manager = config_manager
|
||||||
|
self.templater = Templater()
|
||||||
|
self.router.include_router(
|
||||||
|
CallsRouter(
|
||||||
|
base_url=base_url,
|
||||||
|
templater=self.templater,
|
||||||
|
config_manager=self.config_manager,
|
||||||
|
logger=self.logger,
|
||||||
|
).get_router()
|
||||||
|
)
|
||||||
|
self.router.include_router(
|
||||||
|
TwiMLRouter(
|
||||||
|
base_url=base_url, templater=self.templater, logger=self.logger
|
||||||
|
).get_router()
|
||||||
|
)
|
||||||
|
for config in inbound_call_configs:
|
||||||
|
self.router.add_api_route(
|
||||||
|
config.url,
|
||||||
|
self.create_inbound_route(
|
||||||
|
agent_config=config.agent_config,
|
||||||
|
twilio_config=config.twilio_config,
|
||||||
|
transcriber_config=config.transcriber_config,
|
||||||
|
synthesizer_config=config.synthesizer_config,
|
||||||
|
),
|
||||||
|
methods=["POST"],
|
||||||
|
)
|
||||||
|
logger.info(f"Set up inbound call TwiML at https://{base_url}{config.url}")
|
||||||
|
|
||||||
|
def create_inbound_route(
|
||||||
|
self,
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
twilio_config: TwilioConfig,
|
||||||
|
transcriber_config: Optional[TranscriberConfig] = None,
|
||||||
|
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||||
|
):
|
||||||
|
def route(twilio_sid: str = Form(alias="CallSid")) -> Response:
|
||||||
|
call_config = CallConfig(
|
||||||
|
transcriber_config=transcriber_config
|
||||||
|
or DeepgramTranscriberConfig(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
chunk_size=DEFAULT_CHUNK_SIZE,
|
||||||
|
model="voicemail",
|
||||||
|
endpointing_config=PunctuationEndpointingConfig(),
|
||||||
|
),
|
||||||
|
agent_config=agent_config,
|
||||||
|
synthesizer_config=synthesizer_config
|
||||||
|
or AzureSynthesizerConfig(
|
||||||
|
sampling_rate=DEFAULT_SAMPLING_RATE,
|
||||||
|
audio_encoding=DEFAULT_AUDIO_ENCODING,
|
||||||
|
),
|
||||||
|
twilio_config=twilio_config,
|
||||||
|
twilio_sid=twilio_sid,
|
||||||
|
)
|
||||||
|
|
||||||
|
conversation_id = create_conversation_id()
|
||||||
|
self.config_manager.save_config(conversation_id, call_config)
|
||||||
|
return self.templater.get_connection_twiml(
|
||||||
|
base_url=self.base_url, call_id=conversation_id
|
||||||
|
)
|
||||||
|
|
||||||
|
return route
|
||||||
|
|
||||||
|
async def end_outbound_call(self, conversation_id: str):
|
||||||
|
# TODO validation via twilio_client
|
||||||
|
call_config = self.config_manager.get_config(conversation_id)
|
||||||
|
if not call_config:
|
||||||
|
raise ValueError("Call not found")
|
||||||
|
call = Call.from_call_config(
|
||||||
|
self.base_url,
|
||||||
|
call_config,
|
||||||
|
self.config_manager,
|
||||||
|
conversation_id,
|
||||||
|
self.logger,
|
||||||
|
)
|
||||||
|
call.end_twilio_call()
|
||||||
|
return {"id": call.id}
|
||||||
|
|
||||||
|
def get_router(self) -> APIRouter:
|
||||||
|
return self.router
|
||||||
45
vocode/streaming/telephony/server/router/calls.py
Normal file
45
vocode/streaming/telephony/server/router/calls.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
from typing import Optional
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, WebSocket
|
||||||
|
from vocode.streaming.telephony.config_manager.base_config_manager import (
|
||||||
|
BaseConfigManager,
|
||||||
|
)
|
||||||
|
|
||||||
|
from vocode.streaming.telephony.conversation.call import Call
|
||||||
|
from vocode.streaming.telephony.templates import Templater
|
||||||
|
|
||||||
|
|
||||||
|
class CallsRouter:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
templater: Templater,
|
||||||
|
config_manager: BaseConfigManager,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.base_url = base_url
|
||||||
|
self.templater = templater
|
||||||
|
self.config_manager = config_manager
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
self.router = APIRouter()
|
||||||
|
self.router.websocket("/connect_call/{id}")(self.connect_call)
|
||||||
|
|
||||||
|
async def connect_call(self, websocket: WebSocket, id: str):
|
||||||
|
await websocket.accept()
|
||||||
|
self.logger.debug("Phone WS connection opened for chat {}".format(id))
|
||||||
|
call_config = self.config_manager.get_config(id)
|
||||||
|
if not call_config:
|
||||||
|
raise HTTPException(status_code=400, detail="No active phone call")
|
||||||
|
|
||||||
|
call: Call = Call.from_call_config(
|
||||||
|
self.base_url, call_config, self.config_manager, id, self.logger
|
||||||
|
)
|
||||||
|
|
||||||
|
await call.attach_ws_and_start(websocket)
|
||||||
|
self.config_manager.delete_config(call.id)
|
||||||
|
self.logger.debug("Phone WS connection closed for chat {}".format(id))
|
||||||
|
|
||||||
|
def get_router(self) -> APIRouter:
|
||||||
|
return self.router
|
||||||
29
vocode/streaming/telephony/server/router/twiml.py
Normal file
29
vocode/streaming/telephony/server/router/twiml.py
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from vocode.streaming.telephony.templates import Templater
|
||||||
|
|
||||||
|
|
||||||
|
class TwiMLRouter:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
templater: Templater,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.base_url = base_url
|
||||||
|
self.templater = templater
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
self.router = APIRouter()
|
||||||
|
self.router.add_api_route(
|
||||||
|
"/twiml/initiate_call/{id}", self.call_twiml, methods=["POST"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def call_twiml(self, id: str):
|
||||||
|
return self.templater.get_connection_twiml(base_url=self.base_url, call_id=id)
|
||||||
|
|
||||||
|
def get_router(self) -> APIRouter:
|
||||||
|
return self.router
|
||||||
20
vocode/streaming/telephony/templates.py
Normal file
20
vocode/streaming/telephony/templates.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
import os
|
||||||
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
from fastapi import Response
|
||||||
|
|
||||||
|
|
||||||
|
class Templater:
|
||||||
|
def __init__(self):
|
||||||
|
self.templates = Environment(
|
||||||
|
loader=FileSystemLoader("%s/templates/" % os.path.dirname(__file__))
|
||||||
|
)
|
||||||
|
|
||||||
|
def render_template(self, template_name: str, **kwargs):
|
||||||
|
template = self.templates.get_template(template_name)
|
||||||
|
return template.render(**kwargs)
|
||||||
|
|
||||||
|
def get_connection_twiml(self, call_id: str, base_url: str):
|
||||||
|
return Response(
|
||||||
|
self.render_template("connect_call.xml", base_url=base_url, id=call_id),
|
||||||
|
media_type="application/xml",
|
||||||
|
)
|
||||||
6
vocode/streaming/telephony/templates/connect_call.xml
Normal file
6
vocode/streaming/telephony/templates/connect_call.xml
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Response>
|
||||||
|
<Connect>
|
||||||
|
<Stream url="wss://{{ base_url }}/connect_call/{{ id }}" />
|
||||||
|
</Connect>
|
||||||
|
</Response>
|
||||||
12
vocode/streaming/telephony/twilio.py
Normal file
12
vocode/streaming/telephony/twilio.py
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from twilio.rest import Client
|
||||||
|
|
||||||
|
from vocode.streaming.models.telephony import TwilioConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
def create_twilio_client(twilio_config: TwilioConfig):
|
||||||
|
return Client(twilio_config.account_sid, twilio_config.auth_token)
|
||||||
101
vocode/streaming/transcriber/assembly_ai_transcriber.py
Normal file
101
vocode/streaming/transcriber/assembly_ai_transcriber.py
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import websockets
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig
|
||||||
|
from vocode.streaming.models.websocket import AudioMessage
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import (
|
||||||
|
BaseTranscriber,
|
||||||
|
Transcription,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
ASSEMBLY_AI_API_KEY = os.environ.get("ASSEMBLY_AI_API_KEY")
|
||||||
|
ASSEMBLY_AI_URL = "wss://api.assemblyai.com/v2/realtime/ws"
|
||||||
|
|
||||||
|
|
||||||
|
class AssemblyAITranscriber(BaseTranscriber):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
transcriber_config: AssemblyAITranscriberConfig,
|
||||||
|
logger: logging.Logger = None,
|
||||||
|
):
|
||||||
|
super().__init__(transcriber_config)
|
||||||
|
self._ended = False
|
||||||
|
self.is_ready = False
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
if self.transcriber_config.should_warmup_model:
|
||||||
|
raise Exception("AssemblyAI model warmup not supported yet")
|
||||||
|
elif self.transcriber_config.endpointing_config:
|
||||||
|
raise Exception("Assembly AI endpointing config not supported yet")
|
||||||
|
|
||||||
|
async def ready(self):
|
||||||
|
# while not self.warmed_up:
|
||||||
|
# await asyncio.sleep(0.1)
|
||||||
|
# return self.is_ready
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
await self.process()
|
||||||
|
|
||||||
|
def send_audio(self, chunk):
|
||||||
|
self.audio_queue.put_nowait(chunk)
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
terminate_msg = json.dumps({"terminate_session": True})
|
||||||
|
self.audio_queue.put_nowait(terminate_msg)
|
||||||
|
self._ended = True
|
||||||
|
|
||||||
|
def get_assembly_ai_url(self):
|
||||||
|
return ASSEMBLY_AI_URL + f"?sample_rate={self.transcriber_config.sampling_rate}"
|
||||||
|
|
||||||
|
async def process(self):
|
||||||
|
self.audio_queue = asyncio.Queue()
|
||||||
|
URL = self.get_assembly_ai_url()
|
||||||
|
|
||||||
|
async with websockets.connect(
|
||||||
|
URL,
|
||||||
|
extra_headers=(("Authorization", ASSEMBLY_AI_API_KEY),),
|
||||||
|
ping_interval=5,
|
||||||
|
ping_timeout=20,
|
||||||
|
) as ws:
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
async def sender(ws): # sends audio to websocket
|
||||||
|
while not self._ended:
|
||||||
|
try:
|
||||||
|
data = await asyncio.wait_for(self.audio_queue.get(), 5)
|
||||||
|
except asyncio.exceptions.TimeoutError:
|
||||||
|
break
|
||||||
|
await ws.send(
|
||||||
|
json.dumps({"audio_data": AudioMessage.from_bytes(data).data})
|
||||||
|
)
|
||||||
|
self.logger.debug("Terminating AssemblyAI transcriber sender")
|
||||||
|
|
||||||
|
async def receiver(ws):
|
||||||
|
while not self._ended:
|
||||||
|
try:
|
||||||
|
result_str = await ws.recv()
|
||||||
|
except websockets.exceptions.ConnectionClosedError as e:
|
||||||
|
self.logger.debug(e)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
assert False, "Not a websocket 4008 error"
|
||||||
|
|
||||||
|
data = json.loads(result_str)
|
||||||
|
is_final = (
|
||||||
|
"message_type" in data
|
||||||
|
and data["message_type"] == "FinalTranscript"
|
||||||
|
)
|
||||||
|
if "text" in data and data["text"]:
|
||||||
|
await self.on_response(
|
||||||
|
Transcription(data["text"], data["confidence"], is_final)
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.gather(sender(ws), receiver(ws))
|
||||||
59
vocode/streaming/transcriber/base_transcriber.py
Normal file
59
vocode/streaming/transcriber/base_transcriber.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from typing import Callable, Optional, Awaitable
|
||||||
|
|
||||||
|
from vocode.streaming.utils import convert_wav
|
||||||
|
from vocode.streaming.models.transcriber import EndpointingConfig, TranscriberConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class Transcription:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
confidence: float,
|
||||||
|
is_final: bool,
|
||||||
|
is_interrupt: bool = False,
|
||||||
|
):
|
||||||
|
self.message = message
|
||||||
|
self.confidence = confidence
|
||||||
|
self.is_final = is_final
|
||||||
|
self.is_interrupt = is_interrupt
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"Transcription({self.message}, {self.confidence}, {self.is_final})"
|
||||||
|
|
||||||
|
|
||||||
|
class BaseTranscriber:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
transcriber_config: TranscriberConfig,
|
||||||
|
):
|
||||||
|
self.transcriber_config = transcriber_config
|
||||||
|
self.on_response: Optional[Callable[[Transcription], Awaitable]] = None
|
||||||
|
|
||||||
|
def get_transcriber_config(self) -> TranscriberConfig:
|
||||||
|
return self.transcriber_config
|
||||||
|
|
||||||
|
def set_on_response(self, on_response: Callable[[Transcription], Awaitable]):
|
||||||
|
self.on_response = on_response
|
||||||
|
|
||||||
|
def get_warmup_bytes(self):
|
||||||
|
sampling_rate = self.transcriber_config.sampling_rate
|
||||||
|
return convert_wav(
|
||||||
|
"convo/audio/ajay.wav",
|
||||||
|
sampling_rate,
|
||||||
|
self.transcriber_config.audio_encoding,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def ready(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def send_audio(self, chunk):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
pass
|
||||||
230
vocode/streaming/transcriber/deepgram_transcriber.py
Normal file
230
vocode/streaming/transcriber/deepgram_transcriber.py
Normal file
|
|
@ -0,0 +1,230 @@
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import websockets
|
||||||
|
from websockets.client import WebSocketClientProtocol
|
||||||
|
import audioop
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import (
|
||||||
|
BaseTranscriber,
|
||||||
|
Transcription,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.transcriber import (
|
||||||
|
DeepgramTranscriberConfig,
|
||||||
|
EndpointingConfig,
|
||||||
|
EndpointingType,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
|
||||||
|
PUNCTUATION_TERMINATORS = [".", "!", "?"]
|
||||||
|
NUM_RESTARTS = 5
|
||||||
|
|
||||||
|
|
||||||
|
class DeepgramTranscriber(BaseTranscriber):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
transcriber_config: DeepgramTranscriberConfig,
|
||||||
|
logger: logging.Logger = None,
|
||||||
|
):
|
||||||
|
super().__init__(transcriber_config)
|
||||||
|
self.transcriber_config = transcriber_config
|
||||||
|
self._ended = False
|
||||||
|
self.warmed_up = False
|
||||||
|
self.is_ready = False
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def create_warmup_chunks(self):
|
||||||
|
warmup_chunks = []
|
||||||
|
warmup_bytes = self.get_warmup_bytes()
|
||||||
|
chunk_size = self.transcriber_config.chunk_size
|
||||||
|
for i in range(len(warmup_bytes) // chunk_size):
|
||||||
|
warmup_chunks.append(warmup_bytes[i * chunk_size : (i + 1) * chunk_size])
|
||||||
|
return warmup_chunks
|
||||||
|
|
||||||
|
async def ready(self):
|
||||||
|
while not self.warmed_up:
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
return self.is_ready
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
# warmup_chunks = await self.create_warmup_chunks()
|
||||||
|
restarts = 0
|
||||||
|
while not self._ended and restarts < NUM_RESTARTS:
|
||||||
|
await self.process(self.transcriber_config.should_warmup_model)
|
||||||
|
restarts += 1
|
||||||
|
self.logger.debug(
|
||||||
|
"Deepgram connection died, restarting, num_restarts: %s", restarts
|
||||||
|
)
|
||||||
|
|
||||||
|
def send_audio(self, chunk):
|
||||||
|
if (
|
||||||
|
self.transcriber_config.downsampling
|
||||||
|
and self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16
|
||||||
|
):
|
||||||
|
chunk, _ = audioop.ratecv(
|
||||||
|
chunk,
|
||||||
|
2,
|
||||||
|
1,
|
||||||
|
self.transcriber_config.sampling_rate
|
||||||
|
* self.transcriber_config.downsampling,
|
||||||
|
self.transcriber_config.sampling_rate,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
self.audio_queue.put_nowait(chunk)
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
terminate_msg = json.dumps({"type": "CloseStream"})
|
||||||
|
self.audio_queue.put_nowait(terminate_msg)
|
||||||
|
self._ended = True
|
||||||
|
|
||||||
|
def get_deepgram_url(self):
|
||||||
|
if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
|
||||||
|
encoding = "linear16"
|
||||||
|
elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
encoding = "mulaw"
|
||||||
|
url_params = {
|
||||||
|
"encoding": encoding,
|
||||||
|
"sample_rate": self.transcriber_config.sampling_rate,
|
||||||
|
"channels": 1,
|
||||||
|
"interim_results": "true",
|
||||||
|
}
|
||||||
|
extra_params = {}
|
||||||
|
if self.transcriber_config.model:
|
||||||
|
extra_params["model"] = self.transcriber_config.model
|
||||||
|
if self.transcriber_config.tier:
|
||||||
|
extra_params["tier"] = self.transcriber_config.tier
|
||||||
|
if self.transcriber_config.version:
|
||||||
|
extra_params["version"] = self.transcriber_config.version
|
||||||
|
if (
|
||||||
|
self.transcriber_config.endpointing_config
|
||||||
|
and self.transcriber_config.endpointing_config.type
|
||||||
|
== EndpointingType.PUNCTUATION_BASED
|
||||||
|
):
|
||||||
|
extra_params["punctuate"] = "true"
|
||||||
|
url_params.update(extra_params)
|
||||||
|
return f"wss://api.deepgram.com/v1/listen?{urlencode(url_params)}"
|
||||||
|
|
||||||
|
def is_speech_final(
|
||||||
|
self, current_buffer: str, deepgram_response: dict, time_silent: float
|
||||||
|
):
|
||||||
|
transcript = deepgram_response["channel"]["alternatives"][0]["transcript"]
|
||||||
|
|
||||||
|
# if it is not time based, then return true if speech is final and there is a transcript
|
||||||
|
if not self.transcriber_config.endpointing_config:
|
||||||
|
return transcript and deepgram_response["speech_final"]
|
||||||
|
elif (
|
||||||
|
self.transcriber_config.endpointing_config.type
|
||||||
|
== EndpointingType.TIME_BASED
|
||||||
|
):
|
||||||
|
# if it is time based, then return true if there is no transcript
|
||||||
|
# and there is some speech to send
|
||||||
|
# and the time_silent is greater than the cutoff
|
||||||
|
return (
|
||||||
|
not transcript
|
||||||
|
and current_buffer
|
||||||
|
and (time_silent + deepgram_response["duration"])
|
||||||
|
> self.transcriber_config.endpointing_config.time_cutoff_seconds
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
self.transcriber_config.endpointing_config.type
|
||||||
|
== EndpointingType.PUNCTUATION_BASED
|
||||||
|
):
|
||||||
|
return (
|
||||||
|
transcript
|
||||||
|
and deepgram_response["speech_final"]
|
||||||
|
and transcript.strip()[-1] in PUNCTUATION_TERMINATORS
|
||||||
|
) or (
|
||||||
|
not transcript
|
||||||
|
and current_buffer
|
||||||
|
and (time_silent + deepgram_response["duration"])
|
||||||
|
> self.transcriber_config.endpointing_config.time_cutoff_seconds
|
||||||
|
)
|
||||||
|
raise Exception("Endpointing config not supported")
|
||||||
|
|
||||||
|
def calculate_time_silent(self, data: dict):
|
||||||
|
end = data["start"] + data["duration"]
|
||||||
|
words = data["channel"]["alternatives"][0]["words"]
|
||||||
|
if words:
|
||||||
|
return end - words[-1]["end"]
|
||||||
|
return data["duration"]
|
||||||
|
|
||||||
|
async def process(self, warmup=True):
|
||||||
|
extra_headers = {"Authorization": f"Token {DEEPGRAM_API_KEY}"}
|
||||||
|
self.audio_queue = asyncio.Queue()
|
||||||
|
|
||||||
|
async with websockets.connect(
|
||||||
|
self.get_deepgram_url(), extra_headers=extra_headers
|
||||||
|
) as ws:
|
||||||
|
|
||||||
|
async def warmup_sender(ws: WebSocketClientProtocol):
|
||||||
|
if warmup:
|
||||||
|
warmup_chunks = self.create_warmup_chunks()
|
||||||
|
for chunk in warmup_chunks:
|
||||||
|
await ws.send(chunk)
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
self.warmed_up = True
|
||||||
|
self.is_ready = True
|
||||||
|
|
||||||
|
async def sender(ws: WebSocketClientProtocol): # sends audio to websocket
|
||||||
|
while not self._ended:
|
||||||
|
try:
|
||||||
|
data = await asyncio.wait_for(self.audio_queue.get(), 5)
|
||||||
|
except asyncio.exceptions.TimeoutError:
|
||||||
|
break
|
||||||
|
await ws.send(data)
|
||||||
|
self.logger.debug("Terminating Deepgram transcriber sender")
|
||||||
|
|
||||||
|
async def receiver(ws: WebSocketClientProtocol):
|
||||||
|
buffer = ""
|
||||||
|
time_silent = 0
|
||||||
|
while not self._ended:
|
||||||
|
try:
|
||||||
|
msg = await ws.recv()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"Got error {e} in Deepgram receiver")
|
||||||
|
break
|
||||||
|
data = json.loads(msg)
|
||||||
|
if (
|
||||||
|
not "is_final" in data
|
||||||
|
): # means we've finished receiving transcriptions
|
||||||
|
break
|
||||||
|
is_final = data["is_final"]
|
||||||
|
speech_final = self.is_speech_final(buffer, data, time_silent)
|
||||||
|
top_choice = data["channel"]["alternatives"][0]
|
||||||
|
confidence = top_choice["confidence"]
|
||||||
|
|
||||||
|
if (
|
||||||
|
top_choice["transcript"]
|
||||||
|
and confidence > 0.0
|
||||||
|
and self.warmed_up
|
||||||
|
and is_final
|
||||||
|
):
|
||||||
|
buffer = f"{buffer} {top_choice['transcript']}"
|
||||||
|
|
||||||
|
if speech_final:
|
||||||
|
await self.on_response(Transcription(buffer, confidence, True))
|
||||||
|
buffer = ""
|
||||||
|
time_silent = 0
|
||||||
|
elif (
|
||||||
|
top_choice["transcript"] and confidence > 0.0 and self.warmed_up
|
||||||
|
):
|
||||||
|
await self.on_response(
|
||||||
|
Transcription(
|
||||||
|
buffer,
|
||||||
|
confidence,
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
time_silent = self.calculate_time_silent(data)
|
||||||
|
else:
|
||||||
|
time_silent += data["duration"]
|
||||||
|
|
||||||
|
self.logger.debug("Terminating Deepgram transcriber receiver")
|
||||||
|
|
||||||
|
await asyncio.gather(warmup_sender(ws), sender(ws), receiver(ws))
|
||||||
145
vocode/streaming/transcriber/google_transcriber.py
Normal file
145
vocode/streaming/transcriber/google_transcriber.py
Normal file
|
|
@ -0,0 +1,145 @@
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import queue
|
||||||
|
from google.cloud import speech
|
||||||
|
import threading
|
||||||
|
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
from vocode.streaming.transcriber.base_transcriber import (
|
||||||
|
BaseTranscriber,
|
||||||
|
Transcription,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.transcriber import GoogleTranscriberConfig
|
||||||
|
from vocode.streaming.utils import create_loop_in_thread
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleTranscriber(BaseTranscriber):
|
||||||
|
def __init__(self, transcriber_config: GoogleTranscriberConfig):
|
||||||
|
super().__init__(transcriber_config)
|
||||||
|
self._queue = queue.Queue()
|
||||||
|
self._ended = False
|
||||||
|
self.google_streaming_config = self.create_google_streaming_config()
|
||||||
|
self.client = speech.SpeechClient()
|
||||||
|
self.warmed_up = False
|
||||||
|
self.is_ready = False
|
||||||
|
if self.transcriber_config.endpointing_config:
|
||||||
|
raise Exception("Google endpointing config not supported yet")
|
||||||
|
self.event_loop = asyncio.new_event_loop()
|
||||||
|
self.thread = threading.Thread(
|
||||||
|
name="google_transcriber",
|
||||||
|
target=create_loop_in_thread,
|
||||||
|
args=(self.event_loop, self.process()),
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_google_streaming_config(self):
|
||||||
|
extra_params = {}
|
||||||
|
if self.transcriber_config.model:
|
||||||
|
extra_params["model"] = self.transcriber_config.model
|
||||||
|
extra_params["use_enhanced"] = True
|
||||||
|
|
||||||
|
if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
|
||||||
|
google_audio_encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
|
||||||
|
elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
|
||||||
|
google_audio_encoding = speech.RecognitionConfig.AudioEncoding.MULAW
|
||||||
|
|
||||||
|
return speech.StreamingRecognitionConfig(
|
||||||
|
config=speech.RecognitionConfig(
|
||||||
|
encoding=google_audio_encoding,
|
||||||
|
sample_rate_hertz=self.transcriber_config.sampling_rate,
|
||||||
|
language_code="en-US",
|
||||||
|
**extra_params
|
||||||
|
),
|
||||||
|
interim_results=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def ready(self):
|
||||||
|
if not self.transcriber_config.should_warmup_model:
|
||||||
|
return True
|
||||||
|
while not self.warmed_up:
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
return self.is_ready
|
||||||
|
|
||||||
|
def warmup(self):
|
||||||
|
warmup_bytes = self.get_warmup_bytes()
|
||||||
|
|
||||||
|
def stream():
|
||||||
|
chunk_size = self.transcriber_config.sampling_rate * 2
|
||||||
|
for i in range(len(warmup_bytes) // chunk_size):
|
||||||
|
yield speech.StreamingRecognizeRequest(
|
||||||
|
audio_content=warmup_bytes[i * chunk_size : (i + 1) * chunk_size]
|
||||||
|
)
|
||||||
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
for _ in self.client.streaming_recognize(
|
||||||
|
self.google_streaming_config, stream()
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
self.warmed_up = True
|
||||||
|
self.is_ready = True
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
self.thread.start()
|
||||||
|
|
||||||
|
async def process(self):
|
||||||
|
if self.transcriber_config.should_warmup_model:
|
||||||
|
self.warmup()
|
||||||
|
stream = self.generator()
|
||||||
|
requests = (
|
||||||
|
speech.StreamingRecognizeRequest(audio_content=content)
|
||||||
|
for content in stream
|
||||||
|
)
|
||||||
|
responses = self.client.streaming_recognize(
|
||||||
|
self.google_streaming_config, requests
|
||||||
|
)
|
||||||
|
await self.process_responses_loop(responses)
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
self._ended = True
|
||||||
|
|
||||||
|
def send_audio(self, chunk: bytes):
|
||||||
|
self._queue.put(chunk, block=False)
|
||||||
|
|
||||||
|
async def process_responses_loop(self, responses):
|
||||||
|
for response in responses:
|
||||||
|
await self._on_response(response)
|
||||||
|
|
||||||
|
if self._ended:
|
||||||
|
break
|
||||||
|
|
||||||
|
async def _on_response(self, response):
|
||||||
|
if not response.results:
|
||||||
|
return
|
||||||
|
|
||||||
|
result = response.results[0]
|
||||||
|
if not result.alternatives:
|
||||||
|
return
|
||||||
|
|
||||||
|
top_choice = result.alternatives[0]
|
||||||
|
message = top_choice.transcript
|
||||||
|
confidence = top_choice.confidence
|
||||||
|
|
||||||
|
return await self.on_response(
|
||||||
|
Transcription(message, confidence, result.is_final)
|
||||||
|
)
|
||||||
|
|
||||||
|
def generator(self):
|
||||||
|
while not self._ended:
|
||||||
|
# Use a blocking get() to ensure there's at least one chunk of
|
||||||
|
# data, and stop iteration if the chunk is None, indicating the
|
||||||
|
# end of the audio stream.
|
||||||
|
chunk = self._queue.get()
|
||||||
|
if chunk is None:
|
||||||
|
return
|
||||||
|
data = [chunk]
|
||||||
|
|
||||||
|
# Now consume whatever other data's still buffered.
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
chunk = self._queue.get(block=False)
|
||||||
|
if chunk is None:
|
||||||
|
return
|
||||||
|
data.append(chunk)
|
||||||
|
except queue.Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
yield b"".join(data)
|
||||||
63
vocode/streaming/utils/__init__.py
Normal file
63
vocode/streaming/utils/__init__.py
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
import asyncio
|
||||||
|
import audioop
|
||||||
|
import secrets
|
||||||
|
from typing import Any
|
||||||
|
import wave
|
||||||
|
|
||||||
|
from ..models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
|
||||||
|
def create_loop_in_thread(loop: asyncio.AbstractEventLoop, long_running_task=None):
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
if long_running_task:
|
||||||
|
loop.run_until_complete(long_running_task)
|
||||||
|
else:
|
||||||
|
loop.run_forever()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_linear_audio(
|
||||||
|
raw_wav: bytes,
|
||||||
|
input_sample_rate=24000,
|
||||||
|
output_sample_rate=8000,
|
||||||
|
output_encoding=AudioEncoding.LINEAR16,
|
||||||
|
output_sample_width=2,
|
||||||
|
):
|
||||||
|
# downsample
|
||||||
|
if input_sample_rate != output_sample_rate:
|
||||||
|
raw_wav, _ = audioop.ratecv(
|
||||||
|
raw_wav, 2, 1, input_sample_rate, output_sample_rate, None
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_encoding == AudioEncoding.LINEAR16:
|
||||||
|
return raw_wav
|
||||||
|
elif output_encoding == AudioEncoding.MULAW:
|
||||||
|
return audioop.lin2ulaw(raw_wav, output_sample_width)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_wav(
|
||||||
|
file: Any,
|
||||||
|
output_sample_rate=8000,
|
||||||
|
output_encoding=AudioEncoding.LINEAR16,
|
||||||
|
):
|
||||||
|
with wave.open(file, "rb") as wav:
|
||||||
|
raw_wav = wav.readframes(wav.getnframes())
|
||||||
|
return convert_linear_audio(
|
||||||
|
raw_wav,
|
||||||
|
input_sample_rate=wav.getframerate(),
|
||||||
|
output_sample_rate=output_sample_rate,
|
||||||
|
output_encoding=output_encoding,
|
||||||
|
output_sample_width=wav.getsampwidth(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_size_per_second(audio_encoding: AudioEncoding, sampling_rate: int) -> int:
|
||||||
|
if audio_encoding == AudioEncoding.LINEAR16:
|
||||||
|
return sampling_rate * 2
|
||||||
|
elif audio_encoding == AudioEncoding.MULAW:
|
||||||
|
return sampling_rate
|
||||||
|
else:
|
||||||
|
raise Exception("Unsupported audio encoding")
|
||||||
|
|
||||||
|
|
||||||
|
def create_conversation_id() -> str:
|
||||||
|
return secrets.token_urlsafe(16)
|
||||||
0
vocode/streaming/utils/goodbye_embeddings/.gitkeep
Normal file
0
vocode/streaming/utils/goodbye_embeddings/.gitkeep
Normal file
102
vocode/streaming/utils/goodbye_model.py
Normal file
102
vocode/streaming/utils/goodbye_model.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import openai
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
PLATFORM = "pyq" if os.getenv("USE_PYQ_EMBEDDINGS", "false") == "true" else "openai"
|
||||||
|
SIMILARITY_THRESHOLD = 0.9
|
||||||
|
SIMILARITY_THRESHOLD_PYQ = 0.7
|
||||||
|
EMBEDDING_SIZE = 1536
|
||||||
|
PYQ_EMBEDDING_SIZE = 768
|
||||||
|
GOODBYE_PHRASES = [
|
||||||
|
"bye",
|
||||||
|
"goodbye",
|
||||||
|
"see you",
|
||||||
|
"see you later",
|
||||||
|
"talk to you later",
|
||||||
|
"talk to you soon",
|
||||||
|
"have a good day",
|
||||||
|
"have a good night",
|
||||||
|
]
|
||||||
|
PYQ_API_URL = "https://embeddings.pyqai.com"
|
||||||
|
|
||||||
|
|
||||||
|
class GoodbyeModel:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embeddings_cache_path=os.path.join(
|
||||||
|
os.path.dirname(__file__), "goodbye_embeddings"
|
||||||
|
),
|
||||||
|
):
|
||||||
|
self.goodbye_embeddings = self.load_or_create_embeddings(
|
||||||
|
f"{embeddings_cache_path}/goodbye_embeddings.npy"
|
||||||
|
)
|
||||||
|
self.goodbye_embeddings_pyq = self.load_or_create_embeddings(
|
||||||
|
f"{embeddings_cache_path}/goodbye_embeddings_pyq.npy"
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_or_create_embeddings(self, path):
|
||||||
|
if os.path.exists(path):
|
||||||
|
return np.load(path)
|
||||||
|
else:
|
||||||
|
embeddings = self.create_embeddings()
|
||||||
|
np.save(path, embeddings)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
def create_embeddings(self, platform=PLATFORM):
|
||||||
|
print("Creating embeddings...")
|
||||||
|
size = EMBEDDING_SIZE if platform == "openai" else PYQ_EMBEDDING_SIZE
|
||||||
|
embeddings = np.empty((size, len(GOODBYE_PHRASES)))
|
||||||
|
for i, goodbye_phrase in enumerate(GOODBYE_PHRASES):
|
||||||
|
embeddings[:, i] = self.create_embedding(goodbye_phrase, platform=platform)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
async def is_goodbye(self, text: str, platform=PLATFORM) -> bool:
|
||||||
|
if "bye" in text.lower():
|
||||||
|
return True
|
||||||
|
embedding = self.create_embedding(text.strip().lower(), platform=platform)
|
||||||
|
goodbye_embeddings = (
|
||||||
|
self.goodbye_embeddings
|
||||||
|
if platform == "openai"
|
||||||
|
else self.goodbye_embeddings_pyq
|
||||||
|
)
|
||||||
|
threshold = (
|
||||||
|
SIMILARITY_THRESHOLD if platform == "openai" else SIMILARITY_THRESHOLD_PYQ
|
||||||
|
)
|
||||||
|
similarity_results = embedding @ goodbye_embeddings
|
||||||
|
return np.max(similarity_results) > threshold
|
||||||
|
|
||||||
|
def create_embedding(self, text, platform=PLATFORM) -> np.array:
|
||||||
|
if platform == "openai":
|
||||||
|
return np.array(
|
||||||
|
openai.Embedding.create(input=text, model="text-embedding-ada-002")[
|
||||||
|
"data"
|
||||||
|
][0]["embedding"]
|
||||||
|
)
|
||||||
|
elif platform == "pyq":
|
||||||
|
return np.array(
|
||||||
|
requests.post(
|
||||||
|
PYQ_API_URL,
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": os.getenv("PYQ_API_KEY"),
|
||||||
|
},
|
||||||
|
json={"input_sequence": [text], "account_id": "400"},
|
||||||
|
).json()["response"][0]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
model = GoodbyeModel()
|
||||||
|
while True:
|
||||||
|
print(await model.is_goodbye(input("Text: ")))
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
236
vocode/streaming/utils/sse_client.py
Normal file
236
vocode/streaming/utils/sse_client.py
Normal file
|
|
@ -0,0 +1,236 @@
|
||||||
|
"""
|
||||||
|
A port of sseclient (https://pypi.org/project/sseclient/) that allows you to get server-side events with a POST request
|
||||||
|
|
||||||
|
Copyright (c) 2015 Brent Tubbs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE."""
|
||||||
|
#
|
||||||
|
# Distributed under the terms of the MIT license.
|
||||||
|
#
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
__version__ = "0.0.27"
|
||||||
|
|
||||||
|
# Technically, we should support streams that mix line endings. This regex,
|
||||||
|
# however, assumes that a system will provide consistent line endings.
|
||||||
|
end_of_field = re.compile(r"\r\n\r\n|\r\r|\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
class SSEClient(object):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
last_id=None,
|
||||||
|
retry=3000,
|
||||||
|
session=None,
|
||||||
|
chunk_size=1024,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
self.url = url
|
||||||
|
self.method = method
|
||||||
|
self.last_id = last_id
|
||||||
|
self.retry = retry
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
|
||||||
|
# Optional support for passing in a requests.Session()
|
||||||
|
self.session = session
|
||||||
|
|
||||||
|
# Any extra kwargs will be fed into the requests.get call later.
|
||||||
|
self.requests_kwargs = kwargs
|
||||||
|
|
||||||
|
# The SSE spec requires making requests with Cache-Control: nocache
|
||||||
|
if "headers" not in self.requests_kwargs:
|
||||||
|
self.requests_kwargs["headers"] = {}
|
||||||
|
self.requests_kwargs["headers"]["Cache-Control"] = "no-cache"
|
||||||
|
|
||||||
|
# The 'Accept' header is not required, but explicit > implicit
|
||||||
|
self.requests_kwargs["headers"]["Accept"] = "text/event-stream"
|
||||||
|
|
||||||
|
# Keep data here as it streams in
|
||||||
|
self.buf = ""
|
||||||
|
|
||||||
|
self._connect()
|
||||||
|
|
||||||
|
def _connect(self):
|
||||||
|
if self.last_id:
|
||||||
|
self.requests_kwargs["headers"]["Last-Event-ID"] = self.last_id
|
||||||
|
|
||||||
|
# Use session if set. Otherwise fall back to requests module.
|
||||||
|
requester = self.session or requests
|
||||||
|
self.resp = requester.request(
|
||||||
|
self.method, self.url, stream=True, **self.requests_kwargs
|
||||||
|
)
|
||||||
|
self.resp_iterator = self.iter_content()
|
||||||
|
encoding = self.resp.encoding or self.resp.apparent_encoding
|
||||||
|
self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace")
|
||||||
|
|
||||||
|
# TODO: Ensure we're handling redirects. Might also stick the 'origin'
|
||||||
|
# attribute on Events like the Javascript spec requires.
|
||||||
|
self.resp.raise_for_status()
|
||||||
|
|
||||||
|
def iter_content(self):
|
||||||
|
def generate():
|
||||||
|
while True:
|
||||||
|
if (
|
||||||
|
hasattr(self.resp.raw, "_fp")
|
||||||
|
and hasattr(self.resp.raw._fp, "fp")
|
||||||
|
and hasattr(self.resp.raw._fp.fp, "read1")
|
||||||
|
):
|
||||||
|
chunk = self.resp.raw._fp.fp.read1(self.chunk_size)
|
||||||
|
else:
|
||||||
|
# _fp is not available, this means that we cannot use short
|
||||||
|
# reads and this will block until the full chunk size is
|
||||||
|
# actually read
|
||||||
|
chunk = self.resp.raw.read(self.chunk_size)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return generate()
|
||||||
|
|
||||||
|
def _event_complete(self):
|
||||||
|
return re.search(end_of_field, self.buf) is not None
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
while not self._event_complete():
|
||||||
|
try:
|
||||||
|
next_chunk = next(self.resp_iterator)
|
||||||
|
if not next_chunk:
|
||||||
|
raise EOFError()
|
||||||
|
self.buf += self.decoder.decode(next_chunk)
|
||||||
|
|
||||||
|
except (
|
||||||
|
StopIteration,
|
||||||
|
requests.RequestException,
|
||||||
|
EOFError,
|
||||||
|
six.moves.http_client.IncompleteRead,
|
||||||
|
) as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(self.retry / 1000.0)
|
||||||
|
self._connect()
|
||||||
|
|
||||||
|
# The SSE spec only supports resuming from a whole message, so
|
||||||
|
# if we have half a message we should throw it out.
|
||||||
|
head, sep, tail = self.buf.rpartition("\n")
|
||||||
|
self.buf = head + sep
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split the complete event (up to the end_of_field) into event_string,
|
||||||
|
# and retain anything after the current complete event in self.buf
|
||||||
|
# for next time.
|
||||||
|
(event_string, self.buf) = re.split(end_of_field, self.buf, maxsplit=1)
|
||||||
|
msg = Event.parse(event_string)
|
||||||
|
|
||||||
|
# If the server requests a specific retry delay, we need to honor it.
|
||||||
|
if msg.retry:
|
||||||
|
self.retry = msg.retry
|
||||||
|
|
||||||
|
# last_id should only be set if included in the message. It's not
|
||||||
|
# forgotten if a message omits it.
|
||||||
|
if msg.id:
|
||||||
|
self.last_id = msg.id
|
||||||
|
|
||||||
|
return msg
|
||||||
|
|
||||||
|
if six.PY2:
|
||||||
|
next = __next__
|
||||||
|
|
||||||
|
|
||||||
|
class Event(object):
|
||||||
|
sse_line_pattern = re.compile("(?P<name>[^:]*):?( ?(?P<value>.*))?")
|
||||||
|
|
||||||
|
def __init__(self, data="", event="message", id=None, retry=None):
|
||||||
|
assert isinstance(data, six.string_types), "Data must be text"
|
||||||
|
self.data = data
|
||||||
|
self.event = event
|
||||||
|
self.id = id
|
||||||
|
self.retry = retry
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
lines = []
|
||||||
|
if self.id:
|
||||||
|
lines.append("id: %s" % self.id)
|
||||||
|
|
||||||
|
# Only include an event line if it's not the default already.
|
||||||
|
if self.event != "message":
|
||||||
|
lines.append("event: %s" % self.event)
|
||||||
|
|
||||||
|
if self.retry:
|
||||||
|
lines.append("retry: %s" % self.retry)
|
||||||
|
|
||||||
|
lines.extend("data: %s" % d for d in self.data.split("\n"))
|
||||||
|
return "\n".join(lines) + "\n\n"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse(cls, raw):
|
||||||
|
"""
|
||||||
|
Given a possibly-multiline string representing an SSE message, parse it
|
||||||
|
and return a Event object.
|
||||||
|
"""
|
||||||
|
msg = cls()
|
||||||
|
for line in raw.splitlines():
|
||||||
|
m = cls.sse_line_pattern.match(line)
|
||||||
|
if m is None:
|
||||||
|
# Malformed line. Discard but warn.
|
||||||
|
warnings.warn('Invalid SSE line: "%s"' % line, SyntaxWarning)
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = m.group("name")
|
||||||
|
if name == "":
|
||||||
|
# line began with a ":", so is a comment. Ignore
|
||||||
|
continue
|
||||||
|
value = m.group("value")
|
||||||
|
|
||||||
|
if name == "data":
|
||||||
|
# If we already have some data, then join to it with a newline.
|
||||||
|
# Else this is it.
|
||||||
|
if msg.data:
|
||||||
|
msg.data = "%s\n%s" % (msg.data, value)
|
||||||
|
else:
|
||||||
|
msg.data = value
|
||||||
|
elif name == "event":
|
||||||
|
msg.event = value
|
||||||
|
elif name == "id":
|
||||||
|
msg.id = value
|
||||||
|
elif name == "retry":
|
||||||
|
msg.retry = int(value)
|
||||||
|
|
||||||
|
return msg
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.data
|
||||||
40
vocode/streaming/utils/transcript.py
Normal file
40
vocode/streaming/utils/transcript.py
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
import time
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class Sender(str, Enum):
|
||||||
|
HUMAN = "human"
|
||||||
|
BOT = "bot"
|
||||||
|
|
||||||
|
|
||||||
|
class Message(BaseModel):
|
||||||
|
text: str
|
||||||
|
sender: Sender
|
||||||
|
timestamp: float
|
||||||
|
|
||||||
|
def to_string(self, include_timestamp: bool = False) -> str:
|
||||||
|
if include_timestamp:
|
||||||
|
return f"{self.sender.name}: {self.text} ({self.timestamp})"
|
||||||
|
return f"{self.sender.name}: {self.text}"
|
||||||
|
|
||||||
|
|
||||||
|
class Transcript(BaseModel):
|
||||||
|
messages: list[Message] = []
|
||||||
|
start_time: float = Field(default_factory=time.time)
|
||||||
|
|
||||||
|
def to_string(self, include_timestamps: bool = False) -> str:
|
||||||
|
return "\n".join(
|
||||||
|
message.to_string(include_timestamp=include_timestamps)
|
||||||
|
for message in self.messages
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_human_message(self, text: str):
|
||||||
|
self.messages.append(
|
||||||
|
Message(text=text, sender=Sender.HUMAN, timestamp=time.time())
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_bot_message(self, text: str):
|
||||||
|
self.messages.append(
|
||||||
|
Message(text=text, sender=Sender.BOT, timestamp=time.time())
|
||||||
|
)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue