open source

This commit is contained in:
Ajay Raj 2023-03-28 00:15:34 -07:00
commit a93bfc1ec9
61 changed files with 4013 additions and 126 deletions

View file

@ -1,26 +1,67 @@
import websockets
from websockets.exceptions import ConnectionClosedOK
from websockets.client import WebSocketClientProtocol
import asyncio
from dotenv import load_dotenv
import os
from asyncio import Future
import queue
from typing import Callable, Awaitable, Optional, Any
import logging
import threading
import queue
import vocode
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
import time
import secrets
import random
from dotenv import load_dotenv
from vocode.streaming.agent.bot_sentiment_analyser import (
BotSentiment,
BotSentimentAnalyser,
)
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.websocket import (
ReadyMessage,
AudioMessage,
StartMessage,
StopMessage,
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
from vocode.streaming.utils.goodbye_model import GoodbyeModel
from vocode.streaming.utils.transcript import Transcript
from vocode.streaming.models.transcriber import (
TranscriberConfig,
TranscriberType,
)
from vocode.streaming.models.agent import (
AgentConfig,
AgentType,
FillerAudioConfig,
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS,
)
from vocode.streaming.models.synthesizer import (
SynthesizerConfig,
SynthesizerType,
TrackBotSentimentConfig,
)
from vocode.streaming.models.websocket import AudioMessage
from vocode.streaming.constants import (
TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS,
PER_CHUNK_ALLOWANCE_SECONDS,
ALLOWED_IDLE_TIME,
)
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
SynthesisResult,
FillerAudio,
)
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.utils import (
create_conversation_id,
create_loop_in_thread,
get_chunk_size_per_second,
)
from vocode.streaming.transcriber.base_transcriber import (
Transcription,
BaseTranscriber,
)
from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
load_dotenv()
@ -28,79 +69,468 @@ load_dotenv()
class StreamingConversation:
def __init__(
self,
input_device: BaseInputDevice,
output_device: BaseOutputDevice,
transcriber_config: TranscriberConfig,
agent_config: AgentConfig,
synthesizer_config: SynthesizerConfig,
id: str = None,
transcriber: BaseTranscriber,
agent: BaseAgent,
synthesizer: BaseSynthesizer,
conversation_id: str = None,
per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
logger: Optional[logging.Logger] = None,
):
self.id = id
self.input_device = input_device
self.id = conversation_id or create_conversation_id()
self.logger = logger or logging.getLogger(__name__)
self.output_device = output_device
self.transcriber_config = transcriber_config
self.agent_config = agent_config
self.synthesizer_config = synthesizer_config
self.logger = logging.getLogger(__name__)
self.receiver_ready = False
self.active = True
self.output_loop = asyncio.new_event_loop()
self.output_audio_queue = queue.Queue()
self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
self.transcriber = transcriber
self.transcriber.set_on_response(self.on_transcription_response)
self.transcriber_task = None
self.agent = agent
self.synthesizer = synthesizer
self.synthesizer_event_loop = asyncio.new_event_loop()
self.synthesizer_thread = threading.Thread(
name="synthesizer",
target=create_loop_in_thread,
args=(self.synthesizer_event_loop,),
)
self.per_chunk_allowance_seconds = per_chunk_allowance_seconds
self.transcript = Transcript()
self.bot_sentiment = None
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
if isinstance(
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice,
bool,
):
self.track_bot_sentiment_config = TrackBotSentimentConfig()
else:
self.track_bot_sentiment_config = (
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice
)
self.bot_sentiment_analyser = BotSentimentAnalyser(
emotions=self.track_bot_sentiment_config.emotions
)
self.goodbye_model = GoodbyeModel()
async def wait_for_ready(self):
while not self.receiver_ready:
await asyncio.sleep(0.1)
return True
def deactivate(self):
self.is_human_speaking = False
self.active = False
def play_audio(self):
async def run():
while self.active:
try:
audio = self.output_audio_queue.get(timeout=5)
await self.output_device.send_async(audio)
except queue.Empty:
continue
loop = asyncio.new_event_loop()
loop.run_until_complete(run())
self.current_synthesis_task = None
self.is_current_synthesis_interruptable = False
self.stop_events: queue.Queue[threading.Event] = queue.Queue()
self.last_action_timestamp = time.time()
self.check_for_idle_task = None
self.track_bot_sentiment_task = None
self.should_wait_for_filler_audio_done_event = False
self.current_filler_audio_done_event: Optional[threading.Event] = None
self.current_filler_seconds_per_chunk: int = 0
self.current_transcription_is_interrupt: bool = False
async def start(self):
async with websockets.connect(
f"{self.vocode_websocket_url}?key={vocode.api_key}"
) as ws:
async def sender(ws: WebSocketClientProtocol):
start_message = StartMessage(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config,
conversation_id=self.id,
self.transcriber_task = asyncio.create_task(self.transcriber.run())
is_ready = await self.transcriber.ready()
if not is_ready:
raise Exception("Transcriber startup failed")
self.synthesizer_thread.start()
if self.agent.get_agent_config().send_filler_audio:
filler_audio_config = (
self.agent.get_agent_config().send_filler_audio
if isinstance(
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
)
await ws.send(start_message.json())
await self.wait_for_ready()
self.logger.info("Listening...press Ctrl+C to stop")
while self.active:
data = self.input_device.get_audio()
if data:
try:
await ws.send(AudioMessage.from_bytes(data).json())
except ConnectionClosedOK:
self.deactivate()
return
else FillerAudioConfig()
)
self.synthesizer.set_filler_audios(filler_audio_config)
self.agent.start()
if self.agent.get_agent_config().initial_message:
self.transcript.add_bot_message(
self.agent.get_agent_config().initial_message.text
)
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
self.update_bot_sentiment()
self.send_message_to_stream_nonblocking(
self.agent.get_agent_config().initial_message, False
)
self.active = True
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
self.track_bot_sentiment_task = asyncio.create_task(
self.track_bot_sentiment()
)
self.check_for_idle_task = asyncio.create_task(self.check_for_idle())
async def check_for_idle(self):
while self.is_active():
if time.time() - self.last_action_timestamp > (
self.agent.get_agent_config().allowed_idle_time_seconds
or ALLOWED_IDLE_TIME
):
self.logger.debug("Conversation idle for too long, terminating")
self.mark_terminated()
return
await asyncio.sleep(15)
async def track_bot_sentiment(self):
prev_transcript = None
while self.is_active():
await asyncio.sleep(1)
if self.transcript.to_string() != prev_transcript:
self.update_bot_sentiment()
prev_transcript = self.transcript.to_string()
def update_bot_sentiment(self):
new_bot_sentiment = self.bot_sentiment_analyser.analyse(
self.transcript.to_string()
)
if new_bot_sentiment.emotion:
self.logger.debug("Bot sentiment: %s", new_bot_sentiment)
self.bot_sentiment = new_bot_sentiment
def receive_audio(self, chunk: bytes):
self.transcriber.send_audio(chunk)
async def send_messages_to_stream_async(
self,
messages,
should_allow_human_to_cut_off_bot: bool,
wait_for_filler_audio: bool = False,
) -> tuple[str, bool]:
messages_queue = queue.Queue()
messages_done = threading.Event()
speech_cut_off = threading.Event()
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
chunk_size = (
get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
* seconds_per_chunk
)
async def send_to_call():
response_buffer = ""
cut_off = False
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
while True:
try:
message: BaseMessage = messages_queue.get_nowait()
except queue.Empty:
if messages_done.is_set():
break
else:
await asyncio.sleep(0)
await ws.send(StopMessage().json())
continue
async def receiver(ws: WebSocketClientProtocol):
ReadyMessage.parse_raw(await ws.recv())
self.receiver_ready = True
async for msg in ws:
audio_message = AudioMessage.parse_raw(msg)
self.output_audio_queue.put_nowait(audio_message.get_bytes())
stop_event = self.enqueue_stop_event()
synthesis_result = self.synthesizer.create_speech(
message, chunk_size, bot_sentiment=self.bot_sentiment
)
message_sent, cut_off = await self.send_speech_to_output(
message.text,
synthesis_result,
stop_event,
seconds_per_chunk,
)
self.logger.debug("Message sent: {}".format(message_sent))
response_buffer = f"{response_buffer} {message_sent}"
if cut_off:
speech_cut_off.set()
break
await asyncio.sleep(0)
if cut_off:
self.agent.update_last_bot_message_on_cut_off(response_buffer)
self.transcript.add_bot_message(response_buffer)
return response_buffer, cut_off
output_thread = threading.Thread(target=self.play_audio)
output_thread.start()
return await asyncio.gather(sender(ws), receiver(ws))
asyncio.run_coroutine_threadsafe(send_to_call(), self.synthesizer_event_loop)
messages_generated = 0
for i, message in enumerate(messages):
messages_generated += 1
if i == 0:
if wait_for_filler_audio:
self.interrupt_all_synthesis()
self.wait_for_filler_audio_to_finish()
if speech_cut_off.is_set():
break
messages_queue.put_nowait(BaseMessage(text=message))
await asyncio.sleep(0)
if messages_generated == 0:
self.logger.debug("Agent generated no messages")
if wait_for_filler_audio:
self.interrupt_all_synthesis()
messages_done.set()
def send_message_to_stream_nonblocking(
self,
message: BaseMessage,
should_allow_human_to_cut_off_bot: bool,
):
asyncio.run_coroutine_threadsafe(
self.send_message_to_stream_async(
message,
self.agent.get_agent_config().allow_agent_to_be_cut_off,
),
self.synthesizer_event_loop,
)
async def send_message_to_stream_async(
self,
message: BaseMessage,
should_allow_human_to_cut_off_bot: bool,
) -> tuple[str, bool]:
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
stop_event = self.enqueue_stop_event()
self.logger.debug("Synthesizing speech for message")
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
chunk_size = (
get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
* seconds_per_chunk
)
synthesis_result = self.synthesizer.create_speech(
message, chunk_size, bot_sentiment=self.bot_sentiment
)
message_sent, cut_off = await self.send_speech_to_output(
message.text,
synthesis_result,
stop_event,
seconds_per_chunk,
)
self.logger.debug("Message sent: {}".format(message_sent))
if cut_off:
self.agent.update_last_bot_message_on_cut_off(message_sent)
self.transcript.add_bot_message(message_sent)
return message_sent, cut_off
def warmup_synthesizer(self):
self.synthesizer.ready_synthesizer()
# returns an estimate of what was sent up to, and a flag if the message was cut off
async def send_speech_to_output(
self,
message,
synthesis_result: SynthesisResult,
stop_event: threading.Event,
seconds_per_chunk: int,
is_filler_audio: bool = False,
):
message_sent = message
cut_off = False
chunk_size = seconds_per_chunk * get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
for i, chunk_result in enumerate(synthesis_result.chunk_generator):
start_time = time.time()
speech_length_seconds = seconds_per_chunk * (
len(chunk_result.chunk) / chunk_size
)
if stop_event.is_set():
seconds = i * seconds_per_chunk
self.logger.debug(
"Interrupted, stopping text to speech after {} chunks".format(i)
)
message_sent = f"{synthesis_result.get_message_up_to(seconds)}-"
cut_off = True
break
if i == 0:
if is_filler_audio:
self.should_wait_for_filler_audio_done_event = True
await self.output_device.send_async(chunk_result.chunk)
end_time = time.time()
await asyncio.sleep(
max(
speech_length_seconds
- (end_time - start_time)
- self.per_chunk_allowance_seconds,
0,
)
)
self.logger.debug(
"Sent chunk {} with size {}".format(i, len(chunk_result.chunk))
)
self.last_action_timestamp = time.time()
# clears it off the stop events queue
if not stop_event.is_set():
stop_event.set()
return message_sent, cut_off
async def on_transcription_response(self, transcription: Transcription):
self.last_action_timestamp = time.time()
if transcription.is_final:
self.logger.debug(
"Got transcription: {}, confidence: {}".format(
transcription.message, transcription.confidence
)
)
if not self.is_human_speaking:
# send interrupt
self.current_transcription_is_interrupt = False
if self.is_current_synthesis_interruptable:
self.logger.debug("sending interrupt")
self.current_transcription_is_interrupt = self.interrupt_all_synthesis()
self.logger.debug("Human started speaking")
transcription.is_interrupt = self.current_transcription_is_interrupt
self.is_human_speaking = not transcription.is_final
return await self.handle_transcription(transcription)
def enqueue_stop_event(self):
stop_event = threading.Event()
self.stop_events.put_nowait(stop_event)
return stop_event
def interrupt_all_synthesis(self):
"""Returns true if any synthesis was interrupted"""
num_interrupts = 0
while True:
try:
stop_event = self.stop_events.get_nowait()
if not stop_event.is_set():
self.logger.debug("Interrupting synthesis")
stop_event.set()
num_interrupts += 1
except queue.Empty:
break
return num_interrupts > 0
async def send_filler_audio_to_output(
self,
filler_audio: FillerAudio,
stop_event: threading.Event,
done_event: threading.Event,
):
filler_synthesis_result = filler_audio.create_synthesis_result()
self.is_current_synthesis_interruptable = filler_audio.is_interruptable
if isinstance(
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
):
silence_threshold = (
self.agent.get_agent_config().send_filler_audio.silence_threshold_seconds
)
else:
silence_threshold = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
await asyncio.sleep(silence_threshold)
self.logger.debug("Sending filler audio to output")
await self.send_speech_to_output(
filler_audio.message.text,
filler_synthesis_result,
stop_event,
filler_audio.seconds_per_chunk,
is_filler_audio=True,
)
done_event.set()
def wait_for_filler_audio_to_finish(self):
if not self.should_wait_for_filler_audio_done_event:
self.logger.debug(
"Not waiting for filler audio to finish since we didn't send any chunks"
)
return
self.should_wait_for_filler_audio_done_event = False
if (
self.current_filler_audio_done_event
and not self.current_filler_audio_done_event.is_set()
):
self.logger.debug("Waiting for filler audio to finish")
# this should guarantee that filler audio finishes, since it has to be on its last chunk
if not self.current_filler_audio_done_event.wait(
self.current_filler_seconds_per_chunk
):
self.logger.debug("Filler audio did not finish")
async def handle_transcription(self, transcription: Transcription):
if transcription.is_final:
self.transcript.add_human_message(transcription.message)
goodbye_detected_task = None
if self.agent.get_agent_config().end_conversation_on_goodbye:
goodbye_detected_task = asyncio.create_task(
self.goodbye_model.is_goodbye(transcription.message)
)
if self.agent.get_agent_config().send_filler_audio:
self.logger.debug("Sending filler audio")
if self.synthesizer.filler_audios:
filler_audio = random.choice(self.synthesizer.filler_audios)
self.logger.debug(f"Chose {filler_audio.message.text}")
self.current_filler_audio_done_event = threading.Event()
self.current_filler_seconds_per_chunk = (
filler_audio.seconds_per_chunk
)
stop_event = self.enqueue_stop_event()
asyncio.run_coroutine_threadsafe(
self.send_filler_audio_to_output(
filler_audio,
stop_event,
done_event=self.current_filler_audio_done_event,
),
self.synthesizer_event_loop,
)
else:
self.logger.debug("No filler audio available for synthesizer")
self.logger.debug("Generating response for transcription")
if self.agent.get_agent_config().generate_responses:
responses = self.agent.generate_response(
transcription.message, is_interrupt=transcription.is_interrupt
)
await self.send_messages_to_stream_async(
responses,
self.agent.get_agent_config().allow_agent_to_be_cut_off,
wait_for_filler_audio=self.agent.get_agent_config().send_filler_audio,
)
else:
response, should_stop = self.agent.respond(
transcription.message, is_interrupt=transcription.is_interrupt
)
if self.agent.get_agent_config().send_filler_audio:
self.interrupt_all_synthesis()
self.wait_for_filler_audio_to_finish()
if should_stop:
self.logger.debug("Agent requested to stop")
self.mark_terminated()
return
if response:
self.send_message_to_stream_nonblocking(
BaseMessage(text=response),
self.agent.get_agent_config().allow_agent_to_be_cut_off,
)
else:
self.logger.debug("No response generated")
if goodbye_detected_task:
try:
goodbye_detected = await asyncio.wait_for(
goodbye_detected_task, 0.1
)
if goodbye_detected:
self.logger.debug("Goodbye detected, ending conversation")
self.mark_terminated()
return
except asyncio.TimeoutError:
self.logger.debug("Goodbye detection timed out")
def mark_terminated(self):
self.active = False
# must be called from the main thread
def terminate(self):
self.mark_terminated()
if self.check_for_idle_task:
self.logger.debug("Terminating check_for_idle Task")
self.check_for_idle_task.cancel()
if self.track_bot_sentiment_task:
self.logger.debug("Terminating track_bot_sentiment Task")
self.track_bot_sentiment_task.cancel()
self.logger.debug("Terminating agent")
self.agent.terminate()
self.logger.debug("Terminating speech transcriber")
self.transcriber.terminate()
self.logger.debug("Terminating synthesizer event loop")
self.synthesizer_event_loop.call_soon_threadsafe(
self.synthesizer_event_loop.stop
)
self.logger.debug("Terminating synthesizer thread")
if self.synthesizer_thread.is_alive():
self.synthesizer_thread.join()
self.logger.debug("Terminating transcriber task")
self.transcriber_task.cancel()
self.logger.debug("Successfully terminated")
def is_active(self):
return self.active