vocode-python/vocode/streaming/streaming_conversation.py

533 lines
21 KiB
Python

import asyncio
from asyncio import Future
from ctypes import Union
import queue
from typing import Callable, Awaitable, Optional, Any
import logging
import threading
import time
import secrets
import random
from vocode.streaming.agent.bot_sentiment_analyser import (
BotSentiment,
BotSentimentAnalyser,
)
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
from vocode.streaming.factory import (
create_agent,
create_synthesizer,
create_transcriber,
)
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
from vocode.streaming.utils.goodbye_model import GoodbyeModel
from vocode.streaming.utils.transcript import Transcript
from vocode.streaming.models.transcriber import (
TranscriberConfig,
TranscriberType,
)
from vocode.streaming.models.agent import (
AgentConfig,
AgentType,
FillerAudioConfig,
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS,
)
from vocode.streaming.models.synthesizer import (
SynthesizerConfig,
SynthesizerType,
TrackBotSentimentConfig,
)
from vocode.streaming.models.websocket import AudioMessage
from vocode.streaming.constants import (
TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS,
PER_CHUNK_ALLOWANCE_SECONDS,
ALLOWED_IDLE_TIME,
)
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
SynthesisResult,
FillerAudio,
)
from vocode.streaming.utils import (
create_conversation_id,
create_loop_in_thread,
get_chunk_size_per_second,
)
from vocode.streaming.transcriber.base_transcriber import (
Transcription,
BaseTranscriber,
)
class StreamingConversation:
def __init__(
self,
output_device: BaseOutputDevice,
transcriber: BaseTranscriber,
agent: BaseAgent,
synthesizer: BaseSynthesizer,
conversation_id: str = None,
per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
logger: Optional[logging.Logger] = None,
):
self.id = conversation_id or create_conversation_id()
self.logger = logger or logging.getLogger(__name__)
self.output_device = output_device
self.transcriber = transcriber
self.transcriber.set_on_response(self.on_transcription_response)
self.transcriber_task = None
self.agent = agent
self.synthesizer = synthesizer
self.synthesizer_event_loop = asyncio.new_event_loop()
self.synthesizer_thread = threading.Thread(
name="synthesizer",
target=create_loop_in_thread,
args=(self.synthesizer_event_loop,),
)
self.per_chunk_allowance_seconds = per_chunk_allowance_seconds
self.transcript = Transcript()
self.bot_sentiment = None
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
if isinstance(
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice,
bool,
):
self.track_bot_sentiment_config = TrackBotSentimentConfig()
else:
self.track_bot_sentiment_config = (
self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice
)
self.bot_sentiment_analyser = BotSentimentAnalyser(
emotions=self.track_bot_sentiment_config.emotions
)
self.goodbye_model = GoodbyeModel()
self.is_human_speaking = False
self.active = False
self.current_synthesis_task = None
self.is_current_synthesis_interruptable = False
self.stop_events: queue.Queue[threading.Event] = queue.Queue()
self.last_action_timestamp = time.time()
self.check_for_idle_task = None
self.track_bot_sentiment_task = None
self.should_wait_for_filler_audio_done_event = False
self.current_filler_audio_done_event: Optional[threading.Event] = None
self.current_filler_seconds_per_chunk: int = 0
self.current_transcription_is_interrupt: bool = False
async def start(self):
self.transcriber_task = asyncio.create_task(self.transcriber.run())
is_ready = await self.transcriber.ready()
if not is_ready:
raise Exception("Transcriber startup failed")
self.synthesizer_thread.start()
if self.agent.get_agent_config().send_filler_audio:
filler_audio_config = (
self.agent.get_agent_config().send_filler_audio
if isinstance(
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
)
else FillerAudioConfig()
)
self.synthesizer.set_filler_audios(filler_audio_config)
self.agent.start()
if self.agent.get_agent_config().initial_message:
self.transcript.add_bot_message(
self.agent.get_agent_config().initial_message.text
)
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
self.update_bot_sentiment()
self.send_message_to_stream_nonblocking(
self.agent.get_agent_config().initial_message, False
)
self.active = True
if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
self.track_bot_sentiment_task = asyncio.create_task(
self.track_bot_sentiment()
)
self.check_for_idle_task = asyncio.create_task(self.check_for_idle())
async def check_for_idle(self):
while self.is_active():
if time.time() - self.last_action_timestamp > (
self.agent.get_agent_config().allowed_idle_time_seconds
or ALLOWED_IDLE_TIME
):
self.logger.debug("Conversation idle for too long, terminating")
self.mark_terminated()
return
await asyncio.sleep(15)
async def track_bot_sentiment(self):
prev_transcript = None
while self.is_active():
await asyncio.sleep(1)
if self.transcript.to_string() != prev_transcript:
self.update_bot_sentiment()
prev_transcript = self.transcript.to_string()
def update_bot_sentiment(self):
new_bot_sentiment = self.bot_sentiment_analyser.analyse(
self.transcript.to_string()
)
if new_bot_sentiment.emotion:
self.logger.debug("Bot sentiment: %s", new_bot_sentiment)
self.bot_sentiment = new_bot_sentiment
def receive_audio(self, chunk: bytes):
self.transcriber.send_audio(chunk)
async def send_messages_to_stream_async(
self,
messages,
should_allow_human_to_cut_off_bot: bool,
wait_for_filler_audio: bool = False,
) -> tuple[str, bool]:
messages_queue = queue.Queue()
messages_done = threading.Event()
speech_cut_off = threading.Event()
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
chunk_size = (
get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
* seconds_per_chunk
)
async def send_to_call():
response_buffer = ""
cut_off = False
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
while True:
try:
message: BaseMessage = messages_queue.get_nowait()
except queue.Empty:
if messages_done.is_set():
break
else:
await asyncio.sleep(0)
continue
stop_event = self.enqueue_stop_event()
synthesis_result = self.synthesizer.create_speech(
message, chunk_size, bot_sentiment=self.bot_sentiment
)
message_sent, cut_off = await self.send_speech_to_output(
message.text,
synthesis_result,
stop_event,
seconds_per_chunk,
)
self.logger.debug("Message sent: {}".format(message_sent))
response_buffer = f"{response_buffer} {message_sent}"
if cut_off:
speech_cut_off.set()
break
await asyncio.sleep(0)
if cut_off:
self.agent.update_last_bot_message_on_cut_off(response_buffer)
self.transcript.add_bot_message(response_buffer)
return response_buffer, cut_off
asyncio.run_coroutine_threadsafe(send_to_call(), self.synthesizer_event_loop)
messages_generated = 0
for i, message in enumerate(messages):
messages_generated += 1
if i == 0:
if wait_for_filler_audio:
self.interrupt_all_synthesis()
self.wait_for_filler_audio_to_finish()
if speech_cut_off.is_set():
break
messages_queue.put_nowait(BaseMessage(text=message))
await asyncio.sleep(0)
if messages_generated == 0:
self.logger.debug("Agent generated no messages")
if wait_for_filler_audio:
self.interrupt_all_synthesis()
messages_done.set()
def send_message_to_stream_nonblocking(
self,
message: BaseMessage,
should_allow_human_to_cut_off_bot: bool,
):
asyncio.run_coroutine_threadsafe(
self.send_message_to_stream_async(
message,
self.agent.get_agent_config().allow_agent_to_be_cut_off,
),
self.synthesizer_event_loop,
)
async def send_message_to_stream_async(
self,
message: BaseMessage,
should_allow_human_to_cut_off_bot: bool,
) -> tuple[str, bool]:
self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
stop_event = self.enqueue_stop_event()
self.logger.debug("Synthesizing speech for message")
seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
chunk_size = (
get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
* seconds_per_chunk
)
synthesis_result = self.synthesizer.create_speech(
message, chunk_size, bot_sentiment=self.bot_sentiment
)
message_sent, cut_off = await self.send_speech_to_output(
message.text,
synthesis_result,
stop_event,
seconds_per_chunk,
)
self.logger.debug("Message sent: {}".format(message_sent))
if cut_off:
self.agent.update_last_bot_message_on_cut_off(message_sent)
self.transcript.add_bot_message(message_sent)
return message_sent, cut_off
def warmup_synthesizer(self):
self.synthesizer.ready_synthesizer()
# returns an estimate of what was sent up to, and a flag if the message was cut off
async def send_speech_to_output(
self,
message,
synthesis_result: SynthesisResult,
stop_event: threading.Event,
seconds_per_chunk: int,
is_filler_audio: bool = False,
):
message_sent = message
cut_off = False
chunk_size = seconds_per_chunk * get_chunk_size_per_second(
self.synthesizer.get_synthesizer_config().audio_encoding,
self.synthesizer.get_synthesizer_config().sampling_rate,
)
for i, chunk_result in enumerate(synthesis_result.chunk_generator):
start_time = time.time()
speech_length_seconds = seconds_per_chunk * (
len(chunk_result.chunk) / chunk_size
)
if stop_event.is_set():
seconds = i * seconds_per_chunk
self.logger.debug(
"Interrupted, stopping text to speech after {} chunks".format(i)
)
message_sent = f"{synthesis_result.get_message_up_to(seconds)}-"
cut_off = True
break
if i == 0:
if is_filler_audio:
self.should_wait_for_filler_audio_done_event = True
await self.output_device.send_async(chunk_result.chunk)
end_time = time.time()
await asyncio.sleep(
max(
speech_length_seconds
- (end_time - start_time)
- self.per_chunk_allowance_seconds,
0,
)
)
self.logger.debug(
"Sent chunk {} with size {}".format(i, len(chunk_result.chunk))
)
self.last_action_timestamp = time.time()
# clears it off the stop events queue
if not stop_event.is_set():
stop_event.set()
return message_sent, cut_off
async def on_transcription_response(self, transcription: Transcription):
self.last_action_timestamp = time.time()
if transcription.is_final:
self.logger.debug(
"Got transcription: {}, confidence: {}".format(
transcription.message, transcription.confidence
)
)
if not self.is_human_speaking:
# send interrupt
self.current_transcription_is_interrupt = False
if self.is_current_synthesis_interruptable:
self.logger.debug("sending interrupt")
self.current_transcription_is_interrupt = self.interrupt_all_synthesis()
self.logger.debug("Human started speaking")
transcription.is_interrupt = self.current_transcription_is_interrupt
self.is_human_speaking = not transcription.is_final
return await self.handle_transcription(transcription)
def enqueue_stop_event(self):
stop_event = threading.Event()
self.stop_events.put_nowait(stop_event)
return stop_event
def interrupt_all_synthesis(self):
"""Returns true if any synthesis was interrupted"""
num_interrupts = 0
while True:
try:
stop_event = self.stop_events.get_nowait()
if not stop_event.is_set():
self.logger.debug("Interrupting synthesis")
stop_event.set()
num_interrupts += 1
except queue.Empty:
break
return num_interrupts > 0
async def send_filler_audio_to_output(
self,
filler_audio: FillerAudio,
stop_event: threading.Event,
done_event: threading.Event,
):
filler_synthesis_result = filler_audio.create_synthesis_result()
self.is_current_synthesis_interruptable = filler_audio.is_interruptable
if isinstance(
self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
):
silence_threshold = (
self.agent.get_agent_config().send_filler_audio.silence_threshold_seconds
)
else:
silence_threshold = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
await asyncio.sleep(silence_threshold)
self.logger.debug("Sending filler audio to output")
await self.send_speech_to_output(
filler_audio.message.text,
filler_synthesis_result,
stop_event,
filler_audio.seconds_per_chunk,
is_filler_audio=True,
)
done_event.set()
def wait_for_filler_audio_to_finish(self):
if not self.should_wait_for_filler_audio_done_event:
self.logger.debug(
"Not waiting for filler audio to finish since we didn't send any chunks"
)
return
self.should_wait_for_filler_audio_done_event = False
if (
self.current_filler_audio_done_event
and not self.current_filler_audio_done_event.is_set()
):
self.logger.debug("Waiting for filler audio to finish")
# this should guarantee that filler audio finishes, since it has to be on its last chunk
if not self.current_filler_audio_done_event.wait(
self.current_filler_seconds_per_chunk
):
self.logger.debug("Filler audio did not finish")
async def handle_transcription(self, transcription: Transcription):
if transcription.is_final:
self.transcript.add_human_message(transcription.message)
goodbye_detected_task = None
if self.agent.get_agent_config().end_conversation_on_goodbye:
goodbye_detected_task = asyncio.create_task(
self.goodbye_model.is_goodbye(transcription.message)
)
if self.agent.get_agent_config().send_filler_audio:
self.logger.debug("Sending filler audio")
if self.synthesizer.filler_audios:
filler_audio = random.choice(self.synthesizer.filler_audios)
self.logger.debug(f"Chose {filler_audio.message.text}")
self.current_filler_audio_done_event = threading.Event()
self.current_filler_seconds_per_chunk = (
filler_audio.seconds_per_chunk
)
stop_event = self.enqueue_stop_event()
asyncio.run_coroutine_threadsafe(
self.send_filler_audio_to_output(
filler_audio,
stop_event,
done_event=self.current_filler_audio_done_event,
),
self.synthesizer_event_loop,
)
else:
self.logger.debug("No filler audio available for synthesizer")
self.logger.debug("Generating response for transcription")
if self.agent.get_agent_config().generate_responses:
responses = self.agent.generate_response(
transcription.message, is_interrupt=transcription.is_interrupt
)
await self.send_messages_to_stream_async(
responses,
self.agent.get_agent_config().allow_agent_to_be_cut_off,
wait_for_filler_audio=self.agent.get_agent_config().send_filler_audio,
)
else:
response, should_stop = self.agent.respond(
transcription.message, is_interrupt=transcription.is_interrupt
)
if self.agent.get_agent_config().send_filler_audio:
self.interrupt_all_synthesis()
self.wait_for_filler_audio_to_finish()
if should_stop:
self.logger.debug("Agent requested to stop")
self.mark_terminated()
return
if response:
self.send_message_to_stream_nonblocking(
BaseMessage(text=response),
self.agent.get_agent_config().allow_agent_to_be_cut_off,
)
else:
self.logger.debug("No response generated")
if goodbye_detected_task:
try:
goodbye_detected = await asyncio.wait_for(
goodbye_detected_task, 0.1
)
if goodbye_detected:
self.logger.debug("Goodbye detected, ending conversation")
self.mark_terminated()
return
except asyncio.TimeoutError:
self.logger.debug("Goodbye detection timed out")
def mark_terminated(self):
self.active = False
# must be called from the main thread
def terminate(self):
self.mark_terminated()
if self.check_for_idle_task:
self.logger.debug("Terminating check_for_idle Task")
self.check_for_idle_task.cancel()
if self.track_bot_sentiment_task:
self.logger.debug("Terminating track_bot_sentiment Task")
self.track_bot_sentiment_task.cancel()
self.logger.debug("Terminating agent")
self.agent.terminate()
self.logger.debug("Terminating speech transcriber")
self.transcriber.terminate()
self.logger.debug("Terminating synthesizer event loop")
self.synthesizer_event_loop.call_soon_threadsafe(
self.synthesizer_event_loop.stop
)
self.logger.debug("Terminating synthesizer thread")
if self.synthesizer_thread.is_alive():
self.synthesizer_thread.join()
self.logger.debug("Terminating transcriber task")
self.transcriber_task.cancel()
self.logger.debug("Successfully terminated")
def is_active(self):
return self.active