open source

2023-03-28 00:15:34 -07:00 · 2023-03-28 00:15:34 -07:00 · a93bfc1ec9
commit a93bfc1ec9
parent 70b6e17c69
61 changed files with 4013 additions and 126 deletions
--- a/vocode/streaming/streaming_conversation.py
+++ b/vocode/streaming/streaming_conversation.py
@ -1,26 +1,67 @@
-import websockets
-from websockets.exceptions import ConnectionClosedOK
-from websockets.client import WebSocketClientProtocol
 import asyncio
-from dotenv import load_dotenv
-import os
+from asyncio import Future
+import queue
+from typing import Callable, Awaitable, Optional, Any
 import logging
 import threading
-import queue
-import vocode
-from vocode.streaming.input_device.base_input_device import (
-    BaseInputDevice,
+import time
+import secrets
+import random
+
+from dotenv import load_dotenv
+from vocode.streaming.agent.bot_sentiment_analyser import (
+    BotSentiment,
+    BotSentimentAnalyser,
 )
+from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
+from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
-from vocode.streaming.models.transcriber import TranscriberConfig
-from vocode.streaming.models.agent import AgentConfig
-from vocode.streaming.models.synthesizer import SynthesizerConfig
-from vocode.streaming.models.websocket import (
-    ReadyMessage,
-    AudioMessage,
-    StartMessage,
-    StopMessage,
+from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
+from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
+from vocode.streaming.utils.goodbye_model import GoodbyeModel
+from vocode.streaming.utils.transcript import Transcript
+
+from vocode.streaming.models.transcriber import (
+    TranscriberConfig,
+    TranscriberType,
 )
+from vocode.streaming.models.agent import (
+    AgentConfig,
+    AgentType,
+    FillerAudioConfig,
+    FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS,
+)
+from vocode.streaming.models.synthesizer import (
+    SynthesizerConfig,
+    SynthesizerType,
+    TrackBotSentimentConfig,
+)
+from vocode.streaming.models.websocket import AudioMessage
+from vocode.streaming.constants import (
+    TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS,
+    PER_CHUNK_ALLOWANCE_SECONDS,
+    ALLOWED_IDLE_TIME,
+)
+from vocode.streaming.agent.base_agent import BaseAgent
+from vocode.streaming.synthesizer.base_synthesizer import (
+    BaseSynthesizer,
+    SynthesisResult,
+    FillerAudio,
+)
+from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
+from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
+from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
+from vocode.streaming.utils import (
+    create_conversation_id,
+    create_loop_in_thread,
+    get_chunk_size_per_second,
+)
+from vocode.streaming.transcriber.base_transcriber import (
+    Transcription,
+    BaseTranscriber,
+)
+from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
+from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber

 load_dotenv()

@ -28,79 +69,468 @@ load_dotenv()
 class StreamingConversation:
    def __init__(
        self,
-        input_device: BaseInputDevice,
        output_device: BaseOutputDevice,
-        transcriber_config: TranscriberConfig,
-        agent_config: AgentConfig,
-        synthesizer_config: SynthesizerConfig,
-        id: str = None,
+        transcriber: BaseTranscriber,
+        agent: BaseAgent,
+        synthesizer: BaseSynthesizer,
+        conversation_id: str = None,
+        per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
+        logger: Optional[logging.Logger] = None,
    ):
-        self.id = id
-        self.input_device = input_device
+        self.id = conversation_id or create_conversation_id()
+        self.logger = logger or logging.getLogger(__name__)
        self.output_device = output_device
-        self.transcriber_config = transcriber_config
-        self.agent_config = agent_config
-        self.synthesizer_config = synthesizer_config
-        self.logger = logging.getLogger(__name__)
-        self.receiver_ready = False
-        self.active = True
-        self.output_loop = asyncio.new_event_loop()
-        self.output_audio_queue = queue.Queue()
-        self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
+        self.transcriber = transcriber
+        self.transcriber.set_on_response(self.on_transcription_response)
+        self.transcriber_task = None
+        self.agent = agent
+        self.synthesizer = synthesizer
+        self.synthesizer_event_loop = asyncio.new_event_loop()
+        self.synthesizer_thread = threading.Thread(
+            name="synthesizer",
+            target=create_loop_in_thread,
+            args=(self.synthesizer_event_loop,),
+        )
+        self.per_chunk_allowance_seconds = per_chunk_allowance_seconds
+        self.transcript = Transcript()
+        self.bot_sentiment = None
+        if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
+            if isinstance(
+                self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice,
+                bool,
+            ):
+                self.track_bot_sentiment_config = TrackBotSentimentConfig()
+            else:
+                self.track_bot_sentiment_config = (
+                    self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice
+                )
+            self.bot_sentiment_analyser = BotSentimentAnalyser(
+                emotions=self.track_bot_sentiment_config.emotions
+            )
+        self.goodbye_model = GoodbyeModel()

-    async def wait_for_ready(self):
-        while not self.receiver_ready:
-            await asyncio.sleep(0.1)
-        return True
-
-    def deactivate(self):
+        self.is_human_speaking = False
        self.active = False
-
-    def play_audio(self):
-        async def run():
-            while self.active:
-                try:
-                    audio = self.output_audio_queue.get(timeout=5)
-                    await self.output_device.send_async(audio)
-                except queue.Empty:
-                    continue
-
-        loop = asyncio.new_event_loop()
-        loop.run_until_complete(run())
+        self.current_synthesis_task = None
+        self.is_current_synthesis_interruptable = False
+        self.stop_events: queue.Queue[threading.Event] = queue.Queue()
+        self.last_action_timestamp = time.time()
+        self.check_for_idle_task = None
+        self.track_bot_sentiment_task = None
+        self.should_wait_for_filler_audio_done_event = False
+        self.current_filler_audio_done_event: Optional[threading.Event] = None
+        self.current_filler_seconds_per_chunk: int = 0
+        self.current_transcription_is_interrupt: bool = False

    async def start(self):
-        async with websockets.connect(
-            f"{self.vocode_websocket_url}?key={vocode.api_key}"
-        ) as ws:
-
-            async def sender(ws: WebSocketClientProtocol):
-                start_message = StartMessage(
-                    transcriber_config=self.transcriber_config,
-                    agent_config=self.agent_config,
-                    synthesizer_config=self.synthesizer_config,
-                    conversation_id=self.id,
+        self.transcriber_task = asyncio.create_task(self.transcriber.run())
+        is_ready = await self.transcriber.ready()
+        if not is_ready:
+            raise Exception("Transcriber startup failed")
+        self.synthesizer_thread.start()
+        if self.agent.get_agent_config().send_filler_audio:
+            filler_audio_config = (
+                self.agent.get_agent_config().send_filler_audio
+                if isinstance(
+                    self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
                )
-                await ws.send(start_message.json())
-                await self.wait_for_ready()
-                self.logger.info("Listening...press Ctrl+C to stop")
-                while self.active:
-                    data = self.input_device.get_audio()
-                    if data:
-                        try:
-                            await ws.send(AudioMessage.from_bytes(data).json())
-                        except ConnectionClosedOK:
-                            self.deactivate()
-                            return
+                else FillerAudioConfig()
+            )
+            self.synthesizer.set_filler_audios(filler_audio_config)
+        self.agent.start()
+        if self.agent.get_agent_config().initial_message:
+            self.transcript.add_bot_message(
+                self.agent.get_agent_config().initial_message.text
+            )
+        if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
+            self.update_bot_sentiment()
+        self.send_message_to_stream_nonblocking(
+            self.agent.get_agent_config().initial_message, False
+        )
+        self.active = True
+        if self.synthesizer.get_synthesizer_config().track_bot_sentiment_in_voice:
+            self.track_bot_sentiment_task = asyncio.create_task(
+                self.track_bot_sentiment()
+            )
+        self.check_for_idle_task = asyncio.create_task(self.check_for_idle())
+
+    async def check_for_idle(self):
+        while self.is_active():
+            if time.time() - self.last_action_timestamp > (
+                self.agent.get_agent_config().allowed_idle_time_seconds
+                or ALLOWED_IDLE_TIME
+            ):
+                self.logger.debug("Conversation idle for too long, terminating")
+                self.mark_terminated()
+                return
+            await asyncio.sleep(15)
+
+    async def track_bot_sentiment(self):
+        prev_transcript = None
+        while self.is_active():
+            await asyncio.sleep(1)
+            if self.transcript.to_string() != prev_transcript:
+                self.update_bot_sentiment()
+                prev_transcript = self.transcript.to_string()
+
+    def update_bot_sentiment(self):
+        new_bot_sentiment = self.bot_sentiment_analyser.analyse(
+            self.transcript.to_string()
+        )
+        if new_bot_sentiment.emotion:
+            self.logger.debug("Bot sentiment: %s", new_bot_sentiment)
+            self.bot_sentiment = new_bot_sentiment
+
+    def receive_audio(self, chunk: bytes):
+        self.transcriber.send_audio(chunk)
+
+    async def send_messages_to_stream_async(
+        self,
+        messages,
+        should_allow_human_to_cut_off_bot: bool,
+        wait_for_filler_audio: bool = False,
+    ) -> tuple[str, bool]:
+        messages_queue = queue.Queue()
+        messages_done = threading.Event()
+        speech_cut_off = threading.Event()
+        seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
+        chunk_size = (
+            get_chunk_size_per_second(
+                self.synthesizer.get_synthesizer_config().audio_encoding,
+                self.synthesizer.get_synthesizer_config().sampling_rate,
+            )
+            * seconds_per_chunk
+        )
+
+        async def send_to_call():
+            response_buffer = ""
+            cut_off = False
+            self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
+            while True:
+                try:
+                    message: BaseMessage = messages_queue.get_nowait()
+                except queue.Empty:
+                    if messages_done.is_set():
+                        break
+                    else:
                        await asyncio.sleep(0)
-                await ws.send(StopMessage().json())
+                        continue

-            async def receiver(ws: WebSocketClientProtocol):
-                ReadyMessage.parse_raw(await ws.recv())
-                self.receiver_ready = True
-                async for msg in ws:
-                    audio_message = AudioMessage.parse_raw(msg)
-                    self.output_audio_queue.put_nowait(audio_message.get_bytes())
+                stop_event = self.enqueue_stop_event()
+                synthesis_result = self.synthesizer.create_speech(
+                    message, chunk_size, bot_sentiment=self.bot_sentiment
+                )
+                message_sent, cut_off = await self.send_speech_to_output(
+                    message.text,
+                    synthesis_result,
+                    stop_event,
+                    seconds_per_chunk,
+                )
+                self.logger.debug("Message sent: {}".format(message_sent))
+                response_buffer = f"{response_buffer} {message_sent}"
+                if cut_off:
+                    speech_cut_off.set()
+                    break
+                await asyncio.sleep(0)
+            if cut_off:
+                self.agent.update_last_bot_message_on_cut_off(response_buffer)
+            self.transcript.add_bot_message(response_buffer)
+            return response_buffer, cut_off

-            output_thread = threading.Thread(target=self.play_audio)
-            output_thread.start()
-            return await asyncio.gather(sender(ws), receiver(ws))
+        asyncio.run_coroutine_threadsafe(send_to_call(), self.synthesizer_event_loop)
+
+        messages_generated = 0
+        for i, message in enumerate(messages):
+            messages_generated += 1
+            if i == 0:
+                if wait_for_filler_audio:
+                    self.interrupt_all_synthesis()
+                    self.wait_for_filler_audio_to_finish()
+            if speech_cut_off.is_set():
+                break
+            messages_queue.put_nowait(BaseMessage(text=message))
+            await asyncio.sleep(0)
+        if messages_generated == 0:
+            self.logger.debug("Agent generated no messages")
+            if wait_for_filler_audio:
+                self.interrupt_all_synthesis()
+        messages_done.set()
+
+    def send_message_to_stream_nonblocking(
+        self,
+        message: BaseMessage,
+        should_allow_human_to_cut_off_bot: bool,
+    ):
+        asyncio.run_coroutine_threadsafe(
+            self.send_message_to_stream_async(
+                message,
+                self.agent.get_agent_config().allow_agent_to_be_cut_off,
+            ),
+            self.synthesizer_event_loop,
+        )
+
+    async def send_message_to_stream_async(
+        self,
+        message: BaseMessage,
+        should_allow_human_to_cut_off_bot: bool,
+    ) -> tuple[str, bool]:
+        self.is_current_synthesis_interruptable = should_allow_human_to_cut_off_bot
+        stop_event = self.enqueue_stop_event()
+        self.logger.debug("Synthesizing speech for message")
+        seconds_per_chunk = TEXT_TO_SPEECH_CHUNK_SIZE_SECONDS
+        chunk_size = (
+            get_chunk_size_per_second(
+                self.synthesizer.get_synthesizer_config().audio_encoding,
+                self.synthesizer.get_synthesizer_config().sampling_rate,
+            )
+            * seconds_per_chunk
+        )
+        synthesis_result = self.synthesizer.create_speech(
+            message, chunk_size, bot_sentiment=self.bot_sentiment
+        )
+        message_sent, cut_off = await self.send_speech_to_output(
+            message.text,
+            synthesis_result,
+            stop_event,
+            seconds_per_chunk,
+        )
+        self.logger.debug("Message sent: {}".format(message_sent))
+        if cut_off:
+            self.agent.update_last_bot_message_on_cut_off(message_sent)
+        self.transcript.add_bot_message(message_sent)
+        return message_sent, cut_off
+
+    def warmup_synthesizer(self):
+        self.synthesizer.ready_synthesizer()
+
+    # returns an estimate of what was sent up to, and a flag if the message was cut off
+    async def send_speech_to_output(
+        self,
+        message,
+        synthesis_result: SynthesisResult,
+        stop_event: threading.Event,
+        seconds_per_chunk: int,
+        is_filler_audio: bool = False,
+    ):
+        message_sent = message
+        cut_off = False
+        chunk_size = seconds_per_chunk * get_chunk_size_per_second(
+            self.synthesizer.get_synthesizer_config().audio_encoding,
+            self.synthesizer.get_synthesizer_config().sampling_rate,
+        )
+        for i, chunk_result in enumerate(synthesis_result.chunk_generator):
+            start_time = time.time()
+            speech_length_seconds = seconds_per_chunk * (
+                len(chunk_result.chunk) / chunk_size
+            )
+            if stop_event.is_set():
+                seconds = i * seconds_per_chunk
+                self.logger.debug(
+                    "Interrupted, stopping text to speech after {} chunks".format(i)
+                )
+                message_sent = f"{synthesis_result.get_message_up_to(seconds)}-"
+                cut_off = True
+                break
+            if i == 0:
+                if is_filler_audio:
+                    self.should_wait_for_filler_audio_done_event = True
+            await self.output_device.send_async(chunk_result.chunk)
+            end_time = time.time()
+            await asyncio.sleep(
+                max(
+                    speech_length_seconds
+                    - (end_time - start_time)
+                    - self.per_chunk_allowance_seconds,
+                    0,
+                )
+            )
+            self.logger.debug(
+                "Sent chunk {} with size {}".format(i, len(chunk_result.chunk))
+            )
+            self.last_action_timestamp = time.time()
+        # clears it off the stop events queue
+        if not stop_event.is_set():
+            stop_event.set()
+        return message_sent, cut_off
+
+    async def on_transcription_response(self, transcription: Transcription):
+        self.last_action_timestamp = time.time()
+        if transcription.is_final:
+            self.logger.debug(
+                "Got transcription: {}, confidence: {}".format(
+                    transcription.message, transcription.confidence
+                )
+            )
+        if not self.is_human_speaking:
+            # send interrupt
+            self.current_transcription_is_interrupt = False
+            if self.is_current_synthesis_interruptable:
+                self.logger.debug("sending interrupt")
+                self.current_transcription_is_interrupt = self.interrupt_all_synthesis()
+            self.logger.debug("Human started speaking")
+
+        transcription.is_interrupt = self.current_transcription_is_interrupt
+        self.is_human_speaking = not transcription.is_final
+        return await self.handle_transcription(transcription)
+
+    def enqueue_stop_event(self):
+        stop_event = threading.Event()
+        self.stop_events.put_nowait(stop_event)
+        return stop_event
+
+    def interrupt_all_synthesis(self):
+        """Returns true if any synthesis was interrupted"""
+        num_interrupts = 0
+        while True:
+            try:
+                stop_event = self.stop_events.get_nowait()
+                if not stop_event.is_set():
+                    self.logger.debug("Interrupting synthesis")
+                    stop_event.set()
+                    num_interrupts += 1
+            except queue.Empty:
+                break
+        return num_interrupts > 0
+
+    async def send_filler_audio_to_output(
+        self,
+        filler_audio: FillerAudio,
+        stop_event: threading.Event,
+        done_event: threading.Event,
+    ):
+        filler_synthesis_result = filler_audio.create_synthesis_result()
+        self.is_current_synthesis_interruptable = filler_audio.is_interruptable
+        if isinstance(
+            self.agent.get_agent_config().send_filler_audio, FillerAudioConfig
+        ):
+            silence_threshold = (
+                self.agent.get_agent_config().send_filler_audio.silence_threshold_seconds
+            )
+        else:
+            silence_threshold = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
+        await asyncio.sleep(silence_threshold)
+        self.logger.debug("Sending filler audio to output")
+        await self.send_speech_to_output(
+            filler_audio.message.text,
+            filler_synthesis_result,
+            stop_event,
+            filler_audio.seconds_per_chunk,
+            is_filler_audio=True,
+        )
+        done_event.set()
+
+    def wait_for_filler_audio_to_finish(self):
+        if not self.should_wait_for_filler_audio_done_event:
+            self.logger.debug(
+                "Not waiting for filler audio to finish since we didn't send any chunks"
+            )
+            return
+        self.should_wait_for_filler_audio_done_event = False
+        if (
+            self.current_filler_audio_done_event
+            and not self.current_filler_audio_done_event.is_set()
+        ):
+            self.logger.debug("Waiting for filler audio to finish")
+            # this should guarantee that filler audio finishes, since it has to be on its last chunk
+            if not self.current_filler_audio_done_event.wait(
+                self.current_filler_seconds_per_chunk
+            ):
+                self.logger.debug("Filler audio did not finish")
+
+    async def handle_transcription(self, transcription: Transcription):
+        if transcription.is_final:
+            self.transcript.add_human_message(transcription.message)
+            goodbye_detected_task = None
+            if self.agent.get_agent_config().end_conversation_on_goodbye:
+                goodbye_detected_task = asyncio.create_task(
+                    self.goodbye_model.is_goodbye(transcription.message)
+                )
+            if self.agent.get_agent_config().send_filler_audio:
+                self.logger.debug("Sending filler audio")
+                if self.synthesizer.filler_audios:
+                    filler_audio = random.choice(self.synthesizer.filler_audios)
+                    self.logger.debug(f"Chose {filler_audio.message.text}")
+                    self.current_filler_audio_done_event = threading.Event()
+                    self.current_filler_seconds_per_chunk = (
+                        filler_audio.seconds_per_chunk
+                    )
+                    stop_event = self.enqueue_stop_event()
+                    asyncio.run_coroutine_threadsafe(
+                        self.send_filler_audio_to_output(
+                            filler_audio,
+                            stop_event,
+                            done_event=self.current_filler_audio_done_event,
+                        ),
+                        self.synthesizer_event_loop,
+                    )
+                else:
+                    self.logger.debug("No filler audio available for synthesizer")
+            self.logger.debug("Generating response for transcription")
+            if self.agent.get_agent_config().generate_responses:
+                responses = self.agent.generate_response(
+                    transcription.message, is_interrupt=transcription.is_interrupt
+                )
+                await self.send_messages_to_stream_async(
+                    responses,
+                    self.agent.get_agent_config().allow_agent_to_be_cut_off,
+                    wait_for_filler_audio=self.agent.get_agent_config().send_filler_audio,
+                )
+            else:
+                response, should_stop = self.agent.respond(
+                    transcription.message, is_interrupt=transcription.is_interrupt
+                )
+                if self.agent.get_agent_config().send_filler_audio:
+                    self.interrupt_all_synthesis()
+                    self.wait_for_filler_audio_to_finish()
+                if should_stop:
+                    self.logger.debug("Agent requested to stop")
+                    self.mark_terminated()
+                    return
+                if response:
+                    self.send_message_to_stream_nonblocking(
+                        BaseMessage(text=response),
+                        self.agent.get_agent_config().allow_agent_to_be_cut_off,
+                    )
+                else:
+                    self.logger.debug("No response generated")
+            if goodbye_detected_task:
+                try:
+                    goodbye_detected = await asyncio.wait_for(
+                        goodbye_detected_task, 0.1
+                    )
+                    if goodbye_detected:
+                        self.logger.debug("Goodbye detected, ending conversation")
+                        self.mark_terminated()
+                        return
+                except asyncio.TimeoutError:
+                    self.logger.debug("Goodbye detection timed out")
+
+    def mark_terminated(self):
+        self.active = False
+
+    # must be called from the main thread
+    def terminate(self):
+        self.mark_terminated()
+        if self.check_for_idle_task:
+            self.logger.debug("Terminating check_for_idle Task")
+            self.check_for_idle_task.cancel()
+        if self.track_bot_sentiment_task:
+            self.logger.debug("Terminating track_bot_sentiment Task")
+            self.track_bot_sentiment_task.cancel()
+        self.logger.debug("Terminating agent")
+        self.agent.terminate()
+        self.logger.debug("Terminating speech transcriber")
+        self.transcriber.terminate()
+        self.logger.debug("Terminating synthesizer event loop")
+        self.synthesizer_event_loop.call_soon_threadsafe(
+            self.synthesizer_event_loop.stop
+        )
+        self.logger.debug("Terminating synthesizer thread")
+        if self.synthesizer_thread.is_alive():
+            self.synthesizer_thread.join()
+        self.logger.debug("Terminating transcriber task")
+        self.transcriber_task.cancel()
+        self.logger.debug("Successfully terminated")
+
+    def is_active(self):
+        return self.active