first pass at turn based conversation

2023-03-20 15:37:23 -07:00 · 2023-03-20 15:37:23 -07:00 · 518a0f2b53
commit 518a0f2b53
parent d1118d375e
40 changed files with 503 additions and 99 deletions
--- a/vocode/turn_based/agent/base_agent.py
+++ b/vocode/turn_based/agent/base_agent.py
@ -0,0 +1,9 @@
+from typing import Optional
+
+
+class BaseAgent:
+    def __init__(self, initial_message: Optional[str] = None):
+        self.initial_message = initial_message
+
+    def respond(self, human_input: str):
+        raise NotImplementedError
--- a/vocode/turn_based/agent/chat_gpt_agent.py
+++ b/vocode/turn_based/agent/chat_gpt_agent.py
@ -0,0 +1,45 @@
+from typing import Optional
+from langchain.prompts import (
+    ChatPromptTemplate,
+    MessagesPlaceholder,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.chains import ConversationChain
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+
+from vocode.turn_based.agent.base_agent import BaseAgent
+
+
+class ChatGPTAgent(BaseAgent):
+    def __init__(
+        self,
+        system_prompt: str,
+        initial_message: Optional[str] = None,
+        model_name: str = "gpt-3.5-turbo",
+        temperature: float = 0.7,
+        max_tokens: int = 100,
+    ):
+        super().__init__(initial_message=initial_message)
+        self.prompt = ChatPromptTemplate.from_messages(
+            [
+                SystemMessagePromptTemplate.from_template(system_prompt),
+                MessagesPlaceholder(variable_name="history"),
+                HumanMessagePromptTemplate.from_template("{input}"),
+            ]
+        )
+        self.memory = ConversationBufferMemory(return_messages=True)
+        if initial_message:
+            self.memory.chat_memory.add_ai_message(initial_message)
+        self.llm = ChatOpenAI(
+            model_name=model_name,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        self.conversation = ConversationChain(
+            memory=self.memory, prompt=self.prompt, llm=self.llm
+        )
+
+    def respond(self, human_input: str):
+        return self.conversation.predict(input=human_input)
--- a/vocode/turn_based/agent/echo_agent.py
+++ b/vocode/turn_based/agent/echo_agent.py
@ -0,0 +1,6 @@
+from vocode.turn_based.agent.base_agent import BaseAgent
+
+
+class EchoAgent(BaseAgent):
+    def respond(self, human_input: str):
+        return human_input
--- a/vocode/turn_based/input_device/base_input_device.py
+++ b/vocode/turn_based/input_device/base_input_device.py
@ -0,0 +1,9 @@
+from pydub import AudioSegment
+
+
+class BaseInputDevice:
+    def start_listening(self):
+        raise NotImplementedError
+
+    def end_listening(self) -> AudioSegment:
+        raise NotImplementedError
--- a/vocode/turn_based/input_device/microphone_input.py
+++ b/vocode/turn_based/input_device/microphone_input.py
@ -0,0 +1,59 @@
+from typing import Optional
+import sounddevice as sd
+import numpy as np
+from pydub import AudioSegment
+import io
+import wave
+
+from vocode.turn_based.input_device.base_input_device import BaseInputDevice
+
+
+class MicrophoneInput(BaseInputDevice):
+    DEFAULT_SAMPLING_RATE = 44100
+    DEFAULT_CHUNK_SIZE = 2048
+
+    def __init__(
+        self,
+        device_info: dict,
+        sampling_rate: int = None,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+    ):
+        self.device_info = device_info
+        self.sampling_rate = sampling_rate or (
+            self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
+        )
+        self.chunk_size = chunk_size
+        self.buffer: Optional[io.BytesIO] = None
+        self.wave_writer: Optional[wave.Wave_write] = None
+
+    def create_stream(self):
+        return sd.InputStream(
+            dtype=np.int16,
+            channels=1,
+            samplerate=self.sampling_rate,
+            blocksize=self.chunk_size,
+            device=int(self.device_info["index"]),
+            callback=self._stream_callback,
+        )
+
+    def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
+        audio_bytes = in_data.tobytes()
+        self.wave_writer.writeframes(audio_bytes)
+
+    def create_buffer(self):
+        in_memory_wav = io.BytesIO()
+        wave_writer = wave.open(in_memory_wav, "wb")
+        wave_writer.setnchannels(1)
+        wave_writer.setsampwidth(2)
+        wave_writer.setframerate(self.sampling_rate)
+        return in_memory_wav, wave_writer
+
+    def start_listening(self):
+        self.buffer, self.wave_writer = self.create_buffer()
+        self.stream = self.create_stream()
+        self.stream.start()
+
+    def end_listening(self) -> AudioSegment:
+        self.stream.stop()
+        self.buffer.seek(0)
+        return AudioSegment.from_wav(self.buffer)
--- a/vocode/turn_based/output_device/base_output_device.py
+++ b/vocode/turn_based/output_device/base_output_device.py
@ -0,0 +1,9 @@
+from pydub import AudioSegment
+
+
+class BaseOutputDevice:
+    def send_audio(self, audio: AudioSegment) -> None:
+        raise NotImplementedError
+
+    def terminate(self):
+        pass
--- a/vocode/turn_based/output_device/speaker_output.py
+++ b/vocode/turn_based/output_device/speaker_output.py
@ -0,0 +1,32 @@
+import sounddevice as sd
+import numpy as np
+from pydub import AudioSegment
+
+from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
+
+
+class SpeakerOutput(BaseOutputDevice):
+    DEFAULT_SAMPLING_RATE = 44100
+
+    def __init__(
+        self,
+        device_info: dict,
+        sampling_rate: int = None,
+    ):
+        self.device_info = device_info
+        self.sampling_rate = sampling_rate or int(
+            self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
+        )
+        self.stream = sd.OutputStream(
+            channels=1,
+            samplerate=self.sampling_rate,
+            dtype=np.int16,
+            device=int(self.device_info["index"]),
+        )
+        self.stream.start()
+
+    def send_audio(self, audio_segment: AudioSegment):
+        self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16))
+
+    def terminate(self):
+        self.stream.close()
--- a/vocode/turn_based/synthesizer/azure_synthesizer.py
+++ b/vocode/turn_based/synthesizer/azure_synthesizer.py
@ -0,0 +1,53 @@
+import os
+from dotenv import load_dotenv
+import azure.cognitiveservices.speech as speechsdk
+from pydub import AudioSegment
+
+from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
+
+load_dotenv()
+
+
+class AzureSynthesizer(BaseSynthesizer):
+    def __init__(self, sampling_rate: int):
+        self.sampling_rate = sampling_rate
+        speech_config = speechsdk.SpeechConfig(
+            subscription=os.environ.get("AZURE_SPEECH_KEY"),
+            region=os.environ.get("AZURE_SPEECH_REGION"),
+        )
+        if self.sampling_rate == 44100:
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
+            )
+        if self.sampling_rate == 48000:
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
+            )
+        if self.sampling_rate == 24000:
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
+            )
+        elif self.sampling_rate == 16000:
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
+            )
+        elif self.sampling_rate == 8000:
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
+            )
+
+        self.synthesizer = speechsdk.SpeechSynthesizer(
+            speech_config=speech_config, audio_config=None
+        )
+
+    def synthesize(self, text) -> AudioSegment:
+        result = self.synthesizer.speak_text(text)
+        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+            return AudioSegment(
+                result.audio_data,
+                sample_width=2,
+                frame_rate=self.sampling_rate,
+                channels=1,
+            )
+        else:
+            raise Exception("Could not synthesize audio")
--- a/vocode/turn_based/synthesizer/base_synthesizer.py
+++ b/vocode/turn_based/synthesizer/base_synthesizer.py
@ -0,0 +1,6 @@
+from pydub import AudioSegment
+
+
+class BaseSynthesizer:
+    def synthesize(self, text) -> AudioSegment:
+        raise NotImplementedError
--- a/vocode/turn_based/transcriber/base_transcriber.py
+++ b/vocode/turn_based/transcriber/base_transcriber.py
@ -0,0 +1,6 @@
+from pydub import AudioSegment
+
+
+class BaseTranscriber:
+    def transcribe(self, audio_segment: AudioSegment) -> str:
+        raise NotImplementedError
--- a/vocode/turn_based/transcriber/whisper_transcriber.py
+++ b/vocode/turn_based/transcriber/whisper_transcriber.py
@ -0,0 +1,21 @@
+from pydub import AudioSegment
+import io
+import os
+from dotenv import load_dotenv
+import openai
+
+from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
+
+load_dotenv()
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class WhisperTranscriber(BaseTranscriber):
+    def transcribe(self, audio_segment: AudioSegment) -> str:
+        in_memory_wav = io.BytesIO()
+        audio_segment.export(in_memory_wav, format="wav")
+        in_memory_wav.seek(0)
+        in_memory_wav.name = "whisper.wav"
+        transcript = openai.Audio.transcribe("whisper-1", in_memory_wav)
+        return transcript.text
--- a/vocode/turn_based/turn_based_conversation.py
+++ b/vocode/turn_based/turn_based_conversation.py
@ -0,0 +1,38 @@
+from vocode.turn_based.agent.base_agent import BaseAgent
+from vocode.turn_based.input_device.base_input_device import (
+    BaseInputDevice,
+)
+from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
+from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
+from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
+
+
+class TurnBasedConversation:
+    def __init__(
+        self,
+        input_device: BaseInputDevice,
+        transcriber: BaseTranscriber,
+        agent: BaseAgent,
+        synthesizer: BaseSynthesizer,
+        output_device: BaseOutputDevice,
+    ):
+        self.input_device = input_device
+        self.transcriber = transcriber
+        self.agent = agent
+        self.synthesizer = synthesizer
+        self.output_device = output_device
+        self.maybe_play_initial_message()
+
+    def maybe_play_initial_message(self):
+        if self.agent.initial_message:
+            self.output_device.send_audio(
+                self.synthesizer.synthesize(self.agent.initial_message)
+            )
+
+    def start_speech(self):
+        self.input_device.start_listening()
+
+    def end_speech_and_respond(self):
+        human_input = self.transcriber.transcribe(self.input_device.end_listening())
+        agent_response = self.agent.respond(human_input)
+        self.output_device.send_audio(self.synthesizer.synthesize(agent_response))