first pass at turn based conversation
This commit is contained in:
parent
d1118d375e
commit
518a0f2b53
40 changed files with 503 additions and 99 deletions
9
vocode/turn_based/agent/base_agent.py
Normal file
9
vocode/turn_based/agent/base_agent.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from typing import Optional
|
||||
|
||||
|
||||
class BaseAgent:
|
||||
def __init__(self, initial_message: Optional[str] = None):
|
||||
self.initial_message = initial_message
|
||||
|
||||
def respond(self, human_input: str):
|
||||
raise NotImplementedError
|
||||
45
vocode/turn_based/agent/chat_gpt_agent.py
Normal file
45
vocode/turn_based/agent/chat_gpt_agent.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
from typing import Optional
|
||||
from langchain.prompts import (
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
SystemMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.chains import ConversationChain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.memory import ConversationBufferMemory
|
||||
|
||||
from vocode.turn_based.agent.base_agent import BaseAgent
|
||||
|
||||
|
||||
class ChatGPTAgent(BaseAgent):
|
||||
def __init__(
|
||||
self,
|
||||
system_prompt: str,
|
||||
initial_message: Optional[str] = None,
|
||||
model_name: str = "gpt-3.5-turbo",
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 100,
|
||||
):
|
||||
super().__init__(initial_message=initial_message)
|
||||
self.prompt = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
SystemMessagePromptTemplate.from_template(system_prompt),
|
||||
MessagesPlaceholder(variable_name="history"),
|
||||
HumanMessagePromptTemplate.from_template("{input}"),
|
||||
]
|
||||
)
|
||||
self.memory = ConversationBufferMemory(return_messages=True)
|
||||
if initial_message:
|
||||
self.memory.chat_memory.add_ai_message(initial_message)
|
||||
self.llm = ChatOpenAI(
|
||||
model_name=model_name,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
self.conversation = ConversationChain(
|
||||
memory=self.memory, prompt=self.prompt, llm=self.llm
|
||||
)
|
||||
|
||||
def respond(self, human_input: str):
|
||||
return self.conversation.predict(input=human_input)
|
||||
6
vocode/turn_based/agent/echo_agent.py
Normal file
6
vocode/turn_based/agent/echo_agent.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
from vocode.turn_based.agent.base_agent import BaseAgent
|
||||
|
||||
|
||||
class EchoAgent(BaseAgent):
|
||||
def respond(self, human_input: str):
|
||||
return human_input
|
||||
9
vocode/turn_based/input_device/base_input_device.py
Normal file
9
vocode/turn_based/input_device/base_input_device.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from pydub import AudioSegment
|
||||
|
||||
|
||||
class BaseInputDevice:
|
||||
def start_listening(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def end_listening(self) -> AudioSegment:
|
||||
raise NotImplementedError
|
||||
59
vocode/turn_based/input_device/microphone_input.py
Normal file
59
vocode/turn_based/input_device/microphone_input.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
from typing import Optional
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
from pydub import AudioSegment
|
||||
import io
|
||||
import wave
|
||||
|
||||
from vocode.turn_based.input_device.base_input_device import BaseInputDevice
|
||||
|
||||
|
||||
class MicrophoneInput(BaseInputDevice):
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
DEFAULT_CHUNK_SIZE = 2048
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device_info: dict,
|
||||
sampling_rate: int = None,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
):
|
||||
self.device_info = device_info
|
||||
self.sampling_rate = sampling_rate or (
|
||||
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||
)
|
||||
self.chunk_size = chunk_size
|
||||
self.buffer: Optional[io.BytesIO] = None
|
||||
self.wave_writer: Optional[wave.Wave_write] = None
|
||||
|
||||
def create_stream(self):
|
||||
return sd.InputStream(
|
||||
dtype=np.int16,
|
||||
channels=1,
|
||||
samplerate=self.sampling_rate,
|
||||
blocksize=self.chunk_size,
|
||||
device=int(self.device_info["index"]),
|
||||
callback=self._stream_callback,
|
||||
)
|
||||
|
||||
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
|
||||
audio_bytes = in_data.tobytes()
|
||||
self.wave_writer.writeframes(audio_bytes)
|
||||
|
||||
def create_buffer(self):
|
||||
in_memory_wav = io.BytesIO()
|
||||
wave_writer = wave.open(in_memory_wav, "wb")
|
||||
wave_writer.setnchannels(1)
|
||||
wave_writer.setsampwidth(2)
|
||||
wave_writer.setframerate(self.sampling_rate)
|
||||
return in_memory_wav, wave_writer
|
||||
|
||||
def start_listening(self):
|
||||
self.buffer, self.wave_writer = self.create_buffer()
|
||||
self.stream = self.create_stream()
|
||||
self.stream.start()
|
||||
|
||||
def end_listening(self) -> AudioSegment:
|
||||
self.stream.stop()
|
||||
self.buffer.seek(0)
|
||||
return AudioSegment.from_wav(self.buffer)
|
||||
9
vocode/turn_based/output_device/base_output_device.py
Normal file
9
vocode/turn_based/output_device/base_output_device.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from pydub import AudioSegment
|
||||
|
||||
|
||||
class BaseOutputDevice:
|
||||
def send_audio(self, audio: AudioSegment) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def terminate(self):
|
||||
pass
|
||||
32
vocode/turn_based/output_device/speaker_output.py
Normal file
32
vocode/turn_based/output_device/speaker_output.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
import sounddevice as sd
|
||||
import numpy as np
|
||||
from pydub import AudioSegment
|
||||
|
||||
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
|
||||
|
||||
|
||||
class SpeakerOutput(BaseOutputDevice):
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device_info: dict,
|
||||
sampling_rate: int = None,
|
||||
):
|
||||
self.device_info = device_info
|
||||
self.sampling_rate = sampling_rate or int(
|
||||
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||
)
|
||||
self.stream = sd.OutputStream(
|
||||
channels=1,
|
||||
samplerate=self.sampling_rate,
|
||||
dtype=np.int16,
|
||||
device=int(self.device_info["index"]),
|
||||
)
|
||||
self.stream.start()
|
||||
|
||||
def send_audio(self, audio_segment: AudioSegment):
|
||||
self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16))
|
||||
|
||||
def terminate(self):
|
||||
self.stream.close()
|
||||
53
vocode/turn_based/synthesizer/azure_synthesizer.py
Normal file
53
vocode/turn_based/synthesizer/azure_synthesizer.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
import azure.cognitiveservices.speech as speechsdk
|
||||
from pydub import AudioSegment
|
||||
|
||||
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class AzureSynthesizer(BaseSynthesizer):
|
||||
def __init__(self, sampling_rate: int):
|
||||
self.sampling_rate = sampling_rate
|
||||
speech_config = speechsdk.SpeechConfig(
|
||||
subscription=os.environ.get("AZURE_SPEECH_KEY"),
|
||||
region=os.environ.get("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
if self.sampling_rate == 44100:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
|
||||
)
|
||||
if self.sampling_rate == 48000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
|
||||
)
|
||||
if self.sampling_rate == 24000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
|
||||
)
|
||||
elif self.sampling_rate == 16000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
|
||||
)
|
||||
elif self.sampling_rate == 8000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
|
||||
)
|
||||
|
||||
self.synthesizer = speechsdk.SpeechSynthesizer(
|
||||
speech_config=speech_config, audio_config=None
|
||||
)
|
||||
|
||||
def synthesize(self, text) -> AudioSegment:
|
||||
result = self.synthesizer.speak_text(text)
|
||||
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
||||
return AudioSegment(
|
||||
result.audio_data,
|
||||
sample_width=2,
|
||||
frame_rate=self.sampling_rate,
|
||||
channels=1,
|
||||
)
|
||||
else:
|
||||
raise Exception("Could not synthesize audio")
|
||||
6
vocode/turn_based/synthesizer/base_synthesizer.py
Normal file
6
vocode/turn_based/synthesizer/base_synthesizer.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
from pydub import AudioSegment
|
||||
|
||||
|
||||
class BaseSynthesizer:
|
||||
def synthesize(self, text) -> AudioSegment:
|
||||
raise NotImplementedError
|
||||
6
vocode/turn_based/transcriber/base_transcriber.py
Normal file
6
vocode/turn_based/transcriber/base_transcriber.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
from pydub import AudioSegment
|
||||
|
||||
|
||||
class BaseTranscriber:
|
||||
def transcribe(self, audio_segment: AudioSegment) -> str:
|
||||
raise NotImplementedError
|
||||
21
vocode/turn_based/transcriber/whisper_transcriber.py
Normal file
21
vocode/turn_based/transcriber/whisper_transcriber.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from pydub import AudioSegment
|
||||
import io
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import openai
|
||||
|
||||
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
|
||||
|
||||
load_dotenv()
|
||||
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
|
||||
class WhisperTranscriber(BaseTranscriber):
|
||||
def transcribe(self, audio_segment: AudioSegment) -> str:
|
||||
in_memory_wav = io.BytesIO()
|
||||
audio_segment.export(in_memory_wav, format="wav")
|
||||
in_memory_wav.seek(0)
|
||||
in_memory_wav.name = "whisper.wav"
|
||||
transcript = openai.Audio.transcribe("whisper-1", in_memory_wav)
|
||||
return transcript.text
|
||||
38
vocode/turn_based/turn_based_conversation.py
Normal file
38
vocode/turn_based/turn_based_conversation.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
from vocode.turn_based.agent.base_agent import BaseAgent
|
||||
from vocode.turn_based.input_device.base_input_device import (
|
||||
BaseInputDevice,
|
||||
)
|
||||
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
|
||||
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
|
||||
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
|
||||
|
||||
|
||||
class TurnBasedConversation:
|
||||
def __init__(
|
||||
self,
|
||||
input_device: BaseInputDevice,
|
||||
transcriber: BaseTranscriber,
|
||||
agent: BaseAgent,
|
||||
synthesizer: BaseSynthesizer,
|
||||
output_device: BaseOutputDevice,
|
||||
):
|
||||
self.input_device = input_device
|
||||
self.transcriber = transcriber
|
||||
self.agent = agent
|
||||
self.synthesizer = synthesizer
|
||||
self.output_device = output_device
|
||||
self.maybe_play_initial_message()
|
||||
|
||||
def maybe_play_initial_message(self):
|
||||
if self.agent.initial_message:
|
||||
self.output_device.send_audio(
|
||||
self.synthesizer.synthesize(self.agent.initial_message)
|
||||
)
|
||||
|
||||
def start_speech(self):
|
||||
self.input_device.start_listening()
|
||||
|
||||
def end_speech_and_respond(self):
|
||||
human_input = self.transcriber.transcribe(self.input_device.end_listening())
|
||||
agent_response = self.agent.respond(human_input)
|
||||
self.output_device.send_audio(self.synthesizer.synthesize(agent_response))
|
||||
Loading…
Add table
Add a link
Reference in a new issue