first pass at turn based conversation

This commit is contained in:
Ajay Raj 2023-03-20 15:37:23 -07:00
commit 518a0f2b53
40 changed files with 503 additions and 99 deletions

View file

@ -1,28 +1,69 @@
from typing import Union
import sounddevice as sd
from .input_device.microphone_input import MicrophoneInput
from .output_device.speaker_output import SpeakerOutput
from vocode.streaming.input_device.microphone_input import (
MicrophoneInput as StreamingMicrophoneInput,
)
from vocode.streaming.output_device.speaker_output import (
SpeakerOutput as StreamingSpeakerOutput,
)
from vocode.turn_based.input_device.microphone_input import (
MicrophoneInput as TurnBasedMicrophoneInput,
)
from vocode.turn_based.output_device.speaker_output import (
SpeakerOutput as TurnBasedSpeakerOutput,
)
import logging
logger = logging.getLogger(__name__)
def _get_device_prompt(device_infos: list[dict]) -> str:
return """Please select a device:
{}
Choice: """.format(
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
"\n".join(
f"{index}: {device['name']}" for index, device in enumerate(device_infos)
)
)
def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
def create_microphone_input_and_speaker_output(
streaming: bool = True,
use_default_devices=False,
mic_sampling_rate=None,
speaker_sampling_rate=None,
) -> Union[
tuple[StreamingMicrophoneInput, StreamingSpeakerOutput],
tuple[TurnBasedMicrophoneInput, TurnBasedSpeakerOutput],
]:
device_infos = sd.query_devices()
input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos))
output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos))
input_device_infos = list(
filter(lambda device_info: device_info["max_input_channels"] > 0, device_infos)
)
output_device_infos = list(
filter(lambda device_info: device_info["max_output_channels"] > 0, device_infos)
)
if use_default_devices:
input_device_info = sd.query_devices(kind='input')
output_device_info = sd.query_devices(kind='output')
input_device_info = sd.query_devices(kind="input")
output_device_info = sd.query_devices(kind="output")
else:
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
logger.info("Using microphone input device: %s", input_device_info['name'])
microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate)
logger.info("Using speaker output device: %s", output_device_info['name'])
speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate)
return microphone_input, speaker_output
input_device_info = input_device_infos[
int(input(_get_device_prompt(input_device_infos)))
]
output_device_info = output_device_infos[
int(input(_get_device_prompt(output_device_infos)))
]
logger.info("Using microphone input device: %s", input_device_info["name"])
microphone_class = (
StreamingMicrophoneInput if streaming else TurnBasedMicrophoneInput
)
speaker_class = StreamingSpeakerOutput if streaming else TurnBasedSpeakerOutput
microphone_input = microphone_class(
input_device_info, sampling_rate=mic_sampling_rate
)
logger.info("Using speaker output device: %s", output_device_info["name"])
speaker_output = speaker_class(
output_device_info, sampling_rate=speaker_sampling_rate
)
return microphone_input, speaker_output

View file

@ -1,14 +1,16 @@
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
import queue
from typing import Optional
class BaseInputDevice():
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int):
class BaseInputDevice:
def __init__(
self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int
):
self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding
self.chunk_size = chunk_size
self.queue = queue.Queue()
def get_audio(self) -> Optional[bytes]:
raise NotImplementedError
raise NotImplementedError

View file

@ -4,25 +4,33 @@ from typing import Optional
import queue
import wave
from .base_input_device import BaseInputDevice
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.input_device.base_input_device import BaseInputDevice
from vocode.streaming.models.audio_encoding import AudioEncoding
class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1):
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
chunk_size: int = DEFAULT_CHUNK_SIZE,
microphone_gain: int = 1,
):
self.device_info = device_info
sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
sampling_rate = sampling_rate or (
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
self.stream = sd.InputStream(
dtype=np.int16,
channels=1,
samplerate=self.sampling_rate,
samplerate=self.sampling_rate,
blocksize=self.chunk_size,
device=int(self.device_info['index']),
callback=self._stream_callback
device=int(self.device_info["index"]),
callback=self._stream_callback,
)
self.stream.start()
self.queue = queue.Queue()
@ -40,4 +48,4 @@ class MicrophoneInput(BaseInputDevice):
try:
return self.queue.get_nowait()
except queue.Empty:
return None
return None

View file

@ -1,5 +1,7 @@
from vocode.input_device.base_input_device import BaseInputDevice
from vocode.models.audio_encoding import AudioEncoding
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.streaming.models.audio_encoding import AudioEncoding
class TelephoneInput(BaseInputDevice):

View file

@ -3,7 +3,7 @@ from enum import Enum
from pydantic import validator
from vocode.models.message import BaseMessage
from vocode.streaming.models.message import BaseMessage
from .model import TypedModel, BaseModel
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5

View file

@ -1,8 +1,8 @@
from typing import Optional
from vocode.models.model import BaseModel
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig
from vocode.streaming.models.model import BaseModel
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
class TwilioConfig(BaseModel):

View file

@ -1,8 +1,11 @@
from enum import Enum
from typing import Optional
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from .audio_encoding import AudioEncoding
from .model import BaseModel, TypedModel
from ..input_device.base_input_device import BaseInputDevice
class TranscriberType(str, Enum):

View file

@ -1,7 +1,7 @@
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
class BaseOutputDevice:
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding
@ -11,5 +11,3 @@ class BaseOutputDevice:
async def maybe_send_mark_async(self, message):
pass

View file

@ -2,21 +2,28 @@ import sounddevice as sd
import numpy as np
from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
audio_encoding: AudioEncoding = AudioEncoding.LINEAR16,
):
self.device_info = device_info
sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
sampling_rate = sampling_rate or int(
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
super().__init__(sampling_rate, audio_encoding)
self.stream = sd.OutputStream(
channels=1,
samplerate=self.sampling_rate,
dtype=np.int16,
device=int(self.device_info['index'])
device=int(self.device_info["index"]),
)
self.stream.start()
@ -24,4 +31,4 @@ class SpeakerOutput(BaseOutputDevice):
self.stream.write(np.frombuffer(chunk, dtype=np.int16))
def terminate(self):
self.stream.close()
self.stream.close()

View file

@ -1,5 +1,5 @@
from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
class TelephoneOutput(BaseOutputDevice):

View file

@ -8,16 +8,24 @@ import logging
import threading
import queue
import vocode
from vocode.input_device.base_input_device import BaseInputDevice
from vocode.output_device.base_output_device import BaseOutputDevice
from vocode.models.transcriber import TranscriberConfig
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.websocket import (
ReadyMessage,
AudioMessage,
StartMessage,
StopMessage,
)
load_dotenv()
class Conversation:
class StreamingConversation:
def __init__(
self,
input_device: BaseInputDevice,
@ -61,14 +69,16 @@ class Conversation:
loop.run_until_complete(run())
async def start(self):
async with websockets.connect(f"{self.vocode_websocket_url}?key={vocode.api_key}") as ws:
async with websockets.connect(
f"{self.vocode_websocket_url}?key={vocode.api_key}"
) as ws:
async def sender(ws: WebSocketClientProtocol):
start_message = StartMessage(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config,
conversation_id=self.id
conversation_id=self.id,
)
await ws.send(start_message.json())
await self.wait_for_ready()

View file

@ -4,10 +4,15 @@ import requests
import uvicorn
import vocode
from vocode.models.transcriber import TranscriberConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.agent import AgentConfig
from vocode.models.telephony import CreateInboundCall, TwilioConfig, TwilioConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.telephony import (
CreateInboundCall,
TwilioConfig,
TwilioConfig,
)
class InboundCallServer:
def __init__(

View file

@ -2,9 +2,9 @@ from typing import Optional
import requests
import vocode
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from ..models.telephony import (
CallEntity,
CreateOutboundCall,
@ -31,8 +31,12 @@ class OutboundCall:
self.synthesizer_config = synthesizer_config
self.conversation_id = conversation_id
self.twilio_config = twilio_config
self.vocode_create_outbound_call_url = f"https://{vocode.base_url}/create_outbound_call"
self.vocode_end_outbound_call_url = f"https://{vocode.base_url}/end_outbound_call"
self.vocode_create_outbound_call_url = (
f"https://{vocode.base_url}/create_outbound_call"
)
self.vocode_end_outbound_call_url = (
f"https://{vocode.base_url}/end_outbound_call"
)
def start(self) -> str:
response = requests.post(

View file

@ -2,11 +2,11 @@ from typing import Optional
import requests
import vocode
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig
from vocode.telephony.outbound_call import OutboundCall
from vocode.models.telephony import (
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.telephony.outbound_call import OutboundCall
from vocode.streaming.models.telephony import (
CallEntity,
DialIntoZoomCall,
TwilioConfig,

View file

@ -0,0 +1,9 @@
from typing import Optional
class BaseAgent:
def __init__(self, initial_message: Optional[str] = None):
self.initial_message = initial_message
def respond(self, human_input: str):
raise NotImplementedError

View file

@ -0,0 +1,45 @@
from typing import Optional
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from vocode.turn_based.agent.base_agent import BaseAgent
class ChatGPTAgent(BaseAgent):
def __init__(
self,
system_prompt: str,
initial_message: Optional[str] = None,
model_name: str = "gpt-3.5-turbo",
temperature: float = 0.7,
max_tokens: int = 100,
):
super().__init__(initial_message=initial_message)
self.prompt = ChatPromptTemplate.from_messages(
[
SystemMessagePromptTemplate.from_template(system_prompt),
MessagesPlaceholder(variable_name="history"),
HumanMessagePromptTemplate.from_template("{input}"),
]
)
self.memory = ConversationBufferMemory(return_messages=True)
if initial_message:
self.memory.chat_memory.add_ai_message(initial_message)
self.llm = ChatOpenAI(
model_name=model_name,
temperature=temperature,
max_tokens=max_tokens,
)
self.conversation = ConversationChain(
memory=self.memory, prompt=self.prompt, llm=self.llm
)
def respond(self, human_input: str):
return self.conversation.predict(input=human_input)

View file

@ -0,0 +1,6 @@
from vocode.turn_based.agent.base_agent import BaseAgent
class EchoAgent(BaseAgent):
def respond(self, human_input: str):
return human_input

View file

@ -0,0 +1,9 @@
from pydub import AudioSegment
class BaseInputDevice:
def start_listening(self):
raise NotImplementedError
def end_listening(self) -> AudioSegment:
raise NotImplementedError

View file

@ -0,0 +1,59 @@
from typing import Optional
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
import io
import wave
from vocode.turn_based.input_device.base_input_device import BaseInputDevice
class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
chunk_size: int = DEFAULT_CHUNK_SIZE,
):
self.device_info = device_info
self.sampling_rate = sampling_rate or (
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
self.chunk_size = chunk_size
self.buffer: Optional[io.BytesIO] = None
self.wave_writer: Optional[wave.Wave_write] = None
def create_stream(self):
return sd.InputStream(
dtype=np.int16,
channels=1,
samplerate=self.sampling_rate,
blocksize=self.chunk_size,
device=int(self.device_info["index"]),
callback=self._stream_callback,
)
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
audio_bytes = in_data.tobytes()
self.wave_writer.writeframes(audio_bytes)
def create_buffer(self):
in_memory_wav = io.BytesIO()
wave_writer = wave.open(in_memory_wav, "wb")
wave_writer.setnchannels(1)
wave_writer.setsampwidth(2)
wave_writer.setframerate(self.sampling_rate)
return in_memory_wav, wave_writer
def start_listening(self):
self.buffer, self.wave_writer = self.create_buffer()
self.stream = self.create_stream()
self.stream.start()
def end_listening(self) -> AudioSegment:
self.stream.stop()
self.buffer.seek(0)
return AudioSegment.from_wav(self.buffer)

View file

@ -0,0 +1,9 @@
from pydub import AudioSegment
class BaseOutputDevice:
def send_audio(self, audio: AudioSegment) -> None:
raise NotImplementedError
def terminate(self):
pass

View file

@ -0,0 +1,32 @@
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
):
self.device_info = device_info
self.sampling_rate = sampling_rate or int(
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
self.stream = sd.OutputStream(
channels=1,
samplerate=self.sampling_rate,
dtype=np.int16,
device=int(self.device_info["index"]),
)
self.stream.start()
def send_audio(self, audio_segment: AudioSegment):
self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16))
def terminate(self):
self.stream.close()

View file

@ -0,0 +1,53 @@
import os
from dotenv import load_dotenv
import azure.cognitiveservices.speech as speechsdk
from pydub import AudioSegment
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
load_dotenv()
class AzureSynthesizer(BaseSynthesizer):
def __init__(self, sampling_rate: int):
self.sampling_rate = sampling_rate
speech_config = speechsdk.SpeechConfig(
subscription=os.environ.get("AZURE_SPEECH_KEY"),
region=os.environ.get("AZURE_SPEECH_REGION"),
)
if self.sampling_rate == 44100:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
)
if self.sampling_rate == 48000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
)
if self.sampling_rate == 24000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
)
elif self.sampling_rate == 16000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
)
elif self.sampling_rate == 8000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
)
self.synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=None
)
def synthesize(self, text) -> AudioSegment:
result = self.synthesizer.speak_text(text)
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
return AudioSegment(
result.audio_data,
sample_width=2,
frame_rate=self.sampling_rate,
channels=1,
)
else:
raise Exception("Could not synthesize audio")

View file

@ -0,0 +1,6 @@
from pydub import AudioSegment
class BaseSynthesizer:
def synthesize(self, text) -> AudioSegment:
raise NotImplementedError

View file

@ -0,0 +1,6 @@
from pydub import AudioSegment
class BaseTranscriber:
def transcribe(self, audio_segment: AudioSegment) -> str:
raise NotImplementedError

View file

@ -0,0 +1,21 @@
from pydub import AudioSegment
import io
import os
from dotenv import load_dotenv
import openai
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
class WhisperTranscriber(BaseTranscriber):
def transcribe(self, audio_segment: AudioSegment) -> str:
in_memory_wav = io.BytesIO()
audio_segment.export(in_memory_wav, format="wav")
in_memory_wav.seek(0)
in_memory_wav.name = "whisper.wav"
transcript = openai.Audio.transcribe("whisper-1", in_memory_wav)
return transcript.text

View file

@ -0,0 +1,38 @@
from vocode.turn_based.agent.base_agent import BaseAgent
from vocode.turn_based.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
class TurnBasedConversation:
def __init__(
self,
input_device: BaseInputDevice,
transcriber: BaseTranscriber,
agent: BaseAgent,
synthesizer: BaseSynthesizer,
output_device: BaseOutputDevice,
):
self.input_device = input_device
self.transcriber = transcriber
self.agent = agent
self.synthesizer = synthesizer
self.output_device = output_device
self.maybe_play_initial_message()
def maybe_play_initial_message(self):
if self.agent.initial_message:
self.output_device.send_audio(
self.synthesizer.synthesize(self.agent.initial_message)
)
def start_speech(self):
self.input_device.start_listening()
def end_speech_and_respond(self):
human_input = self.transcriber.transcribe(self.input_device.end_listening())
agent_response = self.agent.respond(human_input)
self.output_device.send_audio(self.synthesizer.synthesize(agent_response))