first pass at turn based conversation

This commit is contained in:
Ajay Raj 2023-03-20 15:37:23 -07:00
commit 518a0f2b53
40 changed files with 503 additions and 99 deletions

View file

@ -10,9 +10,9 @@ import signal
from vocode.conversation import Conversation from vocode.conversation import Conversation
from vocode.helpers import create_microphone_input_and_speaker_output from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.models.transcriber import DeepgramTranscriberConfig from vocode.streaming.models.transcriber import DeepgramTranscriberConfig
from vocode.models.agent import LLMAgentConfig from vocode.streaming.models.agent import LLMAgentConfig
from vocode.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
if __name__ == "__main__": if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True) microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True)

View file

@ -1,8 +1,6 @@
from vocode.telephony.inbound_call_server import InboundCallServer from vocode.streaming.telephony.inbound_call_server import InboundCallServer
from vocode.models.agent import EchoAgentConfig from vocode.streaming.models.agent import EchoAgentConfig
if __name__ == '__main__': if __name__ == "__main__":
server = InboundCallServer( server = InboundCallServer(agent_config=EchoAgentConfig(initial_message="hello!"))
agent_config=EchoAgentConfig(initial_message="hello!") server.run(port=3001)
)
server.run(port=3001)

View file

@ -1,14 +1,14 @@
from vocode.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.output_device.telephone_output import TelephoneOutput from vocode.streaming.output_device.telephone_output import TelephoneOutput
from vocode.telephony.outbound_call import OutboundCall from vocode.streaming.telephony.outbound_call import OutboundCall
from vocode.models.telephony import CallEntity from vocode.streaming.models.telephony import CallEntity
from vocode.models.agent import ( from vocode.streaming.models.agent import (
EchoAgentConfig, EchoAgentConfig,
ChatGPTAgentConfig, ChatGPTAgentConfig,
WebSocketUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig,
) )
from vocode.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from vocode.telephony.zoom_dial_in import ZoomDialIn from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn
if __name__ == "__main__": if __name__ == "__main__":
call = ZoomDialIn( call = ZoomDialIn(
@ -24,7 +24,7 @@ if __name__ == "__main__":
generate_responses=True, generate_responses=True,
end_conversation_on_goodbye=True, end_conversation_on_goodbye=True,
send_filler_audio=True, send_filler_audio=True,
allowed_idle_time_seconds=30 allowed_idle_time_seconds=30,
), ),
synthesizer_config=AzureSynthesizerConfig.from_output_device( synthesizer_config=AzureSynthesizerConfig.from_output_device(
output_device=TelephoneOutput(), voice_name="en-US-JennyNeural" output_device=TelephoneOutput(), voice_name="en-US-JennyNeural"

View file

@ -3,14 +3,14 @@ import logging
import signal import signal
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
from vocode.conversation import Conversation from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.helpers import create_microphone_input_and_speaker_output from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.models.transcriber import ( from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig, DeepgramTranscriberConfig,
PunctuationEndpointingConfig, PunctuationEndpointingConfig,
GoogleTranscriberConfig, GoogleTranscriberConfig,
) )
from vocode.models.agent import ( from vocode.streaming.models.agent import (
ChatGPTAgentConfig, ChatGPTAgentConfig,
CutOffResponse, CutOffResponse,
FillerAudioConfig, FillerAudioConfig,
@ -20,9 +20,9 @@ from vocode.models.agent import (
LLMAgentConfig, LLMAgentConfig,
ChatGPTAgentConfig, ChatGPTAgentConfig,
) )
from vocode.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from vocode.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.user_implemented_agent.restful_agent import RESTfulAgent from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
import vocode import vocode
load_dotenv() load_dotenv()
@ -34,10 +34,10 @@ logging.root.setLevel(logging.INFO)
if __name__ == "__main__": if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output( microphone_input, speaker_output = create_microphone_input_and_speaker_output(
use_default_devices=False streaming=True, use_default_devices=False
) )
conversation = Conversation( conversation = StreamingConversation(
input_device=microphone_input, input_device=microphone_input,
output_device=speaker_output, output_device=speaker_output,
transcriber_config=DeepgramTranscriberConfig.from_input_device( transcriber_config=DeepgramTranscriberConfig.from_input_device(

View file

@ -0,0 +1,33 @@
import logging
from dotenv import load_dotenv
import os
from vocode.helpers import create_microphone_input_and_speaker_output
import vocode
from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent
from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber
from vocode.turn_based.turn_based_conversation import TurnBasedConversation
load_dotenv()
vocode.api_key = os.getenv("VOCODE_API_KEY")
if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
streaming=False, use_default_devices=False
)
conversation = TurnBasedConversation(
input_device=microphone_input,
output_device=speaker_output,
transcriber=WhisperTranscriber(),
agent=ChatGPTAgent(
system_prompt="The AI is having a pleasant conversation about life",
initial_message="Hello!",
),
synthesizer=AzureSynthesizer(sampling_rate=speaker_output.sampling_rate),
)
while True:
conversation.start_speech()
input("Press enter to end speech")
conversation.end_speech_and_respond()

View file

@ -1,6 +1,6 @@
from typing import AsyncGenerator from typing import AsyncGenerator
from vocode.user_implemented_agent.restful_agent import RESTfulAgent from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
from vocode.models.agent import ( from vocode.streaming.models.agent import (
RESTfulAgentOutput, RESTfulAgentOutput,
RESTfulAgentText, RESTfulAgentText,
RESTfulAgentEnd, RESTfulAgentEnd,
@ -9,7 +9,7 @@ from vocode.models.agent import (
WebSocketAgentTextMessage, WebSocketAgentTextMessage,
WebSocketAgentStopMessage, WebSocketAgentStopMessage,
) )
from vocode.user_implemented_agent.websocket_agent import WebSocketAgent from vocode.streaming.user_implemented_agent.websocket_agent import WebSocketAgent
class TestRESTfulAgent(RESTfulAgent): class TestRESTfulAgent(RESTfulAgent):

View file

@ -1,28 +1,69 @@
from typing import Union
import sounddevice as sd import sounddevice as sd
from .input_device.microphone_input import MicrophoneInput from vocode.streaming.input_device.microphone_input import (
from .output_device.speaker_output import SpeakerOutput MicrophoneInput as StreamingMicrophoneInput,
)
from vocode.streaming.output_device.speaker_output import (
SpeakerOutput as StreamingSpeakerOutput,
)
from vocode.turn_based.input_device.microphone_input import (
MicrophoneInput as TurnBasedMicrophoneInput,
)
from vocode.turn_based.output_device.speaker_output import (
SpeakerOutput as TurnBasedSpeakerOutput,
)
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _get_device_prompt(device_infos: list[dict]) -> str: def _get_device_prompt(device_infos: list[dict]) -> str:
return """Please select a device: return """Please select a device:
{} {}
Choice: """.format( Choice: """.format(
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) "\n".join(
f"{index}: {device['name']}" for index, device in enumerate(device_infos)
)
)
def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
def create_microphone_input_and_speaker_output(
streaming: bool = True,
use_default_devices=False,
mic_sampling_rate=None,
speaker_sampling_rate=None,
) -> Union[
tuple[StreamingMicrophoneInput, StreamingSpeakerOutput],
tuple[TurnBasedMicrophoneInput, TurnBasedSpeakerOutput],
]:
device_infos = sd.query_devices() device_infos = sd.query_devices()
input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos)) input_device_infos = list(
output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos)) filter(lambda device_info: device_info["max_input_channels"] > 0, device_infos)
)
output_device_infos = list(
filter(lambda device_info: device_info["max_output_channels"] > 0, device_infos)
)
if use_default_devices: if use_default_devices:
input_device_info = sd.query_devices(kind='input') input_device_info = sd.query_devices(kind="input")
output_device_info = sd.query_devices(kind='output') output_device_info = sd.query_devices(kind="output")
else: else:
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))] input_device_info = input_device_infos[
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))] int(input(_get_device_prompt(input_device_infos)))
logger.info("Using microphone input device: %s", input_device_info['name']) ]
microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate) output_device_info = output_device_infos[
logger.info("Using speaker output device: %s", output_device_info['name']) int(input(_get_device_prompt(output_device_infos)))
speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate) ]
return microphone_input, speaker_output logger.info("Using microphone input device: %s", input_device_info["name"])
microphone_class = (
StreamingMicrophoneInput if streaming else TurnBasedMicrophoneInput
)
speaker_class = StreamingSpeakerOutput if streaming else TurnBasedSpeakerOutput
microphone_input = microphone_class(
input_device_info, sampling_rate=mic_sampling_rate
)
logger.info("Using speaker output device: %s", output_device_info["name"])
speaker_output = speaker_class(
output_device_info, sampling_rate=speaker_sampling_rate
)
return microphone_input, speaker_output

View file

@ -1,14 +1,16 @@
from ..models.audio_encoding import AudioEncoding from vocode.streaming.models.audio_encoding import AudioEncoding
import queue import queue
from typing import Optional from typing import Optional
class BaseInputDevice():
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int): class BaseInputDevice:
def __init__(
self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int
):
self.sampling_rate = sampling_rate self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding self.audio_encoding = audio_encoding
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.queue = queue.Queue() self.queue = queue.Queue()
def get_audio(self) -> Optional[bytes]: def get_audio(self) -> Optional[bytes]:
raise NotImplementedError raise NotImplementedError

View file

@ -4,25 +4,33 @@ from typing import Optional
import queue import queue
import wave import wave
from .base_input_device import BaseInputDevice from vocode.streaming.input_device.base_input_device import BaseInputDevice
from ..models.audio_encoding import AudioEncoding from vocode.streaming.models.audio_encoding import AudioEncoding
class MicrophoneInput(BaseInputDevice): class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100 DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048 DEFAULT_CHUNK_SIZE = 2048
def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1): def __init__(
self,
device_info: dict,
sampling_rate: int = None,
chunk_size: int = DEFAULT_CHUNK_SIZE,
microphone_gain: int = 1,
):
self.device_info = device_info self.device_info = device_info
sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE)) sampling_rate = sampling_rate or (
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size) super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
self.stream = sd.InputStream( self.stream = sd.InputStream(
dtype=np.int16, dtype=np.int16,
channels=1, channels=1,
samplerate=self.sampling_rate, samplerate=self.sampling_rate,
blocksize=self.chunk_size, blocksize=self.chunk_size,
device=int(self.device_info['index']), device=int(self.device_info["index"]),
callback=self._stream_callback callback=self._stream_callback,
) )
self.stream.start() self.stream.start()
self.queue = queue.Queue() self.queue = queue.Queue()
@ -40,4 +48,4 @@ class MicrophoneInput(BaseInputDevice):
try: try:
return self.queue.get_nowait() return self.queue.get_nowait()
except queue.Empty: except queue.Empty:
return None return None

View file

@ -1,5 +1,7 @@
from vocode.input_device.base_input_device import BaseInputDevice from vocode.streaming.input_device.base_input_device import (
from vocode.models.audio_encoding import AudioEncoding BaseInputDevice,
)
from vocode.streaming.models.audio_encoding import AudioEncoding
class TelephoneInput(BaseInputDevice): class TelephoneInput(BaseInputDevice):

View file

@ -3,7 +3,7 @@ from enum import Enum
from pydantic import validator from pydantic import validator
from vocode.models.message import BaseMessage from vocode.streaming.models.message import BaseMessage
from .model import TypedModel, BaseModel from .model import TypedModel, BaseModel
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5 FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5

View file

@ -1,8 +1,8 @@
from typing import Optional from typing import Optional
from vocode.models.model import BaseModel from vocode.streaming.models.model import BaseModel
from vocode.models.agent import AgentConfig from vocode.streaming.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig from vocode.streaming.models.transcriber import TranscriberConfig
class TwilioConfig(BaseModel): class TwilioConfig(BaseModel):

View file

@ -1,8 +1,11 @@
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from .audio_encoding import AudioEncoding from .audio_encoding import AudioEncoding
from .model import BaseModel, TypedModel from .model import BaseModel, TypedModel
from ..input_device.base_input_device import BaseInputDevice
class TranscriberType(str, Enum): class TranscriberType(str, Enum):

View file

@ -1,7 +1,7 @@
from ..models.audio_encoding import AudioEncoding from vocode.streaming.models.audio_encoding import AudioEncoding
class BaseOutputDevice: class BaseOutputDevice:
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding): def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
self.sampling_rate = sampling_rate self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding self.audio_encoding = audio_encoding
@ -11,5 +11,3 @@ class BaseOutputDevice:
async def maybe_send_mark_async(self, message): async def maybe_send_mark_async(self, message):
pass pass

View file

@ -2,21 +2,28 @@ import sounddevice as sd
import numpy as np import numpy as np
from .base_output_device import BaseOutputDevice from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding from vocode.streaming.models.audio_encoding import AudioEncoding
class SpeakerOutput(BaseOutputDevice): class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100 DEFAULT_SAMPLING_RATE = 44100
def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): def __init__(
self,
device_info: dict,
sampling_rate: int = None,
audio_encoding: AudioEncoding = AudioEncoding.LINEAR16,
):
self.device_info = device_info self.device_info = device_info
sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE)) sampling_rate = sampling_rate or int(
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
super().__init__(sampling_rate, audio_encoding) super().__init__(sampling_rate, audio_encoding)
self.stream = sd.OutputStream( self.stream = sd.OutputStream(
channels=1, channels=1,
samplerate=self.sampling_rate, samplerate=self.sampling_rate,
dtype=np.int16, dtype=np.int16,
device=int(self.device_info['index']) device=int(self.device_info["index"]),
) )
self.stream.start() self.stream.start()
@ -24,4 +31,4 @@ class SpeakerOutput(BaseOutputDevice):
self.stream.write(np.frombuffer(chunk, dtype=np.int16)) self.stream.write(np.frombuffer(chunk, dtype=np.int16))
def terminate(self): def terminate(self):
self.stream.close() self.stream.close()

View file

@ -1,5 +1,5 @@
from .base_output_device import BaseOutputDevice from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding from vocode.streaming.models.audio_encoding import AudioEncoding
class TelephoneOutput(BaseOutputDevice): class TelephoneOutput(BaseOutputDevice):

View file

@ -8,16 +8,24 @@ import logging
import threading import threading
import queue import queue
import vocode import vocode
from vocode.input_device.base_input_device import BaseInputDevice from vocode.streaming.input_device.base_input_device import (
from vocode.output_device.base_output_device import BaseOutputDevice BaseInputDevice,
from vocode.models.transcriber import TranscriberConfig )
from vocode.models.agent import AgentConfig from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.models.synthesizer import SynthesizerConfig from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.websocket import (
ReadyMessage,
AudioMessage,
StartMessage,
StopMessage,
)
load_dotenv() load_dotenv()
class Conversation:
class StreamingConversation:
def __init__( def __init__(
self, self,
input_device: BaseInputDevice, input_device: BaseInputDevice,
@ -61,14 +69,16 @@ class Conversation:
loop.run_until_complete(run()) loop.run_until_complete(run())
async def start(self): async def start(self):
async with websockets.connect(f"{self.vocode_websocket_url}?key={vocode.api_key}") as ws: async with websockets.connect(
f"{self.vocode_websocket_url}?key={vocode.api_key}"
) as ws:
async def sender(ws: WebSocketClientProtocol): async def sender(ws: WebSocketClientProtocol):
start_message = StartMessage( start_message = StartMessage(
transcriber_config=self.transcriber_config, transcriber_config=self.transcriber_config,
agent_config=self.agent_config, agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config, synthesizer_config=self.synthesizer_config,
conversation_id=self.id conversation_id=self.id,
) )
await ws.send(start_message.json()) await ws.send(start_message.json())
await self.wait_for_ready() await self.wait_for_ready()

View file

@ -4,10 +4,15 @@ import requests
import uvicorn import uvicorn
import vocode import vocode
from vocode.models.transcriber import TranscriberConfig from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.models.synthesizer import SynthesizerConfig from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.models.agent import AgentConfig from vocode.streaming.models.agent import AgentConfig
from vocode.models.telephony import CreateInboundCall, TwilioConfig, TwilioConfig from vocode.streaming.models.telephony import (
CreateInboundCall,
TwilioConfig,
TwilioConfig,
)
class InboundCallServer: class InboundCallServer:
def __init__( def __init__(

View file

@ -2,9 +2,9 @@ from typing import Optional
import requests import requests
import vocode import vocode
from vocode.models.agent import AgentConfig from vocode.streaming.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig from vocode.streaming.models.transcriber import TranscriberConfig
from ..models.telephony import ( from ..models.telephony import (
CallEntity, CallEntity,
CreateOutboundCall, CreateOutboundCall,
@ -31,8 +31,12 @@ class OutboundCall:
self.synthesizer_config = synthesizer_config self.synthesizer_config = synthesizer_config
self.conversation_id = conversation_id self.conversation_id = conversation_id
self.twilio_config = twilio_config self.twilio_config = twilio_config
self.vocode_create_outbound_call_url = f"https://{vocode.base_url}/create_outbound_call" self.vocode_create_outbound_call_url = (
self.vocode_end_outbound_call_url = f"https://{vocode.base_url}/end_outbound_call" f"https://{vocode.base_url}/create_outbound_call"
)
self.vocode_end_outbound_call_url = (
f"https://{vocode.base_url}/end_outbound_call"
)
def start(self) -> str: def start(self) -> str:
response = requests.post( response = requests.post(

View file

@ -2,11 +2,11 @@ from typing import Optional
import requests import requests
import vocode import vocode
from vocode.models.agent import AgentConfig from vocode.streaming.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.telephony.outbound_call import OutboundCall from vocode.streaming.telephony.outbound_call import OutboundCall
from vocode.models.telephony import ( from vocode.streaming.models.telephony import (
CallEntity, CallEntity,
DialIntoZoomCall, DialIntoZoomCall,
TwilioConfig, TwilioConfig,

View file

@ -0,0 +1,9 @@
from typing import Optional
class BaseAgent:
def __init__(self, initial_message: Optional[str] = None):
self.initial_message = initial_message
def respond(self, human_input: str):
raise NotImplementedError

View file

@ -0,0 +1,45 @@
from typing import Optional
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from vocode.turn_based.agent.base_agent import BaseAgent
class ChatGPTAgent(BaseAgent):
def __init__(
self,
system_prompt: str,
initial_message: Optional[str] = None,
model_name: str = "gpt-3.5-turbo",
temperature: float = 0.7,
max_tokens: int = 100,
):
super().__init__(initial_message=initial_message)
self.prompt = ChatPromptTemplate.from_messages(
[
SystemMessagePromptTemplate.from_template(system_prompt),
MessagesPlaceholder(variable_name="history"),
HumanMessagePromptTemplate.from_template("{input}"),
]
)
self.memory = ConversationBufferMemory(return_messages=True)
if initial_message:
self.memory.chat_memory.add_ai_message(initial_message)
self.llm = ChatOpenAI(
model_name=model_name,
temperature=temperature,
max_tokens=max_tokens,
)
self.conversation = ConversationChain(
memory=self.memory, prompt=self.prompt, llm=self.llm
)
def respond(self, human_input: str):
return self.conversation.predict(input=human_input)

View file

@ -0,0 +1,6 @@
from vocode.turn_based.agent.base_agent import BaseAgent
class EchoAgent(BaseAgent):
def respond(self, human_input: str):
return human_input

View file

@ -0,0 +1,9 @@
from pydub import AudioSegment
class BaseInputDevice:
def start_listening(self):
raise NotImplementedError
def end_listening(self) -> AudioSegment:
raise NotImplementedError

View file

@ -0,0 +1,59 @@
from typing import Optional
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
import io
import wave
from vocode.turn_based.input_device.base_input_device import BaseInputDevice
class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
chunk_size: int = DEFAULT_CHUNK_SIZE,
):
self.device_info = device_info
self.sampling_rate = sampling_rate or (
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
self.chunk_size = chunk_size
self.buffer: Optional[io.BytesIO] = None
self.wave_writer: Optional[wave.Wave_write] = None
def create_stream(self):
return sd.InputStream(
dtype=np.int16,
channels=1,
samplerate=self.sampling_rate,
blocksize=self.chunk_size,
device=int(self.device_info["index"]),
callback=self._stream_callback,
)
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
audio_bytes = in_data.tobytes()
self.wave_writer.writeframes(audio_bytes)
def create_buffer(self):
in_memory_wav = io.BytesIO()
wave_writer = wave.open(in_memory_wav, "wb")
wave_writer.setnchannels(1)
wave_writer.setsampwidth(2)
wave_writer.setframerate(self.sampling_rate)
return in_memory_wav, wave_writer
def start_listening(self):
self.buffer, self.wave_writer = self.create_buffer()
self.stream = self.create_stream()
self.stream.start()
def end_listening(self) -> AudioSegment:
self.stream.stop()
self.buffer.seek(0)
return AudioSegment.from_wav(self.buffer)

View file

@ -0,0 +1,9 @@
from pydub import AudioSegment
class BaseOutputDevice:
def send_audio(self, audio: AudioSegment) -> None:
raise NotImplementedError
def terminate(self):
pass

View file

@ -0,0 +1,32 @@
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
):
self.device_info = device_info
self.sampling_rate = sampling_rate or int(
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
self.stream = sd.OutputStream(
channels=1,
samplerate=self.sampling_rate,
dtype=np.int16,
device=int(self.device_info["index"]),
)
self.stream.start()
def send_audio(self, audio_segment: AudioSegment):
self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16))
def terminate(self):
self.stream.close()

View file

@ -0,0 +1,53 @@
import os
from dotenv import load_dotenv
import azure.cognitiveservices.speech as speechsdk
from pydub import AudioSegment
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
load_dotenv()
class AzureSynthesizer(BaseSynthesizer):
def __init__(self, sampling_rate: int):
self.sampling_rate = sampling_rate
speech_config = speechsdk.SpeechConfig(
subscription=os.environ.get("AZURE_SPEECH_KEY"),
region=os.environ.get("AZURE_SPEECH_REGION"),
)
if self.sampling_rate == 44100:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
)
if self.sampling_rate == 48000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
)
if self.sampling_rate == 24000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
)
elif self.sampling_rate == 16000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
)
elif self.sampling_rate == 8000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
)
self.synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=None
)
def synthesize(self, text) -> AudioSegment:
result = self.synthesizer.speak_text(text)
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
return AudioSegment(
result.audio_data,
sample_width=2,
frame_rate=self.sampling_rate,
channels=1,
)
else:
raise Exception("Could not synthesize audio")

View file

@ -0,0 +1,6 @@
from pydub import AudioSegment
class BaseSynthesizer:
def synthesize(self, text) -> AudioSegment:
raise NotImplementedError

View file

@ -0,0 +1,6 @@
from pydub import AudioSegment
class BaseTranscriber:
def transcribe(self, audio_segment: AudioSegment) -> str:
raise NotImplementedError

View file

@ -0,0 +1,21 @@
from pydub import AudioSegment
import io
import os
from dotenv import load_dotenv
import openai
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
class WhisperTranscriber(BaseTranscriber):
def transcribe(self, audio_segment: AudioSegment) -> str:
in_memory_wav = io.BytesIO()
audio_segment.export(in_memory_wav, format="wav")
in_memory_wav.seek(0)
in_memory_wav.name = "whisper.wav"
transcript = openai.Audio.transcribe("whisper-1", in_memory_wav)
return transcript.text

View file

@ -0,0 +1,38 @@
from vocode.turn_based.agent.base_agent import BaseAgent
from vocode.turn_based.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
class TurnBasedConversation:
def __init__(
self,
input_device: BaseInputDevice,
transcriber: BaseTranscriber,
agent: BaseAgent,
synthesizer: BaseSynthesizer,
output_device: BaseOutputDevice,
):
self.input_device = input_device
self.transcriber = transcriber
self.agent = agent
self.synthesizer = synthesizer
self.output_device = output_device
self.maybe_play_initial_message()
def maybe_play_initial_message(self):
if self.agent.initial_message:
self.output_device.send_audio(
self.synthesizer.synthesize(self.agent.initial_message)
)
def start_speech(self):
self.input_device.start_listening()
def end_speech_and_respond(self):
human_input = self.transcriber.transcribe(self.input_device.end_listening())
agent_response = self.agent.respond(human_input)
self.output_device.send_audio(self.synthesizer.synthesize(agent_response))