first pass at turn based conversation

This commit is contained in:
Ajay Raj 2023-03-20 15:37:23 -07:00
commit 518a0f2b53
40 changed files with 503 additions and 99 deletions

View file

@ -10,9 +10,9 @@ import signal
from vocode.conversation import Conversation
from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.models.transcriber import DeepgramTranscriberConfig
from vocode.models.agent import LLMAgentConfig
from vocode.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.models.transcriber import DeepgramTranscriberConfig
from vocode.streaming.models.agent import LLMAgentConfig
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True)

View file

@ -1,8 +1,6 @@
from vocode.telephony.inbound_call_server import InboundCallServer
from vocode.models.agent import EchoAgentConfig
from vocode.streaming.telephony.inbound_call_server import InboundCallServer
from vocode.streaming.models.agent import EchoAgentConfig
if __name__ == '__main__':
server = InboundCallServer(
agent_config=EchoAgentConfig(initial_message="hello!")
)
if __name__ == "__main__":
server = InboundCallServer(agent_config=EchoAgentConfig(initial_message="hello!"))
server.run(port=3001)

View file

@ -1,14 +1,14 @@
from vocode.models.synthesizer import AzureSynthesizerConfig
from vocode.output_device.telephone_output import TelephoneOutput
from vocode.telephony.outbound_call import OutboundCall
from vocode.models.telephony import CallEntity
from vocode.models.agent import (
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.output_device.telephone_output import TelephoneOutput
from vocode.streaming.telephony.outbound_call import OutboundCall
from vocode.streaming.models.telephony import CallEntity
from vocode.streaming.models.agent import (
EchoAgentConfig,
ChatGPTAgentConfig,
WebSocketUserImplementedAgentConfig,
)
from vocode.models.message import BaseMessage
from vocode.telephony.zoom_dial_in import ZoomDialIn
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn
if __name__ == "__main__":
call = ZoomDialIn(
@ -24,7 +24,7 @@ if __name__ == "__main__":
generate_responses=True,
end_conversation_on_goodbye=True,
send_filler_audio=True,
allowed_idle_time_seconds=30
allowed_idle_time_seconds=30,
),
synthesizer_config=AzureSynthesizerConfig.from_output_device(
output_device=TelephoneOutput(), voice_name="en-US-JennyNeural"

View file

@ -3,14 +3,14 @@ import logging
import signal
from dotenv import load_dotenv
import os
from vocode.conversation import Conversation
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.models.transcriber import (
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
GoogleTranscriberConfig,
)
from vocode.models.agent import (
from vocode.streaming.models.agent import (
ChatGPTAgentConfig,
CutOffResponse,
FillerAudioConfig,
@ -20,9 +20,9 @@ from vocode.models.agent import (
LLMAgentConfig,
ChatGPTAgentConfig,
)
from vocode.models.message import BaseMessage
from vocode.models.synthesizer import AzureSynthesizerConfig
from vocode.user_implemented_agent.restful_agent import RESTfulAgent
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
import vocode
load_dotenv()
@ -34,10 +34,10 @@ logging.root.setLevel(logging.INFO)
if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
use_default_devices=False
streaming=True, use_default_devices=False
)
conversation = Conversation(
conversation = StreamingConversation(
input_device=microphone_input,
output_device=speaker_output,
transcriber_config=DeepgramTranscriberConfig.from_input_device(

View file

@ -0,0 +1,33 @@
import logging
from dotenv import load_dotenv
import os
from vocode.helpers import create_microphone_input_and_speaker_output
import vocode
from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent
from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber
from vocode.turn_based.turn_based_conversation import TurnBasedConversation
load_dotenv()
vocode.api_key = os.getenv("VOCODE_API_KEY")
if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
streaming=False, use_default_devices=False
)
conversation = TurnBasedConversation(
input_device=microphone_input,
output_device=speaker_output,
transcriber=WhisperTranscriber(),
agent=ChatGPTAgent(
system_prompt="The AI is having a pleasant conversation about life",
initial_message="Hello!",
),
synthesizer=AzureSynthesizer(sampling_rate=speaker_output.sampling_rate),
)
while True:
conversation.start_speech()
input("Press enter to end speech")
conversation.end_speech_and_respond()

View file

@ -1,6 +1,6 @@
from typing import AsyncGenerator
from vocode.user_implemented_agent.restful_agent import RESTfulAgent
from vocode.models.agent import (
from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
from vocode.streaming.models.agent import (
RESTfulAgentOutput,
RESTfulAgentText,
RESTfulAgentEnd,
@ -9,7 +9,7 @@ from vocode.models.agent import (
WebSocketAgentTextMessage,
WebSocketAgentStopMessage,
)
from vocode.user_implemented_agent.websocket_agent import WebSocketAgent
from vocode.streaming.user_implemented_agent.websocket_agent import WebSocketAgent
class TestRESTfulAgent(RESTfulAgent):

View file

@ -1,28 +1,69 @@
from typing import Union
import sounddevice as sd
from .input_device.microphone_input import MicrophoneInput
from .output_device.speaker_output import SpeakerOutput
from vocode.streaming.input_device.microphone_input import (
MicrophoneInput as StreamingMicrophoneInput,
)
from vocode.streaming.output_device.speaker_output import (
SpeakerOutput as StreamingSpeakerOutput,
)
from vocode.turn_based.input_device.microphone_input import (
MicrophoneInput as TurnBasedMicrophoneInput,
)
from vocode.turn_based.output_device.speaker_output import (
SpeakerOutput as TurnBasedSpeakerOutput,
)
import logging
logger = logging.getLogger(__name__)
def _get_device_prompt(device_infos: list[dict]) -> str:
return """Please select a device:
{}
Choice: """.format(
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
"\n".join(
f"{index}: {device['name']}" for index, device in enumerate(device_infos)
)
)
def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
def create_microphone_input_and_speaker_output(
streaming: bool = True,
use_default_devices=False,
mic_sampling_rate=None,
speaker_sampling_rate=None,
) -> Union[
tuple[StreamingMicrophoneInput, StreamingSpeakerOutput],
tuple[TurnBasedMicrophoneInput, TurnBasedSpeakerOutput],
]:
device_infos = sd.query_devices()
input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos))
output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos))
input_device_infos = list(
filter(lambda device_info: device_info["max_input_channels"] > 0, device_infos)
)
output_device_infos = list(
filter(lambda device_info: device_info["max_output_channels"] > 0, device_infos)
)
if use_default_devices:
input_device_info = sd.query_devices(kind='input')
output_device_info = sd.query_devices(kind='output')
input_device_info = sd.query_devices(kind="input")
output_device_info = sd.query_devices(kind="output")
else:
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
logger.info("Using microphone input device: %s", input_device_info['name'])
microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate)
logger.info("Using speaker output device: %s", output_device_info['name'])
speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate)
input_device_info = input_device_infos[
int(input(_get_device_prompt(input_device_infos)))
]
output_device_info = output_device_infos[
int(input(_get_device_prompt(output_device_infos)))
]
logger.info("Using microphone input device: %s", input_device_info["name"])
microphone_class = (
StreamingMicrophoneInput if streaming else TurnBasedMicrophoneInput
)
speaker_class = StreamingSpeakerOutput if streaming else TurnBasedSpeakerOutput
microphone_input = microphone_class(
input_device_info, sampling_rate=mic_sampling_rate
)
logger.info("Using speaker output device: %s", output_device_info["name"])
speaker_output = speaker_class(
output_device_info, sampling_rate=speaker_sampling_rate
)
return microphone_input, speaker_output

View file

@ -1,10 +1,12 @@
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
import queue
from typing import Optional
class BaseInputDevice():
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int):
class BaseInputDevice:
def __init__(
self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int
):
self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding
self.chunk_size = chunk_size

View file

@ -4,25 +4,33 @@ from typing import Optional
import queue
import wave
from .base_input_device import BaseInputDevice
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.input_device.base_input_device import BaseInputDevice
from vocode.streaming.models.audio_encoding import AudioEncoding
class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1):
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
chunk_size: int = DEFAULT_CHUNK_SIZE,
microphone_gain: int = 1,
):
self.device_info = device_info
sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
sampling_rate = sampling_rate or (
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
self.stream = sd.InputStream(
dtype=np.int16,
channels=1,
samplerate=self.sampling_rate,
blocksize=self.chunk_size,
device=int(self.device_info['index']),
callback=self._stream_callback
device=int(self.device_info["index"]),
callback=self._stream_callback,
)
self.stream.start()
self.queue = queue.Queue()

View file

@ -1,5 +1,7 @@
from vocode.input_device.base_input_device import BaseInputDevice
from vocode.models.audio_encoding import AudioEncoding
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.streaming.models.audio_encoding import AudioEncoding
class TelephoneInput(BaseInputDevice):

View file

@ -3,7 +3,7 @@ from enum import Enum
from pydantic import validator
from vocode.models.message import BaseMessage
from vocode.streaming.models.message import BaseMessage
from .model import TypedModel, BaseModel
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5

View file

@ -1,8 +1,8 @@
from typing import Optional
from vocode.models.model import BaseModel
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig
from vocode.streaming.models.model import BaseModel
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
class TwilioConfig(BaseModel):

View file

@ -1,8 +1,11 @@
from enum import Enum
from typing import Optional
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from .audio_encoding import AudioEncoding
from .model import BaseModel, TypedModel
from ..input_device.base_input_device import BaseInputDevice
class TranscriberType(str, Enum):

View file

@ -1,7 +1,7 @@
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
class BaseOutputDevice:
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding
@ -11,5 +11,3 @@ class BaseOutputDevice:
async def maybe_send_mark_async(self, message):
pass

View file

@ -2,21 +2,28 @@ import sounddevice as sd
import numpy as np
from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
audio_encoding: AudioEncoding = AudioEncoding.LINEAR16,
):
self.device_info = device_info
sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
sampling_rate = sampling_rate or int(
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
super().__init__(sampling_rate, audio_encoding)
self.stream = sd.OutputStream(
channels=1,
samplerate=self.sampling_rate,
dtype=np.int16,
device=int(self.device_info['index'])
device=int(self.device_info["index"]),
)
self.stream.start()

View file

@ -1,5 +1,5 @@
from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding
from vocode.streaming.models.audio_encoding import AudioEncoding
class TelephoneOutput(BaseOutputDevice):

View file

@ -8,16 +8,24 @@ import logging
import threading
import queue
import vocode
from vocode.input_device.base_input_device import BaseInputDevice
from vocode.output_device.base_output_device import BaseOutputDevice
from vocode.models.transcriber import TranscriberConfig
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage
from vocode.streaming.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.websocket import (
ReadyMessage,
AudioMessage,
StartMessage,
StopMessage,
)
load_dotenv()
class Conversation:
class StreamingConversation:
def __init__(
self,
input_device: BaseInputDevice,
@ -61,14 +69,16 @@ class Conversation:
loop.run_until_complete(run())
async def start(self):
async with websockets.connect(f"{self.vocode_websocket_url}?key={vocode.api_key}") as ws:
async with websockets.connect(
f"{self.vocode_websocket_url}?key={vocode.api_key}"
) as ws:
async def sender(ws: WebSocketClientProtocol):
start_message = StartMessage(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config,
conversation_id=self.id
conversation_id=self.id,
)
await ws.send(start_message.json())
await self.wait_for_ready()

View file

@ -4,10 +4,15 @@ import requests
import uvicorn
import vocode
from vocode.models.transcriber import TranscriberConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.agent import AgentConfig
from vocode.models.telephony import CreateInboundCall, TwilioConfig, TwilioConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.telephony import (
CreateInboundCall,
TwilioConfig,
TwilioConfig,
)
class InboundCallServer:
def __init__(

View file

@ -2,9 +2,9 @@ from typing import Optional
import requests
import vocode
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from ..models.telephony import (
CallEntity,
CreateOutboundCall,
@ -31,8 +31,12 @@ class OutboundCall:
self.synthesizer_config = synthesizer_config
self.conversation_id = conversation_id
self.twilio_config = twilio_config
self.vocode_create_outbound_call_url = f"https://{vocode.base_url}/create_outbound_call"
self.vocode_end_outbound_call_url = f"https://{vocode.base_url}/end_outbound_call"
self.vocode_create_outbound_call_url = (
f"https://{vocode.base_url}/create_outbound_call"
)
self.vocode_end_outbound_call_url = (
f"https://{vocode.base_url}/end_outbound_call"
)
def start(self) -> str:
response = requests.post(

View file

@ -2,11 +2,11 @@ from typing import Optional
import requests
import vocode
from vocode.models.agent import AgentConfig
from vocode.models.synthesizer import SynthesizerConfig
from vocode.models.transcriber import TranscriberConfig
from vocode.telephony.outbound_call import OutboundCall
from vocode.models.telephony import (
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import SynthesizerConfig
from vocode.streaming.models.transcriber import TranscriberConfig
from vocode.streaming.telephony.outbound_call import OutboundCall
from vocode.streaming.models.telephony import (
CallEntity,
DialIntoZoomCall,
TwilioConfig,

View file

@ -0,0 +1,9 @@
from typing import Optional
class BaseAgent:
def __init__(self, initial_message: Optional[str] = None):
self.initial_message = initial_message
def respond(self, human_input: str):
raise NotImplementedError

View file

@ -0,0 +1,45 @@
from typing import Optional
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from vocode.turn_based.agent.base_agent import BaseAgent
class ChatGPTAgent(BaseAgent):
def __init__(
self,
system_prompt: str,
initial_message: Optional[str] = None,
model_name: str = "gpt-3.5-turbo",
temperature: float = 0.7,
max_tokens: int = 100,
):
super().__init__(initial_message=initial_message)
self.prompt = ChatPromptTemplate.from_messages(
[
SystemMessagePromptTemplate.from_template(system_prompt),
MessagesPlaceholder(variable_name="history"),
HumanMessagePromptTemplate.from_template("{input}"),
]
)
self.memory = ConversationBufferMemory(return_messages=True)
if initial_message:
self.memory.chat_memory.add_ai_message(initial_message)
self.llm = ChatOpenAI(
model_name=model_name,
temperature=temperature,
max_tokens=max_tokens,
)
self.conversation = ConversationChain(
memory=self.memory, prompt=self.prompt, llm=self.llm
)
def respond(self, human_input: str):
return self.conversation.predict(input=human_input)

View file

@ -0,0 +1,6 @@
from vocode.turn_based.agent.base_agent import BaseAgent
class EchoAgent(BaseAgent):
def respond(self, human_input: str):
return human_input

View file

@ -0,0 +1,9 @@
from pydub import AudioSegment
class BaseInputDevice:
def start_listening(self):
raise NotImplementedError
def end_listening(self) -> AudioSegment:
raise NotImplementedError

View file

@ -0,0 +1,59 @@
from typing import Optional
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
import io
import wave
from vocode.turn_based.input_device.base_input_device import BaseInputDevice
class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
chunk_size: int = DEFAULT_CHUNK_SIZE,
):
self.device_info = device_info
self.sampling_rate = sampling_rate or (
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
self.chunk_size = chunk_size
self.buffer: Optional[io.BytesIO] = None
self.wave_writer: Optional[wave.Wave_write] = None
def create_stream(self):
return sd.InputStream(
dtype=np.int16,
channels=1,
samplerate=self.sampling_rate,
blocksize=self.chunk_size,
device=int(self.device_info["index"]),
callback=self._stream_callback,
)
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
audio_bytes = in_data.tobytes()
self.wave_writer.writeframes(audio_bytes)
def create_buffer(self):
in_memory_wav = io.BytesIO()
wave_writer = wave.open(in_memory_wav, "wb")
wave_writer.setnchannels(1)
wave_writer.setsampwidth(2)
wave_writer.setframerate(self.sampling_rate)
return in_memory_wav, wave_writer
def start_listening(self):
self.buffer, self.wave_writer = self.create_buffer()
self.stream = self.create_stream()
self.stream.start()
def end_listening(self) -> AudioSegment:
self.stream.stop()
self.buffer.seek(0)
return AudioSegment.from_wav(self.buffer)

View file

@ -0,0 +1,9 @@
from pydub import AudioSegment
class BaseOutputDevice:
def send_audio(self, audio: AudioSegment) -> None:
raise NotImplementedError
def terminate(self):
pass

View file

@ -0,0 +1,32 @@
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(
self,
device_info: dict,
sampling_rate: int = None,
):
self.device_info = device_info
self.sampling_rate = sampling_rate or int(
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
)
self.stream = sd.OutputStream(
channels=1,
samplerate=self.sampling_rate,
dtype=np.int16,
device=int(self.device_info["index"]),
)
self.stream.start()
def send_audio(self, audio_segment: AudioSegment):
self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16))
def terminate(self):
self.stream.close()

View file

@ -0,0 +1,53 @@
import os
from dotenv import load_dotenv
import azure.cognitiveservices.speech as speechsdk
from pydub import AudioSegment
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
load_dotenv()
class AzureSynthesizer(BaseSynthesizer):
def __init__(self, sampling_rate: int):
self.sampling_rate = sampling_rate
speech_config = speechsdk.SpeechConfig(
subscription=os.environ.get("AZURE_SPEECH_KEY"),
region=os.environ.get("AZURE_SPEECH_REGION"),
)
if self.sampling_rate == 44100:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
)
if self.sampling_rate == 48000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
)
if self.sampling_rate == 24000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
)
elif self.sampling_rate == 16000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
)
elif self.sampling_rate == 8000:
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
)
self.synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config, audio_config=None
)
def synthesize(self, text) -> AudioSegment:
result = self.synthesizer.speak_text(text)
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
return AudioSegment(
result.audio_data,
sample_width=2,
frame_rate=self.sampling_rate,
channels=1,
)
else:
raise Exception("Could not synthesize audio")

View file

@ -0,0 +1,6 @@
from pydub import AudioSegment
class BaseSynthesizer:
def synthesize(self, text) -> AudioSegment:
raise NotImplementedError

View file

@ -0,0 +1,6 @@
from pydub import AudioSegment
class BaseTranscriber:
def transcribe(self, audio_segment: AudioSegment) -> str:
raise NotImplementedError

View file

@ -0,0 +1,21 @@
from pydub import AudioSegment
import io
import os
from dotenv import load_dotenv
import openai
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
class WhisperTranscriber(BaseTranscriber):
def transcribe(self, audio_segment: AudioSegment) -> str:
in_memory_wav = io.BytesIO()
audio_segment.export(in_memory_wav, format="wav")
in_memory_wav.seek(0)
in_memory_wav.name = "whisper.wav"
transcript = openai.Audio.transcribe("whisper-1", in_memory_wav)
return transcript.text

View file

@ -0,0 +1,38 @@
from vocode.turn_based.agent.base_agent import BaseAgent
from vocode.turn_based.input_device.base_input_device import (
BaseInputDevice,
)
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
class TurnBasedConversation:
def __init__(
self,
input_device: BaseInputDevice,
transcriber: BaseTranscriber,
agent: BaseAgent,
synthesizer: BaseSynthesizer,
output_device: BaseOutputDevice,
):
self.input_device = input_device
self.transcriber = transcriber
self.agent = agent
self.synthesizer = synthesizer
self.output_device = output_device
self.maybe_play_initial_message()
def maybe_play_initial_message(self):
if self.agent.initial_message:
self.output_device.send_audio(
self.synthesizer.synthesize(self.agent.initial_message)
)
def start_speech(self):
self.input_device.start_listening()
def end_speech_and_respond(self):
human_input = self.transcriber.transcribe(self.input_device.end_listening())
agent_response = self.agent.respond(human_input)
self.output_device.send_audio(self.synthesizer.synthesize(agent_response))