diff --git a/README.md b/README.md index 08de78e..d34e3f0 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ import signal from vocode.conversation import Conversation from vocode.helpers import create_microphone_input_and_speaker_output -from vocode.models.transcriber import DeepgramTranscriberConfig -from vocode.models.agent import LLMAgentConfig -from vocode.models.synthesizer import AzureSynthesizerConfig +from vocode.streaming.models.transcriber import DeepgramTranscriberConfig +from vocode.streaming.models.agent import LLMAgentConfig +from vocode.streaming.models.synthesizer import AzureSynthesizerConfig if __name__ == "__main__": microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True) diff --git a/simple_inbound_call_server.py b/simple_inbound_call_server.py index 20fdfaf..970b3db 100644 --- a/simple_inbound_call_server.py +++ b/simple_inbound_call_server.py @@ -1,8 +1,6 @@ -from vocode.telephony.inbound_call_server import InboundCallServer -from vocode.models.agent import EchoAgentConfig +from vocode.streaming.telephony.inbound_call_server import InboundCallServer +from vocode.streaming.models.agent import EchoAgentConfig -if __name__ == '__main__': - server = InboundCallServer( - agent_config=EchoAgentConfig(initial_message="hello!") - ) - server.run(port=3001) \ No newline at end of file +if __name__ == "__main__": + server = InboundCallServer(agent_config=EchoAgentConfig(initial_message="hello!")) + server.run(port=3001) diff --git a/simple_outbound_call.py b/simple_outbound_call.py index c8fa80a..8596dd8 100644 --- a/simple_outbound_call.py +++ b/simple_outbound_call.py @@ -1,14 +1,14 @@ -from vocode.models.synthesizer import AzureSynthesizerConfig -from vocode.output_device.telephone_output import TelephoneOutput -from vocode.telephony.outbound_call import OutboundCall -from vocode.models.telephony import CallEntity -from vocode.models.agent import ( +from vocode.streaming.models.synthesizer import AzureSynthesizerConfig +from vocode.streaming.output_device.telephone_output import TelephoneOutput +from vocode.streaming.telephony.outbound_call import OutboundCall +from vocode.streaming.models.telephony import CallEntity +from vocode.streaming.models.agent import ( EchoAgentConfig, ChatGPTAgentConfig, WebSocketUserImplementedAgentConfig, ) -from vocode.models.message import BaseMessage -from vocode.telephony.zoom_dial_in import ZoomDialIn +from vocode.streaming.models.message import BaseMessage +from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn if __name__ == "__main__": call = ZoomDialIn( @@ -24,7 +24,7 @@ if __name__ == "__main__": generate_responses=True, end_conversation_on_goodbye=True, send_filler_audio=True, - allowed_idle_time_seconds=30 + allowed_idle_time_seconds=30, ), synthesizer_config=AzureSynthesizerConfig.from_output_device( output_device=TelephoneOutput(), voice_name="en-US-JennyNeural" diff --git a/simple_conversation.py b/simple_streaming_conversation.py similarity index 74% rename from simple_conversation.py rename to simple_streaming_conversation.py index 63feef6..07ae480 100644 --- a/simple_conversation.py +++ b/simple_streaming_conversation.py @@ -3,14 +3,14 @@ import logging import signal from dotenv import load_dotenv import os -from vocode.conversation import Conversation +from vocode.streaming.streaming_conversation import StreamingConversation from vocode.helpers import create_microphone_input_and_speaker_output -from vocode.models.transcriber import ( +from vocode.streaming.models.transcriber import ( DeepgramTranscriberConfig, PunctuationEndpointingConfig, GoogleTranscriberConfig, ) -from vocode.models.agent import ( +from vocode.streaming.models.agent import ( ChatGPTAgentConfig, CutOffResponse, FillerAudioConfig, @@ -20,9 +20,9 @@ from vocode.models.agent import ( LLMAgentConfig, ChatGPTAgentConfig, ) -from vocode.models.message import BaseMessage -from vocode.models.synthesizer import AzureSynthesizerConfig -from vocode.user_implemented_agent.restful_agent import RESTfulAgent +from vocode.streaming.models.message import BaseMessage +from vocode.streaming.models.synthesizer import AzureSynthesizerConfig +from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent import vocode load_dotenv() @@ -34,10 +34,10 @@ logging.root.setLevel(logging.INFO) if __name__ == "__main__": microphone_input, speaker_output = create_microphone_input_and_speaker_output( - use_default_devices=False + streaming=True, use_default_devices=False ) - conversation = Conversation( + conversation = StreamingConversation( input_device=microphone_input, output_device=speaker_output, transcriber_config=DeepgramTranscriberConfig.from_input_device( diff --git a/simple_turn_based_conversation.py b/simple_turn_based_conversation.py new file mode 100644 index 0000000..5f6e1a1 --- /dev/null +++ b/simple_turn_based_conversation.py @@ -0,0 +1,33 @@ +import logging +from dotenv import load_dotenv +import os +from vocode.helpers import create_microphone_input_and_speaker_output +import vocode +from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent +from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer +from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber +from vocode.turn_based.turn_based_conversation import TurnBasedConversation + +load_dotenv() +vocode.api_key = os.getenv("VOCODE_API_KEY") + + +if __name__ == "__main__": + microphone_input, speaker_output = create_microphone_input_and_speaker_output( + streaming=False, use_default_devices=False + ) + + conversation = TurnBasedConversation( + input_device=microphone_input, + output_device=speaker_output, + transcriber=WhisperTranscriber(), + agent=ChatGPTAgent( + system_prompt="The AI is having a pleasant conversation about life", + initial_message="Hello!", + ), + synthesizer=AzureSynthesizer(sampling_rate=speaker_output.sampling_rate), + ) + while True: + conversation.start_speech() + input("Press enter to end speech") + conversation.end_speech_and_respond() diff --git a/simple_user_implemented_agent.py b/simple_user_implemented_agent.py index a5299c1..6a4420d 100644 --- a/simple_user_implemented_agent.py +++ b/simple_user_implemented_agent.py @@ -1,6 +1,6 @@ from typing import AsyncGenerator -from vocode.user_implemented_agent.restful_agent import RESTfulAgent -from vocode.models.agent import ( +from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent +from vocode.streaming.models.agent import ( RESTfulAgentOutput, RESTfulAgentText, RESTfulAgentEnd, @@ -9,7 +9,7 @@ from vocode.models.agent import ( WebSocketAgentTextMessage, WebSocketAgentStopMessage, ) -from vocode.user_implemented_agent.websocket_agent import WebSocketAgent +from vocode.streaming.user_implemented_agent.websocket_agent import WebSocketAgent class TestRESTfulAgent(RESTfulAgent): diff --git a/vocode/helpers.py b/vocode/helpers.py index 905f093..84d807d 100644 --- a/vocode/helpers.py +++ b/vocode/helpers.py @@ -1,28 +1,69 @@ +from typing import Union import sounddevice as sd -from .input_device.microphone_input import MicrophoneInput -from .output_device.speaker_output import SpeakerOutput +from vocode.streaming.input_device.microphone_input import ( + MicrophoneInput as StreamingMicrophoneInput, +) +from vocode.streaming.output_device.speaker_output import ( + SpeakerOutput as StreamingSpeakerOutput, +) +from vocode.turn_based.input_device.microphone_input import ( + MicrophoneInput as TurnBasedMicrophoneInput, +) +from vocode.turn_based.output_device.speaker_output import ( + SpeakerOutput as TurnBasedSpeakerOutput, +) import logging logger = logging.getLogger(__name__) + def _get_device_prompt(device_infos: list[dict]) -> str: return """Please select a device: {} Choice: """.format( - "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) + "\n".join( + f"{index}: {device['name']}" for index, device in enumerate(device_infos) + ) + ) -def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: + +def create_microphone_input_and_speaker_output( + streaming: bool = True, + use_default_devices=False, + mic_sampling_rate=None, + speaker_sampling_rate=None, +) -> Union[ + tuple[StreamingMicrophoneInput, StreamingSpeakerOutput], + tuple[TurnBasedMicrophoneInput, TurnBasedSpeakerOutput], +]: device_infos = sd.query_devices() - input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos)) - output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos)) + input_device_infos = list( + filter(lambda device_info: device_info["max_input_channels"] > 0, device_infos) + ) + output_device_infos = list( + filter(lambda device_info: device_info["max_output_channels"] > 0, device_infos) + ) if use_default_devices: - input_device_info = sd.query_devices(kind='input') - output_device_info = sd.query_devices(kind='output') + input_device_info = sd.query_devices(kind="input") + output_device_info = sd.query_devices(kind="output") else: - input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))] - output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))] - logger.info("Using microphone input device: %s", input_device_info['name']) - microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate) - logger.info("Using speaker output device: %s", output_device_info['name']) - speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate) - return microphone_input, speaker_output \ No newline at end of file + input_device_info = input_device_infos[ + int(input(_get_device_prompt(input_device_infos))) + ] + output_device_info = output_device_infos[ + int(input(_get_device_prompt(output_device_infos))) + ] + logger.info("Using microphone input device: %s", input_device_info["name"]) + microphone_class = ( + StreamingMicrophoneInput if streaming else TurnBasedMicrophoneInput + ) + speaker_class = StreamingSpeakerOutput if streaming else TurnBasedSpeakerOutput + + microphone_input = microphone_class( + input_device_info, sampling_rate=mic_sampling_rate + ) + logger.info("Using speaker output device: %s", output_device_info["name"]) + speaker_output = speaker_class( + output_device_info, sampling_rate=speaker_sampling_rate + ) + return microphone_input, speaker_output diff --git a/vocode/input_device/base_input_device.py b/vocode/streaming/input_device/base_input_device.py similarity index 52% rename from vocode/input_device/base_input_device.py rename to vocode/streaming/input_device/base_input_device.py index 5c0385c..6c7c981 100644 --- a/vocode/input_device/base_input_device.py +++ b/vocode/streaming/input_device/base_input_device.py @@ -1,14 +1,16 @@ -from ..models.audio_encoding import AudioEncoding +from vocode.streaming.models.audio_encoding import AudioEncoding import queue from typing import Optional -class BaseInputDevice(): - def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int): +class BaseInputDevice: + def __init__( + self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int + ): self.sampling_rate = sampling_rate self.audio_encoding = audio_encoding self.chunk_size = chunk_size self.queue = queue.Queue() def get_audio(self) -> Optional[bytes]: - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/vocode/input_device/microphone_input.py b/vocode/streaming/input_device/microphone_input.py similarity index 62% rename from vocode/input_device/microphone_input.py rename to vocode/streaming/input_device/microphone_input.py index d1a60aa..f61b261 100644 --- a/vocode/input_device/microphone_input.py +++ b/vocode/streaming/input_device/microphone_input.py @@ -4,25 +4,33 @@ from typing import Optional import queue import wave -from .base_input_device import BaseInputDevice -from ..models.audio_encoding import AudioEncoding +from vocode.streaming.input_device.base_input_device import BaseInputDevice +from vocode.streaming.models.audio_encoding import AudioEncoding + class MicrophoneInput(BaseInputDevice): - DEFAULT_SAMPLING_RATE = 44100 DEFAULT_CHUNK_SIZE = 2048 - def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1): + def __init__( + self, + device_info: dict, + sampling_rate: int = None, + chunk_size: int = DEFAULT_CHUNK_SIZE, + microphone_gain: int = 1, + ): self.device_info = device_info - sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE)) + sampling_rate = sampling_rate or ( + self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE) + ) super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size) self.stream = sd.InputStream( dtype=np.int16, channels=1, - samplerate=self.sampling_rate, + samplerate=self.sampling_rate, blocksize=self.chunk_size, - device=int(self.device_info['index']), - callback=self._stream_callback + device=int(self.device_info["index"]), + callback=self._stream_callback, ) self.stream.start() self.queue = queue.Queue() @@ -40,4 +48,4 @@ class MicrophoneInput(BaseInputDevice): try: return self.queue.get_nowait() except queue.Empty: - return None \ No newline at end of file + return None diff --git a/vocode/input_device/telephone_input.py b/vocode/streaming/input_device/telephone_input.py similarity index 55% rename from vocode/input_device/telephone_input.py rename to vocode/streaming/input_device/telephone_input.py index f43b094..dd962aa 100644 --- a/vocode/input_device/telephone_input.py +++ b/vocode/streaming/input_device/telephone_input.py @@ -1,5 +1,7 @@ -from vocode.input_device.base_input_device import BaseInputDevice -from vocode.models.audio_encoding import AudioEncoding +from vocode.streaming.input_device.base_input_device import ( + BaseInputDevice, +) +from vocode.streaming.models.audio_encoding import AudioEncoding class TelephoneInput(BaseInputDevice): diff --git a/vocode/models/agent.py b/vocode/streaming/models/agent.py similarity index 98% rename from vocode/models/agent.py rename to vocode/streaming/models/agent.py index 0229125..1739267 100644 --- a/vocode/models/agent.py +++ b/vocode/streaming/models/agent.py @@ -3,7 +3,7 @@ from enum import Enum from pydantic import validator -from vocode.models.message import BaseMessage +from vocode.streaming.models.message import BaseMessage from .model import TypedModel, BaseModel FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5 diff --git a/vocode/models/audio_encoding.py b/vocode/streaming/models/audio_encoding.py similarity index 100% rename from vocode/models/audio_encoding.py rename to vocode/streaming/models/audio_encoding.py diff --git a/vocode/models/message.py b/vocode/streaming/models/message.py similarity index 100% rename from vocode/models/message.py rename to vocode/streaming/models/message.py diff --git a/vocode/models/model.py b/vocode/streaming/models/model.py similarity index 100% rename from vocode/models/model.py rename to vocode/streaming/models/model.py diff --git a/vocode/models/synthesizer.py b/vocode/streaming/models/synthesizer.py similarity index 100% rename from vocode/models/synthesizer.py rename to vocode/streaming/models/synthesizer.py diff --git a/vocode/models/telephony.py b/vocode/streaming/models/telephony.py similarity index 84% rename from vocode/models/telephony.py rename to vocode/streaming/models/telephony.py index c161996..76110c9 100644 --- a/vocode/models/telephony.py +++ b/vocode/streaming/models/telephony.py @@ -1,8 +1,8 @@ from typing import Optional -from vocode.models.model import BaseModel -from vocode.models.agent import AgentConfig -from vocode.models.synthesizer import SynthesizerConfig -from vocode.models.transcriber import TranscriberConfig +from vocode.streaming.models.model import BaseModel +from vocode.streaming.models.agent import AgentConfig +from vocode.streaming.models.synthesizer import SynthesizerConfig +from vocode.streaming.models.transcriber import TranscriberConfig class TwilioConfig(BaseModel): diff --git a/vocode/models/transcriber.py b/vocode/streaming/models/transcriber.py similarity index 95% rename from vocode/models/transcriber.py rename to vocode/streaming/models/transcriber.py index 71954ec..60f5dd7 100644 --- a/vocode/models/transcriber.py +++ b/vocode/streaming/models/transcriber.py @@ -1,8 +1,11 @@ from enum import Enum from typing import Optional + +from vocode.streaming.input_device.base_input_device import ( + BaseInputDevice, +) from .audio_encoding import AudioEncoding from .model import BaseModel, TypedModel -from ..input_device.base_input_device import BaseInputDevice class TranscriberType(str, Enum): diff --git a/vocode/models/websocket.py b/vocode/streaming/models/websocket.py similarity index 100% rename from vocode/models/websocket.py rename to vocode/streaming/models/websocket.py diff --git a/vocode/output_device/base_output_device.py b/vocode/streaming/output_device/base_output_device.py similarity index 83% rename from vocode/output_device/base_output_device.py rename to vocode/streaming/output_device/base_output_device.py index 71896d8..19e8bce 100644 --- a/vocode/output_device/base_output_device.py +++ b/vocode/streaming/output_device/base_output_device.py @@ -1,7 +1,7 @@ -from ..models.audio_encoding import AudioEncoding +from vocode.streaming.models.audio_encoding import AudioEncoding + class BaseOutputDevice: - def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding): self.sampling_rate = sampling_rate self.audio_encoding = audio_encoding @@ -11,5 +11,3 @@ class BaseOutputDevice: async def maybe_send_mark_async(self, message): pass - - diff --git a/vocode/output_device/speaker_output.py b/vocode/streaming/output_device/speaker_output.py similarity index 55% rename from vocode/output_device/speaker_output.py rename to vocode/streaming/output_device/speaker_output.py index 2413903..779f60c 100644 --- a/vocode/output_device/speaker_output.py +++ b/vocode/streaming/output_device/speaker_output.py @@ -2,21 +2,28 @@ import sounddevice as sd import numpy as np from .base_output_device import BaseOutputDevice -from ..models.audio_encoding import AudioEncoding +from vocode.streaming.models.audio_encoding import AudioEncoding + class SpeakerOutput(BaseOutputDevice): - DEFAULT_SAMPLING_RATE = 44100 - def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): + def __init__( + self, + device_info: dict, + sampling_rate: int = None, + audio_encoding: AudioEncoding = AudioEncoding.LINEAR16, + ): self.device_info = device_info - sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE)) + sampling_rate = sampling_rate or int( + self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE) + ) super().__init__(sampling_rate, audio_encoding) self.stream = sd.OutputStream( channels=1, samplerate=self.sampling_rate, dtype=np.int16, - device=int(self.device_info['index']) + device=int(self.device_info["index"]), ) self.stream.start() @@ -24,4 +31,4 @@ class SpeakerOutput(BaseOutputDevice): self.stream.write(np.frombuffer(chunk, dtype=np.int16)) def terminate(self): - self.stream.close() \ No newline at end of file + self.stream.close() diff --git a/vocode/output_device/telephone_output.py b/vocode/streaming/output_device/telephone_output.py similarity index 75% rename from vocode/output_device/telephone_output.py rename to vocode/streaming/output_device/telephone_output.py index 1fc80a3..58a7a0f 100644 --- a/vocode/output_device/telephone_output.py +++ b/vocode/streaming/output_device/telephone_output.py @@ -1,5 +1,5 @@ from .base_output_device import BaseOutputDevice -from ..models.audio_encoding import AudioEncoding +from vocode.streaming.models.audio_encoding import AudioEncoding class TelephoneOutput(BaseOutputDevice): diff --git a/vocode/conversation.py b/vocode/streaming/streaming_conversation.py similarity index 82% rename from vocode/conversation.py rename to vocode/streaming/streaming_conversation.py index 6c4fb50..04700ad 100644 --- a/vocode/conversation.py +++ b/vocode/streaming/streaming_conversation.py @@ -8,16 +8,24 @@ import logging import threading import queue import vocode -from vocode.input_device.base_input_device import BaseInputDevice -from vocode.output_device.base_output_device import BaseOutputDevice -from vocode.models.transcriber import TranscriberConfig -from vocode.models.agent import AgentConfig -from vocode.models.synthesizer import SynthesizerConfig -from vocode.models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage +from vocode.streaming.input_device.base_input_device import ( + BaseInputDevice, +) +from vocode.streaming.output_device.base_output_device import BaseOutputDevice +from vocode.streaming.models.transcriber import TranscriberConfig +from vocode.streaming.models.agent import AgentConfig +from vocode.streaming.models.synthesizer import SynthesizerConfig +from vocode.streaming.models.websocket import ( + ReadyMessage, + AudioMessage, + StartMessage, + StopMessage, +) load_dotenv() -class Conversation: + +class StreamingConversation: def __init__( self, input_device: BaseInputDevice, @@ -61,14 +69,16 @@ class Conversation: loop.run_until_complete(run()) async def start(self): - async with websockets.connect(f"{self.vocode_websocket_url}?key={vocode.api_key}") as ws: + async with websockets.connect( + f"{self.vocode_websocket_url}?key={vocode.api_key}" + ) as ws: async def sender(ws: WebSocketClientProtocol): start_message = StartMessage( transcriber_config=self.transcriber_config, agent_config=self.agent_config, synthesizer_config=self.synthesizer_config, - conversation_id=self.id + conversation_id=self.id, ) await ws.send(start_message.json()) await self.wait_for_ready() diff --git a/vocode/telephony/inbound_call_server.py b/vocode/streaming/telephony/inbound_call_server.py similarity index 86% rename from vocode/telephony/inbound_call_server.py rename to vocode/streaming/telephony/inbound_call_server.py index 4b637e0..5cb88c5 100644 --- a/vocode/telephony/inbound_call_server.py +++ b/vocode/streaming/telephony/inbound_call_server.py @@ -4,10 +4,15 @@ import requests import uvicorn import vocode -from vocode.models.transcriber import TranscriberConfig -from vocode.models.synthesizer import SynthesizerConfig -from vocode.models.agent import AgentConfig -from vocode.models.telephony import CreateInboundCall, TwilioConfig, TwilioConfig +from vocode.streaming.models.transcriber import TranscriberConfig +from vocode.streaming.models.synthesizer import SynthesizerConfig +from vocode.streaming.models.agent import AgentConfig +from vocode.streaming.models.telephony import ( + CreateInboundCall, + TwilioConfig, + TwilioConfig, +) + class InboundCallServer: def __init__( diff --git a/vocode/telephony/outbound_call.py b/vocode/streaming/telephony/outbound_call.py similarity index 82% rename from vocode/telephony/outbound_call.py rename to vocode/streaming/telephony/outbound_call.py index e3d765d..f38338c 100644 --- a/vocode/telephony/outbound_call.py +++ b/vocode/streaming/telephony/outbound_call.py @@ -2,9 +2,9 @@ from typing import Optional import requests import vocode -from vocode.models.agent import AgentConfig -from vocode.models.synthesizer import SynthesizerConfig -from vocode.models.transcriber import TranscriberConfig +from vocode.streaming.models.agent import AgentConfig +from vocode.streaming.models.synthesizer import SynthesizerConfig +from vocode.streaming.models.transcriber import TranscriberConfig from ..models.telephony import ( CallEntity, CreateOutboundCall, @@ -31,8 +31,12 @@ class OutboundCall: self.synthesizer_config = synthesizer_config self.conversation_id = conversation_id self.twilio_config = twilio_config - self.vocode_create_outbound_call_url = f"https://{vocode.base_url}/create_outbound_call" - self.vocode_end_outbound_call_url = f"https://{vocode.base_url}/end_outbound_call" + self.vocode_create_outbound_call_url = ( + f"https://{vocode.base_url}/create_outbound_call" + ) + self.vocode_end_outbound_call_url = ( + f"https://{vocode.base_url}/end_outbound_call" + ) def start(self) -> str: response = requests.post( diff --git a/vocode/telephony/zoom_dial_in.py b/vocode/streaming/telephony/zoom_dial_in.py similarity index 86% rename from vocode/telephony/zoom_dial_in.py rename to vocode/streaming/telephony/zoom_dial_in.py index 6062b3d..794cd00 100644 --- a/vocode/telephony/zoom_dial_in.py +++ b/vocode/streaming/telephony/zoom_dial_in.py @@ -2,11 +2,11 @@ from typing import Optional import requests import vocode -from vocode.models.agent import AgentConfig -from vocode.models.synthesizer import SynthesizerConfig -from vocode.models.transcriber import TranscriberConfig -from vocode.telephony.outbound_call import OutboundCall -from vocode.models.telephony import ( +from vocode.streaming.models.agent import AgentConfig +from vocode.streaming.models.synthesizer import SynthesizerConfig +from vocode.streaming.models.transcriber import TranscriberConfig +from vocode.streaming.telephony.outbound_call import OutboundCall +from vocode.streaming.models.telephony import ( CallEntity, DialIntoZoomCall, TwilioConfig, diff --git a/vocode/user_implemented_agent/base_agent.py b/vocode/streaming/user_implemented_agent/base_agent.py similarity index 100% rename from vocode/user_implemented_agent/base_agent.py rename to vocode/streaming/user_implemented_agent/base_agent.py diff --git a/vocode/user_implemented_agent/restful_agent.py b/vocode/streaming/user_implemented_agent/restful_agent.py similarity index 100% rename from vocode/user_implemented_agent/restful_agent.py rename to vocode/streaming/user_implemented_agent/restful_agent.py diff --git a/vocode/user_implemented_agent/websocket_agent.py b/vocode/streaming/user_implemented_agent/websocket_agent.py similarity index 100% rename from vocode/user_implemented_agent/websocket_agent.py rename to vocode/streaming/user_implemented_agent/websocket_agent.py diff --git a/vocode/turn_based/agent/base_agent.py b/vocode/turn_based/agent/base_agent.py new file mode 100644 index 0000000..b31368d --- /dev/null +++ b/vocode/turn_based/agent/base_agent.py @@ -0,0 +1,9 @@ +from typing import Optional + + +class BaseAgent: + def __init__(self, initial_message: Optional[str] = None): + self.initial_message = initial_message + + def respond(self, human_input: str): + raise NotImplementedError diff --git a/vocode/turn_based/agent/chat_gpt_agent.py b/vocode/turn_based/agent/chat_gpt_agent.py new file mode 100644 index 0000000..cc51b12 --- /dev/null +++ b/vocode/turn_based/agent/chat_gpt_agent.py @@ -0,0 +1,45 @@ +from typing import Optional +from langchain.prompts import ( + ChatPromptTemplate, + MessagesPlaceholder, + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, +) +from langchain.chains import ConversationChain +from langchain.chat_models import ChatOpenAI +from langchain.memory import ConversationBufferMemory + +from vocode.turn_based.agent.base_agent import BaseAgent + + +class ChatGPTAgent(BaseAgent): + def __init__( + self, + system_prompt: str, + initial_message: Optional[str] = None, + model_name: str = "gpt-3.5-turbo", + temperature: float = 0.7, + max_tokens: int = 100, + ): + super().__init__(initial_message=initial_message) + self.prompt = ChatPromptTemplate.from_messages( + [ + SystemMessagePromptTemplate.from_template(system_prompt), + MessagesPlaceholder(variable_name="history"), + HumanMessagePromptTemplate.from_template("{input}"), + ] + ) + self.memory = ConversationBufferMemory(return_messages=True) + if initial_message: + self.memory.chat_memory.add_ai_message(initial_message) + self.llm = ChatOpenAI( + model_name=model_name, + temperature=temperature, + max_tokens=max_tokens, + ) + self.conversation = ConversationChain( + memory=self.memory, prompt=self.prompt, llm=self.llm + ) + + def respond(self, human_input: str): + return self.conversation.predict(input=human_input) diff --git a/vocode/turn_based/agent/echo_agent.py b/vocode/turn_based/agent/echo_agent.py new file mode 100644 index 0000000..38b6dcb --- /dev/null +++ b/vocode/turn_based/agent/echo_agent.py @@ -0,0 +1,6 @@ +from vocode.turn_based.agent.base_agent import BaseAgent + + +class EchoAgent(BaseAgent): + def respond(self, human_input: str): + return human_input diff --git a/vocode/turn_based/input_device/base_input_device.py b/vocode/turn_based/input_device/base_input_device.py new file mode 100644 index 0000000..138ca13 --- /dev/null +++ b/vocode/turn_based/input_device/base_input_device.py @@ -0,0 +1,9 @@ +from pydub import AudioSegment + + +class BaseInputDevice: + def start_listening(self): + raise NotImplementedError + + def end_listening(self) -> AudioSegment: + raise NotImplementedError diff --git a/vocode/turn_based/input_device/microphone_input.py b/vocode/turn_based/input_device/microphone_input.py new file mode 100644 index 0000000..40293bb --- /dev/null +++ b/vocode/turn_based/input_device/microphone_input.py @@ -0,0 +1,59 @@ +from typing import Optional +import sounddevice as sd +import numpy as np +from pydub import AudioSegment +import io +import wave + +from vocode.turn_based.input_device.base_input_device import BaseInputDevice + + +class MicrophoneInput(BaseInputDevice): + DEFAULT_SAMPLING_RATE = 44100 + DEFAULT_CHUNK_SIZE = 2048 + + def __init__( + self, + device_info: dict, + sampling_rate: int = None, + chunk_size: int = DEFAULT_CHUNK_SIZE, + ): + self.device_info = device_info + self.sampling_rate = sampling_rate or ( + self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE) + ) + self.chunk_size = chunk_size + self.buffer: Optional[io.BytesIO] = None + self.wave_writer: Optional[wave.Wave_write] = None + + def create_stream(self): + return sd.InputStream( + dtype=np.int16, + channels=1, + samplerate=self.sampling_rate, + blocksize=self.chunk_size, + device=int(self.device_info["index"]), + callback=self._stream_callback, + ) + + def _stream_callback(self, in_data: np.ndarray[np.int16], *_args): + audio_bytes = in_data.tobytes() + self.wave_writer.writeframes(audio_bytes) + + def create_buffer(self): + in_memory_wav = io.BytesIO() + wave_writer = wave.open(in_memory_wav, "wb") + wave_writer.setnchannels(1) + wave_writer.setsampwidth(2) + wave_writer.setframerate(self.sampling_rate) + return in_memory_wav, wave_writer + + def start_listening(self): + self.buffer, self.wave_writer = self.create_buffer() + self.stream = self.create_stream() + self.stream.start() + + def end_listening(self) -> AudioSegment: + self.stream.stop() + self.buffer.seek(0) + return AudioSegment.from_wav(self.buffer) diff --git a/vocode/turn_based/output_device/base_output_device.py b/vocode/turn_based/output_device/base_output_device.py new file mode 100644 index 0000000..d54c0c7 --- /dev/null +++ b/vocode/turn_based/output_device/base_output_device.py @@ -0,0 +1,9 @@ +from pydub import AudioSegment + + +class BaseOutputDevice: + def send_audio(self, audio: AudioSegment) -> None: + raise NotImplementedError + + def terminate(self): + pass diff --git a/vocode/turn_based/output_device/speaker_output.py b/vocode/turn_based/output_device/speaker_output.py new file mode 100644 index 0000000..198d19f --- /dev/null +++ b/vocode/turn_based/output_device/speaker_output.py @@ -0,0 +1,32 @@ +import sounddevice as sd +import numpy as np +from pydub import AudioSegment + +from vocode.turn_based.output_device.base_output_device import BaseOutputDevice + + +class SpeakerOutput(BaseOutputDevice): + DEFAULT_SAMPLING_RATE = 44100 + + def __init__( + self, + device_info: dict, + sampling_rate: int = None, + ): + self.device_info = device_info + self.sampling_rate = sampling_rate or int( + self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE) + ) + self.stream = sd.OutputStream( + channels=1, + samplerate=self.sampling_rate, + dtype=np.int16, + device=int(self.device_info["index"]), + ) + self.stream.start() + + def send_audio(self, audio_segment: AudioSegment): + self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16)) + + def terminate(self): + self.stream.close() diff --git a/vocode/turn_based/synthesizer/azure_synthesizer.py b/vocode/turn_based/synthesizer/azure_synthesizer.py new file mode 100644 index 0000000..d55c43c --- /dev/null +++ b/vocode/turn_based/synthesizer/azure_synthesizer.py @@ -0,0 +1,53 @@ +import os +from dotenv import load_dotenv +import azure.cognitiveservices.speech as speechsdk +from pydub import AudioSegment + +from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer + +load_dotenv() + + +class AzureSynthesizer(BaseSynthesizer): + def __init__(self, sampling_rate: int): + self.sampling_rate = sampling_rate + speech_config = speechsdk.SpeechConfig( + subscription=os.environ.get("AZURE_SPEECH_KEY"), + region=os.environ.get("AZURE_SPEECH_REGION"), + ) + if self.sampling_rate == 44100: + speech_config.set_speech_synthesis_output_format( + speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm + ) + if self.sampling_rate == 48000: + speech_config.set_speech_synthesis_output_format( + speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm + ) + if self.sampling_rate == 24000: + speech_config.set_speech_synthesis_output_format( + speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm + ) + elif self.sampling_rate == 16000: + speech_config.set_speech_synthesis_output_format( + speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm + ) + elif self.sampling_rate == 8000: + speech_config.set_speech_synthesis_output_format( + speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm + ) + + self.synthesizer = speechsdk.SpeechSynthesizer( + speech_config=speech_config, audio_config=None + ) + + def synthesize(self, text) -> AudioSegment: + result = self.synthesizer.speak_text(text) + if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + return AudioSegment( + result.audio_data, + sample_width=2, + frame_rate=self.sampling_rate, + channels=1, + ) + else: + raise Exception("Could not synthesize audio") diff --git a/vocode/turn_based/synthesizer/base_synthesizer.py b/vocode/turn_based/synthesizer/base_synthesizer.py new file mode 100644 index 0000000..41f290d --- /dev/null +++ b/vocode/turn_based/synthesizer/base_synthesizer.py @@ -0,0 +1,6 @@ +from pydub import AudioSegment + + +class BaseSynthesizer: + def synthesize(self, text) -> AudioSegment: + raise NotImplementedError diff --git a/vocode/turn_based/transcriber/base_transcriber.py b/vocode/turn_based/transcriber/base_transcriber.py new file mode 100644 index 0000000..ba77043 --- /dev/null +++ b/vocode/turn_based/transcriber/base_transcriber.py @@ -0,0 +1,6 @@ +from pydub import AudioSegment + + +class BaseTranscriber: + def transcribe(self, audio_segment: AudioSegment) -> str: + raise NotImplementedError diff --git a/vocode/turn_based/transcriber/whisper_transcriber.py b/vocode/turn_based/transcriber/whisper_transcriber.py new file mode 100644 index 0000000..99a091b --- /dev/null +++ b/vocode/turn_based/transcriber/whisper_transcriber.py @@ -0,0 +1,21 @@ +from pydub import AudioSegment +import io +import os +from dotenv import load_dotenv +import openai + +from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber + +load_dotenv() + +openai.api_key = os.getenv("OPENAI_API_KEY") + + +class WhisperTranscriber(BaseTranscriber): + def transcribe(self, audio_segment: AudioSegment) -> str: + in_memory_wav = io.BytesIO() + audio_segment.export(in_memory_wav, format="wav") + in_memory_wav.seek(0) + in_memory_wav.name = "whisper.wav" + transcript = openai.Audio.transcribe("whisper-1", in_memory_wav) + return transcript.text diff --git a/vocode/turn_based/turn_based_conversation.py b/vocode/turn_based/turn_based_conversation.py new file mode 100644 index 0000000..13b9b8b --- /dev/null +++ b/vocode/turn_based/turn_based_conversation.py @@ -0,0 +1,38 @@ +from vocode.turn_based.agent.base_agent import BaseAgent +from vocode.turn_based.input_device.base_input_device import ( + BaseInputDevice, +) +from vocode.turn_based.output_device.base_output_device import BaseOutputDevice +from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer +from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber + + +class TurnBasedConversation: + def __init__( + self, + input_device: BaseInputDevice, + transcriber: BaseTranscriber, + agent: BaseAgent, + synthesizer: BaseSynthesizer, + output_device: BaseOutputDevice, + ): + self.input_device = input_device + self.transcriber = transcriber + self.agent = agent + self.synthesizer = synthesizer + self.output_device = output_device + self.maybe_play_initial_message() + + def maybe_play_initial_message(self): + if self.agent.initial_message: + self.output_device.send_audio( + self.synthesizer.synthesize(self.agent.initial_message) + ) + + def start_speech(self): + self.input_device.start_listening() + + def end_speech_and_respond(self): + human_input = self.transcriber.transcribe(self.input_device.end_listening()) + agent_response = self.agent.respond(human_input) + self.output_device.send_audio(self.synthesizer.synthesize(agent_response))