first pass at turn based conversation
This commit is contained in:
parent
d1118d375e
commit
518a0f2b53
40 changed files with 503 additions and 99 deletions
|
|
@ -10,9 +10,9 @@ import signal
|
||||||
|
|
||||||
from vocode.conversation import Conversation
|
from vocode.conversation import Conversation
|
||||||
from vocode.helpers import create_microphone_input_and_speaker_output
|
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||||
from vocode.models.transcriber import DeepgramTranscriberConfig
|
from vocode.streaming.models.transcriber import DeepgramTranscriberConfig
|
||||||
from vocode.models.agent import LLMAgentConfig
|
from vocode.streaming.models.agent import LLMAgentConfig
|
||||||
from vocode.models.synthesizer import AzureSynthesizerConfig
|
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True)
|
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True)
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,6 @@
|
||||||
from vocode.telephony.inbound_call_server import InboundCallServer
|
from vocode.streaming.telephony.inbound_call_server import InboundCallServer
|
||||||
from vocode.models.agent import EchoAgentConfig
|
from vocode.streaming.models.agent import EchoAgentConfig
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
server = InboundCallServer(
|
server = InboundCallServer(agent_config=EchoAgentConfig(initial_message="hello!"))
|
||||||
agent_config=EchoAgentConfig(initial_message="hello!")
|
server.run(port=3001)
|
||||||
)
|
|
||||||
server.run(port=3001)
|
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,14 @@
|
||||||
from vocode.models.synthesizer import AzureSynthesizerConfig
|
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
||||||
from vocode.output_device.telephone_output import TelephoneOutput
|
from vocode.streaming.output_device.telephone_output import TelephoneOutput
|
||||||
from vocode.telephony.outbound_call import OutboundCall
|
from vocode.streaming.telephony.outbound_call import OutboundCall
|
||||||
from vocode.models.telephony import CallEntity
|
from vocode.streaming.models.telephony import CallEntity
|
||||||
from vocode.models.agent import (
|
from vocode.streaming.models.agent import (
|
||||||
EchoAgentConfig,
|
EchoAgentConfig,
|
||||||
ChatGPTAgentConfig,
|
ChatGPTAgentConfig,
|
||||||
WebSocketUserImplementedAgentConfig,
|
WebSocketUserImplementedAgentConfig,
|
||||||
)
|
)
|
||||||
from vocode.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.telephony.zoom_dial_in import ZoomDialIn
|
from vocode.streaming.telephony.zoom_dial_in import ZoomDialIn
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
call = ZoomDialIn(
|
call = ZoomDialIn(
|
||||||
|
|
@ -24,7 +24,7 @@ if __name__ == "__main__":
|
||||||
generate_responses=True,
|
generate_responses=True,
|
||||||
end_conversation_on_goodbye=True,
|
end_conversation_on_goodbye=True,
|
||||||
send_filler_audio=True,
|
send_filler_audio=True,
|
||||||
allowed_idle_time_seconds=30
|
allowed_idle_time_seconds=30,
|
||||||
),
|
),
|
||||||
synthesizer_config=AzureSynthesizerConfig.from_output_device(
|
synthesizer_config=AzureSynthesizerConfig.from_output_device(
|
||||||
output_device=TelephoneOutput(), voice_name="en-US-JennyNeural"
|
output_device=TelephoneOutput(), voice_name="en-US-JennyNeural"
|
||||||
|
|
|
||||||
|
|
@ -3,14 +3,14 @@ import logging
|
||||||
import signal
|
import signal
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
from vocode.conversation import Conversation
|
from vocode.streaming.streaming_conversation import StreamingConversation
|
||||||
from vocode.helpers import create_microphone_input_and_speaker_output
|
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||||
from vocode.models.transcriber import (
|
from vocode.streaming.models.transcriber import (
|
||||||
DeepgramTranscriberConfig,
|
DeepgramTranscriberConfig,
|
||||||
PunctuationEndpointingConfig,
|
PunctuationEndpointingConfig,
|
||||||
GoogleTranscriberConfig,
|
GoogleTranscriberConfig,
|
||||||
)
|
)
|
||||||
from vocode.models.agent import (
|
from vocode.streaming.models.agent import (
|
||||||
ChatGPTAgentConfig,
|
ChatGPTAgentConfig,
|
||||||
CutOffResponse,
|
CutOffResponse,
|
||||||
FillerAudioConfig,
|
FillerAudioConfig,
|
||||||
|
|
@ -20,9 +20,9 @@ from vocode.models.agent import (
|
||||||
LLMAgentConfig,
|
LLMAgentConfig,
|
||||||
ChatGPTAgentConfig,
|
ChatGPTAgentConfig,
|
||||||
)
|
)
|
||||||
from vocode.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from vocode.models.synthesizer import AzureSynthesizerConfig
|
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
|
||||||
from vocode.user_implemented_agent.restful_agent import RESTfulAgent
|
from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
|
||||||
import vocode
|
import vocode
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
@ -34,10 +34,10 @@ logging.root.setLevel(logging.INFO)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
|
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
|
||||||
use_default_devices=False
|
streaming=True, use_default_devices=False
|
||||||
)
|
)
|
||||||
|
|
||||||
conversation = Conversation(
|
conversation = StreamingConversation(
|
||||||
input_device=microphone_input,
|
input_device=microphone_input,
|
||||||
output_device=speaker_output,
|
output_device=speaker_output,
|
||||||
transcriber_config=DeepgramTranscriberConfig.from_input_device(
|
transcriber_config=DeepgramTranscriberConfig.from_input_device(
|
||||||
33
simple_turn_based_conversation.py
Normal file
33
simple_turn_based_conversation.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
import logging
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||||
|
import vocode
|
||||||
|
from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent
|
||||||
|
from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer
|
||||||
|
from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber
|
||||||
|
from vocode.turn_based.turn_based_conversation import TurnBasedConversation
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
vocode.api_key = os.getenv("VOCODE_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
microphone_input, speaker_output = create_microphone_input_and_speaker_output(
|
||||||
|
streaming=False, use_default_devices=False
|
||||||
|
)
|
||||||
|
|
||||||
|
conversation = TurnBasedConversation(
|
||||||
|
input_device=microphone_input,
|
||||||
|
output_device=speaker_output,
|
||||||
|
transcriber=WhisperTranscriber(),
|
||||||
|
agent=ChatGPTAgent(
|
||||||
|
system_prompt="The AI is having a pleasant conversation about life",
|
||||||
|
initial_message="Hello!",
|
||||||
|
),
|
||||||
|
synthesizer=AzureSynthesizer(sampling_rate=speaker_output.sampling_rate),
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
conversation.start_speech()
|
||||||
|
input("Press enter to end speech")
|
||||||
|
conversation.end_speech_and_respond()
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
from vocode.user_implemented_agent.restful_agent import RESTfulAgent
|
from vocode.streaming.user_implemented_agent.restful_agent import RESTfulAgent
|
||||||
from vocode.models.agent import (
|
from vocode.streaming.models.agent import (
|
||||||
RESTfulAgentOutput,
|
RESTfulAgentOutput,
|
||||||
RESTfulAgentText,
|
RESTfulAgentText,
|
||||||
RESTfulAgentEnd,
|
RESTfulAgentEnd,
|
||||||
|
|
@ -9,7 +9,7 @@ from vocode.models.agent import (
|
||||||
WebSocketAgentTextMessage,
|
WebSocketAgentTextMessage,
|
||||||
WebSocketAgentStopMessage,
|
WebSocketAgentStopMessage,
|
||||||
)
|
)
|
||||||
from vocode.user_implemented_agent.websocket_agent import WebSocketAgent
|
from vocode.streaming.user_implemented_agent.websocket_agent import WebSocketAgent
|
||||||
|
|
||||||
|
|
||||||
class TestRESTfulAgent(RESTfulAgent):
|
class TestRESTfulAgent(RESTfulAgent):
|
||||||
|
|
|
||||||
|
|
@ -1,28 +1,69 @@
|
||||||
|
from typing import Union
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
from .input_device.microphone_input import MicrophoneInput
|
from vocode.streaming.input_device.microphone_input import (
|
||||||
from .output_device.speaker_output import SpeakerOutput
|
MicrophoneInput as StreamingMicrophoneInput,
|
||||||
|
)
|
||||||
|
from vocode.streaming.output_device.speaker_output import (
|
||||||
|
SpeakerOutput as StreamingSpeakerOutput,
|
||||||
|
)
|
||||||
|
from vocode.turn_based.input_device.microphone_input import (
|
||||||
|
MicrophoneInput as TurnBasedMicrophoneInput,
|
||||||
|
)
|
||||||
|
from vocode.turn_based.output_device.speaker_output import (
|
||||||
|
SpeakerOutput as TurnBasedSpeakerOutput,
|
||||||
|
)
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _get_device_prompt(device_infos: list[dict]) -> str:
|
def _get_device_prompt(device_infos: list[dict]) -> str:
|
||||||
return """Please select a device:
|
return """Please select a device:
|
||||||
{}
|
{}
|
||||||
Choice: """.format(
|
Choice: """.format(
|
||||||
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
|
"\n".join(
|
||||||
|
f"{index}: {device['name']}" for index, device in enumerate(device_infos)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
|
|
||||||
|
def create_microphone_input_and_speaker_output(
|
||||||
|
streaming: bool = True,
|
||||||
|
use_default_devices=False,
|
||||||
|
mic_sampling_rate=None,
|
||||||
|
speaker_sampling_rate=None,
|
||||||
|
) -> Union[
|
||||||
|
tuple[StreamingMicrophoneInput, StreamingSpeakerOutput],
|
||||||
|
tuple[TurnBasedMicrophoneInput, TurnBasedSpeakerOutput],
|
||||||
|
]:
|
||||||
device_infos = sd.query_devices()
|
device_infos = sd.query_devices()
|
||||||
input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos))
|
input_device_infos = list(
|
||||||
output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos))
|
filter(lambda device_info: device_info["max_input_channels"] > 0, device_infos)
|
||||||
|
)
|
||||||
|
output_device_infos = list(
|
||||||
|
filter(lambda device_info: device_info["max_output_channels"] > 0, device_infos)
|
||||||
|
)
|
||||||
if use_default_devices:
|
if use_default_devices:
|
||||||
input_device_info = sd.query_devices(kind='input')
|
input_device_info = sd.query_devices(kind="input")
|
||||||
output_device_info = sd.query_devices(kind='output')
|
output_device_info = sd.query_devices(kind="output")
|
||||||
else:
|
else:
|
||||||
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
|
input_device_info = input_device_infos[
|
||||||
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
|
int(input(_get_device_prompt(input_device_infos)))
|
||||||
logger.info("Using microphone input device: %s", input_device_info['name'])
|
]
|
||||||
microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate)
|
output_device_info = output_device_infos[
|
||||||
logger.info("Using speaker output device: %s", output_device_info['name'])
|
int(input(_get_device_prompt(output_device_infos)))
|
||||||
speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate)
|
]
|
||||||
return microphone_input, speaker_output
|
logger.info("Using microphone input device: %s", input_device_info["name"])
|
||||||
|
microphone_class = (
|
||||||
|
StreamingMicrophoneInput if streaming else TurnBasedMicrophoneInput
|
||||||
|
)
|
||||||
|
speaker_class = StreamingSpeakerOutput if streaming else TurnBasedSpeakerOutput
|
||||||
|
|
||||||
|
microphone_input = microphone_class(
|
||||||
|
input_device_info, sampling_rate=mic_sampling_rate
|
||||||
|
)
|
||||||
|
logger.info("Using speaker output device: %s", output_device_info["name"])
|
||||||
|
speaker_output = speaker_class(
|
||||||
|
output_device_info, sampling_rate=speaker_sampling_rate
|
||||||
|
)
|
||||||
|
return microphone_input, speaker_output
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,16 @@
|
||||||
from ..models.audio_encoding import AudioEncoding
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
import queue
|
import queue
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
class BaseInputDevice():
|
|
||||||
|
|
||||||
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int):
|
class BaseInputDevice:
|
||||||
|
def __init__(
|
||||||
|
self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int
|
||||||
|
):
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
self.audio_encoding = audio_encoding
|
self.audio_encoding = audio_encoding
|
||||||
self.chunk_size = chunk_size
|
self.chunk_size = chunk_size
|
||||||
self.queue = queue.Queue()
|
self.queue = queue.Queue()
|
||||||
|
|
||||||
def get_audio(self) -> Optional[bytes]:
|
def get_audio(self) -> Optional[bytes]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
@ -4,25 +4,33 @@ from typing import Optional
|
||||||
import queue
|
import queue
|
||||||
import wave
|
import wave
|
||||||
|
|
||||||
from .base_input_device import BaseInputDevice
|
from vocode.streaming.input_device.base_input_device import BaseInputDevice
|
||||||
from ..models.audio_encoding import AudioEncoding
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
|
||||||
class MicrophoneInput(BaseInputDevice):
|
class MicrophoneInput(BaseInputDevice):
|
||||||
|
|
||||||
DEFAULT_SAMPLING_RATE = 44100
|
DEFAULT_SAMPLING_RATE = 44100
|
||||||
DEFAULT_CHUNK_SIZE = 2048
|
DEFAULT_CHUNK_SIZE = 2048
|
||||||
|
|
||||||
def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1):
|
def __init__(
|
||||||
|
self,
|
||||||
|
device_info: dict,
|
||||||
|
sampling_rate: int = None,
|
||||||
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||||
|
microphone_gain: int = 1,
|
||||||
|
):
|
||||||
self.device_info = device_info
|
self.device_info = device_info
|
||||||
sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
|
sampling_rate = sampling_rate or (
|
||||||
|
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||||
|
)
|
||||||
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
|
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
|
||||||
self.stream = sd.InputStream(
|
self.stream = sd.InputStream(
|
||||||
dtype=np.int16,
|
dtype=np.int16,
|
||||||
channels=1,
|
channels=1,
|
||||||
samplerate=self.sampling_rate,
|
samplerate=self.sampling_rate,
|
||||||
blocksize=self.chunk_size,
|
blocksize=self.chunk_size,
|
||||||
device=int(self.device_info['index']),
|
device=int(self.device_info["index"]),
|
||||||
callback=self._stream_callback
|
callback=self._stream_callback,
|
||||||
)
|
)
|
||||||
self.stream.start()
|
self.stream.start()
|
||||||
self.queue = queue.Queue()
|
self.queue = queue.Queue()
|
||||||
|
|
@ -40,4 +48,4 @@ class MicrophoneInput(BaseInputDevice):
|
||||||
try:
|
try:
|
||||||
return self.queue.get_nowait()
|
return self.queue.get_nowait()
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
return None
|
return None
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
from vocode.input_device.base_input_device import BaseInputDevice
|
from vocode.streaming.input_device.base_input_device import (
|
||||||
from vocode.models.audio_encoding import AudioEncoding
|
BaseInputDevice,
|
||||||
|
)
|
||||||
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
|
||||||
class TelephoneInput(BaseInputDevice):
|
class TelephoneInput(BaseInputDevice):
|
||||||
|
|
@ -3,7 +3,7 @@ from enum import Enum
|
||||||
|
|
||||||
from pydantic import validator
|
from pydantic import validator
|
||||||
|
|
||||||
from vocode.models.message import BaseMessage
|
from vocode.streaming.models.message import BaseMessage
|
||||||
from .model import TypedModel, BaseModel
|
from .model import TypedModel, BaseModel
|
||||||
|
|
||||||
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5
|
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from vocode.models.model import BaseModel
|
from vocode.streaming.models.model import BaseModel
|
||||||
from vocode.models.agent import AgentConfig
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
from vocode.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
from vocode.models.transcriber import TranscriberConfig
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
|
|
||||||
|
|
||||||
class TwilioConfig(BaseModel):
|
class TwilioConfig(BaseModel):
|
||||||
|
|
@ -1,8 +1,11 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from vocode.streaming.input_device.base_input_device import (
|
||||||
|
BaseInputDevice,
|
||||||
|
)
|
||||||
from .audio_encoding import AudioEncoding
|
from .audio_encoding import AudioEncoding
|
||||||
from .model import BaseModel, TypedModel
|
from .model import BaseModel, TypedModel
|
||||||
from ..input_device.base_input_device import BaseInputDevice
|
|
||||||
|
|
||||||
|
|
||||||
class TranscriberType(str, Enum):
|
class TranscriberType(str, Enum):
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from ..models.audio_encoding import AudioEncoding
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
|
||||||
class BaseOutputDevice:
|
class BaseOutputDevice:
|
||||||
|
|
||||||
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
|
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
self.audio_encoding = audio_encoding
|
self.audio_encoding = audio_encoding
|
||||||
|
|
@ -11,5 +11,3 @@ class BaseOutputDevice:
|
||||||
|
|
||||||
async def maybe_send_mark_async(self, message):
|
async def maybe_send_mark_async(self, message):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -2,21 +2,28 @@ import sounddevice as sd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .base_output_device import BaseOutputDevice
|
from .base_output_device import BaseOutputDevice
|
||||||
from ..models.audio_encoding import AudioEncoding
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
|
||||||
class SpeakerOutput(BaseOutputDevice):
|
class SpeakerOutput(BaseOutputDevice):
|
||||||
|
|
||||||
DEFAULT_SAMPLING_RATE = 44100
|
DEFAULT_SAMPLING_RATE = 44100
|
||||||
|
|
||||||
def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
|
def __init__(
|
||||||
|
self,
|
||||||
|
device_info: dict,
|
||||||
|
sampling_rate: int = None,
|
||||||
|
audio_encoding: AudioEncoding = AudioEncoding.LINEAR16,
|
||||||
|
):
|
||||||
self.device_info = device_info
|
self.device_info = device_info
|
||||||
sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
|
sampling_rate = sampling_rate or int(
|
||||||
|
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||||
|
)
|
||||||
super().__init__(sampling_rate, audio_encoding)
|
super().__init__(sampling_rate, audio_encoding)
|
||||||
self.stream = sd.OutputStream(
|
self.stream = sd.OutputStream(
|
||||||
channels=1,
|
channels=1,
|
||||||
samplerate=self.sampling_rate,
|
samplerate=self.sampling_rate,
|
||||||
dtype=np.int16,
|
dtype=np.int16,
|
||||||
device=int(self.device_info['index'])
|
device=int(self.device_info["index"]),
|
||||||
)
|
)
|
||||||
self.stream.start()
|
self.stream.start()
|
||||||
|
|
||||||
|
|
@ -24,4 +31,4 @@ class SpeakerOutput(BaseOutputDevice):
|
||||||
self.stream.write(np.frombuffer(chunk, dtype=np.int16))
|
self.stream.write(np.frombuffer(chunk, dtype=np.int16))
|
||||||
|
|
||||||
def terminate(self):
|
def terminate(self):
|
||||||
self.stream.close()
|
self.stream.close()
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from .base_output_device import BaseOutputDevice
|
from .base_output_device import BaseOutputDevice
|
||||||
from ..models.audio_encoding import AudioEncoding
|
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||||
|
|
||||||
|
|
||||||
class TelephoneOutput(BaseOutputDevice):
|
class TelephoneOutput(BaseOutputDevice):
|
||||||
|
|
@ -8,16 +8,24 @@ import logging
|
||||||
import threading
|
import threading
|
||||||
import queue
|
import queue
|
||||||
import vocode
|
import vocode
|
||||||
from vocode.input_device.base_input_device import BaseInputDevice
|
from vocode.streaming.input_device.base_input_device import (
|
||||||
from vocode.output_device.base_output_device import BaseOutputDevice
|
BaseInputDevice,
|
||||||
from vocode.models.transcriber import TranscriberConfig
|
)
|
||||||
from vocode.models.agent import AgentConfig
|
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
||||||
from vocode.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
from vocode.models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
|
from vocode.streaming.models.websocket import (
|
||||||
|
ReadyMessage,
|
||||||
|
AudioMessage,
|
||||||
|
StartMessage,
|
||||||
|
StopMessage,
|
||||||
|
)
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
class Conversation:
|
|
||||||
|
class StreamingConversation:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_device: BaseInputDevice,
|
input_device: BaseInputDevice,
|
||||||
|
|
@ -61,14 +69,16 @@ class Conversation:
|
||||||
loop.run_until_complete(run())
|
loop.run_until_complete(run())
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
async with websockets.connect(f"{self.vocode_websocket_url}?key={vocode.api_key}") as ws:
|
async with websockets.connect(
|
||||||
|
f"{self.vocode_websocket_url}?key={vocode.api_key}"
|
||||||
|
) as ws:
|
||||||
|
|
||||||
async def sender(ws: WebSocketClientProtocol):
|
async def sender(ws: WebSocketClientProtocol):
|
||||||
start_message = StartMessage(
|
start_message = StartMessage(
|
||||||
transcriber_config=self.transcriber_config,
|
transcriber_config=self.transcriber_config,
|
||||||
agent_config=self.agent_config,
|
agent_config=self.agent_config,
|
||||||
synthesizer_config=self.synthesizer_config,
|
synthesizer_config=self.synthesizer_config,
|
||||||
conversation_id=self.id
|
conversation_id=self.id,
|
||||||
)
|
)
|
||||||
await ws.send(start_message.json())
|
await ws.send(start_message.json())
|
||||||
await self.wait_for_ready()
|
await self.wait_for_ready()
|
||||||
|
|
@ -4,10 +4,15 @@ import requests
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
import vocode
|
import vocode
|
||||||
from vocode.models.transcriber import TranscriberConfig
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
from vocode.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
from vocode.models.agent import AgentConfig
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
from vocode.models.telephony import CreateInboundCall, TwilioConfig, TwilioConfig
|
from vocode.streaming.models.telephony import (
|
||||||
|
CreateInboundCall,
|
||||||
|
TwilioConfig,
|
||||||
|
TwilioConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class InboundCallServer:
|
class InboundCallServer:
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -2,9 +2,9 @@ from typing import Optional
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
import vocode
|
import vocode
|
||||||
from vocode.models.agent import AgentConfig
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
from vocode.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
from vocode.models.transcriber import TranscriberConfig
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
from ..models.telephony import (
|
from ..models.telephony import (
|
||||||
CallEntity,
|
CallEntity,
|
||||||
CreateOutboundCall,
|
CreateOutboundCall,
|
||||||
|
|
@ -31,8 +31,12 @@ class OutboundCall:
|
||||||
self.synthesizer_config = synthesizer_config
|
self.synthesizer_config = synthesizer_config
|
||||||
self.conversation_id = conversation_id
|
self.conversation_id = conversation_id
|
||||||
self.twilio_config = twilio_config
|
self.twilio_config = twilio_config
|
||||||
self.vocode_create_outbound_call_url = f"https://{vocode.base_url}/create_outbound_call"
|
self.vocode_create_outbound_call_url = (
|
||||||
self.vocode_end_outbound_call_url = f"https://{vocode.base_url}/end_outbound_call"
|
f"https://{vocode.base_url}/create_outbound_call"
|
||||||
|
)
|
||||||
|
self.vocode_end_outbound_call_url = (
|
||||||
|
f"https://{vocode.base_url}/end_outbound_call"
|
||||||
|
)
|
||||||
|
|
||||||
def start(self) -> str:
|
def start(self) -> str:
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
|
|
@ -2,11 +2,11 @@ from typing import Optional
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
import vocode
|
import vocode
|
||||||
from vocode.models.agent import AgentConfig
|
from vocode.streaming.models.agent import AgentConfig
|
||||||
from vocode.models.synthesizer import SynthesizerConfig
|
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||||
from vocode.models.transcriber import TranscriberConfig
|
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||||
from vocode.telephony.outbound_call import OutboundCall
|
from vocode.streaming.telephony.outbound_call import OutboundCall
|
||||||
from vocode.models.telephony import (
|
from vocode.streaming.models.telephony import (
|
||||||
CallEntity,
|
CallEntity,
|
||||||
DialIntoZoomCall,
|
DialIntoZoomCall,
|
||||||
TwilioConfig,
|
TwilioConfig,
|
||||||
9
vocode/turn_based/agent/base_agent.py
Normal file
9
vocode/turn_based/agent/base_agent.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
class BaseAgent:
|
||||||
|
def __init__(self, initial_message: Optional[str] = None):
|
||||||
|
self.initial_message = initial_message
|
||||||
|
|
||||||
|
def respond(self, human_input: str):
|
||||||
|
raise NotImplementedError
|
||||||
45
vocode/turn_based/agent/chat_gpt_agent.py
Normal file
45
vocode/turn_based/agent/chat_gpt_agent.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
from typing import Optional
|
||||||
|
from langchain.prompts import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
MessagesPlaceholder,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.chains import ConversationChain
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.memory import ConversationBufferMemory
|
||||||
|
|
||||||
|
from vocode.turn_based.agent.base_agent import BaseAgent
|
||||||
|
|
||||||
|
|
||||||
|
class ChatGPTAgent(BaseAgent):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
system_prompt: str,
|
||||||
|
initial_message: Optional[str] = None,
|
||||||
|
model_name: str = "gpt-3.5-turbo",
|
||||||
|
temperature: float = 0.7,
|
||||||
|
max_tokens: int = 100,
|
||||||
|
):
|
||||||
|
super().__init__(initial_message=initial_message)
|
||||||
|
self.prompt = ChatPromptTemplate.from_messages(
|
||||||
|
[
|
||||||
|
SystemMessagePromptTemplate.from_template(system_prompt),
|
||||||
|
MessagesPlaceholder(variable_name="history"),
|
||||||
|
HumanMessagePromptTemplate.from_template("{input}"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.memory = ConversationBufferMemory(return_messages=True)
|
||||||
|
if initial_message:
|
||||||
|
self.memory.chat_memory.add_ai_message(initial_message)
|
||||||
|
self.llm = ChatOpenAI(
|
||||||
|
model_name=model_name,
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
self.conversation = ConversationChain(
|
||||||
|
memory=self.memory, prompt=self.prompt, llm=self.llm
|
||||||
|
)
|
||||||
|
|
||||||
|
def respond(self, human_input: str):
|
||||||
|
return self.conversation.predict(input=human_input)
|
||||||
6
vocode/turn_based/agent/echo_agent.py
Normal file
6
vocode/turn_based/agent/echo_agent.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
from vocode.turn_based.agent.base_agent import BaseAgent
|
||||||
|
|
||||||
|
|
||||||
|
class EchoAgent(BaseAgent):
|
||||||
|
def respond(self, human_input: str):
|
||||||
|
return human_input
|
||||||
9
vocode/turn_based/input_device/base_input_device.py
Normal file
9
vocode/turn_based/input_device/base_input_device.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class BaseInputDevice:
|
||||||
|
def start_listening(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def end_listening(self) -> AudioSegment:
|
||||||
|
raise NotImplementedError
|
||||||
59
vocode/turn_based/input_device/microphone_input.py
Normal file
59
vocode/turn_based/input_device/microphone_input.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
from typing import Optional
|
||||||
|
import sounddevice as sd
|
||||||
|
import numpy as np
|
||||||
|
from pydub import AudioSegment
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
|
||||||
|
from vocode.turn_based.input_device.base_input_device import BaseInputDevice
|
||||||
|
|
||||||
|
|
||||||
|
class MicrophoneInput(BaseInputDevice):
|
||||||
|
DEFAULT_SAMPLING_RATE = 44100
|
||||||
|
DEFAULT_CHUNK_SIZE = 2048
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
device_info: dict,
|
||||||
|
sampling_rate: int = None,
|
||||||
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||||
|
):
|
||||||
|
self.device_info = device_info
|
||||||
|
self.sampling_rate = sampling_rate or (
|
||||||
|
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||||
|
)
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.buffer: Optional[io.BytesIO] = None
|
||||||
|
self.wave_writer: Optional[wave.Wave_write] = None
|
||||||
|
|
||||||
|
def create_stream(self):
|
||||||
|
return sd.InputStream(
|
||||||
|
dtype=np.int16,
|
||||||
|
channels=1,
|
||||||
|
samplerate=self.sampling_rate,
|
||||||
|
blocksize=self.chunk_size,
|
||||||
|
device=int(self.device_info["index"]),
|
||||||
|
callback=self._stream_callback,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
|
||||||
|
audio_bytes = in_data.tobytes()
|
||||||
|
self.wave_writer.writeframes(audio_bytes)
|
||||||
|
|
||||||
|
def create_buffer(self):
|
||||||
|
in_memory_wav = io.BytesIO()
|
||||||
|
wave_writer = wave.open(in_memory_wav, "wb")
|
||||||
|
wave_writer.setnchannels(1)
|
||||||
|
wave_writer.setsampwidth(2)
|
||||||
|
wave_writer.setframerate(self.sampling_rate)
|
||||||
|
return in_memory_wav, wave_writer
|
||||||
|
|
||||||
|
def start_listening(self):
|
||||||
|
self.buffer, self.wave_writer = self.create_buffer()
|
||||||
|
self.stream = self.create_stream()
|
||||||
|
self.stream.start()
|
||||||
|
|
||||||
|
def end_listening(self) -> AudioSegment:
|
||||||
|
self.stream.stop()
|
||||||
|
self.buffer.seek(0)
|
||||||
|
return AudioSegment.from_wav(self.buffer)
|
||||||
9
vocode/turn_based/output_device/base_output_device.py
Normal file
9
vocode/turn_based/output_device/base_output_device.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class BaseOutputDevice:
|
||||||
|
def send_audio(self, audio: AudioSegment) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
pass
|
||||||
32
vocode/turn_based/output_device/speaker_output.py
Normal file
32
vocode/turn_based/output_device/speaker_output.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import sounddevice as sd
|
||||||
|
import numpy as np
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
|
||||||
|
|
||||||
|
|
||||||
|
class SpeakerOutput(BaseOutputDevice):
|
||||||
|
DEFAULT_SAMPLING_RATE = 44100
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
device_info: dict,
|
||||||
|
sampling_rate: int = None,
|
||||||
|
):
|
||||||
|
self.device_info = device_info
|
||||||
|
self.sampling_rate = sampling_rate or int(
|
||||||
|
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||||
|
)
|
||||||
|
self.stream = sd.OutputStream(
|
||||||
|
channels=1,
|
||||||
|
samplerate=self.sampling_rate,
|
||||||
|
dtype=np.int16,
|
||||||
|
device=int(self.device_info["index"]),
|
||||||
|
)
|
||||||
|
self.stream.start()
|
||||||
|
|
||||||
|
def send_audio(self, audio_segment: AudioSegment):
|
||||||
|
self.stream.write(np.frombuffer(audio_segment.raw_data, dtype=np.int16))
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
self.stream.close()
|
||||||
53
vocode/turn_based/synthesizer/azure_synthesizer.py
Normal file
53
vocode/turn_based/synthesizer/azure_synthesizer.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import azure.cognitiveservices.speech as speechsdk
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class AzureSynthesizer(BaseSynthesizer):
|
||||||
|
def __init__(self, sampling_rate: int):
|
||||||
|
self.sampling_rate = sampling_rate
|
||||||
|
speech_config = speechsdk.SpeechConfig(
|
||||||
|
subscription=os.environ.get("AZURE_SPEECH_KEY"),
|
||||||
|
region=os.environ.get("AZURE_SPEECH_REGION"),
|
||||||
|
)
|
||||||
|
if self.sampling_rate == 44100:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
|
||||||
|
)
|
||||||
|
if self.sampling_rate == 48000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
if self.sampling_rate == 24000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
elif self.sampling_rate == 16000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
elif self.sampling_rate == 8000:
|
||||||
|
speech_config.set_speech_synthesis_output_format(
|
||||||
|
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
|
||||||
|
)
|
||||||
|
|
||||||
|
self.synthesizer = speechsdk.SpeechSynthesizer(
|
||||||
|
speech_config=speech_config, audio_config=None
|
||||||
|
)
|
||||||
|
|
||||||
|
def synthesize(self, text) -> AudioSegment:
|
||||||
|
result = self.synthesizer.speak_text(text)
|
||||||
|
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
||||||
|
return AudioSegment(
|
||||||
|
result.audio_data,
|
||||||
|
sample_width=2,
|
||||||
|
frame_rate=self.sampling_rate,
|
||||||
|
channels=1,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise Exception("Could not synthesize audio")
|
||||||
6
vocode/turn_based/synthesizer/base_synthesizer.py
Normal file
6
vocode/turn_based/synthesizer/base_synthesizer.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSynthesizer:
|
||||||
|
def synthesize(self, text) -> AudioSegment:
|
||||||
|
raise NotImplementedError
|
||||||
6
vocode/turn_based/transcriber/base_transcriber.py
Normal file
6
vocode/turn_based/transcriber/base_transcriber.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class BaseTranscriber:
|
||||||
|
def transcribe(self, audio_segment: AudioSegment) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
21
vocode/turn_based/transcriber/whisper_transcriber.py
Normal file
21
vocode/turn_based/transcriber/whisper_transcriber.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
from pydub import AudioSegment
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import openai
|
||||||
|
|
||||||
|
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperTranscriber(BaseTranscriber):
|
||||||
|
def transcribe(self, audio_segment: AudioSegment) -> str:
|
||||||
|
in_memory_wav = io.BytesIO()
|
||||||
|
audio_segment.export(in_memory_wav, format="wav")
|
||||||
|
in_memory_wav.seek(0)
|
||||||
|
in_memory_wav.name = "whisper.wav"
|
||||||
|
transcript = openai.Audio.transcribe("whisper-1", in_memory_wav)
|
||||||
|
return transcript.text
|
||||||
38
vocode/turn_based/turn_based_conversation.py
Normal file
38
vocode/turn_based/turn_based_conversation.py
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
from vocode.turn_based.agent.base_agent import BaseAgent
|
||||||
|
from vocode.turn_based.input_device.base_input_device import (
|
||||||
|
BaseInputDevice,
|
||||||
|
)
|
||||||
|
from vocode.turn_based.output_device.base_output_device import BaseOutputDevice
|
||||||
|
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
|
||||||
|
from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber
|
||||||
|
|
||||||
|
|
||||||
|
class TurnBasedConversation:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_device: BaseInputDevice,
|
||||||
|
transcriber: BaseTranscriber,
|
||||||
|
agent: BaseAgent,
|
||||||
|
synthesizer: BaseSynthesizer,
|
||||||
|
output_device: BaseOutputDevice,
|
||||||
|
):
|
||||||
|
self.input_device = input_device
|
||||||
|
self.transcriber = transcriber
|
||||||
|
self.agent = agent
|
||||||
|
self.synthesizer = synthesizer
|
||||||
|
self.output_device = output_device
|
||||||
|
self.maybe_play_initial_message()
|
||||||
|
|
||||||
|
def maybe_play_initial_message(self):
|
||||||
|
if self.agent.initial_message:
|
||||||
|
self.output_device.send_audio(
|
||||||
|
self.synthesizer.synthesize(self.agent.initial_message)
|
||||||
|
)
|
||||||
|
|
||||||
|
def start_speech(self):
|
||||||
|
self.input_device.start_listening()
|
||||||
|
|
||||||
|
def end_speech_and_respond(self):
|
||||||
|
human_input = self.transcriber.transcribe(self.input_device.end_listening())
|
||||||
|
agent_response = self.agent.respond(human_input)
|
||||||
|
self.output_device.send_audio(self.synthesizer.synthesize(agent_response))
|
||||||
Loading…
Add table
Add a link
Reference in a new issue