first pass at turn based conversation
This commit is contained in:
parent
d1118d375e
commit
518a0f2b53
40 changed files with 503 additions and 99 deletions
16
vocode/streaming/input_device/base_input_device.py
Normal file
16
vocode/streaming/input_device/base_input_device.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||
import queue
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class BaseInputDevice:
|
||||
def __init__(
|
||||
self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int
|
||||
):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.audio_encoding = audio_encoding
|
||||
self.chunk_size = chunk_size
|
||||
self.queue = queue.Queue()
|
||||
|
||||
def get_audio(self) -> Optional[bytes]:
|
||||
raise NotImplementedError
|
||||
51
vocode/streaming/input_device/microphone_input.py
Normal file
51
vocode/streaming/input_device/microphone_input.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import sounddevice as sd
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
import queue
|
||||
import wave
|
||||
|
||||
from vocode.streaming.input_device.base_input_device import BaseInputDevice
|
||||
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||
|
||||
|
||||
class MicrophoneInput(BaseInputDevice):
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
DEFAULT_CHUNK_SIZE = 2048
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device_info: dict,
|
||||
sampling_rate: int = None,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
microphone_gain: int = 1,
|
||||
):
|
||||
self.device_info = device_info
|
||||
sampling_rate = sampling_rate or (
|
||||
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||
)
|
||||
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
|
||||
self.stream = sd.InputStream(
|
||||
dtype=np.int16,
|
||||
channels=1,
|
||||
samplerate=self.sampling_rate,
|
||||
blocksize=self.chunk_size,
|
||||
device=int(self.device_info["index"]),
|
||||
callback=self._stream_callback,
|
||||
)
|
||||
self.stream.start()
|
||||
self.queue = queue.Queue()
|
||||
self.microphone_gain = microphone_gain
|
||||
|
||||
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
|
||||
if self.microphone_gain > 1:
|
||||
in_data = in_data * (2 ^ self.microphone_gain)
|
||||
else:
|
||||
in_data = in_data // (2 ^ self.microphone_gain)
|
||||
audio_bytes = in_data.tobytes()
|
||||
self.queue.put_nowait(audio_bytes)
|
||||
|
||||
def get_audio(self) -> Optional[bytes]:
|
||||
try:
|
||||
return self.queue.get_nowait()
|
||||
except queue.Empty:
|
||||
return None
|
||||
11
vocode/streaming/input_device/telephone_input.py
Normal file
11
vocode/streaming/input_device/telephone_input.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
from vocode.streaming.input_device.base_input_device import (
|
||||
BaseInputDevice,
|
||||
)
|
||||
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||
|
||||
|
||||
class TelephoneInput(BaseInputDevice):
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
sampling_rate=8000, audio_encoding=AudioEncoding.MULAW, chunk_size=160
|
||||
)
|
||||
181
vocode/streaming/models/agent.py
Normal file
181
vocode/streaming/models/agent.py
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
from typing import Optional, Union
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import validator
|
||||
|
||||
from vocode.streaming.models.message import BaseMessage
|
||||
from .model import TypedModel, BaseModel
|
||||
|
||||
FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS = 0.5
|
||||
LLM_AGENT_DEFAULT_TEMPERATURE = 1.0
|
||||
LLM_AGENT_DEFAULT_MAX_TOKENS = 256
|
||||
LLM_AGENT_DEFAULT_MODEL_NAME = "text-curie-001"
|
||||
CHAT_GPT_AGENT_DEFAULT_MODEL_NAME = "gpt-3.5-turbo"
|
||||
|
||||
|
||||
class AgentType(str, Enum):
|
||||
BASE = "agent_base"
|
||||
LLM = "agent_llm"
|
||||
CHAT_GPT_ALPHA = "agent_chat_gpt_alpha"
|
||||
CHAT_GPT = "agent_chat_gpt"
|
||||
ECHO = "agent_echo"
|
||||
INFORMATION_RETRIEVAL = "agent_information_retrieval"
|
||||
RESTFUL_USER_IMPLEMENTED = "agent_restful_user_implemented"
|
||||
WEBSOCKET_USER_IMPLEMENTED = "agent_websocket_user_implemented"
|
||||
|
||||
|
||||
class FillerAudioConfig(BaseModel):
|
||||
silence_threshold_seconds: float = FILLER_AUDIO_DEFAULT_SILENCE_THRESHOLD_SECONDS
|
||||
use_phrases: bool = True
|
||||
use_typing_noise: bool = False
|
||||
|
||||
@validator("use_typing_noise")
|
||||
def typing_noise_excludes_phrases(cls, v, values):
|
||||
if v and values.get("use_phrases"):
|
||||
values["use_phrases"] = False
|
||||
if not v and not values.get("use_phrases"):
|
||||
raise ValueError("must use either typing noise or phrases for filler audio")
|
||||
return v
|
||||
|
||||
|
||||
class AgentConfig(TypedModel, type=AgentType.BASE):
|
||||
initial_message: Optional[BaseMessage] = None
|
||||
generate_responses: bool = True
|
||||
allowed_idle_time_seconds: Optional[float] = None
|
||||
end_conversation_on_goodbye: bool = False
|
||||
send_filler_audio: Union[bool, FillerAudioConfig] = False
|
||||
|
||||
|
||||
class CutOffResponse(BaseModel):
|
||||
messages: list[BaseMessage] = [BaseMessage(text="Sorry?")]
|
||||
|
||||
|
||||
class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
|
||||
prompt_preamble: str
|
||||
expected_first_prompt: Optional[str] = None
|
||||
model_name: str = LLM_AGENT_DEFAULT_MODEL_NAME
|
||||
temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
|
||||
max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
|
||||
cut_off_response: Optional[CutOffResponse] = None
|
||||
|
||||
|
||||
class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
|
||||
prompt_preamble: str
|
||||
expected_first_prompt: Optional[str] = None
|
||||
generate_responses: bool = False
|
||||
model_name: str = CHAT_GPT_AGENT_DEFAULT_MODEL_NAME
|
||||
temperature: float = LLM_AGENT_DEFAULT_TEMPERATURE
|
||||
max_tokens: int = LLM_AGENT_DEFAULT_MAX_TOKENS
|
||||
cut_off_response: Optional[CutOffResponse] = None
|
||||
|
||||
|
||||
class InformationRetrievalAgentConfig(
|
||||
AgentConfig, type=AgentType.INFORMATION_RETRIEVAL
|
||||
):
|
||||
recipient_descriptor: str
|
||||
caller_descriptor: str
|
||||
goal_description: str
|
||||
fields: list[str]
|
||||
# TODO: add fields for IVR, voicemail
|
||||
|
||||
|
||||
class EchoAgentConfig(AgentConfig, type=AgentType.ECHO):
|
||||
pass
|
||||
|
||||
|
||||
class RESTfulUserImplementedAgentConfig(
|
||||
AgentConfig, type=AgentType.RESTFUL_USER_IMPLEMENTED
|
||||
):
|
||||
class EndpointConfig(BaseModel):
|
||||
url: str
|
||||
method: str = "POST"
|
||||
|
||||
respond: EndpointConfig
|
||||
generate_responses: bool = False
|
||||
# generate_response: Optional[EndpointConfig]
|
||||
# update_last_bot_message_on_cut_off: Optional[EndpointConfig]
|
||||
|
||||
|
||||
class RESTfulAgentInput(BaseModel):
|
||||
conversation_id: str
|
||||
human_input: str
|
||||
|
||||
|
||||
class RESTfulAgentOutputType(str, Enum):
|
||||
BASE = "restful_agent_base"
|
||||
TEXT = "restful_agent_text"
|
||||
END = "restful_agent_end"
|
||||
|
||||
|
||||
class RESTfulAgentOutput(TypedModel, type=RESTfulAgentOutputType.BASE):
|
||||
pass
|
||||
|
||||
|
||||
class RESTfulAgentText(RESTfulAgentOutput, type=RESTfulAgentOutputType.TEXT):
|
||||
response: str
|
||||
|
||||
|
||||
class RESTfulAgentEnd(RESTfulAgentOutput, type=RESTfulAgentOutputType.END):
|
||||
pass
|
||||
|
||||
|
||||
class WebSocketUserImplementedAgentConfig(
|
||||
AgentConfig, type=AgentType.WEBSOCKET_USER_IMPLEMENTED
|
||||
):
|
||||
class RouteConfig(BaseModel):
|
||||
url: str
|
||||
|
||||
respond: RouteConfig
|
||||
generate_responses: bool = False
|
||||
# generate_response: Optional[RouteConfig]
|
||||
# send_message_on_cut_off: bool = False
|
||||
|
||||
|
||||
class WebSocketAgentMessageType(str, Enum):
|
||||
BASE = "websocket_agent_base"
|
||||
START = "websocket_agent_start"
|
||||
TEXT = "websocket_agent_text"
|
||||
TEXT_END = "websocket_agent_text_end"
|
||||
READY = "websocket_agent_ready"
|
||||
STOP = "websocket_agent_stop"
|
||||
|
||||
|
||||
class WebSocketAgentMessage(TypedModel, type=WebSocketAgentMessageType.BASE):
|
||||
conversation_id: Optional[str] = None
|
||||
|
||||
|
||||
class WebSocketAgentTextMessage(
|
||||
WebSocketAgentMessage, type=WebSocketAgentMessageType.TEXT
|
||||
):
|
||||
class Payload(BaseModel):
|
||||
text: str
|
||||
|
||||
data: Payload
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, text: str, conversation_id: Optional[str] = None):
|
||||
return cls(data=cls.Payload(text=text), conversation_id=conversation_id)
|
||||
|
||||
|
||||
class WebSocketAgentStartMessage(
|
||||
WebSocketAgentMessage, type=WebSocketAgentMessageType.START
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
class WebSocketAgentReadyMessage(
|
||||
WebSocketAgentMessage, type=WebSocketAgentMessageType.READY
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
class WebSocketAgentStopMessage(
|
||||
WebSocketAgentMessage, type=WebSocketAgentMessageType.STOP
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
class WebSocketAgentTextEndMessage(
|
||||
WebSocketAgentMessage, type=WebSocketAgentMessageType.TEXT_END
|
||||
):
|
||||
pass
|
||||
5
vocode/streaming/models/audio_encoding.py
Normal file
5
vocode/streaming/models/audio_encoding.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
from enum import Enum
|
||||
|
||||
class AudioEncoding(str, Enum):
|
||||
LINEAR16 = "linear16"
|
||||
MULAW = "mulaw"
|
||||
16
vocode/streaming/models/message.py
Normal file
16
vocode/streaming/models/message.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
from enum import Enum
|
||||
from .model import TypedModel
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class MessageType(str, Enum):
|
||||
BASE = "message_base"
|
||||
SSML = "message_ssml"
|
||||
|
||||
|
||||
class BaseMessage(TypedModel, type=MessageType.BASE):
|
||||
text: str
|
||||
|
||||
|
||||
class SSMLMessage(BaseMessage, type=MessageType.SSML):
|
||||
ssml: str
|
||||
52
vocode/streaming/models/model.py
Normal file
52
vocode/streaming/models/model.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import pydantic
|
||||
|
||||
class BaseModel(pydantic.BaseModel):
|
||||
|
||||
def __init__(self, **data):
|
||||
for key, value in data.items():
|
||||
if isinstance(value, dict):
|
||||
if 'type' in value:
|
||||
data[key] = TypedModel.parse_obj(value)
|
||||
super().__init__(**data)
|
||||
|
||||
# Adapted from https://github.com/pydantic/pydantic/discussions/3091
|
||||
class TypedModel(BaseModel):
|
||||
|
||||
_subtypes_ = []
|
||||
|
||||
def __init_subclass__(cls, type=None):
|
||||
cls._subtypes_.append([type, cls])
|
||||
|
||||
@classmethod
|
||||
def get_cls(_cls, type):
|
||||
for t, cls in _cls._subtypes_:
|
||||
if t == type:
|
||||
return cls
|
||||
raise ValueError(f'Unknown type {type}')
|
||||
|
||||
@classmethod
|
||||
def get_type(_cls, cls_name):
|
||||
for t, cls in _cls._subtypes_:
|
||||
if cls.__name__ == cls_name:
|
||||
return t
|
||||
raise ValueError(f'Unknown class {cls_name}')
|
||||
|
||||
@classmethod
|
||||
def parse_obj(cls, obj):
|
||||
data_type = obj.get('type')
|
||||
if data_type is None:
|
||||
raise ValueError(f'type is required for {cls.__name__}')
|
||||
|
||||
sub = cls.get_cls(data_type)
|
||||
if sub is None:
|
||||
raise ValueError(f'Unknown type {data_type}')
|
||||
return sub(**obj)
|
||||
|
||||
def _iter(self, **kwargs):
|
||||
yield 'type', self.get_type(self.__class__.__name__)
|
||||
yield from super()._iter(**kwargs)
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.get_type(self.__class__.__name__)
|
||||
|
||||
73
vocode/streaming/models/synthesizer.py
Normal file
73
vocode/streaming/models/synthesizer.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
from enum import Enum
|
||||
from typing import Optional, Union
|
||||
|
||||
from pydantic import BaseModel, validator
|
||||
from .model import TypedModel
|
||||
from .audio_encoding import AudioEncoding
|
||||
from ..output_device.base_output_device import BaseOutputDevice
|
||||
|
||||
|
||||
class SynthesizerType(str, Enum):
|
||||
BASE = "synthesizer_base"
|
||||
AZURE = "synthesizer_azure"
|
||||
GOOGLE = "synthesizer_google"
|
||||
ELEVEN_LABS = "synthesizer_eleven_labs"
|
||||
|
||||
|
||||
class TrackBotSentimentConfig(BaseModel):
|
||||
emotions: list[str] = ["angry", "friendly", "sad", "whispering"]
|
||||
|
||||
@validator("emotions")
|
||||
def emotions_must_not_be_empty(cls, v):
|
||||
if len(v) == 0:
|
||||
raise ValueError("must have at least one emotion")
|
||||
return v
|
||||
|
||||
|
||||
class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
|
||||
sampling_rate: int
|
||||
audio_encoding: AudioEncoding
|
||||
should_encode_as_wav: bool = False
|
||||
track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False
|
||||
|
||||
@classmethod
|
||||
def from_output_device(cls, output_device: BaseOutputDevice):
|
||||
return cls(
|
||||
sampling_rate=output_device.sampling_rate,
|
||||
audio_encoding=output_device.audio_encoding,
|
||||
)
|
||||
|
||||
|
||||
AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME = "en-US-AriaNeural"
|
||||
AZURE_SYNTHESIZER_DEFAULT_PITCH = 0
|
||||
AZURE_SYNTHESIZER_DEFAULT_RATE = 15
|
||||
|
||||
|
||||
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
|
||||
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
|
||||
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
|
||||
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
|
||||
|
||||
@classmethod
|
||||
def from_output_device(
|
||||
cls,
|
||||
output_device: BaseOutputDevice,
|
||||
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME,
|
||||
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH,
|
||||
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE,
|
||||
track_bot_sentiment_in_voice: Union[bool, TrackBotSentimentConfig] = False,
|
||||
):
|
||||
return cls(
|
||||
sampling_rate=output_device.sampling_rate,
|
||||
audio_encoding=output_device.audio_encoding,
|
||||
voice_name=voice_name,
|
||||
pitch=pitch,
|
||||
rate=rate,
|
||||
track_bot_sentiment_in_voice=track_bot_sentiment_in_voice,
|
||||
)
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
|
||||
pass
|
||||
50
vocode/streaming/models/telephony.py
Normal file
50
vocode/streaming/models/telephony.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
from typing import Optional
|
||||
from vocode.streaming.models.model import BaseModel
|
||||
from vocode.streaming.models.agent import AgentConfig
|
||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||
|
||||
|
||||
class TwilioConfig(BaseModel):
|
||||
account_sid: str
|
||||
auth_token: str
|
||||
|
||||
|
||||
class CallEntity(BaseModel):
|
||||
phone_number: str
|
||||
|
||||
|
||||
class CreateInboundCall(BaseModel):
|
||||
transcriber_config: Optional[TranscriberConfig] = None
|
||||
agent_config: AgentConfig
|
||||
synthesizer_config: Optional[SynthesizerConfig] = None
|
||||
twilio_sid: str
|
||||
twilio_config: Optional[TwilioConfig] = None
|
||||
|
||||
|
||||
class EndOutboundCall(BaseModel):
|
||||
call_id: str
|
||||
twilio_config: Optional[TwilioConfig] = None
|
||||
|
||||
|
||||
class CreateOutboundCall(BaseModel):
|
||||
recipient: CallEntity
|
||||
caller: CallEntity
|
||||
transcriber_config: Optional[TranscriberConfig] = None
|
||||
agent_config: AgentConfig
|
||||
synthesizer_config: Optional[SynthesizerConfig] = None
|
||||
conversation_id: Optional[str] = None
|
||||
twilio_config: Optional[TwilioConfig] = None
|
||||
# TODO add IVR/etc.
|
||||
|
||||
|
||||
class DialIntoZoomCall(BaseModel):
|
||||
recipient: CallEntity
|
||||
caller: CallEntity
|
||||
zoom_meeting_id: str
|
||||
zoom_meeting_password: Optional[str]
|
||||
transcriber_config: Optional[TranscriberConfig] = None
|
||||
agent_config: AgentConfig
|
||||
synthesizer_config: Optional[SynthesizerConfig] = None
|
||||
conversation_id: Optional[str] = None
|
||||
twilio_config: Optional[TwilioConfig] = None
|
||||
70
vocode/streaming/models/transcriber.py
Normal file
70
vocode/streaming/models/transcriber.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
from vocode.streaming.input_device.base_input_device import (
|
||||
BaseInputDevice,
|
||||
)
|
||||
from .audio_encoding import AudioEncoding
|
||||
from .model import BaseModel, TypedModel
|
||||
|
||||
|
||||
class TranscriberType(str, Enum):
|
||||
BASE = "transcriber_base"
|
||||
DEEPGRAM = "transcriber_deepgram"
|
||||
GOOGLE = "transcriber_google"
|
||||
ASSEMBLY_AI = "transcriber_assembly_ai"
|
||||
|
||||
|
||||
class EndpointingType(str, Enum):
|
||||
BASE = "endpointing_base"
|
||||
TIME_BASED = "endpointing_time_based"
|
||||
PUNCTUATION_BASED = "endpointing_punctuation_based"
|
||||
|
||||
|
||||
class EndpointingConfig(TypedModel, type=EndpointingType.BASE):
|
||||
pass
|
||||
|
||||
|
||||
class TimeEndpointingConfig(EndpointingConfig, type=EndpointingType.TIME_BASED):
|
||||
time_cutoff_seconds: float = 0.4
|
||||
|
||||
|
||||
class PunctuationEndpointingConfig(
|
||||
EndpointingConfig, type=EndpointingType.PUNCTUATION_BASED
|
||||
):
|
||||
time_cutoff_seconds: float = 0.4
|
||||
|
||||
|
||||
class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
|
||||
sampling_rate: int
|
||||
audio_encoding: AudioEncoding
|
||||
chunk_size: int
|
||||
endpointing_config: Optional[EndpointingConfig] = None
|
||||
|
||||
@classmethod
|
||||
def from_input_device(
|
||||
cls,
|
||||
input_device: BaseInputDevice,
|
||||
endpointing_config: Optional[EndpointingConfig] = None,
|
||||
):
|
||||
return cls(
|
||||
sampling_rate=input_device.sampling_rate,
|
||||
audio_encoding=input_device.audio_encoding,
|
||||
chunk_size=input_device.chunk_size,
|
||||
endpointing_config=endpointing_config,
|
||||
)
|
||||
|
||||
|
||||
class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
|
||||
model: Optional[str] = None
|
||||
should_warmup_model: bool = False
|
||||
version: Optional[str] = None
|
||||
|
||||
|
||||
class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
|
||||
model: Optional[str] = None
|
||||
should_warmup_model: bool = False
|
||||
|
||||
|
||||
class AssemblyAITranscriberConfig(TranscriberConfig, type=TranscriberType.ASSEMBLY_AI):
|
||||
should_warmup_model: bool = False
|
||||
38
vocode/streaming/models/websocket.py
Normal file
38
vocode/streaming/models/websocket.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
import base64
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
from .model import TypedModel
|
||||
from .transcriber import TranscriberConfig
|
||||
from .agent import AgentConfig
|
||||
from .synthesizer import SynthesizerConfig
|
||||
|
||||
class WebSocketMessageType(str, Enum):
|
||||
BASE = 'websocket_base'
|
||||
START = 'websocket_start'
|
||||
AUDIO = 'websocket_audio'
|
||||
READY = 'websocket_ready'
|
||||
STOP = 'websocket_stop'
|
||||
|
||||
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
|
||||
|
||||
class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
|
||||
data: str
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls, chunk: bytes):
|
||||
return cls(data=base64.b64encode(chunk).decode('utf-8'))
|
||||
|
||||
def get_bytes(self) -> bytes:
|
||||
return base64.b64decode(self.data)
|
||||
|
||||
class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
|
||||
transcriber_config: TranscriberConfig
|
||||
agent_config: AgentConfig
|
||||
synthesizer_config: SynthesizerConfig
|
||||
conversation_id: Optional[str] = None
|
||||
|
||||
class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
|
||||
pass
|
||||
|
||||
class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
|
||||
pass
|
||||
13
vocode/streaming/output_device/base_output_device.py
Normal file
13
vocode/streaming/output_device/base_output_device.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||
|
||||
|
||||
class BaseOutputDevice:
|
||||
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.audio_encoding = audio_encoding
|
||||
|
||||
async def send_async(self, chunk):
|
||||
raise NotImplemented
|
||||
|
||||
async def maybe_send_mark_async(self, message):
|
||||
pass
|
||||
34
vocode/streaming/output_device/speaker_output.py
Normal file
34
vocode/streaming/output_device/speaker_output.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import sounddevice as sd
|
||||
import numpy as np
|
||||
|
||||
from .base_output_device import BaseOutputDevice
|
||||
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||
|
||||
|
||||
class SpeakerOutput(BaseOutputDevice):
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device_info: dict,
|
||||
sampling_rate: int = None,
|
||||
audio_encoding: AudioEncoding = AudioEncoding.LINEAR16,
|
||||
):
|
||||
self.device_info = device_info
|
||||
sampling_rate = sampling_rate or int(
|
||||
self.device_info.get("default_samplerate", self.DEFAULT_SAMPLING_RATE)
|
||||
)
|
||||
super().__init__(sampling_rate, audio_encoding)
|
||||
self.stream = sd.OutputStream(
|
||||
channels=1,
|
||||
samplerate=self.sampling_rate,
|
||||
dtype=np.int16,
|
||||
device=int(self.device_info["index"]),
|
||||
)
|
||||
self.stream.start()
|
||||
|
||||
async def send_async(self, chunk):
|
||||
self.stream.write(np.frombuffer(chunk, dtype=np.int16))
|
||||
|
||||
def terminate(self):
|
||||
self.stream.close()
|
||||
7
vocode/streaming/output_device/telephone_output.py
Normal file
7
vocode/streaming/output_device/telephone_output.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
from .base_output_device import BaseOutputDevice
|
||||
from vocode.streaming.models.audio_encoding import AudioEncoding
|
||||
|
||||
|
||||
class TelephoneOutput(BaseOutputDevice):
|
||||
def __init__(self):
|
||||
super().__init__(sampling_rate=8000, audio_encoding=AudioEncoding.MULAW)
|
||||
106
vocode/streaming/streaming_conversation.py
Normal file
106
vocode/streaming/streaming_conversation.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
import websockets
|
||||
from websockets.exceptions import ConnectionClosedOK
|
||||
from websockets.client import WebSocketClientProtocol
|
||||
import asyncio
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import logging
|
||||
import threading
|
||||
import queue
|
||||
import vocode
|
||||
from vocode.streaming.input_device.base_input_device import (
|
||||
BaseInputDevice,
|
||||
)
|
||||
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
|
||||
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||
from vocode.streaming.models.agent import AgentConfig
|
||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||
from vocode.streaming.models.websocket import (
|
||||
ReadyMessage,
|
||||
AudioMessage,
|
||||
StartMessage,
|
||||
StopMessage,
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class StreamingConversation:
|
||||
def __init__(
|
||||
self,
|
||||
input_device: BaseInputDevice,
|
||||
output_device: BaseOutputDevice,
|
||||
transcriber_config: TranscriberConfig,
|
||||
agent_config: AgentConfig,
|
||||
synthesizer_config: SynthesizerConfig,
|
||||
id: str = None,
|
||||
):
|
||||
self.id = id
|
||||
self.input_device = input_device
|
||||
self.output_device = output_device
|
||||
self.transcriber_config = transcriber_config
|
||||
self.agent_config = agent_config
|
||||
self.synthesizer_config = synthesizer_config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.receiver_ready = False
|
||||
self.active = True
|
||||
self.output_loop = asyncio.new_event_loop()
|
||||
self.output_audio_queue = queue.Queue()
|
||||
self.vocode_websocket_url = f"wss://{vocode.base_url}/conversation"
|
||||
|
||||
async def wait_for_ready(self):
|
||||
while not self.receiver_ready:
|
||||
await asyncio.sleep(0.1)
|
||||
return True
|
||||
|
||||
def deactivate(self):
|
||||
self.active = False
|
||||
|
||||
def play_audio(self):
|
||||
async def run():
|
||||
while self.active:
|
||||
try:
|
||||
audio = self.output_audio_queue.get(timeout=5)
|
||||
await self.output_device.send_async(audio)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
loop.run_until_complete(run())
|
||||
|
||||
async def start(self):
|
||||
async with websockets.connect(
|
||||
f"{self.vocode_websocket_url}?key={vocode.api_key}"
|
||||
) as ws:
|
||||
|
||||
async def sender(ws: WebSocketClientProtocol):
|
||||
start_message = StartMessage(
|
||||
transcriber_config=self.transcriber_config,
|
||||
agent_config=self.agent_config,
|
||||
synthesizer_config=self.synthesizer_config,
|
||||
conversation_id=self.id,
|
||||
)
|
||||
await ws.send(start_message.json())
|
||||
await self.wait_for_ready()
|
||||
self.logger.info("Listening...press Ctrl+C to stop")
|
||||
while self.active:
|
||||
data = self.input_device.get_audio()
|
||||
if data:
|
||||
try:
|
||||
await ws.send(AudioMessage.from_bytes(data).json())
|
||||
except ConnectionClosedOK:
|
||||
self.deactivate()
|
||||
return
|
||||
await asyncio.sleep(0)
|
||||
await ws.send(StopMessage().json())
|
||||
|
||||
async def receiver(ws: WebSocketClientProtocol):
|
||||
ReadyMessage.parse_raw(await ws.recv())
|
||||
self.receiver_ready = True
|
||||
async for msg in ws:
|
||||
audio_message = AudioMessage.parse_raw(msg)
|
||||
self.output_audio_queue.put_nowait(audio_message.get_bytes())
|
||||
|
||||
output_thread = threading.Thread(target=self.play_audio)
|
||||
output_thread.start()
|
||||
return await asyncio.gather(sender(ws), receiver(ws))
|
||||
62
vocode/streaming/telephony/inbound_call_server.py
Normal file
62
vocode/streaming/telephony/inbound_call_server.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
from fastapi import FastAPI, Response, Form
|
||||
from typing import Optional
|
||||
import requests
|
||||
import uvicorn
|
||||
|
||||
import vocode
|
||||
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||
from vocode.streaming.models.agent import AgentConfig
|
||||
from vocode.streaming.models.telephony import (
|
||||
CreateInboundCall,
|
||||
TwilioConfig,
|
||||
TwilioConfig,
|
||||
)
|
||||
|
||||
|
||||
class InboundCallServer:
|
||||
def __init__(
|
||||
self,
|
||||
agent_config: AgentConfig,
|
||||
transcriber_config: Optional[TranscriberConfig] = None,
|
||||
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||
response_on_rate_limit: Optional[str] = None,
|
||||
twilio_config: Optional[TwilioConfig] = None,
|
||||
):
|
||||
self.agent_config = agent_config
|
||||
self.transcriber_config = transcriber_config
|
||||
self.synthesizer_config = synthesizer_config
|
||||
self.app = FastAPI()
|
||||
self.app.post("/vocode")(self.handle_call)
|
||||
self.response_on_rate_limit = (
|
||||
response_on_rate_limit
|
||||
or "The line is really busy right now, check back later!"
|
||||
)
|
||||
self.twilio_config = twilio_config
|
||||
self.vocode_inbound_call_url = f"https://{vocode.base_url}/create_inbound_call"
|
||||
|
||||
async def handle_call(self, twilio_sid: str = Form(alias="CallSid")):
|
||||
response = requests.post(
|
||||
self.vocode_inbound_call_url,
|
||||
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||
json=CreateInboundCall(
|
||||
agent_config=self.agent_config,
|
||||
twilio_sid=twilio_sid,
|
||||
transcriber_config=self.transcriber_config,
|
||||
synthesizer_config=self.synthesizer_config,
|
||||
twilio_config=self.twilio_config,
|
||||
).dict(),
|
||||
)
|
||||
if response.status_code == 429:
|
||||
return Response(
|
||||
f"<Response><Say>{self.response_on_rate_limit}</Say></Response>",
|
||||
media_type="application/xml",
|
||||
)
|
||||
assert response.ok, response.text
|
||||
return Response(
|
||||
response.text,
|
||||
media_type="application/xml",
|
||||
)
|
||||
|
||||
def run(self, host="localhost", port=3000):
|
||||
uvicorn.run(self.app, host=host, port=port)
|
||||
68
vocode/streaming/telephony/outbound_call.py
Normal file
68
vocode/streaming/telephony/outbound_call.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
from typing import Optional
|
||||
import requests
|
||||
|
||||
import vocode
|
||||
from vocode.streaming.models.agent import AgentConfig
|
||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||
from ..models.telephony import (
|
||||
CallEntity,
|
||||
CreateOutboundCall,
|
||||
EndOutboundCall,
|
||||
TwilioConfig,
|
||||
)
|
||||
|
||||
|
||||
class OutboundCall:
|
||||
def __init__(
|
||||
self,
|
||||
recipient: CallEntity,
|
||||
caller: CallEntity,
|
||||
agent_config: AgentConfig,
|
||||
transcriber_config: Optional[TranscriberConfig] = None,
|
||||
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||
conversation_id: Optional[str] = None,
|
||||
twilio_config: Optional[TwilioConfig] = None,
|
||||
):
|
||||
self.recipient = recipient
|
||||
self.caller = caller
|
||||
self.agent_config = agent_config
|
||||
self.transcriber_config = transcriber_config
|
||||
self.synthesizer_config = synthesizer_config
|
||||
self.conversation_id = conversation_id
|
||||
self.twilio_config = twilio_config
|
||||
self.vocode_create_outbound_call_url = (
|
||||
f"https://{vocode.base_url}/create_outbound_call"
|
||||
)
|
||||
self.vocode_end_outbound_call_url = (
|
||||
f"https://{vocode.base_url}/end_outbound_call"
|
||||
)
|
||||
|
||||
def start(self) -> str:
|
||||
response = requests.post(
|
||||
self.vocode_create_outbound_call_url,
|
||||
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||
json=CreateOutboundCall(
|
||||
recipient=self.recipient,
|
||||
caller=self.caller,
|
||||
agent_config=self.agent_config,
|
||||
transcriber_config=self.transcriber_config,
|
||||
synthesizer_config=self.synthesizer_config,
|
||||
conversation_id=self.conversation_id,
|
||||
twilio_config=self.twilio_config,
|
||||
).dict(),
|
||||
)
|
||||
assert response.ok, response.text
|
||||
data = response.json()
|
||||
self.conversation_id = data["id"]
|
||||
|
||||
def end(self) -> str:
|
||||
response = requests.post(
|
||||
self.vocode_end_outbound_call_url,
|
||||
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||
json=EndOutboundCall(
|
||||
call_id=self.conversation_id,
|
||||
twilio_config=self.twilio_config,
|
||||
).dict(),
|
||||
)
|
||||
assert response.ok or response.status_code == 404, response.text
|
||||
60
vocode/streaming/telephony/zoom_dial_in.py
Normal file
60
vocode/streaming/telephony/zoom_dial_in.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
from typing import Optional
|
||||
import requests
|
||||
|
||||
import vocode
|
||||
from vocode.streaming.models.agent import AgentConfig
|
||||
from vocode.streaming.models.synthesizer import SynthesizerConfig
|
||||
from vocode.streaming.models.transcriber import TranscriberConfig
|
||||
from vocode.streaming.telephony.outbound_call import OutboundCall
|
||||
from vocode.streaming.models.telephony import (
|
||||
CallEntity,
|
||||
DialIntoZoomCall,
|
||||
TwilioConfig,
|
||||
)
|
||||
|
||||
|
||||
class ZoomDialIn(OutboundCall):
|
||||
def __init__(
|
||||
self,
|
||||
recipient: CallEntity,
|
||||
caller: CallEntity,
|
||||
zoom_meeting_id: str,
|
||||
zoom_meeting_password: str,
|
||||
agent_config: AgentConfig,
|
||||
transcriber_config: Optional[TranscriberConfig] = None,
|
||||
synthesizer_config: Optional[SynthesizerConfig] = None,
|
||||
conversation_id: Optional[str] = None,
|
||||
twilio_config: Optional[TwilioConfig] = None,
|
||||
):
|
||||
super().__init__(
|
||||
recipient=recipient,
|
||||
caller=caller,
|
||||
agent_config=agent_config,
|
||||
transcriber_config=transcriber_config,
|
||||
synthesizer_config=synthesizer_config,
|
||||
conversation_id=conversation_id,
|
||||
twilio_config=twilio_config,
|
||||
)
|
||||
self.zoom_meeting_id = zoom_meeting_id
|
||||
self.zoom_meeting_password = zoom_meeting_password
|
||||
self.vocode_zoom_dial_in_url = f"https://{vocode.base_url}/dial_into_zoom_call"
|
||||
|
||||
def start(self) -> str:
|
||||
response = requests.post(
|
||||
self.vocode_zoom_dial_in_url,
|
||||
headers={"Authorization": f"Bearer {vocode.api_key}"},
|
||||
json=DialIntoZoomCall(
|
||||
recipient=self.recipient,
|
||||
caller=self.caller,
|
||||
zoom_meeting_id=self.zoom_meeting_id,
|
||||
zoom_meeting_password=self.zoom_meeting_password,
|
||||
agent_config=self.agent_config,
|
||||
transcriber_config=self.transcriber_config,
|
||||
synthesizer_config=self.synthesizer_config,
|
||||
conversation_id=self.conversation_id,
|
||||
twilio_config=self.twilio_config,
|
||||
).dict(),
|
||||
)
|
||||
assert response.ok, response.text
|
||||
data = response.json()
|
||||
self.conversation_id = data["id"]
|
||||
10
vocode/streaming/user_implemented_agent/base_agent.py
Normal file
10
vocode/streaming/user_implemented_agent/base_agent.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
from fastapi import FastAPI
|
||||
import uvicorn
|
||||
|
||||
|
||||
class BaseAgent:
|
||||
def __init__(self):
|
||||
self.app = FastAPI()
|
||||
|
||||
def run(self, host="localhost", port=3000):
|
||||
uvicorn.run(self.app, host=host, port=port)
|
||||
19
vocode/streaming/user_implemented_agent/restful_agent.py
Normal file
19
vocode/streaming/user_implemented_agent/restful_agent.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
from .base_agent import BaseAgent
|
||||
from ..models.agent import RESTfulAgentInput, RESTfulAgentOutput, RESTfulAgentText, RESTfulAgentEnd
|
||||
from pydantic import BaseModel
|
||||
from typing import Union
|
||||
from fastapi import APIRouter
|
||||
|
||||
class RESTfulAgent(BaseAgent):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.app.post("/respond")(self.respond_rest)
|
||||
|
||||
async def respond(self, human_input, conversation_id) -> RESTfulAgentOutput:
|
||||
raise NotImplementedError
|
||||
|
||||
async def respond_rest(self, request: RESTfulAgentInput) -> Union[RESTfulAgentText, RESTfulAgentEnd]:
|
||||
response = await self.respond(request.human_input, request.conversation_id)
|
||||
return response
|
||||
|
||||
56
vocode/streaming/user_implemented_agent/websocket_agent.py
Normal file
56
vocode/streaming/user_implemented_agent/websocket_agent.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
from .base_agent import BaseAgent
|
||||
import uuid
|
||||
import typing
|
||||
from typing import AsyncGenerator, Union, Optional
|
||||
from fastapi import WebSocket
|
||||
from ..models.agent import (
|
||||
WebSocketAgentStartMessage,
|
||||
WebSocketAgentReadyMessage,
|
||||
WebSocketAgentTextEndMessage,
|
||||
WebSocketAgentTextMessage,
|
||||
WebSocketAgentStopMessage,
|
||||
WebSocketAgentMessage,
|
||||
WebSocketAgentMessageType,
|
||||
)
|
||||
|
||||
|
||||
class WebSocketAgent(BaseAgent):
|
||||
def __init__(self, generate_responses: bool = False):
|
||||
super().__init__()
|
||||
self.generate_responses = generate_responses
|
||||
self.app.websocket("/respond")(self.respond_websocket)
|
||||
|
||||
async def respond(
|
||||
self, human_input: str, conversation_id: Optional[str] = None
|
||||
) -> Union[WebSocketAgentTextMessage, WebSocketAgentStopMessage]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def generate_response(
|
||||
self, human_input: str, conversation_id: Optional[str] = None
|
||||
) -> AsyncGenerator[
|
||||
Union[WebSocketAgentTextMessage, WebSocketAgentTextEndMessage], None
|
||||
]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def respond_websocket(self, websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
WebSocketAgentStartMessage.parse_obj(await websocket.receive_json())
|
||||
await websocket.send_text(WebSocketAgentReadyMessage().json())
|
||||
while True:
|
||||
input_message: WebSocketAgentMessage = WebSocketAgentMessage.parse_obj(
|
||||
await websocket.receive_json()
|
||||
)
|
||||
if input_message.type == WebSocketAgentMessageType.STOP:
|
||||
break
|
||||
text_message = typing.cast(WebSocketAgentTextMessage, input_message)
|
||||
if self.generate_responses:
|
||||
async for output_response in self.generate_response(
|
||||
text_message.data.text, text_message.conversation_id
|
||||
):
|
||||
await websocket.send_text(output_response.json())
|
||||
else:
|
||||
output_response = await self.respond(
|
||||
text_message.data.text, text_message.conversation_id
|
||||
)
|
||||
await websocket.send_text(output_response.json())
|
||||
await websocket.close()
|
||||
Loading…
Add table
Add a link
Reference in a new issue