python SDK
This commit is contained in:
commit
6dc9fceeb5
18 changed files with 482 additions and 0 deletions
75
vocode/conversation.py
Normal file
75
vocode/conversation.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import websockets
|
||||
import asyncio
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import logging
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from .input_device.base_input_device import BaseInputDevice
|
||||
from .output_device.base_output_device import BaseOutputDevice
|
||||
from .models.transcriber import TranscriberConfig
|
||||
from .models.agent import AgentConfig
|
||||
from .models.synthesizer import SynthesizerConfig
|
||||
from .models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage
|
||||
|
||||
BASE_URL = os.environ.get('BASE_URL')
|
||||
VOCODE_WEBSOCKET_URL = f'wss://{BASE_URL}/conversation'
|
||||
|
||||
class Conversation:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token: str,
|
||||
input_device: BaseInputDevice,
|
||||
output_device: BaseOutputDevice,
|
||||
transcriber_config: TranscriberConfig,
|
||||
agent_config: AgentConfig,
|
||||
synthesizer_config: SynthesizerConfig
|
||||
):
|
||||
self.token = token
|
||||
self.input_device = input_device
|
||||
self.output_device = output_device
|
||||
self.transcriber_config = transcriber_config
|
||||
self.agent_config = agent_config
|
||||
self.synthesizer_config = synthesizer_config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.receiver_ready = False
|
||||
self.active = True
|
||||
|
||||
async def wait_for_ready(self):
|
||||
while not self.receiver_ready:
|
||||
await asyncio.sleep(0.1)
|
||||
return True
|
||||
|
||||
def deactivate(self):
|
||||
self.active = False
|
||||
|
||||
async def start(self):
|
||||
async with websockets.connect(f"{VOCODE_WEBSOCKET_URL}?key={self.token}") as ws:
|
||||
async def sender(ws):
|
||||
start_message = StartMessage(
|
||||
transcriber_config=self.transcriber_config,
|
||||
agent_config=self.agent_config,
|
||||
synthesizer_config=self.synthesizer_config
|
||||
)
|
||||
await ws.send(start_message.json())
|
||||
await self.wait_for_ready()
|
||||
self.logger.info("Listening...press Ctrl+C to stop")
|
||||
while self.active:
|
||||
data = self.input_device.get_audio()
|
||||
if data:
|
||||
await ws.send(AudioMessage.from_bytes(data).json())
|
||||
await asyncio.sleep(0)
|
||||
await ws.send(StopMessage().json())
|
||||
|
||||
async def receiver(ws):
|
||||
ReadyMessage.parse_raw(await ws.recv())
|
||||
self.receiver_ready = True
|
||||
async for msg in ws:
|
||||
audio_message = AudioMessage.parse_raw(msg)
|
||||
await self.output_device.send_async(audio_message.get_bytes())
|
||||
|
||||
|
||||
return await asyncio.gather(sender(ws), receiver(ws))
|
||||
|
||||
30
vocode/helpers.py
Normal file
30
vocode/helpers.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import pyaudio
|
||||
from .input_device.microphone_input import MicrophoneInput
|
||||
from .output_device.speaker_output import SpeakerOutput
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _get_device_prompt(device_infos: list[dict]) -> str:
|
||||
return """Please select a device:
|
||||
{}
|
||||
Choice: """.format(
|
||||
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
|
||||
|
||||
def create_microphone_input_and_speaker_output(use_first_available_device=False) -> tuple[MicrophoneInput, SpeakerOutput]:
|
||||
pa = pyaudio.PyAudio()
|
||||
num_devices = pa.get_device_count()
|
||||
devices = list(map(pa.get_device_info_by_index, range(num_devices)))
|
||||
input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices))
|
||||
output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices))
|
||||
if use_first_available_device:
|
||||
input_device_info = input_device_infos[0]
|
||||
output_device_info = output_device_infos[0]
|
||||
else:
|
||||
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
|
||||
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
|
||||
logger.info("Using microphone input device: %s", input_device_info['name'])
|
||||
microphone_input = MicrophoneInput(pa, input_device_info)
|
||||
logger.info("Using speaker output device: %s", output_device_info['name'])
|
||||
speaker_output = SpeakerOutput(pa, output_device_info)
|
||||
return microphone_input, speaker_output
|
||||
14
vocode/input_device/base_input_device.py
Normal file
14
vocode/input_device/base_input_device.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from ..models.audio_encoding import AudioEncoding
|
||||
import queue
|
||||
from typing import Optional
|
||||
|
||||
class BaseInputDevice():
|
||||
|
||||
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.audio_encoding = audio_encoding
|
||||
self.chunk_size = chunk_size
|
||||
self.queue = queue.Queue()
|
||||
|
||||
def get_audio(self) -> Optional[bytes]:
|
||||
raise NotImplementedError
|
||||
37
vocode/input_device/microphone_input.py
Normal file
37
vocode/input_device/microphone_input.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import pyaudio
|
||||
from typing import Optional
|
||||
import queue
|
||||
|
||||
from .base_input_device import BaseInputDevice
|
||||
from ..models.audio_encoding import AudioEncoding
|
||||
|
||||
class MicrophoneInput(BaseInputDevice):
|
||||
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
DEFAULT_CHUNK_SIZE = 2048
|
||||
|
||||
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, chunk_size: int = DEFAULT_CHUNK_SIZE):
|
||||
self.device_info = device_info
|
||||
sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
|
||||
super().__init__(sampling_rate, AudioEncoding.LINEAR16, chunk_size)
|
||||
self.pa = pa
|
||||
self.stream = pa.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=self.sampling_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk_size,
|
||||
input_device_index=int(self.device_info['index']),
|
||||
stream_callback=self._stream_callback
|
||||
)
|
||||
self.queue = queue.Queue()
|
||||
|
||||
def _stream_callback(self, in_data, *_args):
|
||||
self.queue.put_nowait(in_data)
|
||||
return (None, pyaudio.paContinue)
|
||||
|
||||
def get_audio(self) -> Optional[bytes]:
|
||||
try:
|
||||
return self.queue.get_nowait()
|
||||
except queue.Empty:
|
||||
return None
|
||||
37
vocode/models/agent.py
Normal file
37
vocode/models/agent.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
from typing import Optional
|
||||
from enum import Enum
|
||||
from .model import TypedModel
|
||||
|
||||
|
||||
class AgentType(str, Enum):
|
||||
BASE = "base"
|
||||
LLM = "llm"
|
||||
CHAT_GPT = "chat_gpt"
|
||||
ECHO = "echo"
|
||||
INFORMATION_RETRIEVAL = "information_retrieval"
|
||||
|
||||
|
||||
class AgentConfig(TypedModel, type=AgentType.BASE):
|
||||
initial_message: Optional[str] = None
|
||||
|
||||
|
||||
class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
|
||||
prompt_preamble: str
|
||||
expected_first_prompt: Optional[str] = None
|
||||
|
||||
class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
|
||||
prompt_preamble: str
|
||||
expected_first_prompt: Optional[str] = None
|
||||
|
||||
class InformationRetrievalAgentConfig(
|
||||
AgentConfig, type=AgentType.INFORMATION_RETRIEVAL
|
||||
):
|
||||
recipient_descriptor: str
|
||||
caller_descriptor: str
|
||||
goal_description: str
|
||||
fields: list[str]
|
||||
# TODO: add fields for IVR, voicemail
|
||||
|
||||
|
||||
class EchoAgentConfig(AgentConfig, type=AgentType.ECHO):
|
||||
pass
|
||||
5
vocode/models/audio_encoding.py
Normal file
5
vocode/models/audio_encoding.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
from enum import Enum
|
||||
|
||||
class AudioEncoding(str, Enum):
|
||||
LINEAR16 = "linear16"
|
||||
MULAW = "mulaw"
|
||||
51
vocode/models/model.py
Normal file
51
vocode/models/model.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import pydantic
|
||||
|
||||
class BaseModel(pydantic.BaseModel):
|
||||
|
||||
def __init__(self, **data):
|
||||
for key, value in data.items():
|
||||
if isinstance(value, dict):
|
||||
data[key] = self.parse_obj(value)
|
||||
super().__init__(**data)
|
||||
|
||||
# Adapted from https://github.com/pydantic/pydantic/discussions/3091
|
||||
class TypedModel(BaseModel):
|
||||
|
||||
_subtypes_ = []
|
||||
|
||||
def __init_subclass__(cls, type=None):
|
||||
cls._subtypes_.append([type, cls])
|
||||
|
||||
@classmethod
|
||||
def get_cls(_cls, type):
|
||||
for t, cls in _cls._subtypes_:
|
||||
if t == type:
|
||||
return cls
|
||||
raise ValueError(f'Unknown type {type}')
|
||||
|
||||
@classmethod
|
||||
def get_type(_cls, cls_name):
|
||||
for t, cls in _cls._subtypes_:
|
||||
if cls.__name__ == cls_name:
|
||||
return t
|
||||
raise ValueError(f'Unknown class {cls_name}')
|
||||
|
||||
@classmethod
|
||||
def parse_obj(cls, obj):
|
||||
data_type = obj.get('type')
|
||||
if data_type is None:
|
||||
raise ValueError(f'type is required for {cls.__name__}')
|
||||
|
||||
sub = cls.get_cls(data_type)
|
||||
if sub is None:
|
||||
raise ValueError(f'Unknown type {data_type}')
|
||||
return sub(**obj)
|
||||
|
||||
def _iter(self, **kwargs):
|
||||
yield 'type', self.get_type(self.__class__.__name__)
|
||||
yield from super()._iter(**kwargs)
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.get_type(self.__class__.__name__)
|
||||
|
||||
27
vocode/models/synthesizer.py
Normal file
27
vocode/models/synthesizer.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
from enum import Enum
|
||||
from .model import TypedModel
|
||||
from .audio_encoding import AudioEncoding
|
||||
from ..output_device.base_output_device import BaseOutputDevice
|
||||
|
||||
class SynthesizerType(str, Enum):
|
||||
BASE = "base"
|
||||
AZURE = "azure"
|
||||
GOOGLE = "google"
|
||||
ELEVEN_LABS = "eleven_labs"
|
||||
|
||||
class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
|
||||
sampling_rate: int
|
||||
audio_encoding: AudioEncoding
|
||||
|
||||
@classmethod
|
||||
def from_output_device(cls, output_device: BaseOutputDevice):
|
||||
return cls(sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding)
|
||||
|
||||
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
|
||||
pass
|
||||
|
||||
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
|
||||
pass
|
||||
|
||||
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
|
||||
pass
|
||||
14
vocode/models/telephony.py
Normal file
14
vocode/models/telephony.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from pydantic import BaseModel
|
||||
from vocode.models.agent import AgentConfig, InformationRetrievalAgentConfig
|
||||
|
||||
|
||||
class CallEntity(BaseModel):
|
||||
phone_number: str
|
||||
descriptor: str
|
||||
|
||||
|
||||
class CreateCallRequest(BaseModel):
|
||||
recipient: CallEntity
|
||||
caller: CallEntity
|
||||
agent_config: InformationRetrievalAgentConfig # TODO switch to AgentConfig
|
||||
# TODO add IVR/etc.
|
||||
31
vocode/models/transcriber.py
Normal file
31
vocode/models/transcriber.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
from enum import Enum
|
||||
from typing import Optional
|
||||
from .audio_encoding import AudioEncoding
|
||||
from .model import TypedModel
|
||||
from ..input_device.base_input_device import BaseInputDevice
|
||||
|
||||
class TranscriberType(str, Enum):
|
||||
BASE = "base"
|
||||
DEEPGRAM = "deepgram"
|
||||
GOOGLE = "google"
|
||||
|
||||
class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
|
||||
sampling_rate: int
|
||||
audio_encoding: AudioEncoding
|
||||
chunk_size: int
|
||||
|
||||
@classmethod
|
||||
def from_input_device(cls, input_device: BaseInputDevice):
|
||||
return cls(
|
||||
sampling_rate=input_device.sampling_rate,
|
||||
audio_encoding=input_device.audio_encoding,
|
||||
chunk_size=input_device.chunk_size)
|
||||
|
||||
class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
|
||||
model: Optional[str] = None
|
||||
should_warmup_model: bool = False
|
||||
version: Optional[str] = None
|
||||
|
||||
class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
|
||||
model: Optional[str] = None
|
||||
should_warmup_model: bool = False
|
||||
36
vocode/models/websocket.py
Normal file
36
vocode/models/websocket.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import base64
|
||||
from enum import Enum
|
||||
from .model import TypedModel
|
||||
from .transcriber import TranscriberConfig
|
||||
from .agent import AgentConfig
|
||||
from .synthesizer import SynthesizerConfig
|
||||
|
||||
class WebSocketMessageType(str, Enum):
|
||||
BASE = 'base'
|
||||
START = 'start'
|
||||
AUDIO = 'audio'
|
||||
READY = 'ready'
|
||||
STOP = 'stop'
|
||||
|
||||
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
|
||||
|
||||
class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
|
||||
data: str
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls, chunk: bytes):
|
||||
return cls(data=base64.b64encode(chunk).decode('utf-8'))
|
||||
|
||||
def get_bytes(self) -> bytes:
|
||||
return base64.b64decode(self.data)
|
||||
|
||||
class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
|
||||
transcriber_config: TranscriberConfig
|
||||
agent_config: AgentConfig
|
||||
synthesizer_config: SynthesizerConfig
|
||||
|
||||
class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
|
||||
pass
|
||||
|
||||
class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
|
||||
pass
|
||||
15
vocode/output_device/base_output_device.py
Normal file
15
vocode/output_device/base_output_device.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
from ..models.audio_encoding import AudioEncoding
|
||||
|
||||
class BaseOutputDevice:
|
||||
|
||||
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.audio_encoding = audio_encoding
|
||||
|
||||
async def send_async(self, chunk):
|
||||
raise NotImplemented
|
||||
|
||||
async def maybe_send_mark_async(self, message):
|
||||
pass
|
||||
|
||||
|
||||
28
vocode/output_device/speaker_output.py
Normal file
28
vocode/output_device/speaker_output.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import pyaudio
|
||||
|
||||
from .base_output_device import BaseOutputDevice
|
||||
from ..models.audio_encoding import AudioEncoding
|
||||
|
||||
class SpeakerOutput(BaseOutputDevice):
|
||||
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
|
||||
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
|
||||
self.device_info = device_info
|
||||
sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
|
||||
super().__init__(sampling_rate, audio_encoding)
|
||||
self.pa = pa
|
||||
self.stream = self.pa.open(
|
||||
output=True,
|
||||
channels=1,
|
||||
rate=self.sampling_rate,
|
||||
format=pyaudio.paInt16,
|
||||
output_device_index=int(self.device_info['index'])
|
||||
)
|
||||
|
||||
async def send_async(self, chunk):
|
||||
self.stream.write(chunk)
|
||||
|
||||
def terminate(self):
|
||||
self.stream.close()
|
||||
self.pa.close()
|
||||
41
vocode/telephony.py
Normal file
41
vocode/telephony.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import requests
|
||||
from vocode.models.agent import InformationRetrievalAgentConfig, LLMAgentConfig
|
||||
from vocode.models.telephony import CallEntity, CreateCallRequest
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
BASE_URL = os.environ.get("BASE_URL")
|
||||
|
||||
|
||||
def create_call(request: CreateCallRequest):
|
||||
request_data = request.dict()
|
||||
|
||||
url = f"http://{BASE_URL}/create_outbound_call"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
response = requests.post(url, headers=headers, json=request_data)
|
||||
return response.status_code
|
||||
|
||||
|
||||
def create_information_retrieval_call(
|
||||
recipient: CallEntity,
|
||||
caller: CallEntity,
|
||||
goal_description: str,
|
||||
fields: list[str] = None,
|
||||
):
|
||||
agent_config = InformationRetrievalAgentConfig(
|
||||
recipient_descriptor=recipient.descriptor,
|
||||
caller_descriptor=caller.descriptor,
|
||||
goal_description=goal_description,
|
||||
fields=fields,
|
||||
)
|
||||
|
||||
return create_call(
|
||||
CreateCallRequest(
|
||||
recipient=recipient,
|
||||
caller=caller,
|
||||
agent_config=agent_config,
|
||||
)
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue