python SDK

This commit is contained in:
Ajay Raj 2023-02-24 10:47:17 -08:00
commit 6dc9fceeb5
18 changed files with 482 additions and 0 deletions

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
venv/
__pycache__/
.env
.DS_Store

1
README.md Normal file
View file

@ -0,0 +1 @@
# vocode-sdk

5
requirements.txt Normal file
View file

@ -0,0 +1,5 @@
PyAudio==0.2.13
pydantic==1.10.5
python-dotenv==0.21.1
typing_extensions==4.5.0
websockets==10.4

31
simple_conversation.py Normal file
View file

@ -0,0 +1,31 @@
import asyncio
import logging
import os
import signal
from vocode.conversation import Conversation
from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.models.transcriber import DeepgramTranscriberConfig
from vocode.models.agent import ChatGPTAgentConfig
from vocode.models.synthesizer import AzureSynthesizerConfig
logging.basicConfig()
logging.root.setLevel(logging.INFO)
if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True)
conversation = Conversation(
token=os.environ.get("VOCODE_API_KEY"),
input_device=microphone_input,
output_device=speaker_output,
transcriber_config=DeepgramTranscriberConfig.from_input_device(microphone_input),
agent_config=ChatGPTAgentConfig(
initial_message="Hello!",
prompt_preamble="The AI is having a pleasant conversation about life."
),
synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output)
)
signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate())
asyncio.run(conversation.start())

75
vocode/conversation.py Normal file
View file

@ -0,0 +1,75 @@
import websockets
import asyncio
from dotenv import load_dotenv
import os
import logging
load_dotenv()
from .input_device.base_input_device import BaseInputDevice
from .output_device.base_output_device import BaseOutputDevice
from .models.transcriber import TranscriberConfig
from .models.agent import AgentConfig
from .models.synthesizer import SynthesizerConfig
from .models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage
BASE_URL = os.environ.get('BASE_URL')
VOCODE_WEBSOCKET_URL = f'wss://{BASE_URL}/conversation'
class Conversation:
def __init__(
self,
token: str,
input_device: BaseInputDevice,
output_device: BaseOutputDevice,
transcriber_config: TranscriberConfig,
agent_config: AgentConfig,
synthesizer_config: SynthesizerConfig
):
self.token = token
self.input_device = input_device
self.output_device = output_device
self.transcriber_config = transcriber_config
self.agent_config = agent_config
self.synthesizer_config = synthesizer_config
self.logger = logging.getLogger(__name__)
self.receiver_ready = False
self.active = True
async def wait_for_ready(self):
while not self.receiver_ready:
await asyncio.sleep(0.1)
return True
def deactivate(self):
self.active = False
async def start(self):
async with websockets.connect(f"{VOCODE_WEBSOCKET_URL}?key={self.token}") as ws:
async def sender(ws):
start_message = StartMessage(
transcriber_config=self.transcriber_config,
agent_config=self.agent_config,
synthesizer_config=self.synthesizer_config
)
await ws.send(start_message.json())
await self.wait_for_ready()
self.logger.info("Listening...press Ctrl+C to stop")
while self.active:
data = self.input_device.get_audio()
if data:
await ws.send(AudioMessage.from_bytes(data).json())
await asyncio.sleep(0)
await ws.send(StopMessage().json())
async def receiver(ws):
ReadyMessage.parse_raw(await ws.recv())
self.receiver_ready = True
async for msg in ws:
audio_message = AudioMessage.parse_raw(msg)
await self.output_device.send_async(audio_message.get_bytes())
return await asyncio.gather(sender(ws), receiver(ws))

30
vocode/helpers.py Normal file
View file

@ -0,0 +1,30 @@
import pyaudio
from .input_device.microphone_input import MicrophoneInput
from .output_device.speaker_output import SpeakerOutput
import logging
logger = logging.getLogger(__name__)
def _get_device_prompt(device_infos: list[dict]) -> str:
return """Please select a device:
{}
Choice: """.format(
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
def create_microphone_input_and_speaker_output(use_first_available_device=False) -> tuple[MicrophoneInput, SpeakerOutput]:
pa = pyaudio.PyAudio()
num_devices = pa.get_device_count()
devices = list(map(pa.get_device_info_by_index, range(num_devices)))
input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices))
output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices))
if use_first_available_device:
input_device_info = input_device_infos[0]
output_device_info = output_device_infos[0]
else:
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
logger.info("Using microphone input device: %s", input_device_info['name'])
microphone_input = MicrophoneInput(pa, input_device_info)
logger.info("Using speaker output device: %s", output_device_info['name'])
speaker_output = SpeakerOutput(pa, output_device_info)
return microphone_input, speaker_output

View file

@ -0,0 +1,14 @@
from ..models.audio_encoding import AudioEncoding
import queue
from typing import Optional
class BaseInputDevice():
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int):
self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding
self.chunk_size = chunk_size
self.queue = queue.Queue()
def get_audio(self) -> Optional[bytes]:
raise NotImplementedError

View file

@ -0,0 +1,37 @@
import pyaudio
from typing import Optional
import queue
from .base_input_device import BaseInputDevice
from ..models.audio_encoding import AudioEncoding
class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, chunk_size: int = DEFAULT_CHUNK_SIZE):
self.device_info = device_info
sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
super().__init__(sampling_rate, AudioEncoding.LINEAR16, chunk_size)
self.pa = pa
self.stream = pa.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sampling_rate,
input=True,
frames_per_buffer=self.chunk_size,
input_device_index=int(self.device_info['index']),
stream_callback=self._stream_callback
)
self.queue = queue.Queue()
def _stream_callback(self, in_data, *_args):
self.queue.put_nowait(in_data)
return (None, pyaudio.paContinue)
def get_audio(self) -> Optional[bytes]:
try:
return self.queue.get_nowait()
except queue.Empty:
return None

37
vocode/models/agent.py Normal file
View file

@ -0,0 +1,37 @@
from typing import Optional
from enum import Enum
from .model import TypedModel
class AgentType(str, Enum):
BASE = "base"
LLM = "llm"
CHAT_GPT = "chat_gpt"
ECHO = "echo"
INFORMATION_RETRIEVAL = "information_retrieval"
class AgentConfig(TypedModel, type=AgentType.BASE):
initial_message: Optional[str] = None
class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
prompt_preamble: str
expected_first_prompt: Optional[str] = None
class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
prompt_preamble: str
expected_first_prompt: Optional[str] = None
class InformationRetrievalAgentConfig(
AgentConfig, type=AgentType.INFORMATION_RETRIEVAL
):
recipient_descriptor: str
caller_descriptor: str
goal_description: str
fields: list[str]
# TODO: add fields for IVR, voicemail
class EchoAgentConfig(AgentConfig, type=AgentType.ECHO):
pass

View file

@ -0,0 +1,5 @@
from enum import Enum
class AudioEncoding(str, Enum):
LINEAR16 = "linear16"
MULAW = "mulaw"

51
vocode/models/model.py Normal file
View file

@ -0,0 +1,51 @@
import pydantic
class BaseModel(pydantic.BaseModel):
def __init__(self, **data):
for key, value in data.items():
if isinstance(value, dict):
data[key] = self.parse_obj(value)
super().__init__(**data)
# Adapted from https://github.com/pydantic/pydantic/discussions/3091
class TypedModel(BaseModel):
_subtypes_ = []
def __init_subclass__(cls, type=None):
cls._subtypes_.append([type, cls])
@classmethod
def get_cls(_cls, type):
for t, cls in _cls._subtypes_:
if t == type:
return cls
raise ValueError(f'Unknown type {type}')
@classmethod
def get_type(_cls, cls_name):
for t, cls in _cls._subtypes_:
if cls.__name__ == cls_name:
return t
raise ValueError(f'Unknown class {cls_name}')
@classmethod
def parse_obj(cls, obj):
data_type = obj.get('type')
if data_type is None:
raise ValueError(f'type is required for {cls.__name__}')
sub = cls.get_cls(data_type)
if sub is None:
raise ValueError(f'Unknown type {data_type}')
return sub(**obj)
def _iter(self, **kwargs):
yield 'type', self.get_type(self.__class__.__name__)
yield from super()._iter(**kwargs)
@property
def type(self):
return self.get_type(self.__class__.__name__)

View file

@ -0,0 +1,27 @@
from enum import Enum
from .model import TypedModel
from .audio_encoding import AudioEncoding
from ..output_device.base_output_device import BaseOutputDevice
class SynthesizerType(str, Enum):
BASE = "base"
AZURE = "azure"
GOOGLE = "google"
ELEVEN_LABS = "eleven_labs"
class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
sampling_rate: int
audio_encoding: AudioEncoding
@classmethod
def from_output_device(cls, output_device: BaseOutputDevice):
return cls(sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding)
class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
pass
class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
pass
class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
pass

View file

@ -0,0 +1,14 @@
from pydantic import BaseModel
from vocode.models.agent import AgentConfig, InformationRetrievalAgentConfig
class CallEntity(BaseModel):
phone_number: str
descriptor: str
class CreateCallRequest(BaseModel):
recipient: CallEntity
caller: CallEntity
agent_config: InformationRetrievalAgentConfig # TODO switch to AgentConfig
# TODO add IVR/etc.

View file

@ -0,0 +1,31 @@
from enum import Enum
from typing import Optional
from .audio_encoding import AudioEncoding
from .model import TypedModel
from ..input_device.base_input_device import BaseInputDevice
class TranscriberType(str, Enum):
BASE = "base"
DEEPGRAM = "deepgram"
GOOGLE = "google"
class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
sampling_rate: int
audio_encoding: AudioEncoding
chunk_size: int
@classmethod
def from_input_device(cls, input_device: BaseInputDevice):
return cls(
sampling_rate=input_device.sampling_rate,
audio_encoding=input_device.audio_encoding,
chunk_size=input_device.chunk_size)
class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
model: Optional[str] = None
should_warmup_model: bool = False
version: Optional[str] = None
class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
model: Optional[str] = None
should_warmup_model: bool = False

View file

@ -0,0 +1,36 @@
import base64
from enum import Enum
from .model import TypedModel
from .transcriber import TranscriberConfig
from .agent import AgentConfig
from .synthesizer import SynthesizerConfig
class WebSocketMessageType(str, Enum):
BASE = 'base'
START = 'start'
AUDIO = 'audio'
READY = 'ready'
STOP = 'stop'
class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
data: str
@classmethod
def from_bytes(cls, chunk: bytes):
return cls(data=base64.b64encode(chunk).decode('utf-8'))
def get_bytes(self) -> bytes:
return base64.b64decode(self.data)
class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
transcriber_config: TranscriberConfig
agent_config: AgentConfig
synthesizer_config: SynthesizerConfig
class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
pass
class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
pass

View file

@ -0,0 +1,15 @@
from ..models.audio_encoding import AudioEncoding
class BaseOutputDevice:
def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
self.sampling_rate = sampling_rate
self.audio_encoding = audio_encoding
async def send_async(self, chunk):
raise NotImplemented
async def maybe_send_mark_async(self, message):
pass

View file

@ -0,0 +1,28 @@
import pyaudio
from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding
class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
self.device_info = device_info
sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
super().__init__(sampling_rate, audio_encoding)
self.pa = pa
self.stream = self.pa.open(
output=True,
channels=1,
rate=self.sampling_rate,
format=pyaudio.paInt16,
output_device_index=int(self.device_info['index'])
)
async def send_async(self, chunk):
self.stream.write(chunk)
def terminate(self):
self.stream.close()
self.pa.close()

41
vocode/telephony.py Normal file
View file

@ -0,0 +1,41 @@
import requests
from vocode.models.agent import InformationRetrievalAgentConfig, LLMAgentConfig
from vocode.models.telephony import CallEntity, CreateCallRequest
import os
from dotenv import load_dotenv
load_dotenv()
BASE_URL = os.environ.get("BASE_URL")
def create_call(request: CreateCallRequest):
request_data = request.dict()
url = f"http://{BASE_URL}/create_outbound_call"
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, json=request_data)
return response.status_code
def create_information_retrieval_call(
recipient: CallEntity,
caller: CallEntity,
goal_description: str,
fields: list[str] = None,
):
agent_config = InformationRetrievalAgentConfig(
recipient_descriptor=recipient.descriptor,
caller_descriptor=caller.descriptor,
goal_description=goal_description,
fields=fields,
)
return create_call(
CreateCallRequest(
recipient=recipient,
caller=caller,
agent_config=agent_config,
)
)