From 6dc9fceeb51b651c30cf71e95c2d37a4b67a2b7b Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Fri, 24 Feb 2023 10:47:17 -0800 Subject: [PATCH] python SDK --- .gitignore | 4 ++ README.md | 1 + requirements.txt | 5 ++ simple_conversation.py | 31 +++++++++ vocode/conversation.py | 75 ++++++++++++++++++++++ vocode/helpers.py | 30 +++++++++ vocode/input_device/base_input_device.py | 14 ++++ vocode/input_device/microphone_input.py | 37 +++++++++++ vocode/models/agent.py | 37 +++++++++++ vocode/models/audio_encoding.py | 5 ++ vocode/models/model.py | 51 +++++++++++++++ vocode/models/synthesizer.py | 27 ++++++++ vocode/models/telephony.py | 14 ++++ vocode/models/transcriber.py | 31 +++++++++ vocode/models/websocket.py | 36 +++++++++++ vocode/output_device/base_output_device.py | 15 +++++ vocode/output_device/speaker_output.py | 28 ++++++++ vocode/telephony.py | 41 ++++++++++++ 18 files changed, 482 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 simple_conversation.py create mode 100644 vocode/conversation.py create mode 100644 vocode/helpers.py create mode 100644 vocode/input_device/base_input_device.py create mode 100644 vocode/input_device/microphone_input.py create mode 100644 vocode/models/agent.py create mode 100644 vocode/models/audio_encoding.py create mode 100644 vocode/models/model.py create mode 100644 vocode/models/synthesizer.py create mode 100644 vocode/models/telephony.py create mode 100644 vocode/models/transcriber.py create mode 100644 vocode/models/websocket.py create mode 100644 vocode/output_device/base_output_device.py create mode 100644 vocode/output_device/speaker_output.py create mode 100644 vocode/telephony.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..455216c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +venv/ +__pycache__/ +.env +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..10b9e9f --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# vocode-sdk diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..be86a68 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +PyAudio==0.2.13 +pydantic==1.10.5 +python-dotenv==0.21.1 +typing_extensions==4.5.0 +websockets==10.4 diff --git a/simple_conversation.py b/simple_conversation.py new file mode 100644 index 0000000..4c10bc6 --- /dev/null +++ b/simple_conversation.py @@ -0,0 +1,31 @@ +import asyncio +import logging +import os +import signal + +from vocode.conversation import Conversation +from vocode.helpers import create_microphone_input_and_speaker_output +from vocode.models.transcriber import DeepgramTranscriberConfig +from vocode.models.agent import ChatGPTAgentConfig +from vocode.models.synthesizer import AzureSynthesizerConfig + +logging.basicConfig() +logging.root.setLevel(logging.INFO) + +if __name__ == "__main__": + microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True) + + conversation = Conversation( + token=os.environ.get("VOCODE_API_KEY"), + input_device=microphone_input, + output_device=speaker_output, + transcriber_config=DeepgramTranscriberConfig.from_input_device(microphone_input), + agent_config=ChatGPTAgentConfig( + initial_message="Hello!", + prompt_preamble="The AI is having a pleasant conversation about life." + ), + synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output) + ) + signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate()) + asyncio.run(conversation.start()) + diff --git a/vocode/conversation.py b/vocode/conversation.py new file mode 100644 index 0000000..1eeb75d --- /dev/null +++ b/vocode/conversation.py @@ -0,0 +1,75 @@ +import websockets +import asyncio +from dotenv import load_dotenv +import os +import logging + +load_dotenv() + +from .input_device.base_input_device import BaseInputDevice +from .output_device.base_output_device import BaseOutputDevice +from .models.transcriber import TranscriberConfig +from .models.agent import AgentConfig +from .models.synthesizer import SynthesizerConfig +from .models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage + +BASE_URL = os.environ.get('BASE_URL') +VOCODE_WEBSOCKET_URL = f'wss://{BASE_URL}/conversation' + +class Conversation: + + def __init__( + self, + token: str, + input_device: BaseInputDevice, + output_device: BaseOutputDevice, + transcriber_config: TranscriberConfig, + agent_config: AgentConfig, + synthesizer_config: SynthesizerConfig + ): + self.token = token + self.input_device = input_device + self.output_device = output_device + self.transcriber_config = transcriber_config + self.agent_config = agent_config + self.synthesizer_config = synthesizer_config + self.logger = logging.getLogger(__name__) + self.receiver_ready = False + self.active = True + + async def wait_for_ready(self): + while not self.receiver_ready: + await asyncio.sleep(0.1) + return True + + def deactivate(self): + self.active = False + + async def start(self): + async with websockets.connect(f"{VOCODE_WEBSOCKET_URL}?key={self.token}") as ws: + async def sender(ws): + start_message = StartMessage( + transcriber_config=self.transcriber_config, + agent_config=self.agent_config, + synthesizer_config=self.synthesizer_config + ) + await ws.send(start_message.json()) + await self.wait_for_ready() + self.logger.info("Listening...press Ctrl+C to stop") + while self.active: + data = self.input_device.get_audio() + if data: + await ws.send(AudioMessage.from_bytes(data).json()) + await asyncio.sleep(0) + await ws.send(StopMessage().json()) + + async def receiver(ws): + ReadyMessage.parse_raw(await ws.recv()) + self.receiver_ready = True + async for msg in ws: + audio_message = AudioMessage.parse_raw(msg) + await self.output_device.send_async(audio_message.get_bytes()) + + + return await asyncio.gather(sender(ws), receiver(ws)) + diff --git a/vocode/helpers.py b/vocode/helpers.py new file mode 100644 index 0000000..2f3a11e --- /dev/null +++ b/vocode/helpers.py @@ -0,0 +1,30 @@ +import pyaudio +from .input_device.microphone_input import MicrophoneInput +from .output_device.speaker_output import SpeakerOutput +import logging + +logger = logging.getLogger(__name__) + +def _get_device_prompt(device_infos: list[dict]) -> str: + return """Please select a device: +{} +Choice: """.format( + "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) + +def create_microphone_input_and_speaker_output(use_first_available_device=False) -> tuple[MicrophoneInput, SpeakerOutput]: + pa = pyaudio.PyAudio() + num_devices = pa.get_device_count() + devices = list(map(pa.get_device_info_by_index, range(num_devices))) + input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices)) + output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices)) + if use_first_available_device: + input_device_info = input_device_infos[0] + output_device_info = output_device_infos[0] + else: + input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))] + output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))] + logger.info("Using microphone input device: %s", input_device_info['name']) + microphone_input = MicrophoneInput(pa, input_device_info) + logger.info("Using speaker output device: %s", output_device_info['name']) + speaker_output = SpeakerOutput(pa, output_device_info) + return microphone_input, speaker_output \ No newline at end of file diff --git a/vocode/input_device/base_input_device.py b/vocode/input_device/base_input_device.py new file mode 100644 index 0000000..5c0385c --- /dev/null +++ b/vocode/input_device/base_input_device.py @@ -0,0 +1,14 @@ +from ..models.audio_encoding import AudioEncoding +import queue +from typing import Optional + +class BaseInputDevice(): + + def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int): + self.sampling_rate = sampling_rate + self.audio_encoding = audio_encoding + self.chunk_size = chunk_size + self.queue = queue.Queue() + + def get_audio(self) -> Optional[bytes]: + raise NotImplementedError \ No newline at end of file diff --git a/vocode/input_device/microphone_input.py b/vocode/input_device/microphone_input.py new file mode 100644 index 0000000..03055df --- /dev/null +++ b/vocode/input_device/microphone_input.py @@ -0,0 +1,37 @@ +import pyaudio +from typing import Optional +import queue + +from .base_input_device import BaseInputDevice +from ..models.audio_encoding import AudioEncoding + +class MicrophoneInput(BaseInputDevice): + + DEFAULT_SAMPLING_RATE = 44100 + DEFAULT_CHUNK_SIZE = 2048 + + def __init__(self, pa: pyaudio.PyAudio, device_info: dict, chunk_size: int = DEFAULT_CHUNK_SIZE): + self.device_info = device_info + sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + super().__init__(sampling_rate, AudioEncoding.LINEAR16, chunk_size) + self.pa = pa + self.stream = pa.open( + format=pyaudio.paInt16, + channels=1, + rate=self.sampling_rate, + input=True, + frames_per_buffer=self.chunk_size, + input_device_index=int(self.device_info['index']), + stream_callback=self._stream_callback + ) + self.queue = queue.Queue() + + def _stream_callback(self, in_data, *_args): + self.queue.put_nowait(in_data) + return (None, pyaudio.paContinue) + + def get_audio(self) -> Optional[bytes]: + try: + return self.queue.get_nowait() + except queue.Empty: + return None \ No newline at end of file diff --git a/vocode/models/agent.py b/vocode/models/agent.py new file mode 100644 index 0000000..4bf9f6b --- /dev/null +++ b/vocode/models/agent.py @@ -0,0 +1,37 @@ +from typing import Optional +from enum import Enum +from .model import TypedModel + + +class AgentType(str, Enum): + BASE = "base" + LLM = "llm" + CHAT_GPT = "chat_gpt" + ECHO = "echo" + INFORMATION_RETRIEVAL = "information_retrieval" + + +class AgentConfig(TypedModel, type=AgentType.BASE): + initial_message: Optional[str] = None + + +class LLMAgentConfig(AgentConfig, type=AgentType.LLM): + prompt_preamble: str + expected_first_prompt: Optional[str] = None + +class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT): + prompt_preamble: str + expected_first_prompt: Optional[str] = None + +class InformationRetrievalAgentConfig( + AgentConfig, type=AgentType.INFORMATION_RETRIEVAL +): + recipient_descriptor: str + caller_descriptor: str + goal_description: str + fields: list[str] + # TODO: add fields for IVR, voicemail + + +class EchoAgentConfig(AgentConfig, type=AgentType.ECHO): + pass diff --git a/vocode/models/audio_encoding.py b/vocode/models/audio_encoding.py new file mode 100644 index 0000000..bc451bc --- /dev/null +++ b/vocode/models/audio_encoding.py @@ -0,0 +1,5 @@ +from enum import Enum + +class AudioEncoding(str, Enum): + LINEAR16 = "linear16" + MULAW = "mulaw" \ No newline at end of file diff --git a/vocode/models/model.py b/vocode/models/model.py new file mode 100644 index 0000000..5b7fc92 --- /dev/null +++ b/vocode/models/model.py @@ -0,0 +1,51 @@ +import pydantic + +class BaseModel(pydantic.BaseModel): + + def __init__(self, **data): + for key, value in data.items(): + if isinstance(value, dict): + data[key] = self.parse_obj(value) + super().__init__(**data) + +# Adapted from https://github.com/pydantic/pydantic/discussions/3091 +class TypedModel(BaseModel): + + _subtypes_ = [] + + def __init_subclass__(cls, type=None): + cls._subtypes_.append([type, cls]) + + @classmethod + def get_cls(_cls, type): + for t, cls in _cls._subtypes_: + if t == type: + return cls + raise ValueError(f'Unknown type {type}') + + @classmethod + def get_type(_cls, cls_name): + for t, cls in _cls._subtypes_: + if cls.__name__ == cls_name: + return t + raise ValueError(f'Unknown class {cls_name}') + + @classmethod + def parse_obj(cls, obj): + data_type = obj.get('type') + if data_type is None: + raise ValueError(f'type is required for {cls.__name__}') + + sub = cls.get_cls(data_type) + if sub is None: + raise ValueError(f'Unknown type {data_type}') + return sub(**obj) + + def _iter(self, **kwargs): + yield 'type', self.get_type(self.__class__.__name__) + yield from super()._iter(**kwargs) + + @property + def type(self): + return self.get_type(self.__class__.__name__) + diff --git a/vocode/models/synthesizer.py b/vocode/models/synthesizer.py new file mode 100644 index 0000000..6c662bd --- /dev/null +++ b/vocode/models/synthesizer.py @@ -0,0 +1,27 @@ +from enum import Enum +from .model import TypedModel +from .audio_encoding import AudioEncoding +from ..output_device.base_output_device import BaseOutputDevice + +class SynthesizerType(str, Enum): + BASE = "base" + AZURE = "azure" + GOOGLE = "google" + ELEVEN_LABS = "eleven_labs" + +class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE): + sampling_rate: int + audio_encoding: AudioEncoding + + @classmethod + def from_output_device(cls, output_device: BaseOutputDevice): + return cls(sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding) + +class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE): + pass + +class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE): + pass + +class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS): + pass diff --git a/vocode/models/telephony.py b/vocode/models/telephony.py new file mode 100644 index 0000000..645e165 --- /dev/null +++ b/vocode/models/telephony.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel +from vocode.models.agent import AgentConfig, InformationRetrievalAgentConfig + + +class CallEntity(BaseModel): + phone_number: str + descriptor: str + + +class CreateCallRequest(BaseModel): + recipient: CallEntity + caller: CallEntity + agent_config: InformationRetrievalAgentConfig # TODO switch to AgentConfig + # TODO add IVR/etc. diff --git a/vocode/models/transcriber.py b/vocode/models/transcriber.py new file mode 100644 index 0000000..2ce209c --- /dev/null +++ b/vocode/models/transcriber.py @@ -0,0 +1,31 @@ +from enum import Enum +from typing import Optional +from .audio_encoding import AudioEncoding +from .model import TypedModel +from ..input_device.base_input_device import BaseInputDevice + +class TranscriberType(str, Enum): + BASE = "base" + DEEPGRAM = "deepgram" + GOOGLE = "google" + +class TranscriberConfig(TypedModel, type=TranscriberType.BASE): + sampling_rate: int + audio_encoding: AudioEncoding + chunk_size: int + + @classmethod + def from_input_device(cls, input_device: BaseInputDevice): + return cls( + sampling_rate=input_device.sampling_rate, + audio_encoding=input_device.audio_encoding, + chunk_size=input_device.chunk_size) + +class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM): + model: Optional[str] = None + should_warmup_model: bool = False + version: Optional[str] = None + +class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE): + model: Optional[str] = None + should_warmup_model: bool = False \ No newline at end of file diff --git a/vocode/models/websocket.py b/vocode/models/websocket.py new file mode 100644 index 0000000..c446426 --- /dev/null +++ b/vocode/models/websocket.py @@ -0,0 +1,36 @@ +import base64 +from enum import Enum +from .model import TypedModel +from .transcriber import TranscriberConfig +from .agent import AgentConfig +from .synthesizer import SynthesizerConfig + +class WebSocketMessageType(str, Enum): + BASE = 'base' + START = 'start' + AUDIO = 'audio' + READY = 'ready' + STOP = 'stop' + +class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass + +class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO): + data: str + + @classmethod + def from_bytes(cls, chunk: bytes): + return cls(data=base64.b64encode(chunk).decode('utf-8')) + + def get_bytes(self) -> bytes: + return base64.b64decode(self.data) + +class StartMessage(WebSocketMessage, type=WebSocketMessageType.START): + transcriber_config: TranscriberConfig + agent_config: AgentConfig + synthesizer_config: SynthesizerConfig + +class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY): + pass + +class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP): + pass \ No newline at end of file diff --git a/vocode/output_device/base_output_device.py b/vocode/output_device/base_output_device.py new file mode 100644 index 0000000..71896d8 --- /dev/null +++ b/vocode/output_device/base_output_device.py @@ -0,0 +1,15 @@ +from ..models.audio_encoding import AudioEncoding + +class BaseOutputDevice: + + def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding): + self.sampling_rate = sampling_rate + self.audio_encoding = audio_encoding + + async def send_async(self, chunk): + raise NotImplemented + + async def maybe_send_mark_async(self, message): + pass + + diff --git a/vocode/output_device/speaker_output.py b/vocode/output_device/speaker_output.py new file mode 100644 index 0000000..75ffbf1 --- /dev/null +++ b/vocode/output_device/speaker_output.py @@ -0,0 +1,28 @@ +import pyaudio + +from .base_output_device import BaseOutputDevice +from ..models.audio_encoding import AudioEncoding + +class SpeakerOutput(BaseOutputDevice): + + DEFAULT_SAMPLING_RATE = 44100 + + def __init__(self, pa: pyaudio.PyAudio, device_info: dict, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): + self.device_info = device_info + sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + super().__init__(sampling_rate, audio_encoding) + self.pa = pa + self.stream = self.pa.open( + output=True, + channels=1, + rate=self.sampling_rate, + format=pyaudio.paInt16, + output_device_index=int(self.device_info['index']) + ) + + async def send_async(self, chunk): + self.stream.write(chunk) + + def terminate(self): + self.stream.close() + self.pa.close() \ No newline at end of file diff --git a/vocode/telephony.py b/vocode/telephony.py new file mode 100644 index 0000000..fde084b --- /dev/null +++ b/vocode/telephony.py @@ -0,0 +1,41 @@ +import requests +from vocode.models.agent import InformationRetrievalAgentConfig, LLMAgentConfig +from vocode.models.telephony import CallEntity, CreateCallRequest +import os +from dotenv import load_dotenv + +load_dotenv() + +BASE_URL = os.environ.get("BASE_URL") + + +def create_call(request: CreateCallRequest): + request_data = request.dict() + + url = f"http://{BASE_URL}/create_outbound_call" + headers = {"Content-Type": "application/json"} + + response = requests.post(url, headers=headers, json=request_data) + return response.status_code + + +def create_information_retrieval_call( + recipient: CallEntity, + caller: CallEntity, + goal_description: str, + fields: list[str] = None, +): + agent_config = InformationRetrievalAgentConfig( + recipient_descriptor=recipient.descriptor, + caller_descriptor=caller.descriptor, + goal_description=goal_description, + fields=fields, + ) + + return create_call( + CreateCallRequest( + recipient=recipient, + caller=caller, + agent_config=agent_config, + ) + )