python SDK

2023-02-24 10:47:17 -08:00 · 2023-02-24 10:47:17 -08:00 · 6dc9fceeb5
commit 6dc9fceeb5
18 changed files with 482 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+venv/
+__pycache__/
+.env
+.DS_Store
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+# vocode-sdk
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+PyAudio==0.2.13
+pydantic==1.10.5
+python-dotenv==0.21.1
+typing_extensions==4.5.0
+websockets==10.4
--- a/simple_conversation.py
+++ b/simple_conversation.py
@ -0,0 +1,31 @@
+import asyncio
+import logging
+import os
+import signal
+
+from vocode.conversation import Conversation
+from vocode.helpers import create_microphone_input_and_speaker_output
+from vocode.models.transcriber import DeepgramTranscriberConfig
+from vocode.models.agent import ChatGPTAgentConfig
+from vocode.models.synthesizer import AzureSynthesizerConfig
+
+logging.basicConfig()
+logging.root.setLevel(logging.INFO)
+
+if __name__ == "__main__":
+    microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=True)
+
+    conversation = Conversation(
+        token=os.environ.get("VOCODE_API_KEY"),
+        input_device=microphone_input,
+        output_device=speaker_output,
+        transcriber_config=DeepgramTranscriberConfig.from_input_device(microphone_input),
+        agent_config=ChatGPTAgentConfig(
+            initial_message="Hello!",
+            prompt_preamble="The AI is having a pleasant conversation about life."
+        ),
+        synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output)
+    )
+    signal.signal(signal.SIGINT, lambda _0, _1: conversation.deactivate())
+    asyncio.run(conversation.start())
+
--- a/vocode/conversation.py
+++ b/vocode/conversation.py
@ -0,0 +1,75 @@
+import websockets
+import asyncio
+from dotenv import load_dotenv
+import os
+import logging
+
+load_dotenv()
+
+from .input_device.base_input_device import BaseInputDevice
+from .output_device.base_output_device import BaseOutputDevice
+from .models.transcriber import TranscriberConfig
+from .models.agent import AgentConfig
+from .models.synthesizer import SynthesizerConfig
+from .models.websocket import ReadyMessage, AudioMessage, StartMessage, StopMessage
+
+BASE_URL = os.environ.get('BASE_URL')
+VOCODE_WEBSOCKET_URL = f'wss://{BASE_URL}/conversation'
+
+class Conversation:
+
+    def __init__(
+        self,
+        token: str,
+        input_device: BaseInputDevice, 
+        output_device: BaseOutputDevice, 
+        transcriber_config: TranscriberConfig, 
+        agent_config: AgentConfig,
+        synthesizer_config: SynthesizerConfig
+    ):
+        self.token = token
+        self.input_device = input_device
+        self.output_device = output_device
+        self.transcriber_config = transcriber_config
+        self.agent_config = agent_config
+        self.synthesizer_config = synthesizer_config
+        self.logger = logging.getLogger(__name__)
+        self.receiver_ready = False
+        self.active = True
+
+    async def wait_for_ready(self):
+        while not self.receiver_ready:
+            await asyncio.sleep(0.1)
+        return True
+    
+    def deactivate(self):
+        self.active = False
+    
+    async def start(self):
+        async with websockets.connect(f"{VOCODE_WEBSOCKET_URL}?key={self.token}") as ws:
+            async def sender(ws):
+                start_message = StartMessage(
+                    transcriber_config=self.transcriber_config, 
+                    agent_config=self.agent_config, 
+                    synthesizer_config=self.synthesizer_config
+                )
+                await ws.send(start_message.json())
+                await self.wait_for_ready()
+                self.logger.info("Listening...press Ctrl+C to stop")
+                while self.active:
+                    data = self.input_device.get_audio()
+                    if data:
+                        await ws.send(AudioMessage.from_bytes(data).json())
+                        await asyncio.sleep(0)
+                await ws.send(StopMessage().json())
+
+            async def receiver(ws):
+                ReadyMessage.parse_raw(await ws.recv())
+                self.receiver_ready = True
+                async for msg in ws:
+                    audio_message = AudioMessage.parse_raw(msg)
+                    await self.output_device.send_async(audio_message.get_bytes())
+
+
+            return await asyncio.gather(sender(ws), receiver(ws))
+
--- a/vocode/helpers.py
+++ b/vocode/helpers.py
@ -0,0 +1,30 @@
+import pyaudio
+from .input_device.microphone_input import MicrophoneInput
+from .output_device.speaker_output import SpeakerOutput
+import logging
+
+logger = logging.getLogger(__name__)
+
+def _get_device_prompt(device_infos: list[dict]) -> str:
+    return """Please select a device:
+{}
+Choice: """.format(
+        "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
+
+def create_microphone_input_and_speaker_output(use_first_available_device=False) -> tuple[MicrophoneInput, SpeakerOutput]:
+    pa = pyaudio.PyAudio()
+    num_devices = pa.get_device_count()
+    devices = list(map(pa.get_device_info_by_index, range(num_devices)))
+    input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices))
+    output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices))
+    if use_first_available_device:
+        input_device_info = input_device_infos[0]
+        output_device_info = output_device_infos[0]
+    else:
+        input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
+        output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
+    logger.info("Using microphone input device: %s", input_device_info['name'])
+    microphone_input = MicrophoneInput(pa, input_device_info)
+    logger.info("Using speaker output device: %s", output_device_info['name'])
+    speaker_output = SpeakerOutput(pa, output_device_info) 
+    return microphone_input, speaker_output
--- a/vocode/input_device/base_input_device.py
+++ b/vocode/input_device/base_input_device.py
@ -0,0 +1,14 @@
+from ..models.audio_encoding import AudioEncoding
+import queue
+from typing import Optional
+
+class BaseInputDevice():
+
+    def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding, chunk_size: int):
+        self.sampling_rate = sampling_rate
+        self.audio_encoding = audio_encoding
+        self.chunk_size = chunk_size
+        self.queue = queue.Queue()
+
+    def get_audio(self) -> Optional[bytes]:
+        raise NotImplementedError
--- a/vocode/input_device/microphone_input.py
+++ b/vocode/input_device/microphone_input.py
@ -0,0 +1,37 @@
+import pyaudio
+from typing import Optional
+import queue
+
+from .base_input_device import BaseInputDevice
+from ..models.audio_encoding import AudioEncoding
+
+class MicrophoneInput(BaseInputDevice):
+
+    DEFAULT_SAMPLING_RATE = 44100
+    DEFAULT_CHUNK_SIZE = 2048
+
+    def __init__(self, pa: pyaudio.PyAudio, device_info: dict, chunk_size: int = DEFAULT_CHUNK_SIZE):
+        self.device_info = device_info
+        sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
+        super().__init__(sampling_rate, AudioEncoding.LINEAR16, chunk_size)
+        self.pa = pa
+        self.stream = pa.open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=self.sampling_rate,
+            input=True,
+            frames_per_buffer=self.chunk_size,
+            input_device_index=int(self.device_info['index']),
+            stream_callback=self._stream_callback
+        )
+        self.queue = queue.Queue()
+
+    def _stream_callback(self, in_data, *_args):
+        self.queue.put_nowait(in_data)
+        return (None, pyaudio.paContinue)
+
+    def get_audio(self) -> Optional[bytes]:
+        try:
+            return self.queue.get_nowait()
+        except queue.Empty:
+            return None
--- a/vocode/models/agent.py
+++ b/vocode/models/agent.py
@ -0,0 +1,37 @@
+from typing import Optional
+from enum import Enum
+from .model import TypedModel
+
+
+class AgentType(str, Enum):
+    BASE = "base"
+    LLM = "llm"
+    CHAT_GPT = "chat_gpt"
+    ECHO = "echo"
+    INFORMATION_RETRIEVAL = "information_retrieval"
+
+
+class AgentConfig(TypedModel, type=AgentType.BASE):
+    initial_message: Optional[str] = None
+
+
+class LLMAgentConfig(AgentConfig, type=AgentType.LLM):
+    prompt_preamble: str
+    expected_first_prompt: Optional[str] = None
+
+class ChatGPTAgentConfig(AgentConfig, type=AgentType.CHAT_GPT):
+    prompt_preamble: str
+    expected_first_prompt: Optional[str] = None
+
+class InformationRetrievalAgentConfig(
+    AgentConfig, type=AgentType.INFORMATION_RETRIEVAL
+):
+    recipient_descriptor: str
+    caller_descriptor: str
+    goal_description: str
+    fields: list[str]
+    # TODO: add fields for IVR, voicemail
+
+
+class EchoAgentConfig(AgentConfig, type=AgentType.ECHO):
+    pass
--- a/vocode/models/audio_encoding.py
+++ b/vocode/models/audio_encoding.py
@ -0,0 +1,5 @@
+from enum import Enum
+
+class AudioEncoding(str, Enum):
+    LINEAR16 = "linear16"
+    MULAW = "mulaw"
--- a/vocode/models/model.py
+++ b/vocode/models/model.py
@ -0,0 +1,51 @@
+import pydantic
+
+class BaseModel(pydantic.BaseModel):
+
+    def __init__(self, **data):
+        for key, value in data.items():
+            if isinstance(value, dict):
+                data[key] = self.parse_obj(value)
+        super().__init__(**data)
+
+# Adapted from https://github.com/pydantic/pydantic/discussions/3091
+class TypedModel(BaseModel):
+
+    _subtypes_ = []
+
+    def __init_subclass__(cls, type=None):
+        cls._subtypes_.append([type, cls])
+
+    @classmethod
+    def get_cls(_cls, type):
+        for t, cls in _cls._subtypes_:
+            if t == type:
+                return cls
+        raise ValueError(f'Unknown type {type}')
+    
+    @classmethod
+    def get_type(_cls, cls_name):
+        for t, cls in _cls._subtypes_:
+            if cls.__name__ == cls_name:
+                return t
+        raise ValueError(f'Unknown class {cls_name}')
+    
+    @classmethod
+    def parse_obj(cls, obj):
+        data_type = obj.get('type')
+        if data_type is None:
+            raise ValueError(f'type is required for {cls.__name__}')
+    
+        sub = cls.get_cls(data_type)
+        if sub is None:
+            raise ValueError(f'Unknown type {data_type}')
+        return sub(**obj)
+
+    def _iter(self, **kwargs):
+        yield 'type', self.get_type(self.__class__.__name__)
+        yield from super()._iter(**kwargs)
+
+    @property
+    def type(self):
+        return self.get_type(self.__class__.__name__)
+
--- a/vocode/models/synthesizer.py
+++ b/vocode/models/synthesizer.py
@ -0,0 +1,27 @@
+from enum import Enum
+from .model import TypedModel
+from .audio_encoding import AudioEncoding
+from ..output_device.base_output_device import BaseOutputDevice
+
+class SynthesizerType(str, Enum):
+    BASE = "base"
+    AZURE = "azure"
+    GOOGLE = "google"
+    ELEVEN_LABS = "eleven_labs"
+
+class SynthesizerConfig(TypedModel, type=SynthesizerType.BASE):
+    sampling_rate: int
+    audio_encoding: AudioEncoding
+
+    @classmethod
+    def from_output_device(cls, output_device: BaseOutputDevice):
+        return cls(sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding)
+
+class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE):
+    pass
+
+class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE):
+    pass
+
+class ElevenLabsSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS):
+    pass
--- a/vocode/models/telephony.py
+++ b/vocode/models/telephony.py
@ -0,0 +1,14 @@
+from pydantic import BaseModel
+from vocode.models.agent import AgentConfig, InformationRetrievalAgentConfig
+
+
+class CallEntity(BaseModel):
+    phone_number: str
+    descriptor: str
+
+
+class CreateCallRequest(BaseModel):
+    recipient: CallEntity
+    caller: CallEntity
+    agent_config: InformationRetrievalAgentConfig  # TODO switch to AgentConfig
+    # TODO add IVR/etc.
--- a/vocode/models/transcriber.py
+++ b/vocode/models/transcriber.py
@ -0,0 +1,31 @@
+from enum import Enum
+from typing import Optional
+from .audio_encoding import AudioEncoding
+from .model import TypedModel
+from ..input_device.base_input_device import BaseInputDevice
+
+class TranscriberType(str, Enum):
+    BASE = "base"
+    DEEPGRAM = "deepgram"
+    GOOGLE = "google"
+
+class TranscriberConfig(TypedModel, type=TranscriberType.BASE):
+    sampling_rate: int
+    audio_encoding: AudioEncoding
+    chunk_size: int
+
+    @classmethod
+    def from_input_device(cls, input_device: BaseInputDevice):
+        return cls(
+            sampling_rate=input_device.sampling_rate,
+            audio_encoding=input_device.audio_encoding,
+            chunk_size=input_device.chunk_size)
+
+class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM):
+    model: Optional[str] = None
+    should_warmup_model: bool = False
+    version: Optional[str] = None
+
+class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE):
+    model: Optional[str] = None
+    should_warmup_model: bool = False
--- a/vocode/models/websocket.py
+++ b/vocode/models/websocket.py
@ -0,0 +1,36 @@
+import base64
+from enum import Enum
+from .model import TypedModel
+from .transcriber import TranscriberConfig
+from .agent import AgentConfig
+from .synthesizer import SynthesizerConfig
+
+class WebSocketMessageType(str, Enum):
+    BASE = 'base'
+    START = 'start'
+    AUDIO = 'audio'
+    READY = 'ready'
+    STOP = 'stop'
+
+class WebSocketMessage(TypedModel, type=WebSocketMessageType.BASE): pass
+
+class AudioMessage(WebSocketMessage, type=WebSocketMessageType.AUDIO):
+    data: str
+
+    @classmethod
+    def from_bytes(cls, chunk: bytes):
+        return cls(data=base64.b64encode(chunk).decode('utf-8'))
+
+    def get_bytes(self) -> bytes:
+        return base64.b64decode(self.data)
+
+class StartMessage(WebSocketMessage, type=WebSocketMessageType.START):
+    transcriber_config: TranscriberConfig
+    agent_config: AgentConfig
+    synthesizer_config: SynthesizerConfig
+
+class ReadyMessage(WebSocketMessage, type=WebSocketMessageType.READY):
+    pass
+
+class StopMessage(WebSocketMessage, type=WebSocketMessageType.STOP):
+    pass
--- a/vocode/output_device/base_output_device.py
+++ b/vocode/output_device/base_output_device.py
@ -0,0 +1,15 @@
+from ..models.audio_encoding import AudioEncoding
+
+class BaseOutputDevice:
+
+    def __init__(self, sampling_rate: int, audio_encoding: AudioEncoding):
+        self.sampling_rate = sampling_rate
+        self.audio_encoding = audio_encoding
+
+    async def send_async(self, chunk):
+        raise NotImplemented
+
+    async def maybe_send_mark_async(self, message):
+        pass
+
+
--- a/vocode/output_device/speaker_output.py
+++ b/vocode/output_device/speaker_output.py
@ -0,0 +1,28 @@
+import pyaudio
+
+from .base_output_device import BaseOutputDevice
+from ..models.audio_encoding import AudioEncoding
+
+class SpeakerOutput(BaseOutputDevice):
+
+    DEFAULT_SAMPLING_RATE = 44100
+
+    def __init__(self, pa: pyaudio.PyAudio, device_info: dict, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
+        self.device_info = device_info
+        sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
+        super().__init__(sampling_rate, audio_encoding)
+        self.pa = pa
+        self.stream = self.pa.open(
+            output=True,
+            channels=1,
+            rate=self.sampling_rate,
+            format=pyaudio.paInt16,
+            output_device_index=int(self.device_info['index'])
+        )
+
+    async def send_async(self, chunk):
+        self.stream.write(chunk)
+
+    def terminate(self):
+        self.stream.close()
+        self.pa.close()
--- a/vocode/telephony.py
+++ b/vocode/telephony.py
@ -0,0 +1,41 @@
+import requests
+from vocode.models.agent import InformationRetrievalAgentConfig, LLMAgentConfig
+from vocode.models.telephony import CallEntity, CreateCallRequest
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+BASE_URL = os.environ.get("BASE_URL")
+
+
+def create_call(request: CreateCallRequest):
+    request_data = request.dict()
+
+    url = f"http://{BASE_URL}/create_outbound_call"
+    headers = {"Content-Type": "application/json"}
+
+    response = requests.post(url, headers=headers, json=request_data)
+    return response.status_code
+
+
+def create_information_retrieval_call(
+    recipient: CallEntity,
+    caller: CallEntity,
+    goal_description: str,
+    fields: list[str] = None,
+):
+    agent_config = InformationRetrievalAgentConfig(
+        recipient_descriptor=recipient.descriptor,
+        caller_descriptor=caller.descriptor,
+        goal_description=goal_description,
+        fields=fields,
+    )
+
+    return create_call(
+        CreateCallRequest(
+            recipient=recipient,
+            caller=caller,
+            agent_config=agent_config,
+        )
+    )