From f39af91505c45af5fb9eede8e97ab254bc64c4cd Mon Sep 17 00:00:00 2001 From: Kian Date: Mon, 27 Feb 2023 10:19:44 -0800 Subject: [PATCH] add assembly ai integration --- vocode/helpers.py | 4 ++-- vocode/input_device/microphone_input.py | 6 +++--- vocode/models/transcriber.py | 8 +++++++- vocode/output_device/speaker_output.py | 4 ++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/vocode/helpers.py b/vocode/helpers.py index 2f3a11e..e27e441 100644 --- a/vocode/helpers.py +++ b/vocode/helpers.py @@ -11,7 +11,7 @@ def _get_device_prompt(device_infos: list[dict]) -> str: Choice: """.format( "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) -def create_microphone_input_and_speaker_output(use_first_available_device=False) -> tuple[MicrophoneInput, SpeakerOutput]: +def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: pa = pyaudio.PyAudio() num_devices = pa.get_device_count() devices = list(map(pa.get_device_info_by_index, range(num_devices))) @@ -24,7 +24,7 @@ def create_microphone_input_and_speaker_output(use_first_available_device=False) input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))] output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))] logger.info("Using microphone input device: %s", input_device_info['name']) - microphone_input = MicrophoneInput(pa, input_device_info) + microphone_input = MicrophoneInput(pa, input_device_info, sampling_rate=mic_sampling_rate) logger.info("Using speaker output device: %s", output_device_info['name']) speaker_output = SpeakerOutput(pa, output_device_info) return microphone_input, speaker_output \ No newline at end of file diff --git a/vocode/input_device/microphone_input.py b/vocode/input_device/microphone_input.py index 03055df..fe042fe 100644 --- a/vocode/input_device/microphone_input.py +++ b/vocode/input_device/microphone_input.py @@ -10,10 +10,10 @@ class MicrophoneInput(BaseInputDevice): DEFAULT_SAMPLING_RATE = 44100 DEFAULT_CHUNK_SIZE = 2048 - def __init__(self, pa: pyaudio.PyAudio, device_info: dict, chunk_size: int = DEFAULT_CHUNK_SIZE): + def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE): self.device_info = device_info - sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) - super().__init__(sampling_rate, AudioEncoding.LINEAR16, chunk_size) + sampling_rate = sampling_rate or (self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size) self.pa = pa self.stream = pa.open( format=pyaudio.paInt16, diff --git a/vocode/models/transcriber.py b/vocode/models/transcriber.py index 2ce209c..cbffe8a 100644 --- a/vocode/models/transcriber.py +++ b/vocode/models/transcriber.py @@ -8,6 +8,7 @@ class TranscriberType(str, Enum): BASE = "base" DEEPGRAM = "deepgram" GOOGLE = "google" + ASSEMBLY_AI = "assembly_ai" class TranscriberConfig(TypedModel, type=TranscriberType.BASE): sampling_rate: int @@ -28,4 +29,9 @@ class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE): model: Optional[str] = None - should_warmup_model: bool = False \ No newline at end of file + should_warmup_model: bool = False + +class AssemblyAITranscriberConfig(TranscriberConfig, type=TranscriberType.ASSEMBLY_AI): + model: Optional[str] = None + should_warmup_model: bool = False + version: Optional[str] = None \ No newline at end of file diff --git a/vocode/output_device/speaker_output.py b/vocode/output_device/speaker_output.py index 75ffbf1..7a2374d 100644 --- a/vocode/output_device/speaker_output.py +++ b/vocode/output_device/speaker_output.py @@ -7,9 +7,9 @@ class SpeakerOutput(BaseOutputDevice): DEFAULT_SAMPLING_RATE = 44100 - def __init__(self, pa: pyaudio.PyAudio, device_info: dict, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): + def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): self.device_info = device_info - sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + sampling_rate = sampling_rate or int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) super().__init__(sampling_rate, audio_encoding) self.pa = pa self.stream = self.pa.open(