From f39af91505c45af5fb9eede8e97ab254bc64c4cd Mon Sep 17 00:00:00 2001 From: Kian Date: Mon, 27 Feb 2023 10:19:44 -0800 Subject: [PATCH 1/2] add assembly ai integration --- vocode/helpers.py | 4 ++-- vocode/input_device/microphone_input.py | 6 +++--- vocode/models/transcriber.py | 8 +++++++- vocode/output_device/speaker_output.py | 4 ++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/vocode/helpers.py b/vocode/helpers.py index 2f3a11e..e27e441 100644 --- a/vocode/helpers.py +++ b/vocode/helpers.py @@ -11,7 +11,7 @@ def _get_device_prompt(device_infos: list[dict]) -> str: Choice: """.format( "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) -def create_microphone_input_and_speaker_output(use_first_available_device=False) -> tuple[MicrophoneInput, SpeakerOutput]: +def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: pa = pyaudio.PyAudio() num_devices = pa.get_device_count() devices = list(map(pa.get_device_info_by_index, range(num_devices))) @@ -24,7 +24,7 @@ def create_microphone_input_and_speaker_output(use_first_available_device=False) input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))] output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))] logger.info("Using microphone input device: %s", input_device_info['name']) - microphone_input = MicrophoneInput(pa, input_device_info) + microphone_input = MicrophoneInput(pa, input_device_info, sampling_rate=mic_sampling_rate) logger.info("Using speaker output device: %s", output_device_info['name']) speaker_output = SpeakerOutput(pa, output_device_info) return microphone_input, speaker_output \ No newline at end of file diff --git a/vocode/input_device/microphone_input.py b/vocode/input_device/microphone_input.py index 03055df..fe042fe 100644 --- a/vocode/input_device/microphone_input.py +++ b/vocode/input_device/microphone_input.py @@ -10,10 +10,10 @@ class MicrophoneInput(BaseInputDevice): DEFAULT_SAMPLING_RATE = 44100 DEFAULT_CHUNK_SIZE = 2048 - def __init__(self, pa: pyaudio.PyAudio, device_info: dict, chunk_size: int = DEFAULT_CHUNK_SIZE): + def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE): self.device_info = device_info - sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) - super().__init__(sampling_rate, AudioEncoding.LINEAR16, chunk_size) + sampling_rate = sampling_rate or (self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size) self.pa = pa self.stream = pa.open( format=pyaudio.paInt16, diff --git a/vocode/models/transcriber.py b/vocode/models/transcriber.py index 2ce209c..cbffe8a 100644 --- a/vocode/models/transcriber.py +++ b/vocode/models/transcriber.py @@ -8,6 +8,7 @@ class TranscriberType(str, Enum): BASE = "base" DEEPGRAM = "deepgram" GOOGLE = "google" + ASSEMBLY_AI = "assembly_ai" class TranscriberConfig(TypedModel, type=TranscriberType.BASE): sampling_rate: int @@ -28,4 +29,9 @@ class DeepgramTranscriberConfig(TranscriberConfig, type=TranscriberType.DEEPGRAM class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE): model: Optional[str] = None - should_warmup_model: bool = False \ No newline at end of file + should_warmup_model: bool = False + +class AssemblyAITranscriberConfig(TranscriberConfig, type=TranscriberType.ASSEMBLY_AI): + model: Optional[str] = None + should_warmup_model: bool = False + version: Optional[str] = None \ No newline at end of file diff --git a/vocode/output_device/speaker_output.py b/vocode/output_device/speaker_output.py index 75ffbf1..7a2374d 100644 --- a/vocode/output_device/speaker_output.py +++ b/vocode/output_device/speaker_output.py @@ -7,9 +7,9 @@ class SpeakerOutput(BaseOutputDevice): DEFAULT_SAMPLING_RATE = 44100 - def __init__(self, pa: pyaudio.PyAudio, device_info: dict, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): + def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): self.device_info = device_info - sampling_rate = int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + sampling_rate = sampling_rate or int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) super().__init__(sampling_rate, audio_encoding) self.pa = pa self.stream = self.pa.open( From 7e23d319f1b45b6848657f92d9341b3f82b70504 Mon Sep 17 00:00:00 2001 From: Kian Date: Mon, 27 Feb 2023 11:21:50 -0800 Subject: [PATCH 2/2] ajay comments --- vocode/helpers.py | 4 ++-- vocode/models/transcriber.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vocode/helpers.py b/vocode/helpers.py index e27e441..69c9c35 100644 --- a/vocode/helpers.py +++ b/vocode/helpers.py @@ -11,7 +11,7 @@ def _get_device_prompt(device_infos: list[dict]) -> str: Choice: """.format( "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) -def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: +def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: pa = pyaudio.PyAudio() num_devices = pa.get_device_count() devices = list(map(pa.get_device_info_by_index, range(num_devices))) @@ -26,5 +26,5 @@ def create_microphone_input_and_speaker_output(use_first_available_device=False, logger.info("Using microphone input device: %s", input_device_info['name']) microphone_input = MicrophoneInput(pa, input_device_info, sampling_rate=mic_sampling_rate) logger.info("Using speaker output device: %s", output_device_info['name']) - speaker_output = SpeakerOutput(pa, output_device_info) + speaker_output = SpeakerOutput(pa, output_device_info, sampling_rate=speaker_sampling_rate) return microphone_input, speaker_output \ No newline at end of file diff --git a/vocode/models/transcriber.py b/vocode/models/transcriber.py index cbffe8a..190a1f3 100644 --- a/vocode/models/transcriber.py +++ b/vocode/models/transcriber.py @@ -32,6 +32,4 @@ class GoogleTranscriberConfig(TranscriberConfig, type=TranscriberType.GOOGLE): should_warmup_model: bool = False class AssemblyAITranscriberConfig(TranscriberConfig, type=TranscriberType.ASSEMBLY_AI): - model: Optional[str] = None - should_warmup_model: bool = False - version: Optional[str] = None \ No newline at end of file + should_warmup_model: bool = False \ No newline at end of file