From 565bd5b3a1b718dada7902d270e6981a5ef9416e Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Fri, 3 Mar 2023 00:01:41 -0800 Subject: [PATCH] move to sounddevice --- simple_conversation.py | 10 +++----- vocode/helpers.py | 22 ++++++++-------- vocode/input_device/microphone_input.py | 34 +++++++++++++++---------- vocode/output_device/speaker_output.py | 23 ++++++++--------- 4 files changed, 44 insertions(+), 45 deletions(-) diff --git a/simple_conversation.py b/simple_conversation.py index bb9611d..a696a8c 100644 --- a/simple_conversation.py +++ b/simple_conversation.py @@ -5,7 +5,7 @@ import signal from vocode.conversation import Conversation from vocode.helpers import create_microphone_input_and_speaker_output from vocode.models.transcriber import DeepgramTranscriberConfig -from vocode.models.agent import ChatGPTAgentConfig, RESTfulUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig +from vocode.models.agent import ChatGPTAgentConfig, RESTfulUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig, EchoAgentConfig from vocode.models.synthesizer import AzureSynthesizerConfig from vocode.user_implemented_agent.restful_agent import RESTfulAgent @@ -14,19 +14,15 @@ logging.root.setLevel(logging.INFO) if __name__ == "__main__": - microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=False) + microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_default_devices=True) conversation = Conversation( input_device=microphone_input, output_device=speaker_output, transcriber_config=DeepgramTranscriberConfig.from_input_device(microphone_input), - agent_config=RESTfulUserImplementedAgentConfig( + agent_config=EchoAgentConfig( initial_message="Hello!", generate_responses=False, - respond=RESTfulUserImplementedAgentConfig.EndpointConfig( - url="http://a6eb64f4a9b7.ngrok.io/respond", - method="POST" - ) ), synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output) ) diff --git a/vocode/helpers.py b/vocode/helpers.py index 69c9c35..905f093 100644 --- a/vocode/helpers.py +++ b/vocode/helpers.py @@ -1,4 +1,4 @@ -import pyaudio +import sounddevice as sd from .input_device.microphone_input import MicrophoneInput from .output_device.speaker_output import SpeakerOutput import logging @@ -11,20 +11,18 @@ def _get_device_prompt(device_infos: list[dict]) -> str: Choice: """.format( "\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos))) -def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: - pa = pyaudio.PyAudio() - num_devices = pa.get_device_count() - devices = list(map(pa.get_device_info_by_index, range(num_devices))) - input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices)) - output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices)) - if use_first_available_device: - input_device_info = input_device_infos[0] - output_device_info = output_device_infos[0] +def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]: + device_infos = sd.query_devices() + input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos)) + output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos)) + if use_default_devices: + input_device_info = sd.query_devices(kind='input') + output_device_info = sd.query_devices(kind='output') else: input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))] output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))] logger.info("Using microphone input device: %s", input_device_info['name']) - microphone_input = MicrophoneInput(pa, input_device_info, sampling_rate=mic_sampling_rate) + microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate) logger.info("Using speaker output device: %s", output_device_info['name']) - speaker_output = SpeakerOutput(pa, output_device_info, sampling_rate=speaker_sampling_rate) + speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate) return microphone_input, speaker_output \ No newline at end of file diff --git a/vocode/input_device/microphone_input.py b/vocode/input_device/microphone_input.py index fe042fe..d1a60aa 100644 --- a/vocode/input_device/microphone_input.py +++ b/vocode/input_device/microphone_input.py @@ -1,6 +1,8 @@ -import pyaudio +import sounddevice as sd +import numpy as np from typing import Optional import queue +import wave from .base_input_device import BaseInputDevice from ..models.audio_encoding import AudioEncoding @@ -10,25 +12,29 @@ class MicrophoneInput(BaseInputDevice): DEFAULT_SAMPLING_RATE = 44100 DEFAULT_CHUNK_SIZE = 2048 - def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE): + def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1): self.device_info = device_info - sampling_rate = sampling_rate or (self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE)) super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size) - self.pa = pa - self.stream = pa.open( - format=pyaudio.paInt16, + self.stream = sd.InputStream( + dtype=np.int16, channels=1, - rate=self.sampling_rate, - input=True, - frames_per_buffer=self.chunk_size, - input_device_index=int(self.device_info['index']), - stream_callback=self._stream_callback + samplerate=self.sampling_rate, + blocksize=self.chunk_size, + device=int(self.device_info['index']), + callback=self._stream_callback ) + self.stream.start() self.queue = queue.Queue() + self.microphone_gain = microphone_gain - def _stream_callback(self, in_data, *_args): - self.queue.put_nowait(in_data) - return (None, pyaudio.paContinue) + def _stream_callback(self, in_data: np.ndarray[np.int16], *_args): + if self.microphone_gain > 1: + in_data = in_data * (2 ^ self.microphone_gain) + else: + in_data = in_data // (2 ^ self.microphone_gain) + audio_bytes = in_data.tobytes() + self.queue.put_nowait(audio_bytes) def get_audio(self) -> Optional[bytes]: try: diff --git a/vocode/output_device/speaker_output.py b/vocode/output_device/speaker_output.py index 7a2374d..2413903 100644 --- a/vocode/output_device/speaker_output.py +++ b/vocode/output_device/speaker_output.py @@ -1,4 +1,5 @@ -import pyaudio +import sounddevice as sd +import numpy as np from .base_output_device import BaseOutputDevice from ..models.audio_encoding import AudioEncoding @@ -7,22 +8,20 @@ class SpeakerOutput(BaseOutputDevice): DEFAULT_SAMPLING_RATE = 44100 - def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): + def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16): self.device_info = device_info - sampling_rate = sampling_rate or int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE)) + sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE)) super().__init__(sampling_rate, audio_encoding) - self.pa = pa - self.stream = self.pa.open( - output=True, + self.stream = sd.OutputStream( channels=1, - rate=self.sampling_rate, - format=pyaudio.paInt16, - output_device_index=int(self.device_info['index']) + samplerate=self.sampling_rate, + dtype=np.int16, + device=int(self.device_info['index']) ) + self.stream.start() async def send_async(self, chunk): - self.stream.write(chunk) + self.stream.write(np.frombuffer(chunk, dtype=np.int16)) def terminate(self): - self.stream.close() - self.pa.close() \ No newline at end of file + self.stream.close() \ No newline at end of file