move to sounddevice

This commit is contained in:
Ajay Raj 2023-03-03 00:01:41 -08:00
commit 565bd5b3a1
4 changed files with 44 additions and 45 deletions

View file

@ -5,7 +5,7 @@ import signal
from vocode.conversation import Conversation
from vocode.helpers import create_microphone_input_and_speaker_output
from vocode.models.transcriber import DeepgramTranscriberConfig
from vocode.models.agent import ChatGPTAgentConfig, RESTfulUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig
from vocode.models.agent import ChatGPTAgentConfig, RESTfulUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig, EchoAgentConfig
from vocode.models.synthesizer import AzureSynthesizerConfig
from vocode.user_implemented_agent.restful_agent import RESTfulAgent
@ -14,19 +14,15 @@ logging.root.setLevel(logging.INFO)
if __name__ == "__main__":
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=False)
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_default_devices=True)
conversation = Conversation(
input_device=microphone_input,
output_device=speaker_output,
transcriber_config=DeepgramTranscriberConfig.from_input_device(microphone_input),
agent_config=RESTfulUserImplementedAgentConfig(
agent_config=EchoAgentConfig(
initial_message="Hello!",
generate_responses=False,
respond=RESTfulUserImplementedAgentConfig.EndpointConfig(
url="http://a6eb64f4a9b7.ngrok.io/respond",
method="POST"
)
),
synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output)
)

View file

@ -1,4 +1,4 @@
import pyaudio
import sounddevice as sd
from .input_device.microphone_input import MicrophoneInput
from .output_device.speaker_output import SpeakerOutput
import logging
@ -11,20 +11,18 @@ def _get_device_prompt(device_infos: list[dict]) -> str:
Choice: """.format(
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
pa = pyaudio.PyAudio()
num_devices = pa.get_device_count()
devices = list(map(pa.get_device_info_by_index, range(num_devices)))
input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices))
output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices))
if use_first_available_device:
input_device_info = input_device_infos[0]
output_device_info = output_device_infos[0]
def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
device_infos = sd.query_devices()
input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos))
output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos))
if use_default_devices:
input_device_info = sd.query_devices(kind='input')
output_device_info = sd.query_devices(kind='output')
else:
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
logger.info("Using microphone input device: %s", input_device_info['name'])
microphone_input = MicrophoneInput(pa, input_device_info, sampling_rate=mic_sampling_rate)
microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate)
logger.info("Using speaker output device: %s", output_device_info['name'])
speaker_output = SpeakerOutput(pa, output_device_info, sampling_rate=speaker_sampling_rate)
speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate)
return microphone_input, speaker_output

View file

@ -1,6 +1,8 @@
import pyaudio
import sounddevice as sd
import numpy as np
from typing import Optional
import queue
import wave
from .base_input_device import BaseInputDevice
from ..models.audio_encoding import AudioEncoding
@ -10,25 +12,29 @@ class MicrophoneInput(BaseInputDevice):
DEFAULT_SAMPLING_RATE = 44100
DEFAULT_CHUNK_SIZE = 2048
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE):
def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1):
self.device_info = device_info
sampling_rate = sampling_rate or (self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
self.pa = pa
self.stream = pa.open(
format=pyaudio.paInt16,
self.stream = sd.InputStream(
dtype=np.int16,
channels=1,
rate=self.sampling_rate,
input=True,
frames_per_buffer=self.chunk_size,
input_device_index=int(self.device_info['index']),
stream_callback=self._stream_callback
samplerate=self.sampling_rate,
blocksize=self.chunk_size,
device=int(self.device_info['index']),
callback=self._stream_callback
)
self.stream.start()
self.queue = queue.Queue()
self.microphone_gain = microphone_gain
def _stream_callback(self, in_data, *_args):
self.queue.put_nowait(in_data)
return (None, pyaudio.paContinue)
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
if self.microphone_gain > 1:
in_data = in_data * (2 ^ self.microphone_gain)
else:
in_data = in_data // (2 ^ self.microphone_gain)
audio_bytes = in_data.tobytes()
self.queue.put_nowait(audio_bytes)
def get_audio(self) -> Optional[bytes]:
try:

View file

@ -1,4 +1,5 @@
import pyaudio
import sounddevice as sd
import numpy as np
from .base_output_device import BaseOutputDevice
from ..models.audio_encoding import AudioEncoding
@ -7,22 +8,20 @@ class SpeakerOutput(BaseOutputDevice):
DEFAULT_SAMPLING_RATE = 44100
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
self.device_info = device_info
sampling_rate = sampling_rate or int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
super().__init__(sampling_rate, audio_encoding)
self.pa = pa
self.stream = self.pa.open(
output=True,
self.stream = sd.OutputStream(
channels=1,
rate=self.sampling_rate,
format=pyaudio.paInt16,
output_device_index=int(self.device_info['index'])
samplerate=self.sampling_rate,
dtype=np.int16,
device=int(self.device_info['index'])
)
self.stream.start()
async def send_async(self, chunk):
self.stream.write(chunk)
self.stream.write(np.frombuffer(chunk, dtype=np.int16))
def terminate(self):
self.stream.close()
self.pa.close()
self.stream.close()