move to sounddevice
This commit is contained in:
parent
505bba2e07
commit
565bd5b3a1
4 changed files with 44 additions and 45 deletions
|
|
@ -5,7 +5,7 @@ import signal
|
|||
from vocode.conversation import Conversation
|
||||
from vocode.helpers import create_microphone_input_and_speaker_output
|
||||
from vocode.models.transcriber import DeepgramTranscriberConfig
|
||||
from vocode.models.agent import ChatGPTAgentConfig, RESTfulUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig
|
||||
from vocode.models.agent import ChatGPTAgentConfig, RESTfulUserImplementedAgentConfig, WebSocketUserImplementedAgentConfig, EchoAgentConfig
|
||||
from vocode.models.synthesizer import AzureSynthesizerConfig
|
||||
from vocode.user_implemented_agent.restful_agent import RESTfulAgent
|
||||
|
||||
|
|
@ -14,19 +14,15 @@ logging.root.setLevel(logging.INFO)
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_first_available_device=False)
|
||||
microphone_input, speaker_output = create_microphone_input_and_speaker_output(use_default_devices=True)
|
||||
|
||||
conversation = Conversation(
|
||||
input_device=microphone_input,
|
||||
output_device=speaker_output,
|
||||
transcriber_config=DeepgramTranscriberConfig.from_input_device(microphone_input),
|
||||
agent_config=RESTfulUserImplementedAgentConfig(
|
||||
agent_config=EchoAgentConfig(
|
||||
initial_message="Hello!",
|
||||
generate_responses=False,
|
||||
respond=RESTfulUserImplementedAgentConfig.EndpointConfig(
|
||||
url="http://a6eb64f4a9b7.ngrok.io/respond",
|
||||
method="POST"
|
||||
)
|
||||
),
|
||||
synthesizer_config=AzureSynthesizerConfig.from_output_device(speaker_output)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import pyaudio
|
||||
import sounddevice as sd
|
||||
from .input_device.microphone_input import MicrophoneInput
|
||||
from .output_device.speaker_output import SpeakerOutput
|
||||
import logging
|
||||
|
|
@ -11,20 +11,18 @@ def _get_device_prompt(device_infos: list[dict]) -> str:
|
|||
Choice: """.format(
|
||||
"\n".join(f"{index}: {device['name']}" for index, device in enumerate(device_infos)))
|
||||
|
||||
def create_microphone_input_and_speaker_output(use_first_available_device=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
|
||||
pa = pyaudio.PyAudio()
|
||||
num_devices = pa.get_device_count()
|
||||
devices = list(map(pa.get_device_info_by_index, range(num_devices)))
|
||||
input_device_infos = list(filter(lambda device: device['maxInputChannels'] > 0, devices))
|
||||
output_device_infos = list(filter(lambda device: device['maxOutputChannels'] > 0, devices))
|
||||
if use_first_available_device:
|
||||
input_device_info = input_device_infos[0]
|
||||
output_device_info = output_device_infos[0]
|
||||
def create_microphone_input_and_speaker_output(use_default_devices=False, mic_sampling_rate=None, speaker_sampling_rate=None) -> tuple[MicrophoneInput, SpeakerOutput]:
|
||||
device_infos = sd.query_devices()
|
||||
input_device_infos = list(filter(lambda device_info: device_info['max_input_channels'] > 0, device_infos))
|
||||
output_device_infos = list(filter(lambda device_info: device_info['max_output_channels'] > 0, device_infos))
|
||||
if use_default_devices:
|
||||
input_device_info = sd.query_devices(kind='input')
|
||||
output_device_info = sd.query_devices(kind='output')
|
||||
else:
|
||||
input_device_info = input_device_infos[int(input(_get_device_prompt(input_device_infos)))]
|
||||
output_device_info = output_device_infos[int(input(_get_device_prompt(output_device_infos)))]
|
||||
logger.info("Using microphone input device: %s", input_device_info['name'])
|
||||
microphone_input = MicrophoneInput(pa, input_device_info, sampling_rate=mic_sampling_rate)
|
||||
microphone_input = MicrophoneInput(input_device_info, sampling_rate=mic_sampling_rate)
|
||||
logger.info("Using speaker output device: %s", output_device_info['name'])
|
||||
speaker_output = SpeakerOutput(pa, output_device_info, sampling_rate=speaker_sampling_rate)
|
||||
speaker_output = SpeakerOutput(output_device_info, sampling_rate=speaker_sampling_rate)
|
||||
return microphone_input, speaker_output
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
import pyaudio
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
import queue
|
||||
import wave
|
||||
|
||||
from .base_input_device import BaseInputDevice
|
||||
from ..models.audio_encoding import AudioEncoding
|
||||
|
|
@ -10,25 +12,29 @@ class MicrophoneInput(BaseInputDevice):
|
|||
DEFAULT_SAMPLING_RATE = 44100
|
||||
DEFAULT_CHUNK_SIZE = 2048
|
||||
|
||||
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE):
|
||||
def __init__(self, device_info: dict, sampling_rate: int = None, chunk_size: int = DEFAULT_CHUNK_SIZE, microphone_gain: int = 1):
|
||||
self.device_info = device_info
|
||||
sampling_rate = sampling_rate or (self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
|
||||
sampling_rate = sampling_rate or (self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
|
||||
super().__init__(int(sampling_rate), AudioEncoding.LINEAR16, chunk_size)
|
||||
self.pa = pa
|
||||
self.stream = pa.open(
|
||||
format=pyaudio.paInt16,
|
||||
self.stream = sd.InputStream(
|
||||
dtype=np.int16,
|
||||
channels=1,
|
||||
rate=self.sampling_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk_size,
|
||||
input_device_index=int(self.device_info['index']),
|
||||
stream_callback=self._stream_callback
|
||||
samplerate=self.sampling_rate,
|
||||
blocksize=self.chunk_size,
|
||||
device=int(self.device_info['index']),
|
||||
callback=self._stream_callback
|
||||
)
|
||||
self.stream.start()
|
||||
self.queue = queue.Queue()
|
||||
self.microphone_gain = microphone_gain
|
||||
|
||||
def _stream_callback(self, in_data, *_args):
|
||||
self.queue.put_nowait(in_data)
|
||||
return (None, pyaudio.paContinue)
|
||||
def _stream_callback(self, in_data: np.ndarray[np.int16], *_args):
|
||||
if self.microphone_gain > 1:
|
||||
in_data = in_data * (2 ^ self.microphone_gain)
|
||||
else:
|
||||
in_data = in_data // (2 ^ self.microphone_gain)
|
||||
audio_bytes = in_data.tobytes()
|
||||
self.queue.put_nowait(audio_bytes)
|
||||
|
||||
def get_audio(self) -> Optional[bytes]:
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import pyaudio
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
|
||||
from .base_output_device import BaseOutputDevice
|
||||
from ..models.audio_encoding import AudioEncoding
|
||||
|
|
@ -7,22 +8,20 @@ class SpeakerOutput(BaseOutputDevice):
|
|||
|
||||
DEFAULT_SAMPLING_RATE = 44100
|
||||
|
||||
def __init__(self, pa: pyaudio.PyAudio, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
|
||||
def __init__(self, device_info: dict, sampling_rate: int = None, audio_encoding: AudioEncoding = AudioEncoding.LINEAR16):
|
||||
self.device_info = device_info
|
||||
sampling_rate = sampling_rate or int(self.device_info.get('defaultSampleRate', self.DEFAULT_SAMPLING_RATE))
|
||||
sampling_rate = sampling_rate or int(self.device_info.get('default_samplerate', self.DEFAULT_SAMPLING_RATE))
|
||||
super().__init__(sampling_rate, audio_encoding)
|
||||
self.pa = pa
|
||||
self.stream = self.pa.open(
|
||||
output=True,
|
||||
self.stream = sd.OutputStream(
|
||||
channels=1,
|
||||
rate=self.sampling_rate,
|
||||
format=pyaudio.paInt16,
|
||||
output_device_index=int(self.device_info['index'])
|
||||
samplerate=self.sampling_rate,
|
||||
dtype=np.int16,
|
||||
device=int(self.device_info['index'])
|
||||
)
|
||||
self.stream.start()
|
||||
|
||||
async def send_async(self, chunk):
|
||||
self.stream.write(chunk)
|
||||
self.stream.write(np.frombuffer(chunk, dtype=np.int16))
|
||||
|
||||
def terminate(self):
|
||||
self.stream.close()
|
||||
self.pa.close()
|
||||
self.stream.close()
|
||||
Loading…
Add table
Add a link
Reference in a new issue