first pass at turn based conversation
This commit is contained in:
parent
d1118d375e
commit
518a0f2b53
40 changed files with 503 additions and 99 deletions
53
vocode/turn_based/synthesizer/azure_synthesizer.py
Normal file
53
vocode/turn_based/synthesizer/azure_synthesizer.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
import azure.cognitiveservices.speech as speechsdk
|
||||
from pydub import AudioSegment
|
||||
|
||||
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class AzureSynthesizer(BaseSynthesizer):
|
||||
def __init__(self, sampling_rate: int):
|
||||
self.sampling_rate = sampling_rate
|
||||
speech_config = speechsdk.SpeechConfig(
|
||||
subscription=os.environ.get("AZURE_SPEECH_KEY"),
|
||||
region=os.environ.get("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
if self.sampling_rate == 44100:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
|
||||
)
|
||||
if self.sampling_rate == 48000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
|
||||
)
|
||||
if self.sampling_rate == 24000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
|
||||
)
|
||||
elif self.sampling_rate == 16000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
|
||||
)
|
||||
elif self.sampling_rate == 8000:
|
||||
speech_config.set_speech_synthesis_output_format(
|
||||
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
|
||||
)
|
||||
|
||||
self.synthesizer = speechsdk.SpeechSynthesizer(
|
||||
speech_config=speech_config, audio_config=None
|
||||
)
|
||||
|
||||
def synthesize(self, text) -> AudioSegment:
|
||||
result = self.synthesizer.speak_text(text)
|
||||
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
||||
return AudioSegment(
|
||||
result.audio_data,
|
||||
sample_width=2,
|
||||
frame_rate=self.sampling_rate,
|
||||
channels=1,
|
||||
)
|
||||
else:
|
||||
raise Exception("Could not synthesize audio")
|
||||
Loading…
Add table
Add a link
Reference in a new issue