56 lines
2.1 KiB
Python
56 lines
2.1 KiB
Python
from typing import Optional
|
|
import azure.cognitiveservices.speech as speechsdk
|
|
from pydub import AudioSegment
|
|
from vocode import getenv
|
|
|
|
from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
|
|
|
|
|
|
class AzureSynthesizer(BaseSynthesizer):
|
|
def __init__(
|
|
self,
|
|
sampling_rate: int,
|
|
api_key: Optional[str] = None,
|
|
region: Optional[str] = None,
|
|
):
|
|
self.sampling_rate = sampling_rate
|
|
speech_config = speechsdk.SpeechConfig(
|
|
subscription=getenv("AZURE_SPEECH_KEY", api_key),
|
|
region=getenv("AZURE_SPEECH_REGION", region),
|
|
)
|
|
if self.sampling_rate == 44100:
|
|
speech_config.set_speech_synthesis_output_format(
|
|
speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
|
|
)
|
|
if self.sampling_rate == 48000:
|
|
speech_config.set_speech_synthesis_output_format(
|
|
speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
|
|
)
|
|
if self.sampling_rate == 24000:
|
|
speech_config.set_speech_synthesis_output_format(
|
|
speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
|
|
)
|
|
elif self.sampling_rate == 16000:
|
|
speech_config.set_speech_synthesis_output_format(
|
|
speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
|
|
)
|
|
elif self.sampling_rate == 8000:
|
|
speech_config.set_speech_synthesis_output_format(
|
|
speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
|
|
)
|
|
|
|
self.synthesizer = speechsdk.SpeechSynthesizer(
|
|
speech_config=speech_config, audio_config=None
|
|
)
|
|
|
|
def synthesize(self, text) -> AudioSegment:
|
|
result = self.synthesizer.speak_text(text)
|
|
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
|
return AudioSegment(
|
|
result.audio_data,
|
|
sample_width=2,
|
|
frame_rate=self.sampling_rate,
|
|
channels=1,
|
|
)
|
|
else:
|
|
raise Exception("Could not synthesize audio")
|