vocode-python/vocode/turn_based/synthesizer/azure_synthesizer.py

import os
from typing import Optional
import azure.cognitiveservices.speech as speechsdk
from pydub import AudioSegment

from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer


class AzureSynthesizer(BaseSynthesizer):
    def __init__(
        self,
        sampling_rate: int,
        api_key: Optional[str] = None,
        region: Optional[str] = None,
    ):
        self.sampling_rate = sampling_rate
        speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY", api_key),
            region=os.getenv("AZURE_SPEECH_REGION", region),
        )
        if self.sampling_rate == 44100:
            speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
            )
        if self.sampling_rate == 48000:
            speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
            )
        if self.sampling_rate == 24000:
            speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
            )
        elif self.sampling_rate == 16000:
            speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
            )
        elif self.sampling_rate == 8000:
            speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
            )

        self.synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=speech_config, audio_config=None
        )

    def synthesize(self, text) -> AudioSegment:
        result = self.synthesizer.speak_text(text)
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            return AudioSegment(
                result.audio_data,
                sample_width=2,
                frame_rate=self.sampling_rate,
                channels=1,
            )
        else:
            raise Exception("Could not synthesize audio")