vocode-python/vocode/streaming/transcriber/google_transcriber.py

import asyncio
import time
import queue
from google.cloud import speech
import threading

from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.transcriber.base_transcriber import (
    BaseTranscriber,
    Transcription,
)
from vocode.streaming.models.transcriber import GoogleTranscriberConfig
from vocode.streaming.utils import create_loop_in_thread


class GoogleTranscriber(BaseTranscriber):
    def __init__(self, transcriber_config: GoogleTranscriberConfig):
        super().__init__(transcriber_config)
        self._queue = queue.Queue()
        self._ended = False
        self.google_streaming_config = self.create_google_streaming_config()
        self.client = speech.SpeechClient()
        self.warmed_up = False
        self.is_ready = False
        if self.transcriber_config.endpointing_config:
            raise Exception("Google endpointing config not supported yet")
        self.event_loop = asyncio.new_event_loop()
        self.thread = threading.Thread(
            name="google_transcriber",
            target=create_loop_in_thread,
            args=(self.event_loop, self.process()),
        )

    def create_google_streaming_config(self):
        extra_params = {}
        if self.transcriber_config.model:
            extra_params["model"] = self.transcriber_config.model
            extra_params["use_enhanced"] = True

        if self.transcriber_config.audio_encoding == AudioEncoding.LINEAR16:
            google_audio_encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
        elif self.transcriber_config.audio_encoding == AudioEncoding.MULAW:
            google_audio_encoding = speech.RecognitionConfig.AudioEncoding.MULAW

        return speech.StreamingRecognitionConfig(
            config=speech.RecognitionConfig(
                encoding=google_audio_encoding,
                sample_rate_hertz=self.transcriber_config.sampling_rate,
                language_code="en-US",
                **extra_params
            ),
            interim_results=True,
        )

    async def ready(self):
        if not self.transcriber_config.should_warmup_model:
            return True
        while not self.warmed_up:
            await asyncio.sleep(0.1)
        return self.is_ready

    def warmup(self):
        warmup_bytes = self.get_warmup_bytes()

        def stream():
            chunk_size = self.transcriber_config.sampling_rate * 2
            for i in range(len(warmup_bytes) // chunk_size):
                yield speech.StreamingRecognizeRequest(
                    audio_content=warmup_bytes[i * chunk_size : (i + 1) * chunk_size]
                )
                time.sleep(0.01)

        for _ in self.client.streaming_recognize(
            self.google_streaming_config, stream()
        ):
            pass
        self.warmed_up = True
        self.is_ready = True

    async def run(self):
        self.thread.start()

    async def process(self):
        if self.transcriber_config.should_warmup_model:
            self.warmup()
        stream = self.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in stream
        )
        responses = self.client.streaming_recognize(
            self.google_streaming_config, requests
        )
        await self.process_responses_loop(responses)

    def terminate(self):
        self._ended = True

    def send_audio(self, chunk: bytes):
        self._queue.put(chunk, block=False)

    async def process_responses_loop(self, responses):
        for response in responses:
            await self._on_response(response)

            if self._ended:
                break

    async def _on_response(self, response):
        if not response.results:
            return

        result = response.results[0]
        if not result.alternatives:
            return

        top_choice = result.alternatives[0]
        message = top_choice.transcript
        confidence = top_choice.confidence

        return await self.on_response(
            Transcription(message, confidence, result.is_final)
        )

    def generator(self):
        while not self._ended:
            # Use a blocking get() to ensure there's at least one chunk of
            # data, and stop iteration if the chunk is None, indicating the
            # end of the audio stream.
            chunk = self._queue.get()
            if chunk is None:
                return
            data = [chunk]

            # Now consume whatever other data's still buffered.
            while True:
                try:
                    chunk = self._queue.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)