mimic3/opentts_abc/__init__.py
2022-05-02 15:24:43 -04:00

318 lines
8.9 KiB
Python

# Copyright 2022 Mycroft AI Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
"""Base classes for Open Text to Speech systems"""
import io
import typing
import wave
from abc import ABCMeta, abstractmethod
from contextlib import AbstractContextManager
from dataclasses import dataclass
@dataclass
class Settings:
"""Current settings for TTS system"""
voice: typing.Optional[str] = None
"""Current voice key"""
language: typing.Optional[str] = None
"""Current language (e.g., en_US)"""
volume: typing.Optional[float] = None
"""Current speaking volume"""
rate: typing.Optional[float] = None
"""Current speaking rate"""
pitch: typing.Optional[float] = None
"""Current speaking pitch"""
other_settings: typing.Optional[typing.Mapping[str, typing.Any]] = None
"""Custom settings"""
@dataclass
class BaseToken(metaclass=ABCMeta):
"""Base class for spoken tokens"""
text: str
"""Text of the token"""
@dataclass
class Word(BaseToken):
"""Token representing a single word"""
role: typing.Optional[str] = None
"""Role of the word (typically part of speech)"""
@dataclass
class Phonemes(BaseToken):
"""Token representing a phonemized word"""
alphabet: typing.Optional[str] = None
"""Phoneme alphabet (e.g., ipa)"""
@dataclass
class SayAs(BaseToken):
"""Token representing a word or phrase that must be spoken a particular way"""
interpret_as: str
"""Implementation-dependent token interpretation (e.g., characters or digits)"""
format: typing.Optional[str] = None
"""Implementation-dependent token format (depends on interpret_as)"""
@dataclass
class _BaseResultDefaults:
"""Base class of results from TTS end_utterance"""
tag: typing.Optional[typing.Any] = None
"""Optional tag to associate with results"""
@dataclass
class BaseResult(metaclass=ABCMeta):
"""Base class of results from TTS end_utterance"""
@dataclass
class _AudioResultBase:
"""Synthesized audio result"""
sample_rate_hz: int
"""Sample rate in Hertz (e.g., 22050)"""
sample_width_bytes: int
"""Sample width in bytes (e.g., 2)"""
num_channels: int
"""Number of audio channels (e.g., 1)"""
audio_bytes: bytes
"""Raw audio bytes (no header)"""
@dataclass
class AudioResult(BaseResult, _BaseResultDefaults, _AudioResultBase):
"""Synthesized audio result"""
def to_wav_bytes(self) -> bytes:
"""Convert audio bytes to WAV"""
with io.BytesIO() as wav_io:
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
with wav_file:
wav_file.setframerate(self.sample_rate_hz)
wav_file.setsampwidth(self.sample_width_bytes)
wav_file.setnchannels(self.num_channels)
wav_file.writeframes(self.audio_bytes)
return wav_io.getvalue()
@dataclass
class _MarkResultBase:
"""Result indicating a <mark> has been reached in SSML"""
name: str
"""Name of the <mark>"""
@dataclass
class MarkResult(BaseResult, _BaseResultDefaults, _MarkResultBase):
"""Result indicating a <mark> has been reached in SSML"""
@dataclass
class Voice:
"""Details of a voice in a text to speech system"""
key: str
"""Unique key that can be used to reference the voice"""
name: str
"""Human-readable name of the voice"""
language: str
"""Language of the voice (e.g., en_US)"""
description: str
"""Human-readable description of the voice"""
location: str
"""File path or URI where the voice exists"""
speakers: typing.Optional[typing.Sequence[str]] = None
"""List of speakers within the voice model if multi-speaker"""
properties: typing.Optional[typing.Mapping[str, typing.Any]] = None
"""Additional properties associated with the voice"""
aliases: typing.Optional[typing.Set[str]] = None
"""Alternative keys for this voice"""
version: typing.Optional[typing.Set[str]] = None
"""Version of this voice"""
@property
def is_multispeaker(self) -> bool:
"""True if voice has multiple speakers"""
return (self.speakers is not None) and (len(self.speakers) > 1)
class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta):
"""Abstract base class for open text to speech systems.
Expected usage:
begin_utterance()
speak_text(...)
add_break(...)
set_mark(...)
speak_tokens(...)
speak_text(...)
results = end_utterance()
In between begin_utterance() and end_utterance(), the voice/language may
also be changed.
"""
@property
@abstractmethod
def voice(self) -> str:
"""Get the current voice key"""
@voice.setter
def voice(self, new_voice: str):
"""Set the current voice key"""
@property
@abstractmethod
def language(self) -> str:
"""Get the current voice language"""
@language.setter
def language(self, new_language: str):
"""Set the current voice language"""
@property
@abstractmethod
def volume(self) -> float:
"""Get the current volume in [0, 100]"""
@volume.setter
def volume(self, new_volume: float):
"""Set the current volume in [0, 100]"""
@property
@abstractmethod
def rate(self) -> float:
"""Get the current speaking rate"""
@rate.setter
def rate(self, new_rate: float):
"""Set the current speaker rate"""
def shutdown(self):
"""Called by the host program when the text to speech system should be stopped"""
def __exit__(self, exc_type, exc_value, traceback):
"""Automatically call shutdown when context manager has exited"""
self.shutdown()
@abstractmethod
def get_voices(self) -> typing.Iterable[Voice]:
"""Returns an iterable of available voices"""
@abstractmethod
def begin_utterance(self):
"""Begins a new utterance"""
@abstractmethod
def speak_text(self, text: str, text_language: typing.Optional[str] = None):
"""Speaks text using the underlying system's tokenization mechanism.
Becomes an AudioResult in end_utterance()
"""
@abstractmethod
def speak_tokens(self, tokens: typing.Iterable[BaseToken]):
"""Speak user-defined tokens.
Becomes an AudioResult in end_utterance()
"""
@abstractmethod
def add_break(self, time_ms: int):
"""Add milliseconds of silence to the current utterance.
Becomes an AudioResult in end_utterance()
"""
@abstractmethod
def set_mark(self, name: str):
"""Set a named mark at this point in the utterance.
Becomes a MarkResult in end_utterance()
"""
@abstractmethod
def end_utterance(self) -> typing.Iterable[BaseResult]:
"""Complete an utterance after begin_utterance().
Returns an iterable of results (audio, marks, etc.)
"""
def text_to_wav(
self, text: str, text_language: typing.Optional[str] = None
) -> bytes:
"""Synthesize text with current voice settings and return WAV audio"""
with io.BytesIO() as wav_io:
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
wav_params_set = False
with wav_file:
try:
self.begin_utterance()
self.speak_text(text, text_language=text_language)
results = self.end_utterance()
for result in results:
# Add audio to existing WAV file
if isinstance(result, AudioResult):
if not wav_params_set:
wav_file.setframerate(result.sample_rate_hz)
wav_file.setsampwidth(result.sample_width_bytes)
wav_file.setnchannels(result.num_channels)
wav_params_set = True
wav_file.writeframes(result.audio_bytes)
except Exception as e:
if not wav_params_set:
# Set default parameters so exception can propagate
wav_file.setframerate(22050)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
raise e
wav_bytes = wav_io.getvalue()
return wav_bytes