From 668a46f16d066e4db2785db8b97251920d8d88fc Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Fri, 8 Apr 2022 11:23:51 -0400 Subject: [PATCH] Add in SSML --- README.md | 8 +- mimic3-tts/README.md | 6 ++ mimic3-tts/mimic3_tts/const.py | 2 + mimic3-tts/mimic3_tts/tts.py | 17 ++++ mimic3-tts/mimic3_tts/voices.json | 34 +++++++ opentts-abc/opentts_abc/__init__.py | 9 ++ opentts-abc/opentts_abc/ssml.py | 143 +++++++++++++++++++++++++++- 7 files changed, 213 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0f6b3c9..45018a7 100644 --- a/README.md +++ b/README.md @@ -234,15 +234,17 @@ For example: ``` xml - + Welcome to the world of speech synthesis. - + - This is a 2 voice. + + This is a 2 voice. + diff --git a/mimic3-tts/README.md b/mimic3-tts/README.md index 56cc9b4..6301c4a 100644 --- a/mimic3-tts/README.md +++ b/mimic3-tts/README.md @@ -172,6 +172,12 @@ A subset of [SSML](https://www.w3.org/TR/speech-synthesis11/) (Speech Synthesis * `voice` - name or language of voice * Name format is `tts:voice` (e.g., "glow-speak:en-us_mary_ann") or `tts:voice#speaker_id` (e.g., "coqui-tts:en_vctk#p228") * If one of the supported languages, a preferred voice is used (override with `--preferred-voice `) +* `` - change speaking attributes + * Supported `attribute` names: + * `volume` - speaking volume + * number in [0, 100] - 0 is silent, 100 is loudest (default) + * +X, -X, +X%, -X% - absolute/percent offset from current volume + * one of "default", "silent", "x-loud", "loud", "medium", "soft", "x-soft" * `` - force interpretation of inner text * `interpret-as` one of "spell-out", "date", "number", "time", or "currency" * `format` - way to format text depending on `interpret-as` diff --git a/mimic3-tts/mimic3_tts/const.py b/mimic3-tts/mimic3_tts/const.py index b6b2153..d6aa290 100644 --- a/mimic3-tts/mimic3_tts/const.py +++ b/mimic3-tts/mimic3_tts/const.py @@ -23,3 +23,5 @@ DEFAULT_VOICES_URL_FORMAT = ( "https://github.com/MycroftAI/mimic3-voices/raw/master/voices/{lang}/{name}" ) DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3" / "voices" + +DEFAULT_VOLUME = 100.0 diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py index 71367f4..5b196c2 100644 --- a/mimic3-tts/mimic3_tts/tts.py +++ b/mimic3-tts/mimic3_tts/tts.py @@ -14,6 +14,7 @@ # along with this program. If not, see . # """Implementation of OpenTTS for Mimic 3""" +import audioop import itertools import logging import typing @@ -42,6 +43,7 @@ from .const import ( DEFAULT_VOICE, DEFAULT_VOICES_DOWNLOAD_DIR, DEFAULT_VOICES_URL_FORMAT, + DEFAULT_VOLUME, ) from .download import VoiceFile, download_voice from .voice import SPEAKER_TYPE, BreakType, Mimic3Voice @@ -108,6 +110,9 @@ class Mimic3Settings: share_onnx_models_between_threads: bool = True """If True, Onnx models are shared between threads""" + volume: float = DEFAULT_VOLUME + """Voice volume in [0, 100]""" + @dataclass class Mimic3Phonemes: @@ -292,6 +297,14 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): def language(self, new_language: str): self.settings.language = new_language + @property + def volume(self) -> float: + return self.settings.volume + + @volume.setter + def volume(self, new_volume: float): + self.settings.volume = max(0, min(100, new_volume)) + def begin_utterance(self): pass @@ -433,6 +446,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): ) audio_bytes = audio.tobytes() + + if settings.volume != DEFAULT_VOLUME: + audio_bytes = audioop.mul(audio_bytes, 2, settings.volume / 100.0) + return AudioResult( sample_rate_hz=voice.config.audio.sample_rate, audio_bytes=audio_bytes, diff --git a/mimic3-tts/mimic3_tts/voices.json b/mimic3-tts/mimic3_tts/voices.json index 9ba27ed..b0c1db7 100644 --- a/mimic3-tts/mimic3_tts/voices.json +++ b/mimic3-tts/mimic3_tts/voices.json @@ -490,6 +490,40 @@ "speakers": [], "properties": {} }, + "fr_FR/tom_low": { + "files": { + "LICENSE": { + "size_bytes": 24947, + "sha256_sum": "635554793cdae1fbc549793a1565772c763e64a686dc674edeaa492c5b88e493" + }, + "README.md": { + "size_bytes": 186, + "sha256_sum": "5722e7135a6487a7e88158aac85a490e155742e85fa4953d426d2f7884359475" + }, + "SOURCE": { + "size_bytes": 50, + "sha256_sum": "96978fc4977928015e2999d4497f667edb562fd1a44211a31ac2c15c94ced664" + }, + "config.json": { + "size_bytes": 3634, + "sha256_sum": "203dc4bafb3d3dacc0cee09959a41494ac173e53c57b3c3b75e2af1593c3859a" + }, + "generator.onnx": { + "size_bytes": 62788375, + "sha256_sum": "0f9ae579eceea1dd908ad47a8196a5a1944c9f2848a89516abf8624917952d03" + }, + "phoneme_map.txt": { + "size_bytes": 15, + "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d" + }, + "phonemes.txt": { + "size_bytes": 232, + "sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e" + } + }, + "speakers": [], + "properties": {} + }, "hu_HU/diana-majlinger_low": { "files": { "LICENSE": { diff --git a/opentts-abc/opentts_abc/__init__.py b/opentts-abc/opentts_abc/__init__.py index aeb846d..0e3bf54 100644 --- a/opentts-abc/opentts_abc/__init__.py +++ b/opentts-abc/opentts_abc/__init__.py @@ -206,6 +206,15 @@ class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta): def language(self, new_language: str): """Set the current voice language""" + @property + @abstractmethod + def volume(self) -> float: + """Get the current volume in [0, 100]""" + + @volume.setter + def volume(self, new_volume: float): + """Set the current volume in [0, 100]""" + def shutdown(self): """Called by the host program when the text to speech system should be stopped""" diff --git a/opentts-abc/opentts_abc/ssml.py b/opentts-abc/opentts_abc/ssml.py index 56b8aab..949fd7b 100644 --- a/opentts-abc/opentts_abc/ssml.py +++ b/opentts-abc/opentts_abc/ssml.py @@ -14,12 +14,13 @@ # along with this program. If not, see . # """Support for Speech Synthesis Markup Language (SSML)""" +import dataclasses import enum import logging import re import typing import xml.etree.ElementTree as etree -from dataclasses import dataclass +from dataclasses import dataclass, field from opentts_abc import BaseResult, Phonemes, SayAs, TextToSpeechSystem, Word @@ -57,6 +58,39 @@ class ParsingState(int, enum.Enum): IN_SAY_AS = enum.auto() """Inside """ + IN_PROSODY = enum.auto() + """Inside """ + + +_DEFAULT_VOLUME: float = 100.0 + + +@dataclass +class ProsodyState: + """Current prosody settings""" + + volume: float = _DEFAULT_VOLUME + + +# ----------------------------------------------------------------------------- + +_DEFAULT_VOLUME_MAP = { + "default": _DEFAULT_VOLUME, + "x-loud": _DEFAULT_VOLUME, + "loud": _DEFAULT_VOLUME * 0.8, + "medium": _DEFAULT_VOLUME * 0.5, + "soft": _DEFAULT_VOLUME * 0.3, + "x-soft": _DEFAULT_VOLUME * 0.1, + "silent": 0.0, +} + + +@dataclass +class SSMLSettings: + volume_map: typing.Mapping[str, float] = field( + default_factory=lambda: _DEFAULT_VOLUME_MAP + ) + # ----------------------------------------------------------------------------- @@ -67,17 +101,23 @@ class SSMLSpeaker: See: https://www.w3.org/TR/speech-synthesis11/ """ - def __init__(self, tts: TextToSpeechSystem): + def __init__( + self, tts: TextToSpeechSystem, settings: typing.Optional[SSMLSettings] = None + ): + self.tts = tts + self.settings = settings or SSMLSettings() + self._state_stack: typing.List[ParsingState] = [ParsingState.DEFAULT] self._element_stack: typing.List[etree.Element] = [] self._voice_stack: typing.List[str] = [] self._lang_stack: typing.List[str] = [] self._interpret_as: typing.Optional[str] = None self._say_as_format: typing.Optional[str] = None - self.tts = tts + self._prosody_stack: typing.List[ProsodyState] = [] self._default_voice = self.tts.voice self._default_lang = self.tts.language + self._default_prosody = ProsodyState() def speak( self, ssml: typing.Union[str, etree.Element] @@ -120,6 +160,8 @@ class SSMLSpeaker: self._handle_end_say_as() elif end_tag == "lang": self._handle_end_lang() + elif end_tag == "prosody": + self._handle_end_prosody() elif end_tag in {"sub"}: # Handled in handle_text pass @@ -163,6 +205,8 @@ class SSMLSpeaker: self._handle_begin_say_as(elem) elif elem_tag == "lang": self._handle_begin_lang(elem) + elif elem_tag == "prosody": + self._handle_begin_prosody(elem) elif elem_tag in {"metadata", "meta"}: self._handle_begin_metadata() else: @@ -392,6 +436,33 @@ class SSMLSpeaker: LOG.debug("language: %s", self._lang) + def _handle_begin_prosody(self, elem: etree.Element): + """Handle """ + LOG.debug("begin prosody") + + # Start from current settings + new_prosody = ProsodyState(**dataclasses.asdict(self._prosody)) + + volume_str = attrib_no_namespace(elem, "volume") + if volume_str is not None: + new_prosody.volume = self._parse_volume( + volume_str, current_volume=self._prosody.volume + ) + + LOG.debug("prosody: %s", new_prosody) + self._push_prosody(new_prosody) + + self.tts.volume = new_prosody.volume + + def _handle_end_prosody(self): + """Handle """ + LOG.debug("end prosody") + self._pop_prosody() + + LOG.debug("prosody: %s", self._prosody) + + self.tts.volume = self._prosody.volume + # ------------------------------------------------------------------------- @property @@ -470,6 +541,72 @@ class SSMLSpeaker: return self._default_voice + @property + def _prosody(self) -> ProsodyState: + """Get prosody settings at the top of the stack""" + if self._prosody_stack: + return self._prosody_stack[-1] + + return self._default_prosody + + def _push_prosody(self, new_prosody: ProsodyState): + """Push new prosody settings on to the stack""" + self._prosody_stack.append(new_prosody) + + def _pop_prosody(self) -> ProsodyState: + """Pop prosody settings off the stop of the stack""" + if self._prosody_stack: + return self._prosody_stack.pop() + + return self._default_prosody + + def _parse_volume( + self, volume_str: str, current_volume: float = _DEFAULT_VOLUME + ) -> float: + """Parse SSML volume from into [0, 100] value""" + volume = current_volume + volume_str = volume_str.strip().lower() + + # Look up by name + maybe_volume = self.settings.volume_map.get(volume_str) + if maybe_volume is not None: + volume = maybe_volume + elif volume_str: + is_positive_offset = False + is_negative_offset = False + is_percent = False + + if volume_str[0] in {"+", "-"}: + if volume_str[0] == "+": + is_positive_offset = True + else: + is_negative_offset = True + + volume_str = volume_str[1:] + + if volume_str[-1] == "%": + is_percent = True + volume_str = volume_str[:-1] + + volume_value = float(volume_str) + if is_percent: + if is_positive_offset: + volume += volume * (volume_value / 100.0) + elif is_negative_offset: + volume -= volume * (volume_value / 100.0) + else: + # Already on a [0, 100] scale + volume = volume_value + elif is_positive_offset: + volume += volume_value + elif is_negative_offset: + volume -= volume_value + else: + # Absolute value + volume = volume_value + + return max(0, min(_DEFAULT_VOLUME, volume)) + # -----------------------------------------------------------------------------