Add <prosody volume="..."> in SSML

2022-04-08 11:23:51 -04:00 · 2022-04-08 11:23:51 -04:00 · 668a46f16d
commit 668a46f16d
parent e56b4579a6
7 changed files with 213 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -234,15 +234,17 @@ For example:

 ``` xml
 <speak>
-  <voice name="en_UK/apope">
+  <voice name="en_UK/apope_low">
    <s>
      Welcome to the world of speech synthesis.
    </s>
  </voice>
  <break time="3s" />
-  <voice name="en_US/cmu-arctic#slt">
+  <voice name="en_US/cmu-arctic_low#slt">
    <s>
-      This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
+      <prosody volume="soft">
+        This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
+      </prosody>
    </s>
  </voice>
 </speak>
--- a/mimic3-tts/README.md
+++ b/mimic3-tts/README.md
@ -172,6 +172,12 @@ A subset of [SSML](https://www.w3.org/TR/speech-synthesis11/) (Speech Synthesis
    * `voice` - name or language of voice
        * Name format is `tts:voice` (e.g., "glow-speak:en-us_mary_ann") or `tts:voice#speaker_id` (e.g., "coqui-tts:en_vctk#p228")
        * If one of the supported languages, a preferred voice is used (override with `--preferred-voice <lang> <voice>`)
+* `<prosody attribute="value">` - change speaking attributes
+    * Supported `attribute` names:
+        * `volume` - speaking volume
+            * number in [0, 100] - 0 is silent, 100 is loudest (default)
+            * +X, -X, +X%, -X% - absolute/percent offset from current volume
+            * one of "default", "silent", "x-loud", "loud", "medium", "soft", "x-soft"
 * `<say-as interpret-as="">` - force interpretation of inner text
    * `interpret-as` one of "spell-out", "date", "number", "time", or "currency"
    * `format` - way to format text depending on `interpret-as`
--- a/mimic3-tts/mimic3_tts/const.py
+++ b/mimic3-tts/mimic3_tts/const.py
@ -23,3 +23,5 @@ DEFAULT_VOICES_URL_FORMAT = (
    "https://github.com/MycroftAI/mimic3-voices/raw/master/voices/{lang}/{name}"
 )
 DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3" / "voices"
+
+DEFAULT_VOLUME = 100.0
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@ -14,6 +14,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 """Implementation of OpenTTS for Mimic 3"""
+import audioop
 import itertools
 import logging
 import typing
@ -42,6 +43,7 @@ from .const import (
    DEFAULT_VOICE,
    DEFAULT_VOICES_DOWNLOAD_DIR,
    DEFAULT_VOICES_URL_FORMAT,
+    DEFAULT_VOLUME,
 )
 from .download import VoiceFile, download_voice
 from .voice import SPEAKER_TYPE, BreakType, Mimic3Voice
@ -108,6 +110,9 @@ class Mimic3Settings:
    share_onnx_models_between_threads: bool = True
    """If True, Onnx models are shared between threads"""

+    volume: float = DEFAULT_VOLUME
+    """Voice volume in [0, 100]"""
+

@dataclass
 class Mimic3Phonemes:
@ -292,6 +297,14 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
    def language(self, new_language: str):
        self.settings.language = new_language

+    @property
+    def volume(self) -> float:
+        return self.settings.volume
+
+    @volume.setter
+    def volume(self, new_volume: float):
+        self.settings.volume = max(0, min(100, new_volume))
+
    def begin_utterance(self):
        pass

@ -433,6 +446,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
        )

        audio_bytes = audio.tobytes()
+
+        if settings.volume != DEFAULT_VOLUME:
+            audio_bytes = audioop.mul(audio_bytes, 2, settings.volume / 100.0)
+
        return AudioResult(
            sample_rate_hz=voice.config.audio.sample_rate,
            audio_bytes=audio_bytes,
--- a/mimic3-tts/mimic3_tts/voices.json
+++ b/mimic3-tts/mimic3_tts/voices.json
@ -490,6 +490,40 @@
        "speakers": [],
        "properties": {}
    },
+    "fr_FR/tom_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 24947,
+                "sha256_sum": "635554793cdae1fbc549793a1565772c763e64a686dc674edeaa492c5b88e493"
+            },
+            "README.md": {
+                "size_bytes": 186,
+                "sha256_sum": "5722e7135a6487a7e88158aac85a490e155742e85fa4953d426d2f7884359475"
+            },
+            "SOURCE": {
+                "size_bytes": 50,
+                "sha256_sum": "96978fc4977928015e2999d4497f667edb562fd1a44211a31ac2c15c94ced664"
+            },
+            "config.json": {
+                "size_bytes": 3634,
+                "sha256_sum": "203dc4bafb3d3dacc0cee09959a41494ac173e53c57b3c3b75e2af1593c3859a"
+            },
+            "generator.onnx": {
+                "size_bytes": 62788375,
+                "sha256_sum": "0f9ae579eceea1dd908ad47a8196a5a1944c9f2848a89516abf8624917952d03"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 232,
+                "sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
    "hu_HU/diana-majlinger_low": {
        "files": {
            "LICENSE": {
--- a/opentts-abc/opentts_abc/init.py
+++ b/opentts-abc/opentts_abc/init.py
@ -206,6 +206,15 @@ class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta):
    def language(self, new_language: str):
        """Set the current voice language"""

+    @property
+    @abstractmethod
+    def volume(self) -> float:
+        """Get the current volume in [0, 100]"""
+
+    @volume.setter
+    def volume(self, new_volume: float):
+        """Set the current volume in [0, 100]"""
+
    def shutdown(self):
        """Called by the host program when the text to speech system should be stopped"""

--- a/opentts-abc/opentts_abc/ssml.py
+++ b/opentts-abc/opentts_abc/ssml.py
@ -14,12 +14,13 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 """Support for Speech Synthesis Markup Language (SSML)"""
+import dataclasses
 import enum
 import logging
 import re
 import typing
 import xml.etree.ElementTree as etree
-from dataclasses import dataclass
+from dataclasses import dataclass, field

 from opentts_abc import BaseResult, Phonemes, SayAs, TextToSpeechSystem, Word

@ -57,6 +58,39 @@ class ParsingState(int, enum.Enum):
    IN_SAY_AS = enum.auto()
    """Inside <say-as>"""

+    IN_PROSODY = enum.auto()
+    """Inside <prosody>"""
+
+
+_DEFAULT_VOLUME: float = 100.0
+
+
+@dataclass
+class ProsodyState:
+    """Current prosody settings"""
+
+    volume: float = _DEFAULT_VOLUME
+
+
+# -----------------------------------------------------------------------------
+
+_DEFAULT_VOLUME_MAP = {
+    "default": _DEFAULT_VOLUME,
+    "x-loud": _DEFAULT_VOLUME,
+    "loud": _DEFAULT_VOLUME * 0.8,
+    "medium": _DEFAULT_VOLUME * 0.5,
+    "soft": _DEFAULT_VOLUME * 0.3,
+    "x-soft": _DEFAULT_VOLUME * 0.1,
+    "silent": 0.0,
+}
+
+
+@dataclass
+class SSMLSettings:
+    volume_map: typing.Mapping[str, float] = field(
+        default_factory=lambda: _DEFAULT_VOLUME_MAP
+    )
+

 # -----------------------------------------------------------------------------

@ -67,17 +101,23 @@ class SSMLSpeaker:
    See: https://www.w3.org/TR/speech-synthesis11/
    """

-    def __init__(self, tts: TextToSpeechSystem):
+    def __init__(
+        self, tts: TextToSpeechSystem, settings: typing.Optional[SSMLSettings] = None
+    ):
+        self.tts = tts
+        self.settings = settings or SSMLSettings()
+
        self._state_stack: typing.List[ParsingState] = [ParsingState.DEFAULT]
        self._element_stack: typing.List[etree.Element] = []
        self._voice_stack: typing.List[str] = []
        self._lang_stack: typing.List[str] = []
        self._interpret_as: typing.Optional[str] = None
        self._say_as_format: typing.Optional[str] = None
-        self.tts = tts
+        self._prosody_stack: typing.List[ProsodyState] = []

        self._default_voice = self.tts.voice
        self._default_lang = self.tts.language
+        self._default_prosody = ProsodyState()

    def speak(
        self, ssml: typing.Union[str, etree.Element]
@ -120,6 +160,8 @@ class SSMLSpeaker:
                    self._handle_end_say_as()
                elif end_tag == "lang":
                    self._handle_end_lang()
+                elif end_tag == "prosody":
+                    self._handle_end_prosody()
                elif end_tag in {"sub"}:
                    # Handled in handle_text
                    pass
@ -163,6 +205,8 @@ class SSMLSpeaker:
                    self._handle_begin_say_as(elem)
                elif elem_tag == "lang":
                    self._handle_begin_lang(elem)
+                elif elem_tag == "prosody":
+                    self._handle_begin_prosody(elem)
                elif elem_tag in {"metadata", "meta"}:
                    self._handle_begin_metadata()
                else:
@ -392,6 +436,33 @@ class SSMLSpeaker:

        LOG.debug("language: %s", self._lang)

+    def _handle_begin_prosody(self, elem: etree.Element):
+        """Handle <prosody>"""
+        LOG.debug("begin prosody")
+
+        # Start from current settings
+        new_prosody = ProsodyState(**dataclasses.asdict(self._prosody))
+
+        volume_str = attrib_no_namespace(elem, "volume")
+        if volume_str is not None:
+            new_prosody.volume = self._parse_volume(
+                volume_str, current_volume=self._prosody.volume
+            )
+
+        LOG.debug("prosody: %s", new_prosody)
+        self._push_prosody(new_prosody)
+
+        self.tts.volume = new_prosody.volume
+
+    def _handle_end_prosody(self):
+        """Handle </prosody>"""
+        LOG.debug("end prosody")
+        self._pop_prosody()
+
+        LOG.debug("prosody: %s", self._prosody)
+
+        self.tts.volume = self._prosody.volume
+
    # -------------------------------------------------------------------------

    @property
@ -470,6 +541,72 @@ class SSMLSpeaker:

        return self._default_voice

+    @property
+    def _prosody(self) -> ProsodyState:
+        """Get prosody settings at the top of the stack"""
+        if self._prosody_stack:
+            return self._prosody_stack[-1]
+
+        return self._default_prosody
+
+    def _push_prosody(self, new_prosody: ProsodyState):
+        """Push new prosody settings on to the stack"""
+        self._prosody_stack.append(new_prosody)
+
+    def _pop_prosody(self) -> ProsodyState:
+        """Pop prosody settings off the stop of the stack"""
+        if self._prosody_stack:
+            return self._prosody_stack.pop()
+
+        return self._default_prosody
+
+    def _parse_volume(
+        self, volume_str: str, current_volume: float = _DEFAULT_VOLUME
+    ) -> float:
+        """Parse SSML volume from <prosody> into [0, 100] value"""
+        volume = current_volume
+        volume_str = volume_str.strip().lower()
+
+        # Look up by name
+        maybe_volume = self.settings.volume_map.get(volume_str)
+        if maybe_volume is not None:
+            volume = maybe_volume
+        elif volume_str:
+            is_positive_offset = False
+            is_negative_offset = False
+            is_percent = False
+
+            if volume_str[0] in {"+", "-"}:
+                if volume_str[0] == "+":
+                    is_positive_offset = True
+                else:
+                    is_negative_offset = True
+
+                volume_str = volume_str[1:]
+
+            if volume_str[-1] == "%":
+                is_percent = True
+                volume_str = volume_str[:-1]
+
+            volume_value = float(volume_str)
+            if is_percent:
+                if is_positive_offset:
+                    volume += volume * (volume_value / 100.0)
+                elif is_negative_offset:
+                    volume -= volume * (volume_value / 100.0)
+                else:
+                    # Already on a [0, 100] scale
+                    volume = volume_value
+            elif is_positive_offset:
+                volume += volume_value
+            elif is_negative_offset:
+                volume -= volume_value
+            else:
+                # Absolute value
+                volume = volume_value
+
+        return max(0, min(_DEFAULT_VOLUME, volume))
+

 # -----------------------------------------------------------------------------