From 668a46f16d066e4db2785db8b97251920d8d88fc Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Fri, 8 Apr 2022 11:23:51 -0400
Subject: [PATCH] Add <prosody volume="..."> in SSML

---
 README.md                           |   8 +-
 mimic3-tts/README.md                |   6 ++
 mimic3-tts/mimic3_tts/const.py      |   2 +
 mimic3-tts/mimic3_tts/tts.py        |  17 ++++
 mimic3-tts/mimic3_tts/voices.json   |  34 +++++++
 opentts-abc/opentts_abc/__init__.py |   9 ++
 opentts-abc/opentts_abc/ssml.py     | 143 +++++++++++++++++++++++++++-
 7 files changed, 213 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 0f6b3c9..45018a7 100644
--- a/README.md
+++ b/README.md
@@ -234,15 +234,17 @@ For example:
 
 ``` xml
 <speak>
-  <voice name="en_UK/apope">
+  <voice name="en_UK/apope_low">
     <s>
       Welcome to the world of speech synthesis.
     </s>
   </voice>
   <break time="3s" />
-  <voice name="en_US/cmu-arctic#slt">
+  <voice name="en_US/cmu-arctic_low#slt">
     <s>
-      This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
+      <prosody volume="soft">
+        This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
+      </prosody>
     </s>
   </voice>
 </speak>
diff --git a/mimic3-tts/README.md b/mimic3-tts/README.md
index 56cc9b4..6301c4a 100644
--- a/mimic3-tts/README.md
+++ b/mimic3-tts/README.md
@@ -172,6 +172,12 @@ A subset of [SSML](https://www.w3.org/TR/speech-synthesis11/) (Speech Synthesis
     * `voice` - name or language of voice
         * Name format is `tts:voice` (e.g., "glow-speak:en-us_mary_ann") or `tts:voice#speaker_id` (e.g., "coqui-tts:en_vctk#p228")
         * If one of the supported languages, a preferred voice is used (override with `--preferred-voice <lang> <voice>`)
+* `<prosody attribute="value">` - change speaking attributes
+    * Supported `attribute` names:
+        * `volume` - speaking volume
+            * number in [0, 100] - 0 is silent, 100 is loudest (default)
+            * +X, -X, +X%, -X% - absolute/percent offset from current volume
+            * one of "default", "silent", "x-loud", "loud", "medium", "soft", "x-soft"
 * `<say-as interpret-as="">` - force interpretation of inner text
     * `interpret-as` one of "spell-out", "date", "number", "time", or "currency"
     * `format` - way to format text depending on `interpret-as`
diff --git a/mimic3-tts/mimic3_tts/const.py b/mimic3-tts/mimic3_tts/const.py
index b6b2153..d6aa290 100644
--- a/mimic3-tts/mimic3_tts/const.py
+++ b/mimic3-tts/mimic3_tts/const.py
@@ -23,3 +23,5 @@ DEFAULT_VOICES_URL_FORMAT = (
     "https://github.com/MycroftAI/mimic3-voices/raw/master/voices/{lang}/{name}"
 )
 DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3" / "voices"
+
+DEFAULT_VOLUME = 100.0
diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py
index 71367f4..5b196c2 100644
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@@ -14,6 +14,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 """Implementation of OpenTTS for Mimic 3"""
+import audioop
 import itertools
 import logging
 import typing
@@ -42,6 +43,7 @@ from .const import (
     DEFAULT_VOICE,
     DEFAULT_VOICES_DOWNLOAD_DIR,
     DEFAULT_VOICES_URL_FORMAT,
+    DEFAULT_VOLUME,
 )
 from .download import VoiceFile, download_voice
 from .voice import SPEAKER_TYPE, BreakType, Mimic3Voice
@@ -108,6 +110,9 @@ class Mimic3Settings:
     share_onnx_models_between_threads: bool = True
     """If True, Onnx models are shared between threads"""
 
+    volume: float = DEFAULT_VOLUME
+    """Voice volume in [0, 100]"""
+
 
 @dataclass
 class Mimic3Phonemes:
@@ -292,6 +297,14 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
     def language(self, new_language: str):
         self.settings.language = new_language
 
+    @property
+    def volume(self) -> float:
+        return self.settings.volume
+
+    @volume.setter
+    def volume(self, new_volume: float):
+        self.settings.volume = max(0, min(100, new_volume))
+
     def begin_utterance(self):
         pass
 
@@ -433,6 +446,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
         )
 
         audio_bytes = audio.tobytes()
+
+        if settings.volume != DEFAULT_VOLUME:
+            audio_bytes = audioop.mul(audio_bytes, 2, settings.volume / 100.0)
+
         return AudioResult(
             sample_rate_hz=voice.config.audio.sample_rate,
             audio_bytes=audio_bytes,
diff --git a/mimic3-tts/mimic3_tts/voices.json b/mimic3-tts/mimic3_tts/voices.json
index 9ba27ed..b0c1db7 100644
--- a/mimic3-tts/mimic3_tts/voices.json
+++ b/mimic3-tts/mimic3_tts/voices.json
@@ -490,6 +490,40 @@
         "speakers": [],
         "properties": {}
     },
+    "fr_FR/tom_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 24947,
+                "sha256_sum": "635554793cdae1fbc549793a1565772c763e64a686dc674edeaa492c5b88e493"
+            },
+            "README.md": {
+                "size_bytes": 186,
+                "sha256_sum": "5722e7135a6487a7e88158aac85a490e155742e85fa4953d426d2f7884359475"
+            },
+            "SOURCE": {
+                "size_bytes": 50,
+                "sha256_sum": "96978fc4977928015e2999d4497f667edb562fd1a44211a31ac2c15c94ced664"
+            },
+            "config.json": {
+                "size_bytes": 3634,
+                "sha256_sum": "203dc4bafb3d3dacc0cee09959a41494ac173e53c57b3c3b75e2af1593c3859a"
+            },
+            "generator.onnx": {
+                "size_bytes": 62788375,
+                "sha256_sum": "0f9ae579eceea1dd908ad47a8196a5a1944c9f2848a89516abf8624917952d03"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 232,
+                "sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
     "hu_HU/diana-majlinger_low": {
         "files": {
             "LICENSE": {
diff --git a/opentts-abc/opentts_abc/__init__.py b/opentts-abc/opentts_abc/__init__.py
index aeb846d..0e3bf54 100644
--- a/opentts-abc/opentts_abc/__init__.py
+++ b/opentts-abc/opentts_abc/__init__.py
@@ -206,6 +206,15 @@ class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta):
     def language(self, new_language: str):
         """Set the current voice language"""
 
+    @property
+    @abstractmethod
+    def volume(self) -> float:
+        """Get the current volume in [0, 100]"""
+
+    @volume.setter
+    def volume(self, new_volume: float):
+        """Set the current volume in [0, 100]"""
+
     def shutdown(self):
         """Called by the host program when the text to speech system should be stopped"""
 
diff --git a/opentts-abc/opentts_abc/ssml.py b/opentts-abc/opentts_abc/ssml.py
index 56b8aab..949fd7b 100644
--- a/opentts-abc/opentts_abc/ssml.py
+++ b/opentts-abc/opentts_abc/ssml.py
@@ -14,12 +14,13 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 """Support for Speech Synthesis Markup Language (SSML)"""
+import dataclasses
 import enum
 import logging
 import re
 import typing
 import xml.etree.ElementTree as etree
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 from opentts_abc import BaseResult, Phonemes, SayAs, TextToSpeechSystem, Word
 
@@ -57,6 +58,39 @@ class ParsingState(int, enum.Enum):
     IN_SAY_AS = enum.auto()
     """Inside <say-as>"""
 
+    IN_PROSODY = enum.auto()
+    """Inside <prosody>"""
+
+
+_DEFAULT_VOLUME: float = 100.0
+
+
+@dataclass
+class ProsodyState:
+    """Current prosody settings"""
+
+    volume: float = _DEFAULT_VOLUME
+
+
+# -----------------------------------------------------------------------------
+
+_DEFAULT_VOLUME_MAP = {
+    "default": _DEFAULT_VOLUME,
+    "x-loud": _DEFAULT_VOLUME,
+    "loud": _DEFAULT_VOLUME * 0.8,
+    "medium": _DEFAULT_VOLUME * 0.5,
+    "soft": _DEFAULT_VOLUME * 0.3,
+    "x-soft": _DEFAULT_VOLUME * 0.1,
+    "silent": 0.0,
+}
+
+
+@dataclass
+class SSMLSettings:
+    volume_map: typing.Mapping[str, float] = field(
+        default_factory=lambda: _DEFAULT_VOLUME_MAP
+    )
+
 
 # -----------------------------------------------------------------------------
 
@@ -67,17 +101,23 @@ class SSMLSpeaker:
     See: https://www.w3.org/TR/speech-synthesis11/
     """
 
-    def __init__(self, tts: TextToSpeechSystem):
+    def __init__(
+        self, tts: TextToSpeechSystem, settings: typing.Optional[SSMLSettings] = None
+    ):
+        self.tts = tts
+        self.settings = settings or SSMLSettings()
+
         self._state_stack: typing.List[ParsingState] = [ParsingState.DEFAULT]
         self._element_stack: typing.List[etree.Element] = []
         self._voice_stack: typing.List[str] = []
         self._lang_stack: typing.List[str] = []
         self._interpret_as: typing.Optional[str] = None
         self._say_as_format: typing.Optional[str] = None
-        self.tts = tts
+        self._prosody_stack: typing.List[ProsodyState] = []
 
         self._default_voice = self.tts.voice
         self._default_lang = self.tts.language
+        self._default_prosody = ProsodyState()
 
     def speak(
         self, ssml: typing.Union[str, etree.Element]
@@ -120,6 +160,8 @@ class SSMLSpeaker:
                     self._handle_end_say_as()
                 elif end_tag == "lang":
                     self._handle_end_lang()
+                elif end_tag == "prosody":
+                    self._handle_end_prosody()
                 elif end_tag in {"sub"}:
                     # Handled in handle_text
                     pass
@@ -163,6 +205,8 @@ class SSMLSpeaker:
                     self._handle_begin_say_as(elem)
                 elif elem_tag == "lang":
                     self._handle_begin_lang(elem)
+                elif elem_tag == "prosody":
+                    self._handle_begin_prosody(elem)
                 elif elem_tag in {"metadata", "meta"}:
                     self._handle_begin_metadata()
                 else:
@@ -392,6 +436,33 @@ class SSMLSpeaker:
 
         LOG.debug("language: %s", self._lang)
 
+    def _handle_begin_prosody(self, elem: etree.Element):
+        """Handle <prosody>"""
+        LOG.debug("begin prosody")
+
+        # Start from current settings
+        new_prosody = ProsodyState(**dataclasses.asdict(self._prosody))
+
+        volume_str = attrib_no_namespace(elem, "volume")
+        if volume_str is not None:
+            new_prosody.volume = self._parse_volume(
+                volume_str, current_volume=self._prosody.volume
+            )
+
+        LOG.debug("prosody: %s", new_prosody)
+        self._push_prosody(new_prosody)
+
+        self.tts.volume = new_prosody.volume
+
+    def _handle_end_prosody(self):
+        """Handle </prosody>"""
+        LOG.debug("end prosody")
+        self._pop_prosody()
+
+        LOG.debug("prosody: %s", self._prosody)
+
+        self.tts.volume = self._prosody.volume
+
     # -------------------------------------------------------------------------
 
     @property
@@ -470,6 +541,72 @@ class SSMLSpeaker:
 
         return self._default_voice
 
+    @property
+    def _prosody(self) -> ProsodyState:
+        """Get prosody settings at the top of the stack"""
+        if self._prosody_stack:
+            return self._prosody_stack[-1]
+
+        return self._default_prosody
+
+    def _push_prosody(self, new_prosody: ProsodyState):
+        """Push new prosody settings on to the stack"""
+        self._prosody_stack.append(new_prosody)
+
+    def _pop_prosody(self) -> ProsodyState:
+        """Pop prosody settings off the stop of the stack"""
+        if self._prosody_stack:
+            return self._prosody_stack.pop()
+
+        return self._default_prosody
+
+    def _parse_volume(
+        self, volume_str: str, current_volume: float = _DEFAULT_VOLUME
+    ) -> float:
+        """Parse SSML volume from <prosody> into [0, 100] value"""
+        volume = current_volume
+        volume_str = volume_str.strip().lower()
+
+        # Look up by name
+        maybe_volume = self.settings.volume_map.get(volume_str)
+        if maybe_volume is not None:
+            volume = maybe_volume
+        elif volume_str:
+            is_positive_offset = False
+            is_negative_offset = False
+            is_percent = False
+
+            if volume_str[0] in {"+", "-"}:
+                if volume_str[0] == "+":
+                    is_positive_offset = True
+                else:
+                    is_negative_offset = True
+
+                volume_str = volume_str[1:]
+
+            if volume_str[-1] == "%":
+                is_percent = True
+                volume_str = volume_str[:-1]
+
+            volume_value = float(volume_str)
+            if is_percent:
+                if is_positive_offset:
+                    volume += volume * (volume_value / 100.0)
+                elif is_negative_offset:
+                    volume -= volume * (volume_value / 100.0)
+                else:
+                    # Already on a [0, 100] scale
+                    volume = volume_value
+            elif is_positive_offset:
+                volume += volume_value
+            elif is_negative_offset:
+                volume -= volume_value
+            else:
+                # Absolute value
+                volume = volume_value
+
+        return max(0, min(_DEFAULT_VOLUME, volume))
+
 
 # -----------------------------------------------------------------------------