diff --git a/README.md b/README.md
index 0f6b3c9..45018a7 100644
--- a/README.md
+++ b/README.md
@@ -234,15 +234,17 @@ For example:
``` xml
-
+
Welcome to the world of speech synthesis.
-
+
- This is a 2 voice.
+
+ This is a 2 voice.
+
diff --git a/mimic3-tts/README.md b/mimic3-tts/README.md
index 56cc9b4..6301c4a 100644
--- a/mimic3-tts/README.md
+++ b/mimic3-tts/README.md
@@ -172,6 +172,12 @@ A subset of [SSML](https://www.w3.org/TR/speech-synthesis11/) (Speech Synthesis
* `voice` - name or language of voice
* Name format is `tts:voice` (e.g., "glow-speak:en-us_mary_ann") or `tts:voice#speaker_id` (e.g., "coqui-tts:en_vctk#p228")
* If one of the supported languages, a preferred voice is used (override with `--preferred-voice `)
+* `` - change speaking attributes
+ * Supported `attribute` names:
+ * `volume` - speaking volume
+ * number in [0, 100] - 0 is silent, 100 is loudest (default)
+ * +X, -X, +X%, -X% - absolute/percent offset from current volume
+ * one of "default", "silent", "x-loud", "loud", "medium", "soft", "x-soft"
* `` - force interpretation of inner text
* `interpret-as` one of "spell-out", "date", "number", "time", or "currency"
* `format` - way to format text depending on `interpret-as`
diff --git a/mimic3-tts/mimic3_tts/const.py b/mimic3-tts/mimic3_tts/const.py
index b6b2153..d6aa290 100644
--- a/mimic3-tts/mimic3_tts/const.py
+++ b/mimic3-tts/mimic3_tts/const.py
@@ -23,3 +23,5 @@ DEFAULT_VOICES_URL_FORMAT = (
"https://github.com/MycroftAI/mimic3-voices/raw/master/voices/{lang}/{name}"
)
DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3" / "voices"
+
+DEFAULT_VOLUME = 100.0
diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py
index 71367f4..5b196c2 100644
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@@ -14,6 +14,7 @@
# along with this program. If not, see .
#
"""Implementation of OpenTTS for Mimic 3"""
+import audioop
import itertools
import logging
import typing
@@ -42,6 +43,7 @@ from .const import (
DEFAULT_VOICE,
DEFAULT_VOICES_DOWNLOAD_DIR,
DEFAULT_VOICES_URL_FORMAT,
+ DEFAULT_VOLUME,
)
from .download import VoiceFile, download_voice
from .voice import SPEAKER_TYPE, BreakType, Mimic3Voice
@@ -108,6 +110,9 @@ class Mimic3Settings:
share_onnx_models_between_threads: bool = True
"""If True, Onnx models are shared between threads"""
+ volume: float = DEFAULT_VOLUME
+ """Voice volume in [0, 100]"""
+
@dataclass
class Mimic3Phonemes:
@@ -292,6 +297,14 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def language(self, new_language: str):
self.settings.language = new_language
+ @property
+ def volume(self) -> float:
+ return self.settings.volume
+
+ @volume.setter
+ def volume(self, new_volume: float):
+ self.settings.volume = max(0, min(100, new_volume))
+
def begin_utterance(self):
pass
@@ -433,6 +446,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
)
audio_bytes = audio.tobytes()
+
+ if settings.volume != DEFAULT_VOLUME:
+ audio_bytes = audioop.mul(audio_bytes, 2, settings.volume / 100.0)
+
return AudioResult(
sample_rate_hz=voice.config.audio.sample_rate,
audio_bytes=audio_bytes,
diff --git a/mimic3-tts/mimic3_tts/voices.json b/mimic3-tts/mimic3_tts/voices.json
index 9ba27ed..b0c1db7 100644
--- a/mimic3-tts/mimic3_tts/voices.json
+++ b/mimic3-tts/mimic3_tts/voices.json
@@ -490,6 +490,40 @@
"speakers": [],
"properties": {}
},
+ "fr_FR/tom_low": {
+ "files": {
+ "LICENSE": {
+ "size_bytes": 24947,
+ "sha256_sum": "635554793cdae1fbc549793a1565772c763e64a686dc674edeaa492c5b88e493"
+ },
+ "README.md": {
+ "size_bytes": 186,
+ "sha256_sum": "5722e7135a6487a7e88158aac85a490e155742e85fa4953d426d2f7884359475"
+ },
+ "SOURCE": {
+ "size_bytes": 50,
+ "sha256_sum": "96978fc4977928015e2999d4497f667edb562fd1a44211a31ac2c15c94ced664"
+ },
+ "config.json": {
+ "size_bytes": 3634,
+ "sha256_sum": "203dc4bafb3d3dacc0cee09959a41494ac173e53c57b3c3b75e2af1593c3859a"
+ },
+ "generator.onnx": {
+ "size_bytes": 62788375,
+ "sha256_sum": "0f9ae579eceea1dd908ad47a8196a5a1944c9f2848a89516abf8624917952d03"
+ },
+ "phoneme_map.txt": {
+ "size_bytes": 15,
+ "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+ },
+ "phonemes.txt": {
+ "size_bytes": 232,
+ "sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
+ }
+ },
+ "speakers": [],
+ "properties": {}
+ },
"hu_HU/diana-majlinger_low": {
"files": {
"LICENSE": {
diff --git a/opentts-abc/opentts_abc/__init__.py b/opentts-abc/opentts_abc/__init__.py
index aeb846d..0e3bf54 100644
--- a/opentts-abc/opentts_abc/__init__.py
+++ b/opentts-abc/opentts_abc/__init__.py
@@ -206,6 +206,15 @@ class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta):
def language(self, new_language: str):
"""Set the current voice language"""
+ @property
+ @abstractmethod
+ def volume(self) -> float:
+ """Get the current volume in [0, 100]"""
+
+ @volume.setter
+ def volume(self, new_volume: float):
+ """Set the current volume in [0, 100]"""
+
def shutdown(self):
"""Called by the host program when the text to speech system should be stopped"""
diff --git a/opentts-abc/opentts_abc/ssml.py b/opentts-abc/opentts_abc/ssml.py
index 56b8aab..949fd7b 100644
--- a/opentts-abc/opentts_abc/ssml.py
+++ b/opentts-abc/opentts_abc/ssml.py
@@ -14,12 +14,13 @@
# along with this program. If not, see .
#
"""Support for Speech Synthesis Markup Language (SSML)"""
+import dataclasses
import enum
import logging
import re
import typing
import xml.etree.ElementTree as etree
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from opentts_abc import BaseResult, Phonemes, SayAs, TextToSpeechSystem, Word
@@ -57,6 +58,39 @@ class ParsingState(int, enum.Enum):
IN_SAY_AS = enum.auto()
"""Inside """
+ IN_PROSODY = enum.auto()
+ """Inside """
+
+
+_DEFAULT_VOLUME: float = 100.0
+
+
+@dataclass
+class ProsodyState:
+ """Current prosody settings"""
+
+ volume: float = _DEFAULT_VOLUME
+
+
+# -----------------------------------------------------------------------------
+
+_DEFAULT_VOLUME_MAP = {
+ "default": _DEFAULT_VOLUME,
+ "x-loud": _DEFAULT_VOLUME,
+ "loud": _DEFAULT_VOLUME * 0.8,
+ "medium": _DEFAULT_VOLUME * 0.5,
+ "soft": _DEFAULT_VOLUME * 0.3,
+ "x-soft": _DEFAULT_VOLUME * 0.1,
+ "silent": 0.0,
+}
+
+
+@dataclass
+class SSMLSettings:
+ volume_map: typing.Mapping[str, float] = field(
+ default_factory=lambda: _DEFAULT_VOLUME_MAP
+ )
+
# -----------------------------------------------------------------------------
@@ -67,17 +101,23 @@ class SSMLSpeaker:
See: https://www.w3.org/TR/speech-synthesis11/
"""
- def __init__(self, tts: TextToSpeechSystem):
+ def __init__(
+ self, tts: TextToSpeechSystem, settings: typing.Optional[SSMLSettings] = None
+ ):
+ self.tts = tts
+ self.settings = settings or SSMLSettings()
+
self._state_stack: typing.List[ParsingState] = [ParsingState.DEFAULT]
self._element_stack: typing.List[etree.Element] = []
self._voice_stack: typing.List[str] = []
self._lang_stack: typing.List[str] = []
self._interpret_as: typing.Optional[str] = None
self._say_as_format: typing.Optional[str] = None
- self.tts = tts
+ self._prosody_stack: typing.List[ProsodyState] = []
self._default_voice = self.tts.voice
self._default_lang = self.tts.language
+ self._default_prosody = ProsodyState()
def speak(
self, ssml: typing.Union[str, etree.Element]
@@ -120,6 +160,8 @@ class SSMLSpeaker:
self._handle_end_say_as()
elif end_tag == "lang":
self._handle_end_lang()
+ elif end_tag == "prosody":
+ self._handle_end_prosody()
elif end_tag in {"sub"}:
# Handled in handle_text
pass
@@ -163,6 +205,8 @@ class SSMLSpeaker:
self._handle_begin_say_as(elem)
elif elem_tag == "lang":
self._handle_begin_lang(elem)
+ elif elem_tag == "prosody":
+ self._handle_begin_prosody(elem)
elif elem_tag in {"metadata", "meta"}:
self._handle_begin_metadata()
else:
@@ -392,6 +436,33 @@ class SSMLSpeaker:
LOG.debug("language: %s", self._lang)
+ def _handle_begin_prosody(self, elem: etree.Element):
+ """Handle """
+ LOG.debug("begin prosody")
+
+ # Start from current settings
+ new_prosody = ProsodyState(**dataclasses.asdict(self._prosody))
+
+ volume_str = attrib_no_namespace(elem, "volume")
+ if volume_str is not None:
+ new_prosody.volume = self._parse_volume(
+ volume_str, current_volume=self._prosody.volume
+ )
+
+ LOG.debug("prosody: %s", new_prosody)
+ self._push_prosody(new_prosody)
+
+ self.tts.volume = new_prosody.volume
+
+ def _handle_end_prosody(self):
+ """Handle """
+ LOG.debug("end prosody")
+ self._pop_prosody()
+
+ LOG.debug("prosody: %s", self._prosody)
+
+ self.tts.volume = self._prosody.volume
+
# -------------------------------------------------------------------------
@property
@@ -470,6 +541,72 @@ class SSMLSpeaker:
return self._default_voice
+ @property
+ def _prosody(self) -> ProsodyState:
+ """Get prosody settings at the top of the stack"""
+ if self._prosody_stack:
+ return self._prosody_stack[-1]
+
+ return self._default_prosody
+
+ def _push_prosody(self, new_prosody: ProsodyState):
+ """Push new prosody settings on to the stack"""
+ self._prosody_stack.append(new_prosody)
+
+ def _pop_prosody(self) -> ProsodyState:
+ """Pop prosody settings off the stop of the stack"""
+ if self._prosody_stack:
+ return self._prosody_stack.pop()
+
+ return self._default_prosody
+
+ def _parse_volume(
+ self, volume_str: str, current_volume: float = _DEFAULT_VOLUME
+ ) -> float:
+ """Parse SSML volume from into [0, 100] value"""
+ volume = current_volume
+ volume_str = volume_str.strip().lower()
+
+ # Look up by name
+ maybe_volume = self.settings.volume_map.get(volume_str)
+ if maybe_volume is not None:
+ volume = maybe_volume
+ elif volume_str:
+ is_positive_offset = False
+ is_negative_offset = False
+ is_percent = False
+
+ if volume_str[0] in {"+", "-"}:
+ if volume_str[0] == "+":
+ is_positive_offset = True
+ else:
+ is_negative_offset = True
+
+ volume_str = volume_str[1:]
+
+ if volume_str[-1] == "%":
+ is_percent = True
+ volume_str = volume_str[:-1]
+
+ volume_value = float(volume_str)
+ if is_percent:
+ if is_positive_offset:
+ volume += volume * (volume_value / 100.0)
+ elif is_negative_offset:
+ volume -= volume * (volume_value / 100.0)
+ else:
+ # Already on a [0, 100] scale
+ volume = volume_value
+ elif is_positive_offset:
+ volume += volume_value
+ elif is_negative_offset:
+ volume -= volume_value
+ else:
+ # Absolute value
+ volume = volume_value
+
+ return max(0, min(_DEFAULT_VOLUME, volume))
+
# -----------------------------------------------------------------------------