Add <prosody volume="..."> in SSML

This commit is contained in:
Michael Hansen 2022-04-08 11:23:51 -04:00
commit 668a46f16d
7 changed files with 213 additions and 6 deletions

View file

@ -234,15 +234,17 @@ For example:
``` xml
<speak>
<voice name="en_UK/apope">
<voice name="en_UK/apope_low">
<s>
Welcome to the world of speech synthesis.
</s>
</voice>
<break time="3s" />
<voice name="en_US/cmu-arctic#slt">
<voice name="en_US/cmu-arctic_low#slt">
<s>
This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
<prosody volume="soft">
This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
</prosody>
</s>
</voice>
</speak>

View file

@ -172,6 +172,12 @@ A subset of [SSML](https://www.w3.org/TR/speech-synthesis11/) (Speech Synthesis
* `voice` - name or language of voice
* Name format is `tts:voice` (e.g., "glow-speak:en-us_mary_ann") or `tts:voice#speaker_id` (e.g., "coqui-tts:en_vctk#p228")
* If one of the supported languages, a preferred voice is used (override with `--preferred-voice <lang> <voice>`)
* `<prosody attribute="value">` - change speaking attributes
* Supported `attribute` names:
* `volume` - speaking volume
* number in [0, 100] - 0 is silent, 100 is loudest (default)
* +X, -X, +X%, -X% - absolute/percent offset from current volume
* one of "default", "silent", "x-loud", "loud", "medium", "soft", "x-soft"
* `<say-as interpret-as="">` - force interpretation of inner text
* `interpret-as` one of "spell-out", "date", "number", "time", or "currency"
* `format` - way to format text depending on `interpret-as`

View file

@ -23,3 +23,5 @@ DEFAULT_VOICES_URL_FORMAT = (
"https://github.com/MycroftAI/mimic3-voices/raw/master/voices/{lang}/{name}"
)
DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3" / "voices"
DEFAULT_VOLUME = 100.0

View file

@ -14,6 +14,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
"""Implementation of OpenTTS for Mimic 3"""
import audioop
import itertools
import logging
import typing
@ -42,6 +43,7 @@ from .const import (
DEFAULT_VOICE,
DEFAULT_VOICES_DOWNLOAD_DIR,
DEFAULT_VOICES_URL_FORMAT,
DEFAULT_VOLUME,
)
from .download import VoiceFile, download_voice
from .voice import SPEAKER_TYPE, BreakType, Mimic3Voice
@ -108,6 +110,9 @@ class Mimic3Settings:
share_onnx_models_between_threads: bool = True
"""If True, Onnx models are shared between threads"""
volume: float = DEFAULT_VOLUME
"""Voice volume in [0, 100]"""
@dataclass
class Mimic3Phonemes:
@ -292,6 +297,14 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def language(self, new_language: str):
self.settings.language = new_language
@property
def volume(self) -> float:
return self.settings.volume
@volume.setter
def volume(self, new_volume: float):
self.settings.volume = max(0, min(100, new_volume))
def begin_utterance(self):
pass
@ -433,6 +446,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
)
audio_bytes = audio.tobytes()
if settings.volume != DEFAULT_VOLUME:
audio_bytes = audioop.mul(audio_bytes, 2, settings.volume / 100.0)
return AudioResult(
sample_rate_hz=voice.config.audio.sample_rate,
audio_bytes=audio_bytes,

View file

@ -490,6 +490,40 @@
"speakers": [],
"properties": {}
},
"fr_FR/tom_low": {
"files": {
"LICENSE": {
"size_bytes": 24947,
"sha256_sum": "635554793cdae1fbc549793a1565772c763e64a686dc674edeaa492c5b88e493"
},
"README.md": {
"size_bytes": 186,
"sha256_sum": "5722e7135a6487a7e88158aac85a490e155742e85fa4953d426d2f7884359475"
},
"SOURCE": {
"size_bytes": 50,
"sha256_sum": "96978fc4977928015e2999d4497f667edb562fd1a44211a31ac2c15c94ced664"
},
"config.json": {
"size_bytes": 3634,
"sha256_sum": "203dc4bafb3d3dacc0cee09959a41494ac173e53c57b3c3b75e2af1593c3859a"
},
"generator.onnx": {
"size_bytes": 62788375,
"sha256_sum": "0f9ae579eceea1dd908ad47a8196a5a1944c9f2848a89516abf8624917952d03"
},
"phoneme_map.txt": {
"size_bytes": 15,
"sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
},
"phonemes.txt": {
"size_bytes": 232,
"sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
}
},
"speakers": [],
"properties": {}
},
"hu_HU/diana-majlinger_low": {
"files": {
"LICENSE": {

View file

@ -206,6 +206,15 @@ class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta):
def language(self, new_language: str):
"""Set the current voice language"""
@property
@abstractmethod
def volume(self) -> float:
"""Get the current volume in [0, 100]"""
@volume.setter
def volume(self, new_volume: float):
"""Set the current volume in [0, 100]"""
def shutdown(self):
"""Called by the host program when the text to speech system should be stopped"""

View file

@ -14,12 +14,13 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
"""Support for Speech Synthesis Markup Language (SSML)"""
import dataclasses
import enum
import logging
import re
import typing
import xml.etree.ElementTree as etree
from dataclasses import dataclass
from dataclasses import dataclass, field
from opentts_abc import BaseResult, Phonemes, SayAs, TextToSpeechSystem, Word
@ -57,6 +58,39 @@ class ParsingState(int, enum.Enum):
IN_SAY_AS = enum.auto()
"""Inside <say-as>"""
IN_PROSODY = enum.auto()
"""Inside <prosody>"""
_DEFAULT_VOLUME: float = 100.0
@dataclass
class ProsodyState:
"""Current prosody settings"""
volume: float = _DEFAULT_VOLUME
# -----------------------------------------------------------------------------
_DEFAULT_VOLUME_MAP = {
"default": _DEFAULT_VOLUME,
"x-loud": _DEFAULT_VOLUME,
"loud": _DEFAULT_VOLUME * 0.8,
"medium": _DEFAULT_VOLUME * 0.5,
"soft": _DEFAULT_VOLUME * 0.3,
"x-soft": _DEFAULT_VOLUME * 0.1,
"silent": 0.0,
}
@dataclass
class SSMLSettings:
volume_map: typing.Mapping[str, float] = field(
default_factory=lambda: _DEFAULT_VOLUME_MAP
)
# -----------------------------------------------------------------------------
@ -67,17 +101,23 @@ class SSMLSpeaker:
See: https://www.w3.org/TR/speech-synthesis11/
"""
def __init__(self, tts: TextToSpeechSystem):
def __init__(
self, tts: TextToSpeechSystem, settings: typing.Optional[SSMLSettings] = None
):
self.tts = tts
self.settings = settings or SSMLSettings()
self._state_stack: typing.List[ParsingState] = [ParsingState.DEFAULT]
self._element_stack: typing.List[etree.Element] = []
self._voice_stack: typing.List[str] = []
self._lang_stack: typing.List[str] = []
self._interpret_as: typing.Optional[str] = None
self._say_as_format: typing.Optional[str] = None
self.tts = tts
self._prosody_stack: typing.List[ProsodyState] = []
self._default_voice = self.tts.voice
self._default_lang = self.tts.language
self._default_prosody = ProsodyState()
def speak(
self, ssml: typing.Union[str, etree.Element]
@ -120,6 +160,8 @@ class SSMLSpeaker:
self._handle_end_say_as()
elif end_tag == "lang":
self._handle_end_lang()
elif end_tag == "prosody":
self._handle_end_prosody()
elif end_tag in {"sub"}:
# Handled in handle_text
pass
@ -163,6 +205,8 @@ class SSMLSpeaker:
self._handle_begin_say_as(elem)
elif elem_tag == "lang":
self._handle_begin_lang(elem)
elif elem_tag == "prosody":
self._handle_begin_prosody(elem)
elif elem_tag in {"metadata", "meta"}:
self._handle_begin_metadata()
else:
@ -392,6 +436,33 @@ class SSMLSpeaker:
LOG.debug("language: %s", self._lang)
def _handle_begin_prosody(self, elem: etree.Element):
"""Handle <prosody>"""
LOG.debug("begin prosody")
# Start from current settings
new_prosody = ProsodyState(**dataclasses.asdict(self._prosody))
volume_str = attrib_no_namespace(elem, "volume")
if volume_str is not None:
new_prosody.volume = self._parse_volume(
volume_str, current_volume=self._prosody.volume
)
LOG.debug("prosody: %s", new_prosody)
self._push_prosody(new_prosody)
self.tts.volume = new_prosody.volume
def _handle_end_prosody(self):
"""Handle </prosody>"""
LOG.debug("end prosody")
self._pop_prosody()
LOG.debug("prosody: %s", self._prosody)
self.tts.volume = self._prosody.volume
# -------------------------------------------------------------------------
@property
@ -470,6 +541,72 @@ class SSMLSpeaker:
return self._default_voice
@property
def _prosody(self) -> ProsodyState:
"""Get prosody settings at the top of the stack"""
if self._prosody_stack:
return self._prosody_stack[-1]
return self._default_prosody
def _push_prosody(self, new_prosody: ProsodyState):
"""Push new prosody settings on to the stack"""
self._prosody_stack.append(new_prosody)
def _pop_prosody(self) -> ProsodyState:
"""Pop prosody settings off the stop of the stack"""
if self._prosody_stack:
return self._prosody_stack.pop()
return self._default_prosody
def _parse_volume(
self, volume_str: str, current_volume: float = _DEFAULT_VOLUME
) -> float:
"""Parse SSML volume from <prosody> into [0, 100] value"""
volume = current_volume
volume_str = volume_str.strip().lower()
# Look up by name
maybe_volume = self.settings.volume_map.get(volume_str)
if maybe_volume is not None:
volume = maybe_volume
elif volume_str:
is_positive_offset = False
is_negative_offset = False
is_percent = False
if volume_str[0] in {"+", "-"}:
if volume_str[0] == "+":
is_positive_offset = True
else:
is_negative_offset = True
volume_str = volume_str[1:]
if volume_str[-1] == "%":
is_percent = True
volume_str = volume_str[:-1]
volume_value = float(volume_str)
if is_percent:
if is_positive_offset:
volume += volume * (volume_value / 100.0)
elif is_negative_offset:
volume -= volume * (volume_value / 100.0)
else:
# Already on a [0, 100] scale
volume = volume_value
elif is_positive_offset:
volume += volume_value
elif is_negative_offset:
volume -= volume_value
else:
# Absolute value
volume = volume_value
return max(0, min(_DEFAULT_VOLUME, volume))
# -----------------------------------------------------------------------------