Add <prosody volume="..."> in SSML
This commit is contained in:
parent
e56b4579a6
commit
668a46f16d
7 changed files with 213 additions and 6 deletions
|
|
@ -234,15 +234,17 @@ For example:
|
|||
|
||||
``` xml
|
||||
<speak>
|
||||
<voice name="en_UK/apope">
|
||||
<voice name="en_UK/apope_low">
|
||||
<s>
|
||||
Welcome to the world of speech synthesis.
|
||||
</s>
|
||||
</voice>
|
||||
<break time="3s" />
|
||||
<voice name="en_US/cmu-arctic#slt">
|
||||
<voice name="en_US/cmu-arctic_low#slt">
|
||||
<s>
|
||||
This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
|
||||
<prosody volume="soft">
|
||||
This is a <say-as interpret-as="number" format="ordinal">2</say-as> voice.
|
||||
</prosody>
|
||||
</s>
|
||||
</voice>
|
||||
</speak>
|
||||
|
|
|
|||
|
|
@ -172,6 +172,12 @@ A subset of [SSML](https://www.w3.org/TR/speech-synthesis11/) (Speech Synthesis
|
|||
* `voice` - name or language of voice
|
||||
* Name format is `tts:voice` (e.g., "glow-speak:en-us_mary_ann") or `tts:voice#speaker_id` (e.g., "coqui-tts:en_vctk#p228")
|
||||
* If one of the supported languages, a preferred voice is used (override with `--preferred-voice <lang> <voice>`)
|
||||
* `<prosody attribute="value">` - change speaking attributes
|
||||
* Supported `attribute` names:
|
||||
* `volume` - speaking volume
|
||||
* number in [0, 100] - 0 is silent, 100 is loudest (default)
|
||||
* +X, -X, +X%, -X% - absolute/percent offset from current volume
|
||||
* one of "default", "silent", "x-loud", "loud", "medium", "soft", "x-soft"
|
||||
* `<say-as interpret-as="">` - force interpretation of inner text
|
||||
* `interpret-as` one of "spell-out", "date", "number", "time", or "currency"
|
||||
* `format` - way to format text depending on `interpret-as`
|
||||
|
|
|
|||
|
|
@ -23,3 +23,5 @@ DEFAULT_VOICES_URL_FORMAT = (
|
|||
"https://github.com/MycroftAI/mimic3-voices/raw/master/voices/{lang}/{name}"
|
||||
)
|
||||
DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3" / "voices"
|
||||
|
||||
DEFAULT_VOLUME = 100.0
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
"""Implementation of OpenTTS for Mimic 3"""
|
||||
import audioop
|
||||
import itertools
|
||||
import logging
|
||||
import typing
|
||||
|
|
@ -42,6 +43,7 @@ from .const import (
|
|||
DEFAULT_VOICE,
|
||||
DEFAULT_VOICES_DOWNLOAD_DIR,
|
||||
DEFAULT_VOICES_URL_FORMAT,
|
||||
DEFAULT_VOLUME,
|
||||
)
|
||||
from .download import VoiceFile, download_voice
|
||||
from .voice import SPEAKER_TYPE, BreakType, Mimic3Voice
|
||||
|
|
@ -108,6 +110,9 @@ class Mimic3Settings:
|
|||
share_onnx_models_between_threads: bool = True
|
||||
"""If True, Onnx models are shared between threads"""
|
||||
|
||||
volume: float = DEFAULT_VOLUME
|
||||
"""Voice volume in [0, 100]"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mimic3Phonemes:
|
||||
|
|
@ -292,6 +297,14 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
def language(self, new_language: str):
|
||||
self.settings.language = new_language
|
||||
|
||||
@property
|
||||
def volume(self) -> float:
|
||||
return self.settings.volume
|
||||
|
||||
@volume.setter
|
||||
def volume(self, new_volume: float):
|
||||
self.settings.volume = max(0, min(100, new_volume))
|
||||
|
||||
def begin_utterance(self):
|
||||
pass
|
||||
|
||||
|
|
@ -433,6 +446,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
)
|
||||
|
||||
audio_bytes = audio.tobytes()
|
||||
|
||||
if settings.volume != DEFAULT_VOLUME:
|
||||
audio_bytes = audioop.mul(audio_bytes, 2, settings.volume / 100.0)
|
||||
|
||||
return AudioResult(
|
||||
sample_rate_hz=voice.config.audio.sample_rate,
|
||||
audio_bytes=audio_bytes,
|
||||
|
|
|
|||
|
|
@ -490,6 +490,40 @@
|
|||
"speakers": [],
|
||||
"properties": {}
|
||||
},
|
||||
"fr_FR/tom_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
"size_bytes": 24947,
|
||||
"sha256_sum": "635554793cdae1fbc549793a1565772c763e64a686dc674edeaa492c5b88e493"
|
||||
},
|
||||
"README.md": {
|
||||
"size_bytes": 186,
|
||||
"sha256_sum": "5722e7135a6487a7e88158aac85a490e155742e85fa4953d426d2f7884359475"
|
||||
},
|
||||
"SOURCE": {
|
||||
"size_bytes": 50,
|
||||
"sha256_sum": "96978fc4977928015e2999d4497f667edb562fd1a44211a31ac2c15c94ced664"
|
||||
},
|
||||
"config.json": {
|
||||
"size_bytes": 3634,
|
||||
"sha256_sum": "203dc4bafb3d3dacc0cee09959a41494ac173e53c57b3c3b75e2af1593c3859a"
|
||||
},
|
||||
"generator.onnx": {
|
||||
"size_bytes": 62788375,
|
||||
"sha256_sum": "0f9ae579eceea1dd908ad47a8196a5a1944c9f2848a89516abf8624917952d03"
|
||||
},
|
||||
"phoneme_map.txt": {
|
||||
"size_bytes": 15,
|
||||
"sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
|
||||
},
|
||||
"phonemes.txt": {
|
||||
"size_bytes": 232,
|
||||
"sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
|
||||
}
|
||||
},
|
||||
"speakers": [],
|
||||
"properties": {}
|
||||
},
|
||||
"hu_HU/diana-majlinger_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
|
|
|
|||
|
|
@ -206,6 +206,15 @@ class TextToSpeechSystem(AbstractContextManager, metaclass=ABCMeta):
|
|||
def language(self, new_language: str):
|
||||
"""Set the current voice language"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def volume(self) -> float:
|
||||
"""Get the current volume in [0, 100]"""
|
||||
|
||||
@volume.setter
|
||||
def volume(self, new_volume: float):
|
||||
"""Set the current volume in [0, 100]"""
|
||||
|
||||
def shutdown(self):
|
||||
"""Called by the host program when the text to speech system should be stopped"""
|
||||
|
||||
|
|
|
|||
|
|
@ -14,12 +14,13 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
"""Support for Speech Synthesis Markup Language (SSML)"""
|
||||
import dataclasses
|
||||
import enum
|
||||
import logging
|
||||
import re
|
||||
import typing
|
||||
import xml.etree.ElementTree as etree
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from opentts_abc import BaseResult, Phonemes, SayAs, TextToSpeechSystem, Word
|
||||
|
||||
|
|
@ -57,6 +58,39 @@ class ParsingState(int, enum.Enum):
|
|||
IN_SAY_AS = enum.auto()
|
||||
"""Inside <say-as>"""
|
||||
|
||||
IN_PROSODY = enum.auto()
|
||||
"""Inside <prosody>"""
|
||||
|
||||
|
||||
_DEFAULT_VOLUME: float = 100.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProsodyState:
|
||||
"""Current prosody settings"""
|
||||
|
||||
volume: float = _DEFAULT_VOLUME
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
_DEFAULT_VOLUME_MAP = {
|
||||
"default": _DEFAULT_VOLUME,
|
||||
"x-loud": _DEFAULT_VOLUME,
|
||||
"loud": _DEFAULT_VOLUME * 0.8,
|
||||
"medium": _DEFAULT_VOLUME * 0.5,
|
||||
"soft": _DEFAULT_VOLUME * 0.3,
|
||||
"x-soft": _DEFAULT_VOLUME * 0.1,
|
||||
"silent": 0.0,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SSMLSettings:
|
||||
volume_map: typing.Mapping[str, float] = field(
|
||||
default_factory=lambda: _DEFAULT_VOLUME_MAP
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
|
@ -67,17 +101,23 @@ class SSMLSpeaker:
|
|||
See: https://www.w3.org/TR/speech-synthesis11/
|
||||
"""
|
||||
|
||||
def __init__(self, tts: TextToSpeechSystem):
|
||||
def __init__(
|
||||
self, tts: TextToSpeechSystem, settings: typing.Optional[SSMLSettings] = None
|
||||
):
|
||||
self.tts = tts
|
||||
self.settings = settings or SSMLSettings()
|
||||
|
||||
self._state_stack: typing.List[ParsingState] = [ParsingState.DEFAULT]
|
||||
self._element_stack: typing.List[etree.Element] = []
|
||||
self._voice_stack: typing.List[str] = []
|
||||
self._lang_stack: typing.List[str] = []
|
||||
self._interpret_as: typing.Optional[str] = None
|
||||
self._say_as_format: typing.Optional[str] = None
|
||||
self.tts = tts
|
||||
self._prosody_stack: typing.List[ProsodyState] = []
|
||||
|
||||
self._default_voice = self.tts.voice
|
||||
self._default_lang = self.tts.language
|
||||
self._default_prosody = ProsodyState()
|
||||
|
||||
def speak(
|
||||
self, ssml: typing.Union[str, etree.Element]
|
||||
|
|
@ -120,6 +160,8 @@ class SSMLSpeaker:
|
|||
self._handle_end_say_as()
|
||||
elif end_tag == "lang":
|
||||
self._handle_end_lang()
|
||||
elif end_tag == "prosody":
|
||||
self._handle_end_prosody()
|
||||
elif end_tag in {"sub"}:
|
||||
# Handled in handle_text
|
||||
pass
|
||||
|
|
@ -163,6 +205,8 @@ class SSMLSpeaker:
|
|||
self._handle_begin_say_as(elem)
|
||||
elif elem_tag == "lang":
|
||||
self._handle_begin_lang(elem)
|
||||
elif elem_tag == "prosody":
|
||||
self._handle_begin_prosody(elem)
|
||||
elif elem_tag in {"metadata", "meta"}:
|
||||
self._handle_begin_metadata()
|
||||
else:
|
||||
|
|
@ -392,6 +436,33 @@ class SSMLSpeaker:
|
|||
|
||||
LOG.debug("language: %s", self._lang)
|
||||
|
||||
def _handle_begin_prosody(self, elem: etree.Element):
|
||||
"""Handle <prosody>"""
|
||||
LOG.debug("begin prosody")
|
||||
|
||||
# Start from current settings
|
||||
new_prosody = ProsodyState(**dataclasses.asdict(self._prosody))
|
||||
|
||||
volume_str = attrib_no_namespace(elem, "volume")
|
||||
if volume_str is not None:
|
||||
new_prosody.volume = self._parse_volume(
|
||||
volume_str, current_volume=self._prosody.volume
|
||||
)
|
||||
|
||||
LOG.debug("prosody: %s", new_prosody)
|
||||
self._push_prosody(new_prosody)
|
||||
|
||||
self.tts.volume = new_prosody.volume
|
||||
|
||||
def _handle_end_prosody(self):
|
||||
"""Handle </prosody>"""
|
||||
LOG.debug("end prosody")
|
||||
self._pop_prosody()
|
||||
|
||||
LOG.debug("prosody: %s", self._prosody)
|
||||
|
||||
self.tts.volume = self._prosody.volume
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
|
|
@ -470,6 +541,72 @@ class SSMLSpeaker:
|
|||
|
||||
return self._default_voice
|
||||
|
||||
@property
|
||||
def _prosody(self) -> ProsodyState:
|
||||
"""Get prosody settings at the top of the stack"""
|
||||
if self._prosody_stack:
|
||||
return self._prosody_stack[-1]
|
||||
|
||||
return self._default_prosody
|
||||
|
||||
def _push_prosody(self, new_prosody: ProsodyState):
|
||||
"""Push new prosody settings on to the stack"""
|
||||
self._prosody_stack.append(new_prosody)
|
||||
|
||||
def _pop_prosody(self) -> ProsodyState:
|
||||
"""Pop prosody settings off the stop of the stack"""
|
||||
if self._prosody_stack:
|
||||
return self._prosody_stack.pop()
|
||||
|
||||
return self._default_prosody
|
||||
|
||||
def _parse_volume(
|
||||
self, volume_str: str, current_volume: float = _DEFAULT_VOLUME
|
||||
) -> float:
|
||||
"""Parse SSML volume from <prosody> into [0, 100] value"""
|
||||
volume = current_volume
|
||||
volume_str = volume_str.strip().lower()
|
||||
|
||||
# Look up by name
|
||||
maybe_volume = self.settings.volume_map.get(volume_str)
|
||||
if maybe_volume is not None:
|
||||
volume = maybe_volume
|
||||
elif volume_str:
|
||||
is_positive_offset = False
|
||||
is_negative_offset = False
|
||||
is_percent = False
|
||||
|
||||
if volume_str[0] in {"+", "-"}:
|
||||
if volume_str[0] == "+":
|
||||
is_positive_offset = True
|
||||
else:
|
||||
is_negative_offset = True
|
||||
|
||||
volume_str = volume_str[1:]
|
||||
|
||||
if volume_str[-1] == "%":
|
||||
is_percent = True
|
||||
volume_str = volume_str[:-1]
|
||||
|
||||
volume_value = float(volume_str)
|
||||
if is_percent:
|
||||
if is_positive_offset:
|
||||
volume += volume * (volume_value / 100.0)
|
||||
elif is_negative_offset:
|
||||
volume -= volume * (volume_value / 100.0)
|
||||
else:
|
||||
# Already on a [0, 100] scale
|
||||
volume = volume_value
|
||||
elif is_positive_offset:
|
||||
volume += volume_value
|
||||
elif is_negative_offset:
|
||||
volume -= volume_value
|
||||
else:
|
||||
# Absolute value
|
||||
volume = volume_value
|
||||
|
||||
return max(0, min(_DEFAULT_VOLUME, volume))
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue