Add SSML to web server

This commit is contained in:
Michael Hansen 2022-03-25 17:48:47 -04:00
commit 40a74df2e6
5 changed files with 56 additions and 27 deletions

7
mimic3-http/README.md Normal file
View file

@ -0,0 +1,7 @@
# Mimic 3 Web Server
## Server
## Client

View file

@ -30,7 +30,12 @@ from uuid import uuid4
import hypercorn
import quart_cors
from mimic3_tts import AudioResult, Mimic3Settings, Mimic3TextToSpeechSystem
from mimic3_tts import (
AudioResult,
Mimic3Settings,
Mimic3TextToSpeechSystem,
SSMLSpeaker,
)
from quart import (
Quart,
Response,
@ -40,21 +45,19 @@ from quart import (
send_from_directory,
)
from ._resources import _PACKAGE, __version__, _DIR
_LOGGER = logging.getLogger(__name__)
_MISSING = object()
_TEMP_DIR: typing.Optional[Path] = None
_PACKAGE = "mimic3_http"
_DIR = Path(__file__).parent
# -----------------------------------------------------------------------------
parser = argparse.ArgumentParser(prog=_PACKAGE)
parser.add_argument(
"--voices-dir",
action="append",
help="Directory with <language>/<voice> structure",
"--voices-dir", action="append", help="Directory with <language>/<voice> structure",
)
parser.add_argument("--voice", help="Default voice (name of model directory)")
parser.add_argument(
@ -65,9 +68,7 @@ parser.add_argument(
)
parser.add_argument("--speaker", type=int, help="Default speaker to use (name or id)")
parser.add_argument(
"--noise-scale",
type=float,
help="Noise scale [0-1], default is 0.667",
"--noise-scale", type=float, help="Noise scale [0-1], default is 0.667",
)
parser.add_argument(
"--length-scale",
@ -75,9 +76,7 @@ parser.add_argument(
help="Length scale (1.0 is default speed, 0.5 is 2x faster)",
)
parser.add_argument(
"--noise-w",
type=float,
help="Variation in cadence [0-1], default is 0.8",
"--noise-w", type=float, help="Variation in cadence [0-1], default is 0.8",
)
parser.add_argument(
"--cache-dir",
@ -182,21 +181,32 @@ def text_to_wav(params: TextToWavParams, no_cache: bool = False) -> bytes:
wav_params_set = False
with wav_file:
# TODO: SSML
mimic3.begin_utterance()
mimic3.speak_text(params.text, text_language=params.text_language)
results = mimic3.end_utterance()
try:
if params.ssml:
results = SSMLSpeaker(mimic3).speak(params.text)
else:
mimic3.begin_utterance()
mimic3.speak_text(params.text, text_language=params.text_language)
results = mimic3.end_utterance()
for result in results:
# TODO: Marks
if isinstance(result, AudioResult):
if not wav_params_set:
wav_file.setframerate(result.sample_rate_hz)
wav_file.setsampwidth(result.sample_width_bytes)
wav_file.setnchannels(result.num_channels)
wav_params_set = True
for result in results:
# TODO: Marks
if isinstance(result, AudioResult):
if not wav_params_set:
wav_file.setframerate(result.sample_rate_hz)
wav_file.setsampwidth(result.sample_width_bytes)
wav_file.setnchannels(result.num_channels)
wav_params_set = True
wav_file.writeframes(result.audio_bytes)
wav_file.writeframes(result.audio_bytes)
except Exception as e:
if not wav_params_set:
# Set default parameters so exception can propagate
wav_file.setframerate(22050)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
raise e
wav_bytes = wav_io.getvalue()
@ -305,7 +315,9 @@ async def app_tts() -> Response:
@app.route("/api/voices", methods=["GET"])
async def api_voices():
return jsonify([dataclasses.asdict(v) for v in mimic3.get_voices()])
voices_dict = {v.key: v for v in mimic3.get_voices()}
voices = sorted(voices_dict.values(), key=lambda v: v.key)
return jsonify([dataclasses.asdict(v) for v in voices])
@app.route("/process", methods=["GET", "POST"])

View file

View file

@ -66,6 +66,10 @@
<select id="speaker-list" name="speaker">
</select>
</div>
<div class="col-auto">
<input type="checkbox" id="ssml">
<label class="ml-1" for="ssml">SSML</label>
</div>
</div>
<div id="audio-message" class="row mt-3" hidden>
<div class="col">
@ -131,6 +135,7 @@
}
var textLanguage = q('#text-language').value || ''
var ssml = q('#ssml').value || 'false'
q('#audio-message').hidden = false
@ -142,6 +147,7 @@
'&noiseScale=' + encodeURIComponent(noiseScale) +
'&noiseW=' + encodeURIComponent(noiseW) +
'&lengthScale=' + encodeURIComponent(lengthScale) +
'&ssml=' + encodeURIComponent(ssml) +
'&textLanguage=' + encodeURIComponent(textLanguage),
{cache: 'no-cache'})

View file

@ -87,7 +87,11 @@ class SSMLSpeaker:
if isinstance(ssml, etree.Element):
root_element = ssml
else:
root_element = etree.fromstring(ssml)
try:
root_element = etree.fromstring(ssml)
except etree.ParseError:
# Try again wrapped in <speak>
root_element = etree.fromstring(f"<speak>{ssml}</speak>")
# Process sub-elements and text chunks
for elem_or_text in text_and_elements(root_element):