From 40a74df2e61d3525109048497bfdf9fb66235fb6 Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Fri, 25 Mar 2022 17:48:47 -0400
Subject: [PATCH] Add SSML to web server

---
 mimic3-http/README.md                        |  7 +++
 mimic3-http/mimic3_http/__main__.py          | 64 ++++++++++++--------
 mimic3-http/mimic3_http/py.typed             |  0
 mimic3-http/mimic3_http/templates/index.html |  6 ++
 opentts-abc/opentts_abc/ssml.py              |  6 +-
 5 files changed, 56 insertions(+), 27 deletions(-)
 create mode 100644 mimic3-http/README.md
 create mode 100644 mimic3-http/mimic3_http/py.typed
diff --git a/mimic3-http/README.md b/mimic3-http/README.md
new file mode 100644
index 0000000..cfac3c6
--- /dev/null
+++ b/mimic3-http/README.md
@@ -0,0 +1,7 @@
+# Mimic 3 Web Server
+
+
+## Server
+
+
+## Client
diff --git a/mimic3-http/mimic3_http/__main__.py b/mimic3-http/mimic3_http/__main__.py
index c390dfc..e400681 100644
--- a/mimic3-http/mimic3_http/__main__.py
+++ b/mimic3-http/mimic3_http/__main__.py
@@ -30,7 +30,12 @@ from uuid import uuid4
 
 import hypercorn
 import quart_cors
-from mimic3_tts import AudioResult, Mimic3Settings, Mimic3TextToSpeechSystem
+from mimic3_tts import (
+    AudioResult,
+    Mimic3Settings,
+    Mimic3TextToSpeechSystem,
+    SSMLSpeaker,
+)
 from quart import (
     Quart,
     Response,
@@ -40,21 +45,19 @@ from quart import (
     send_from_directory,
 )
 
+from ._resources import _PACKAGE, __version__, _DIR
+
 _LOGGER = logging.getLogger(__name__)
 
 _MISSING = object()
 _TEMP_DIR: typing.Optional[Path] = None
 
-_PACKAGE = "mimic3_http"
-_DIR = Path(__file__).parent
 
 # -----------------------------------------------------------------------------
 
 parser = argparse.ArgumentParser(prog=_PACKAGE)
 parser.add_argument(
-    "--voices-dir",
-    action="append",
-    help="Directory with <language>/<voice> structure",
+    "--voices-dir", action="append", help="Directory with <language>/<voice> structure",
 )
 parser.add_argument("--voice", help="Default voice (name of model directory)")
 parser.add_argument(
@@ -65,9 +68,7 @@ parser.add_argument(
 )
 parser.add_argument("--speaker", type=int, help="Default speaker to use (name or id)")
 parser.add_argument(
-    "--noise-scale",
-    type=float,
-    help="Noise scale [0-1], default is 0.667",
+    "--noise-scale", type=float, help="Noise scale [0-1], default is 0.667",
 )
 parser.add_argument(
     "--length-scale",
@@ -75,9 +76,7 @@ parser.add_argument(
     help="Length scale (1.0 is default speed, 0.5 is 2x faster)",
 )
 parser.add_argument(
-    "--noise-w",
-    type=float,
-    help="Variation in cadence [0-1], default is 0.8",
+    "--noise-w", type=float, help="Variation in cadence [0-1], default is 0.8",
 )
 parser.add_argument(
     "--cache-dir",
@@ -182,21 +181,32 @@ def text_to_wav(params: TextToWavParams, no_cache: bool = False) -> bytes:
         wav_params_set = False
 
         with wav_file:
-            # TODO: SSML
-            mimic3.begin_utterance()
-            mimic3.speak_text(params.text, text_language=params.text_language)
-            results = mimic3.end_utterance()
+            try:
+                if params.ssml:
+                    results = SSMLSpeaker(mimic3).speak(params.text)
+                else:
+                    mimic3.begin_utterance()
+                    mimic3.speak_text(params.text, text_language=params.text_language)
+                    results = mimic3.end_utterance()
 
-            for result in results:
-                # TODO: Marks
-                if isinstance(result, AudioResult):
-                    if not wav_params_set:
-                        wav_file.setframerate(result.sample_rate_hz)
-                        wav_file.setsampwidth(result.sample_width_bytes)
-                        wav_file.setnchannels(result.num_channels)
-                        wav_params_set = True
+                for result in results:
+                    # TODO: Marks
+                    if isinstance(result, AudioResult):
+                        if not wav_params_set:
+                            wav_file.setframerate(result.sample_rate_hz)
+                            wav_file.setsampwidth(result.sample_width_bytes)
+                            wav_file.setnchannels(result.num_channels)
+                            wav_params_set = True
 
-                    wav_file.writeframes(result.audio_bytes)
+                        wav_file.writeframes(result.audio_bytes)
+            except Exception as e:
+                if not wav_params_set:
+                    # Set default parameters so exception can propagate
+                    wav_file.setframerate(22050)
+                    wav_file.setsampwidth(2)
+                    wav_file.setnchannels(1)
+
+                raise e
 
         wav_bytes = wav_io.getvalue()
 
@@ -305,7 +315,9 @@ async def app_tts() -> Response:
 
 @app.route("/api/voices", methods=["GET"])
 async def api_voices():
-    return jsonify([dataclasses.asdict(v) for v in mimic3.get_voices()])
+    voices_dict = {v.key: v for v in mimic3.get_voices()}
+    voices = sorted(voices_dict.values(), key=lambda v: v.key)
+    return jsonify([dataclasses.asdict(v) for v in voices])
 
 
 @app.route("/process", methods=["GET", "POST"])
diff --git a/mimic3-http/mimic3_http/py.typed b/mimic3-http/mimic3_http/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/mimic3-http/mimic3_http/templates/index.html b/mimic3-http/mimic3_http/templates/index.html
index 0df266c..508d305 100644
--- a/mimic3-http/mimic3_http/templates/index.html
+++ b/mimic3-http/mimic3_http/templates/index.html
@@ -66,6 +66,10 @@
                     <select id="speaker-list" name="speaker">
                     </select>
                 </div>
+                <div class="col-auto">
+                  <input type="checkbox" id="ssml">
+                  <label class="ml-1" for="ssml">SSML</label>
+                </div>
             </div>
             <div id="audio-message" class="row mt-3" hidden>
                 <div class="col">
@@ -131,6 +135,7 @@
              }
 
              var textLanguage = q('#text-language').value || ''
+             var ssml = q('#ssml').value || 'false'
 
              q('#audio-message').hidden = false
 
@@ -142,6 +147,7 @@
                  '&noiseScale=' + encodeURIComponent(noiseScale) +
                  '&noiseW=' + encodeURIComponent(noiseW) +
                  '&lengthScale=' + encodeURIComponent(lengthScale) +
+                 '&ssml=' + encodeURIComponent(ssml) +
                  '&textLanguage=' + encodeURIComponent(textLanguage),
                  {cache: 'no-cache'})
 
diff --git a/opentts-abc/opentts_abc/ssml.py b/opentts-abc/opentts_abc/ssml.py
index db3cbe5..56b8aab 100644
--- a/opentts-abc/opentts_abc/ssml.py
+++ b/opentts-abc/opentts_abc/ssml.py
@@ -87,7 +87,11 @@ class SSMLSpeaker:
         if isinstance(ssml, etree.Element):
             root_element = ssml
         else:
-            root_element = etree.fromstring(ssml)
+            try:
+                root_element = etree.fromstring(ssml)
+            except etree.ParseError:
+                # Try again wrapped in <speak>
+                root_element = etree.fromstring(f"<speak>{ssml}</speak>")
 
         # Process sub-elements and text chunks
         for elem_or_text in text_and_elements(root_element):