Add speaker names to web interface

2022-03-21 16:13:45 -04:00 · 2022-03-21 16:13:45 -04:00 · d4ef4744c7
commit d4ef4744c7
parent 50d4c2a6c9
5 changed files with 63 additions and 29 deletions
--- a/mimic3-http/mimic3_http/main.py
+++ b/mimic3-http/mimic3_http/main.py
@ -21,6 +21,7 @@ import io
 import wave
 import tempfile
 import typing
+import dataclasses
 from dataclasses import dataclass
 from pathlib import Path
 from urllib.parse import parse_qs
@ -62,9 +63,7 @@ parser.add_argument(
 parser.add_argument(
    "--port", type=int, default=59125, help="Port of HTTP server (default: 59125)"
 )
-parser.add_argument(
-    "--speaker-id", type=int, default=0, help="Default speaker id to use"
-)
+parser.add_argument("--speaker", type=int, help="Default speaker to use (name or id)")
 parser.add_argument(
    "--length-scale", type=float, default=1.0, help="Speed of speech (> 1 is slower)"
 )
@ -114,7 +113,6 @@ _LOGGER.debug(args)
 class TextToWavParams:
    text: str
    voice: str = args.voice
-    speaker_id: int = args.speaker_id
    noise_scale: float = args.noise_scale
    noise_w: float = args.noise_w
    length_scale: float = args.length_scale
@ -139,7 +137,7 @@ _WAV_CACHE: typing.Dict[TextToWavParams, Path] = {}
 mimic3 = Mimic3TextToSpeechSystem(
    Mimic3Settings(
        voice=args.voice,
-        speaker_id=args.speaker_id,
+        speaker=args.speaker,
        length_scale=args.length_scale,
        noise_scale=args.noise_scale,
        noise_w=args.noise_w,
@ -160,7 +158,6 @@ def text_to_wav(params: TextToWavParams, no_cache: bool = False) -> bytes:
            return wav_bytes

    mimic3.voice = params.voice
-    mimic3.speaker_id = params.speaker_id

    mimic3.settings.length_scale = params.length_scale
    mimic3.settings.noise_scale = params.noise_scale
@ -242,10 +239,6 @@ async def app_tts() -> Response:
    if voice is not None:
        tts_args["voice"] = str(voice)

-    speaker_id = request.args.get("speakerId")
-    if speaker_id is not None:
-        tts_args["speaker_id"] = int(speaker_id)
-
    # TTS settings
    noise_scale = request.args.get("noiseScale")
    if noise_scale is not None:
@ -286,9 +279,7 @@ async def app_tts() -> Response:

@app.route("/api/voices", methods=["GET"])
 async def api_voices():
-    voices = mimic3.get_voices()
-    voice_ids = sorted([v.name for v in voices])
-    return jsonify(voice_ids)
+    return jsonify([dataclasses.asdict(v) for v in mimic3.get_voices()])


@app.route("/process", methods=["GET", "POST"])
@ -308,20 +299,14 @@ async def api_process():

    voice = voice or args.voice

-    speaker_id = args.speaker_id
-    if "#" in voice:
-        voice, speaker_id_str = voice.split("#", maxsplit=1)
-        speaker_id = int(speaker_id_str)
-
    # Assume SSML if text begins with an angle bracket
    ssml = text.strip().startswith("<")

-    _LOGGER.debug("Speaking with voice '%s (speaker=%s)': %s", voice, speaker_id, text)
+    _LOGGER.debug("Speaking with voice '%s': %s", voice, text)
    wav_bytes = text_to_wav(
        TextToWavParams(
            text=text,
            voice=voice,
-            speaker_id=speaker_id,
            ssml=ssml,
            length_scale=args.length_scale,
            noise_scale=args.noise_scale,
--- a/mimic3-http/mimic3_http/templates/index.html
+++ b/mimic3-http/mimic3_http/templates/index.html
@ -62,8 +62,9 @@
                    </select>
                </div>
                <div class="col-auto">
-                    <label for="speaker-id" title="Index of speaker">Speaker:</label>
-                    <input type="number" id="speaker-id" name="speaker_id" size="5" min="0" value="0">
+                    <label for="speaker" title="Name of speaker">Speaker:</label>
+                    <select id="speaker-list" name="speaker">
+                    </select>
                </div>
            </div>
            <div id="audio-message" class="row mt-3" hidden>
@ -122,7 +123,13 @@
             var noiseScale = q('#noise-scale').value || '0.333'
             var noiseW = q('#noise-w').value || '1.0'
             var lengthScale = q('#length-scale').value || '1.0'
-             var speakerId = q('#speaker-id').value || '0'
+
+             var speakerList = q('#speaker-list')
+             var speaker = speakerList.options[speakerList.selectedIndex].value
+             if (speaker.length > 0) {
+                 voice = voice + "#" + speaker
+             }
+
             var textLanguage = q('#text-language').value || ''

             q('#audio-message').hidden = false
@ -135,8 +142,7 @@
                 '&noiseScale=' + encodeURIComponent(noiseScale) +
                 '&noiseW=' + encodeURIComponent(noiseW) +
                 '&lengthScale=' + encodeURIComponent(lengthScale) +
-                 '&textLanguage=' + encodeURIComponent(textLanguage) +
-                 '&speakerId=' + encodeURIComponent(speakerId),
+                 '&textLanguage=' + encodeURIComponent(textLanguage),
                 {cache: 'no-cache'})

             if (res.ok) {
@ -162,6 +168,29 @@
             q('#message').textContent = ''
             q('#audio').hidden = true
             q('#audio').autoplay = true
+
+             // Reset speakers
+             var speakerList = q('#speaker-list')
+             for (var i = speakerList.options.length - 1; i >= 0; i--) {
+                 speakerList.options[i].remove()
+             }
+
+             var voiceKey = voiceList.options[voiceList.selectedIndex].value
+             var voice = voicesInfo[voiceKey]
+
+             if (voice.speakers) {
+                 voice.speakers.forEach(function(speaker) {
+                    speakerList.insertAdjacentHTML(
+                        'beforeend', '<option value="' + speaker + '">' + speaker + '</option>'
+                    )
+                 })
+                 
+             } else {
+                 // Add default speaker
+                speakerList.insertAdjacentHTML(
+                    'beforeend', '<option value="">default</option>'
+                )
+             }
         }

         q('#voice-list').addEventListener('change', voiceChanged)
@ -180,14 +209,15 @@
                     if (!res.ok) throw Error(res.statusText)
                     return res.json()
                 }).then(function(voices) {
-                     voicesInfo = voices
+                     voicesInfo = {}

                     // Populate select
                     var indexToSelect = -1

                     voices.forEach(function(voice) {
+                         voicesInfo[voice.key] = voice
                         voiceList.insertAdjacentHTML(
-                             'beforeend', '<option value="' + voice + '">' + voice + '</option>'
+                             'beforeend', '<option value="' + voice.key + '">' + voice.language + '/' + voice.name + '</option>'
                         )
                     })

--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@ -135,12 +135,25 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):

                    voice_lang = lang_dir.name
                    voice_name = voice_dir.name
+                    speakers: typing.Optional[typing.Sequence[str]] = None
+
+                    speakers_path = voice_dir / "speakers.txt"
+                    if speakers_path.is_file():
+                        speakers = []
+                        with open(
+                            speakers_path, "r", encoding="utf-8"
+                        ) as speakers_file:
+                            for line in speakers_file:
+                                line = line.strip()
+                                if line:
+                                    speakers.append(line)

                    yield Voice(
                        key=str(voice_dir.absolute()),
                        name=voice_name,
                        language=voice_lang,
                        description="",
+                        speakers=speakers,
                    )

    def begin_utterance(self):
--- a/mimic3-tts/mimic3_tts/voice.py
+++ b/mimic3-tts/mimic3_tts/voice.py
@ -212,17 +212,18 @@ class Mimic3Voice(metaclass=ABCMeta):
            with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
                phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)

-        # id -> speaker | alias | alias ...
+        # id -> speaker
        speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None
        speaker_map_path = voice_dir / "speaker_map.csv"
        if speaker_map_path.is_file():
            _LOGGER.debug("Loading speaker map from %s", speaker_map_path)
            with open(speaker_map_path, "r", encoding="utf-8") as map_file:
+                # id | dataset | name | [alias] | [alias] ...
                reader = csv.reader(map_file, delimiter="|")
                speaker_map = {}
                for row in reader:
                    speaker_id = int(row[0])
-                    for alias in row[1:]:
+                    for alias in row[2:]:
                        speaker_map[alias] = speaker_id

        if config.phonemizer == Phonemizer.GRUUT:
--- a/opentts-abc/opentts_abc/init.py
+++ b/opentts-abc/opentts_abc/init.py
@ -90,8 +90,13 @@ class Voice:
    name: str
    language: str
    description: str
+    speakers: typing.Optional[typing.Sequence[str]] = None
    properties: typing.Optional[typing.Mapping[str, typing.Any]] = None

+    @property
+    def is_multispeaker(self) -> bool:
+        return (self.speakers is not None) and (len(self.speakers) > 1)
+

 # @dataclass
 # class LexiconEntry: