diff --git a/mimic3-http/mimic3_http/__main__.py b/mimic3-http/mimic3_http/__main__.py
index 52ca4ff..58e3074 100644
--- a/mimic3-http/mimic3_http/__main__.py
+++ b/mimic3-http/mimic3_http/__main__.py
@@ -21,6 +21,7 @@ import io
import wave
import tempfile
import typing
+import dataclasses
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import parse_qs
@@ -62,9 +63,7 @@ parser.add_argument(
parser.add_argument(
"--port", type=int, default=59125, help="Port of HTTP server (default: 59125)"
)
-parser.add_argument(
- "--speaker-id", type=int, default=0, help="Default speaker id to use"
-)
+parser.add_argument("--speaker", type=int, help="Default speaker to use (name or id)")
parser.add_argument(
"--length-scale", type=float, default=1.0, help="Speed of speech (> 1 is slower)"
)
@@ -114,7 +113,6 @@ _LOGGER.debug(args)
class TextToWavParams:
text: str
voice: str = args.voice
- speaker_id: int = args.speaker_id
noise_scale: float = args.noise_scale
noise_w: float = args.noise_w
length_scale: float = args.length_scale
@@ -139,7 +137,7 @@ _WAV_CACHE: typing.Dict[TextToWavParams, Path] = {}
mimic3 = Mimic3TextToSpeechSystem(
Mimic3Settings(
voice=args.voice,
- speaker_id=args.speaker_id,
+ speaker=args.speaker,
length_scale=args.length_scale,
noise_scale=args.noise_scale,
noise_w=args.noise_w,
@@ -160,7 +158,6 @@ def text_to_wav(params: TextToWavParams, no_cache: bool = False) -> bytes:
return wav_bytes
mimic3.voice = params.voice
- mimic3.speaker_id = params.speaker_id
mimic3.settings.length_scale = params.length_scale
mimic3.settings.noise_scale = params.noise_scale
@@ -242,10 +239,6 @@ async def app_tts() -> Response:
if voice is not None:
tts_args["voice"] = str(voice)
- speaker_id = request.args.get("speakerId")
- if speaker_id is not None:
- tts_args["speaker_id"] = int(speaker_id)
-
# TTS settings
noise_scale = request.args.get("noiseScale")
if noise_scale is not None:
@@ -286,9 +279,7 @@ async def app_tts() -> Response:
@app.route("/api/voices", methods=["GET"])
async def api_voices():
- voices = mimic3.get_voices()
- voice_ids = sorted([v.name for v in voices])
- return jsonify(voice_ids)
+ return jsonify([dataclasses.asdict(v) for v in mimic3.get_voices()])
@app.route("/process", methods=["GET", "POST"])
@@ -308,20 +299,14 @@ async def api_process():
voice = voice or args.voice
- speaker_id = args.speaker_id
- if "#" in voice:
- voice, speaker_id_str = voice.split("#", maxsplit=1)
- speaker_id = int(speaker_id_str)
-
# Assume SSML if text begins with an angle bracket
ssml = text.strip().startswith("<")
- _LOGGER.debug("Speaking with voice '%s (speaker=%s)': %s", voice, speaker_id, text)
+ _LOGGER.debug("Speaking with voice '%s': %s", voice, text)
wav_bytes = text_to_wav(
TextToWavParams(
text=text,
voice=voice,
- speaker_id=speaker_id,
ssml=ssml,
length_scale=args.length_scale,
noise_scale=args.noise_scale,
diff --git a/mimic3-http/mimic3_http/templates/index.html b/mimic3-http/mimic3_http/templates/index.html
index 03dafad..503f4c2 100644
--- a/mimic3-http/mimic3_http/templates/index.html
+++ b/mimic3-http/mimic3_http/templates/index.html
@@ -62,8 +62,9 @@
-
-
+
+
@@ -122,7 +123,13 @@
var noiseScale = q('#noise-scale').value || '0.333'
var noiseW = q('#noise-w').value || '1.0'
var lengthScale = q('#length-scale').value || '1.0'
- var speakerId = q('#speaker-id').value || '0'
+
+ var speakerList = q('#speaker-list')
+ var speaker = speakerList.options[speakerList.selectedIndex].value
+ if (speaker.length > 0) {
+ voice = voice + "#" + speaker
+ }
+
var textLanguage = q('#text-language').value || ''
q('#audio-message').hidden = false
@@ -135,8 +142,7 @@
'&noiseScale=' + encodeURIComponent(noiseScale) +
'&noiseW=' + encodeURIComponent(noiseW) +
'&lengthScale=' + encodeURIComponent(lengthScale) +
- '&textLanguage=' + encodeURIComponent(textLanguage) +
- '&speakerId=' + encodeURIComponent(speakerId),
+ '&textLanguage=' + encodeURIComponent(textLanguage),
{cache: 'no-cache'})
if (res.ok) {
@@ -162,6 +168,29 @@
q('#message').textContent = ''
q('#audio').hidden = true
q('#audio').autoplay = true
+
+ // Reset speakers
+ var speakerList = q('#speaker-list')
+ for (var i = speakerList.options.length - 1; i >= 0; i--) {
+ speakerList.options[i].remove()
+ }
+
+ var voiceKey = voiceList.options[voiceList.selectedIndex].value
+ var voice = voicesInfo[voiceKey]
+
+ if (voice.speakers) {
+ voice.speakers.forEach(function(speaker) {
+ speakerList.insertAdjacentHTML(
+ 'beforeend', ''
+ )
+ })
+
+ } else {
+ // Add default speaker
+ speakerList.insertAdjacentHTML(
+ 'beforeend', ''
+ )
+ }
}
q('#voice-list').addEventListener('change', voiceChanged)
@@ -180,14 +209,15 @@
if (!res.ok) throw Error(res.statusText)
return res.json()
}).then(function(voices) {
- voicesInfo = voices
+ voicesInfo = {}
// Populate select
var indexToSelect = -1
voices.forEach(function(voice) {
+ voicesInfo[voice.key] = voice
voiceList.insertAdjacentHTML(
- 'beforeend', ''
+ 'beforeend', ''
)
})
diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py
index e26bb06..6b44c36 100644
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@@ -135,12 +135,25 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
voice_lang = lang_dir.name
voice_name = voice_dir.name
+ speakers: typing.Optional[typing.Sequence[str]] = None
+
+ speakers_path = voice_dir / "speakers.txt"
+ if speakers_path.is_file():
+ speakers = []
+ with open(
+ speakers_path, "r", encoding="utf-8"
+ ) as speakers_file:
+ for line in speakers_file:
+ line = line.strip()
+ if line:
+ speakers.append(line)
yield Voice(
key=str(voice_dir.absolute()),
name=voice_name,
language=voice_lang,
description="",
+ speakers=speakers,
)
def begin_utterance(self):
diff --git a/mimic3-tts/mimic3_tts/voice.py b/mimic3-tts/mimic3_tts/voice.py
index 8483ab6..a8b8bca 100644
--- a/mimic3-tts/mimic3_tts/voice.py
+++ b/mimic3-tts/mimic3_tts/voice.py
@@ -212,17 +212,18 @@ class Mimic3Voice(metaclass=ABCMeta):
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
- # id -> speaker | alias | alias ...
+ # id -> speaker
speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None
speaker_map_path = voice_dir / "speaker_map.csv"
if speaker_map_path.is_file():
_LOGGER.debug("Loading speaker map from %s", speaker_map_path)
with open(speaker_map_path, "r", encoding="utf-8") as map_file:
+ # id | dataset | name | [alias] | [alias] ...
reader = csv.reader(map_file, delimiter="|")
speaker_map = {}
for row in reader:
speaker_id = int(row[0])
- for alias in row[1:]:
+ for alias in row[2:]:
speaker_map[alias] = speaker_id
if config.phonemizer == Phonemizer.GRUUT:
diff --git a/opentts-abc/opentts_abc/__init__.py b/opentts-abc/opentts_abc/__init__.py
index 89122d0..d19ffeb 100644
--- a/opentts-abc/opentts_abc/__init__.py
+++ b/opentts-abc/opentts_abc/__init__.py
@@ -90,8 +90,13 @@ class Voice:
name: str
language: str
description: str
+ speakers: typing.Optional[typing.Sequence[str]] = None
properties: typing.Optional[typing.Mapping[str, typing.Any]] = None
+ @property
+ def is_multispeaker(self) -> bool:
+ return (self.speakers is not None) and (len(self.speakers) > 1)
+
# @dataclass
# class LexiconEntry: