Add speaker names to web interface

This commit is contained in:
Michael Hansen 2022-03-21 16:13:45 -04:00
commit d4ef4744c7
5 changed files with 63 additions and 29 deletions

View file

@ -21,6 +21,7 @@ import io
import wave
import tempfile
import typing
import dataclasses
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import parse_qs
@ -62,9 +63,7 @@ parser.add_argument(
parser.add_argument(
"--port", type=int, default=59125, help="Port of HTTP server (default: 59125)"
)
parser.add_argument(
"--speaker-id", type=int, default=0, help="Default speaker id to use"
)
parser.add_argument("--speaker", type=int, help="Default speaker to use (name or id)")
parser.add_argument(
"--length-scale", type=float, default=1.0, help="Speed of speech (> 1 is slower)"
)
@ -114,7 +113,6 @@ _LOGGER.debug(args)
class TextToWavParams:
text: str
voice: str = args.voice
speaker_id: int = args.speaker_id
noise_scale: float = args.noise_scale
noise_w: float = args.noise_w
length_scale: float = args.length_scale
@ -139,7 +137,7 @@ _WAV_CACHE: typing.Dict[TextToWavParams, Path] = {}
mimic3 = Mimic3TextToSpeechSystem(
Mimic3Settings(
voice=args.voice,
speaker_id=args.speaker_id,
speaker=args.speaker,
length_scale=args.length_scale,
noise_scale=args.noise_scale,
noise_w=args.noise_w,
@ -160,7 +158,6 @@ def text_to_wav(params: TextToWavParams, no_cache: bool = False) -> bytes:
return wav_bytes
mimic3.voice = params.voice
mimic3.speaker_id = params.speaker_id
mimic3.settings.length_scale = params.length_scale
mimic3.settings.noise_scale = params.noise_scale
@ -242,10 +239,6 @@ async def app_tts() -> Response:
if voice is not None:
tts_args["voice"] = str(voice)
speaker_id = request.args.get("speakerId")
if speaker_id is not None:
tts_args["speaker_id"] = int(speaker_id)
# TTS settings
noise_scale = request.args.get("noiseScale")
if noise_scale is not None:
@ -286,9 +279,7 @@ async def app_tts() -> Response:
@app.route("/api/voices", methods=["GET"])
async def api_voices():
voices = mimic3.get_voices()
voice_ids = sorted([v.name for v in voices])
return jsonify(voice_ids)
return jsonify([dataclasses.asdict(v) for v in mimic3.get_voices()])
@app.route("/process", methods=["GET", "POST"])
@ -308,20 +299,14 @@ async def api_process():
voice = voice or args.voice
speaker_id = args.speaker_id
if "#" in voice:
voice, speaker_id_str = voice.split("#", maxsplit=1)
speaker_id = int(speaker_id_str)
# Assume SSML if text begins with an angle bracket
ssml = text.strip().startswith("<")
_LOGGER.debug("Speaking with voice '%s (speaker=%s)': %s", voice, speaker_id, text)
_LOGGER.debug("Speaking with voice '%s': %s", voice, text)
wav_bytes = text_to_wav(
TextToWavParams(
text=text,
voice=voice,
speaker_id=speaker_id,
ssml=ssml,
length_scale=args.length_scale,
noise_scale=args.noise_scale,

View file

@ -62,8 +62,9 @@
</select>
</div>
<div class="col-auto">
<label for="speaker-id" title="Index of speaker">Speaker:</label>
<input type="number" id="speaker-id" name="speaker_id" size="5" min="0" value="0">
<label for="speaker" title="Name of speaker">Speaker:</label>
<select id="speaker-list" name="speaker">
</select>
</div>
</div>
<div id="audio-message" class="row mt-3" hidden>
@ -122,7 +123,13 @@
var noiseScale = q('#noise-scale').value || '0.333'
var noiseW = q('#noise-w').value || '1.0'
var lengthScale = q('#length-scale').value || '1.0'
var speakerId = q('#speaker-id').value || '0'
var speakerList = q('#speaker-list')
var speaker = speakerList.options[speakerList.selectedIndex].value
if (speaker.length > 0) {
voice = voice + "#" + speaker
}
var textLanguage = q('#text-language').value || ''
q('#audio-message').hidden = false
@ -135,8 +142,7 @@
'&noiseScale=' + encodeURIComponent(noiseScale) +
'&noiseW=' + encodeURIComponent(noiseW) +
'&lengthScale=' + encodeURIComponent(lengthScale) +
'&textLanguage=' + encodeURIComponent(textLanguage) +
'&speakerId=' + encodeURIComponent(speakerId),
'&textLanguage=' + encodeURIComponent(textLanguage),
{cache: 'no-cache'})
if (res.ok) {
@ -162,6 +168,29 @@
q('#message').textContent = ''
q('#audio').hidden = true
q('#audio').autoplay = true
// Reset speakers
var speakerList = q('#speaker-list')
for (var i = speakerList.options.length - 1; i >= 0; i--) {
speakerList.options[i].remove()
}
var voiceKey = voiceList.options[voiceList.selectedIndex].value
var voice = voicesInfo[voiceKey]
if (voice.speakers) {
voice.speakers.forEach(function(speaker) {
speakerList.insertAdjacentHTML(
'beforeend', '<option value="' + speaker + '">' + speaker + '</option>'
)
})
} else {
// Add default speaker
speakerList.insertAdjacentHTML(
'beforeend', '<option value="">default</option>'
)
}
}
q('#voice-list').addEventListener('change', voiceChanged)
@ -180,14 +209,15 @@
if (!res.ok) throw Error(res.statusText)
return res.json()
}).then(function(voices) {
voicesInfo = voices
voicesInfo = {}
// Populate select
var indexToSelect = -1
voices.forEach(function(voice) {
voicesInfo[voice.key] = voice
voiceList.insertAdjacentHTML(
'beforeend', '<option value="' + voice + '">' + voice + '</option>'
'beforeend', '<option value="' + voice.key + '">' + voice.language + '/' + voice.name + '</option>'
)
})

View file

@ -135,12 +135,25 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
voice_lang = lang_dir.name
voice_name = voice_dir.name
speakers: typing.Optional[typing.Sequence[str]] = None
speakers_path = voice_dir / "speakers.txt"
if speakers_path.is_file():
speakers = []
with open(
speakers_path, "r", encoding="utf-8"
) as speakers_file:
for line in speakers_file:
line = line.strip()
if line:
speakers.append(line)
yield Voice(
key=str(voice_dir.absolute()),
name=voice_name,
language=voice_lang,
description="",
speakers=speakers,
)
def begin_utterance(self):

View file

@ -212,17 +212,18 @@ class Mimic3Voice(metaclass=ABCMeta):
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
# id -> speaker | alias | alias ...
# id -> speaker
speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None
speaker_map_path = voice_dir / "speaker_map.csv"
if speaker_map_path.is_file():
_LOGGER.debug("Loading speaker map from %s", speaker_map_path)
with open(speaker_map_path, "r", encoding="utf-8") as map_file:
# id | dataset | name | [alias] | [alias] ...
reader = csv.reader(map_file, delimiter="|")
speaker_map = {}
for row in reader:
speaker_id = int(row[0])
for alias in row[1:]:
for alias in row[2:]:
speaker_map[alias] = speaker_id
if config.phonemizer == Phonemizer.GRUUT:

View file

@ -90,8 +90,13 @@ class Voice:
name: str
language: str
description: str
speakers: typing.Optional[typing.Sequence[str]] = None
properties: typing.Optional[typing.Mapping[str, typing.Any]] = None
@property
def is_multispeaker(self) -> bool:
return (self.speakers is not None) and (len(self.speakers) > 1)
# @dataclass
# class LexiconEntry: