Add speaker names to web interface
This commit is contained in:
parent
50d4c2a6c9
commit
d4ef4744c7
5 changed files with 63 additions and 29 deletions
|
|
@ -21,6 +21,7 @@ import io
|
|||
import wave
|
||||
import tempfile
|
||||
import typing
|
||||
import dataclasses
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs
|
||||
|
|
@ -62,9 +63,7 @@ parser.add_argument(
|
|||
parser.add_argument(
|
||||
"--port", type=int, default=59125, help="Port of HTTP server (default: 59125)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker-id", type=int, default=0, help="Default speaker id to use"
|
||||
)
|
||||
parser.add_argument("--speaker", type=int, help="Default speaker to use (name or id)")
|
||||
parser.add_argument(
|
||||
"--length-scale", type=float, default=1.0, help="Speed of speech (> 1 is slower)"
|
||||
)
|
||||
|
|
@ -114,7 +113,6 @@ _LOGGER.debug(args)
|
|||
class TextToWavParams:
|
||||
text: str
|
||||
voice: str = args.voice
|
||||
speaker_id: int = args.speaker_id
|
||||
noise_scale: float = args.noise_scale
|
||||
noise_w: float = args.noise_w
|
||||
length_scale: float = args.length_scale
|
||||
|
|
@ -139,7 +137,7 @@ _WAV_CACHE: typing.Dict[TextToWavParams, Path] = {}
|
|||
mimic3 = Mimic3TextToSpeechSystem(
|
||||
Mimic3Settings(
|
||||
voice=args.voice,
|
||||
speaker_id=args.speaker_id,
|
||||
speaker=args.speaker,
|
||||
length_scale=args.length_scale,
|
||||
noise_scale=args.noise_scale,
|
||||
noise_w=args.noise_w,
|
||||
|
|
@ -160,7 +158,6 @@ def text_to_wav(params: TextToWavParams, no_cache: bool = False) -> bytes:
|
|||
return wav_bytes
|
||||
|
||||
mimic3.voice = params.voice
|
||||
mimic3.speaker_id = params.speaker_id
|
||||
|
||||
mimic3.settings.length_scale = params.length_scale
|
||||
mimic3.settings.noise_scale = params.noise_scale
|
||||
|
|
@ -242,10 +239,6 @@ async def app_tts() -> Response:
|
|||
if voice is not None:
|
||||
tts_args["voice"] = str(voice)
|
||||
|
||||
speaker_id = request.args.get("speakerId")
|
||||
if speaker_id is not None:
|
||||
tts_args["speaker_id"] = int(speaker_id)
|
||||
|
||||
# TTS settings
|
||||
noise_scale = request.args.get("noiseScale")
|
||||
if noise_scale is not None:
|
||||
|
|
@ -286,9 +279,7 @@ async def app_tts() -> Response:
|
|||
|
||||
@app.route("/api/voices", methods=["GET"])
|
||||
async def api_voices():
|
||||
voices = mimic3.get_voices()
|
||||
voice_ids = sorted([v.name for v in voices])
|
||||
return jsonify(voice_ids)
|
||||
return jsonify([dataclasses.asdict(v) for v in mimic3.get_voices()])
|
||||
|
||||
|
||||
@app.route("/process", methods=["GET", "POST"])
|
||||
|
|
@ -308,20 +299,14 @@ async def api_process():
|
|||
|
||||
voice = voice or args.voice
|
||||
|
||||
speaker_id = args.speaker_id
|
||||
if "#" in voice:
|
||||
voice, speaker_id_str = voice.split("#", maxsplit=1)
|
||||
speaker_id = int(speaker_id_str)
|
||||
|
||||
# Assume SSML if text begins with an angle bracket
|
||||
ssml = text.strip().startswith("<")
|
||||
|
||||
_LOGGER.debug("Speaking with voice '%s (speaker=%s)': %s", voice, speaker_id, text)
|
||||
_LOGGER.debug("Speaking with voice '%s': %s", voice, text)
|
||||
wav_bytes = text_to_wav(
|
||||
TextToWavParams(
|
||||
text=text,
|
||||
voice=voice,
|
||||
speaker_id=speaker_id,
|
||||
ssml=ssml,
|
||||
length_scale=args.length_scale,
|
||||
noise_scale=args.noise_scale,
|
||||
|
|
|
|||
|
|
@ -62,8 +62,9 @@
|
|||
</select>
|
||||
</div>
|
||||
<div class="col-auto">
|
||||
<label for="speaker-id" title="Index of speaker">Speaker:</label>
|
||||
<input type="number" id="speaker-id" name="speaker_id" size="5" min="0" value="0">
|
||||
<label for="speaker" title="Name of speaker">Speaker:</label>
|
||||
<select id="speaker-list" name="speaker">
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div id="audio-message" class="row mt-3" hidden>
|
||||
|
|
@ -122,7 +123,13 @@
|
|||
var noiseScale = q('#noise-scale').value || '0.333'
|
||||
var noiseW = q('#noise-w').value || '1.0'
|
||||
var lengthScale = q('#length-scale').value || '1.0'
|
||||
var speakerId = q('#speaker-id').value || '0'
|
||||
|
||||
var speakerList = q('#speaker-list')
|
||||
var speaker = speakerList.options[speakerList.selectedIndex].value
|
||||
if (speaker.length > 0) {
|
||||
voice = voice + "#" + speaker
|
||||
}
|
||||
|
||||
var textLanguage = q('#text-language').value || ''
|
||||
|
||||
q('#audio-message').hidden = false
|
||||
|
|
@ -135,8 +142,7 @@
|
|||
'&noiseScale=' + encodeURIComponent(noiseScale) +
|
||||
'&noiseW=' + encodeURIComponent(noiseW) +
|
||||
'&lengthScale=' + encodeURIComponent(lengthScale) +
|
||||
'&textLanguage=' + encodeURIComponent(textLanguage) +
|
||||
'&speakerId=' + encodeURIComponent(speakerId),
|
||||
'&textLanguage=' + encodeURIComponent(textLanguage),
|
||||
{cache: 'no-cache'})
|
||||
|
||||
if (res.ok) {
|
||||
|
|
@ -162,6 +168,29 @@
|
|||
q('#message').textContent = ''
|
||||
q('#audio').hidden = true
|
||||
q('#audio').autoplay = true
|
||||
|
||||
// Reset speakers
|
||||
var speakerList = q('#speaker-list')
|
||||
for (var i = speakerList.options.length - 1; i >= 0; i--) {
|
||||
speakerList.options[i].remove()
|
||||
}
|
||||
|
||||
var voiceKey = voiceList.options[voiceList.selectedIndex].value
|
||||
var voice = voicesInfo[voiceKey]
|
||||
|
||||
if (voice.speakers) {
|
||||
voice.speakers.forEach(function(speaker) {
|
||||
speakerList.insertAdjacentHTML(
|
||||
'beforeend', '<option value="' + speaker + '">' + speaker + '</option>'
|
||||
)
|
||||
})
|
||||
|
||||
} else {
|
||||
// Add default speaker
|
||||
speakerList.insertAdjacentHTML(
|
||||
'beforeend', '<option value="">default</option>'
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
q('#voice-list').addEventListener('change', voiceChanged)
|
||||
|
|
@ -180,14 +209,15 @@
|
|||
if (!res.ok) throw Error(res.statusText)
|
||||
return res.json()
|
||||
}).then(function(voices) {
|
||||
voicesInfo = voices
|
||||
voicesInfo = {}
|
||||
|
||||
// Populate select
|
||||
var indexToSelect = -1
|
||||
|
||||
voices.forEach(function(voice) {
|
||||
voicesInfo[voice.key] = voice
|
||||
voiceList.insertAdjacentHTML(
|
||||
'beforeend', '<option value="' + voice + '">' + voice + '</option>'
|
||||
'beforeend', '<option value="' + voice.key + '">' + voice.language + '/' + voice.name + '</option>'
|
||||
)
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -135,12 +135,25 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
|
||||
voice_lang = lang_dir.name
|
||||
voice_name = voice_dir.name
|
||||
speakers: typing.Optional[typing.Sequence[str]] = None
|
||||
|
||||
speakers_path = voice_dir / "speakers.txt"
|
||||
if speakers_path.is_file():
|
||||
speakers = []
|
||||
with open(
|
||||
speakers_path, "r", encoding="utf-8"
|
||||
) as speakers_file:
|
||||
for line in speakers_file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
speakers.append(line)
|
||||
|
||||
yield Voice(
|
||||
key=str(voice_dir.absolute()),
|
||||
name=voice_name,
|
||||
language=voice_lang,
|
||||
description="",
|
||||
speakers=speakers,
|
||||
)
|
||||
|
||||
def begin_utterance(self):
|
||||
|
|
|
|||
|
|
@ -212,17 +212,18 @@ class Mimic3Voice(metaclass=ABCMeta):
|
|||
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
|
||||
phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
|
||||
|
||||
# id -> speaker | alias | alias ...
|
||||
# id -> speaker
|
||||
speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None
|
||||
speaker_map_path = voice_dir / "speaker_map.csv"
|
||||
if speaker_map_path.is_file():
|
||||
_LOGGER.debug("Loading speaker map from %s", speaker_map_path)
|
||||
with open(speaker_map_path, "r", encoding="utf-8") as map_file:
|
||||
# id | dataset | name | [alias] | [alias] ...
|
||||
reader = csv.reader(map_file, delimiter="|")
|
||||
speaker_map = {}
|
||||
for row in reader:
|
||||
speaker_id = int(row[0])
|
||||
for alias in row[1:]:
|
||||
for alias in row[2:]:
|
||||
speaker_map[alias] = speaker_id
|
||||
|
||||
if config.phonemizer == Phonemizer.GRUUT:
|
||||
|
|
|
|||
|
|
@ -90,8 +90,13 @@ class Voice:
|
|||
name: str
|
||||
language: str
|
||||
description: str
|
||||
speakers: typing.Optional[typing.Sequence[str]] = None
|
||||
properties: typing.Optional[typing.Mapping[str, typing.Any]] = None
|
||||
|
||||
@property
|
||||
def is_multispeaker(self) -> bool:
|
||||
return (self.speakers is not None) and (len(self.speakers) > 1)
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class LexiconEntry:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue