Add espeak and symbol voices
This commit is contained in:
parent
1eae034d8c
commit
fb9cd71919
6 changed files with 543 additions and 237 deletions
|
|
@ -1,2 +1,3 @@
|
|||
from .tts import Mimic3TextToSpeechSystem, Mimic3Settings
|
||||
from opentts_abc import AudioResult, MarkResult
|
||||
|
||||
from .tts import Mimic3Settings, Mimic3TextToSpeechSystem
|
||||
|
|
|
|||
|
|
@ -5,13 +5,23 @@ import wave
|
|||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
from opentts_abc.ssml import SSMLSpeaker
|
||||
from mimic3_tts.tts import Mimic3TextToSpeechSystem, Mimic3Settings, AudioResult, MarkResult
|
||||
|
||||
settings = Mimic3Settings(length_scale=1.2, noise_w=0)
|
||||
from mimic3_tts.tts import (
|
||||
AudioResult,
|
||||
MarkResult,
|
||||
Mimic3Settings,
|
||||
Mimic3TextToSpeechSystem,
|
||||
)
|
||||
|
||||
settings = Mimic3Settings()
|
||||
tts = Mimic3TextToSpeechSystem(settings)
|
||||
|
||||
speaker = SSMLSpeaker(tts)
|
||||
ssml = '<speak><s><voice name="en_US/vctk_low#20">This is a test.</voice></s></speak>'
|
||||
# ssml = '<speak><voice name="el_GR/rapunzelina_low"><s><w>Το</w><w>αερόστρωμνό</w><w>μου</w><w>είναι</w><w>γεμάτο</w><w>χέλια.</w></s></voice></speak>'
|
||||
# ssml = '<speak><voice name="uk_UK/m-ailabs_low"><s><w>бажав</w></s></voice></speak>'
|
||||
# ssml = '<speak><s><w>Hello</w><w>World</w></s></speak>'
|
||||
# ssml = '<speak><s>Hello world</s></speak>'
|
||||
ssml = '<speak><s><voice name="el_GR/rapunzelina_low"><say-as interpret-as="characters">12</say-as></voice></s></speak>'
|
||||
|
||||
wav_file: wave.Wave_write = wave.open("out.wav", "wb")
|
||||
params_set = False
|
||||
|
|
|
|||
|
|
@ -1,18 +1,4 @@
|
|||
"""Configuration classes"""
|
||||
# Copyright 2021 Mycroft AI Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import collections
|
||||
import json
|
||||
import typing
|
||||
|
|
@ -20,6 +6,7 @@ from dataclasses import dataclass, field
|
|||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from dataclasses_json import DataClassJsonMixin
|
||||
from gruut_ipa import IPA
|
||||
from phonemes2ids import BlankBetween
|
||||
|
|
@ -59,6 +46,51 @@ class AudioConfig(DataClassJsonMixin):
|
|||
if self.mel_fmax is not None:
|
||||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Normalization
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def normalize(self, mel_db: np.ndarray) -> np.ndarray:
|
||||
"""Put values in [0, max_norm] or [-max_norm, max_norm]"""
|
||||
mel_norm = ((mel_db - self.ref_level_db) - self.min_level_db) / (
|
||||
-self.min_level_db
|
||||
)
|
||||
if self.symmetric_norm:
|
||||
# Symmetric norm
|
||||
mel_norm = ((2 * self.max_norm) * mel_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
mel_norm = np.clip(mel_norm, -self.max_norm, self.max_norm)
|
||||
else:
|
||||
# Asymmetric norm
|
||||
mel_norm = self.max_norm * mel_norm
|
||||
if self.clip_norm:
|
||||
mel_norm = np.clip(mel_norm, 0, self.max_norm)
|
||||
|
||||
return mel_norm
|
||||
|
||||
def denormalize(self, mel_db: np.ndarray) -> np.ndarray:
|
||||
"""Pull values out of [0, max_norm] or [-max_norm, max_norm]"""
|
||||
if self.symmetric_norm:
|
||||
# Symmetric norm
|
||||
if self.clip_norm:
|
||||
mel_denorm = np.clip(mel_db, -self.max_norm, self.max_norm)
|
||||
|
||||
mel_denorm = (
|
||||
(mel_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)
|
||||
) + self.min_level_db
|
||||
else:
|
||||
# Asymmetric norm
|
||||
if self.clip_norm:
|
||||
mel_denorm = np.clip(mel_db, 0, self.max_norm)
|
||||
|
||||
mel_denorm = (
|
||||
mel_denorm * -self.min_level_db / self.max_norm
|
||||
) + self.min_level_db
|
||||
|
||||
mel_denorm += self.ref_level_db
|
||||
|
||||
return mel_denorm
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelConfig(DataClassJsonMixin):
|
||||
|
|
@ -84,7 +116,7 @@ class ModelConfig(DataClassJsonMixin):
|
|||
upsample_kernel_sizes: typing.Tuple[int, ...] = (16, 16, 4, 4)
|
||||
n_layers_q: int = 3
|
||||
use_spectral_norm: bool = False
|
||||
gin_channels: int = 256
|
||||
gin_channels: int = 0 # single speaker
|
||||
use_sdp: bool = True # StochasticDurationPredictor
|
||||
|
||||
@property
|
||||
|
|
@ -100,7 +132,7 @@ class PhonemesConfig(DataClassJsonMixin):
|
|||
word_separator: str = "#"
|
||||
"""Separator between word phonemes in CSV input (must not match phoneme_separator)"""
|
||||
|
||||
phoneme_to_id: typing.Optional[typing.Mapping[str, int]] = None
|
||||
phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
|
||||
pad: typing.Optional[str] = "_"
|
||||
bos: typing.Optional[str] = None
|
||||
eos: typing.Optional[str] = None
|
||||
|
|
@ -110,15 +142,18 @@ class PhonemesConfig(DataClassJsonMixin):
|
|||
blank_at_start: bool = True
|
||||
blank_at_end: bool = True
|
||||
simple_punctuation: bool = True
|
||||
punctuation_map: typing.Optional[typing.Mapping[str, str]] = None
|
||||
punctuation_map: typing.Optional[typing.Dict[str, str]] = None
|
||||
separate: typing.Optional[typing.List[str]] = None
|
||||
separate_graphemes: bool = False
|
||||
separate_tones: bool = False
|
||||
tone_before: bool = False
|
||||
phoneme_map: typing.Optional[typing.Mapping[str, str]] = None
|
||||
phoneme_map: typing.Optional[typing.Dict[str, str]] = None
|
||||
auto_bos_eos: bool = False
|
||||
minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
|
||||
major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
|
||||
break_phonemes_into_graphemes: bool = False
|
||||
drop_stress: bool = False
|
||||
symbols: typing.Optional[typing.List[str]] = None
|
||||
|
||||
def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
|
||||
"""Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
|
||||
|
|
@ -158,8 +193,7 @@ class MetadataFormat(str, Enum):
|
|||
@dataclass
|
||||
class DatasetConfig:
|
||||
name: str
|
||||
metadata_path: typing.Optional[typing.Union[str, Path]] = None
|
||||
train_path: typing.Optional[typing.Union[str, Path]] = None
|
||||
metadata_format: MetadataFormat = MetadataFormat.TEXT
|
||||
multispeaker: bool = False
|
||||
text_language: typing.Optional[str] = None
|
||||
audio_dir: typing.Optional[typing.Union[str, Path]] = None
|
||||
|
|
@ -183,6 +217,13 @@ class AlignerConfig:
|
|||
casing: typing.Optional[TextCasing] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceConfig:
|
||||
length_scale: float = 1.0
|
||||
noise_scale: float = 0.667
|
||||
noise_w: float = 0.8
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingConfig(DataClassJsonMixin):
|
||||
seed: int = 1234
|
||||
|
|
@ -206,6 +247,8 @@ class TrainingConfig(DataClassJsonMixin):
|
|||
min_spec_length: typing.Optional[int] = None
|
||||
max_spec_length: typing.Optional[int] = None
|
||||
|
||||
min_speaker_utterances: typing.Optional[int] = None
|
||||
|
||||
last_epoch: int = 1
|
||||
global_step: int = 1
|
||||
best_loss: typing.Optional[float] = None
|
||||
|
|
@ -216,22 +259,31 @@ class TrainingConfig(DataClassJsonMixin):
|
|||
text_language: typing.Optional[str] = None
|
||||
phonemizer: typing.Optional[Phonemizer] = None
|
||||
datasets: typing.List[DatasetConfig] = field(default_factory=list)
|
||||
dataset_format: MetadataFormat = MetadataFormat.TEXT
|
||||
inference: InferenceConfig = field(default_factory=InferenceConfig)
|
||||
|
||||
version: int = 1
|
||||
git_commit: str = ""
|
||||
|
||||
@property
|
||||
def is_multispeaker(self):
|
||||
return (
|
||||
self.model.is_multispeaker
|
||||
or any(d.multispeaker for d in self.datasets)
|
||||
)
|
||||
return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)
|
||||
|
||||
def save(self, config_file: typing.TextIO):
|
||||
"""Save config as JSON to a file"""
|
||||
json.dump(self.to_dict(), config_file, indent=4)
|
||||
|
||||
def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
|
||||
if self.speaker_id_map is None:
|
||||
self.speaker_id_map = {}
|
||||
|
||||
full_speaker_name = f"{dataset_name}_{speaker_name}"
|
||||
speaker_id = self.speaker_id_map.get(full_speaker_name)
|
||||
if speaker_id is None:
|
||||
speaker_id = len(self.speaker_id_map)
|
||||
self.speaker_id_map[full_speaker_name] = speaker_id
|
||||
|
||||
return speaker_id
|
||||
|
||||
@staticmethod
|
||||
def load(config_file: typing.TextIO) -> "TrainingConfig":
|
||||
"""Load config from a JSON file"""
|
||||
|
|
|
|||
|
|
@ -1,11 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
import dataclasses
|
||||
import logging
|
||||
import time
|
||||
import typing
|
||||
from abc import ABCMeta
|
||||
from dataclasses import dataclass, field
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from xml.sax.saxutils import escape as xmlescape
|
||||
|
||||
|
|
@ -14,22 +12,22 @@ import numpy as np
|
|||
import onnxruntime
|
||||
import phonemes2ids
|
||||
from gruut.const import LookupPhonemes, WordRole
|
||||
from gruut_ipa import guess_phonemes, IPA, Phonemes, Phoneme
|
||||
|
||||
from gruut_ipa import IPA, Phoneme, guess_phonemes
|
||||
from opentts_abc import (
|
||||
TextToSpeechSystem,
|
||||
Voice,
|
||||
BaseToken,
|
||||
BaseResult,
|
||||
MarkResult,
|
||||
AudioResult,
|
||||
Word,
|
||||
BaseResult,
|
||||
BaseToken,
|
||||
MarkResult,
|
||||
Phonemes,
|
||||
SayAs,
|
||||
TextToSpeechSystem,
|
||||
Voice,
|
||||
Word,
|
||||
)
|
||||
|
||||
from mimic3_tts.config import TrainingConfig
|
||||
from mimic3_tts.utils import audio_float_to_int16
|
||||
from mimic3_tts.voice import Mimic3Voice
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
|
||||
|
|
@ -51,20 +49,12 @@ class Mimic3Settings:
|
|||
voices_directories: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None
|
||||
speaker_id: typing.Optional[int] = None
|
||||
length_scale: float = 1.0
|
||||
noise_scale: float = 0.333
|
||||
noise_w: float = 1.0
|
||||
noise_scale: float = 0.667
|
||||
noise_w: float = 0.8
|
||||
text_language: typing.Optional[str] = None
|
||||
sample_rate: int = 22050
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadedVoice:
|
||||
config: TrainingConfig
|
||||
onnx_model: onnxruntime.InferenceSession
|
||||
phoneme_to_id: typing.Mapping[str, int]
|
||||
phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mimic3Phonemes:
|
||||
current_settings: Mimic3Settings
|
||||
|
|
@ -80,12 +70,9 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
def __init__(self, settings: Mimic3Settings):
|
||||
self.settings = settings
|
||||
|
||||
# self._current_voice: typing.Optional[LoadedVoice] = None
|
||||
# self._current_settings = self.settings
|
||||
|
||||
self._results: typing.List[typing.Union[BaseResult, Mimic3Phonemes]] = []
|
||||
|
||||
self.loaded_voices: typing.Dict[str, LoadedVoice] = {}
|
||||
self.loaded_voices: typing.Dict[str, Mimic3Voice] = {}
|
||||
|
||||
@property
|
||||
def voice(self) -> str:
|
||||
|
|
@ -107,10 +94,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
# TODO: Use speaker map
|
||||
self.speaker_id = int(speaker_id_str)
|
||||
|
||||
# self._current_voice = self._get_or_load_voice(
|
||||
# self.settings.voice or DEFAULT_VOICE
|
||||
# )
|
||||
|
||||
@property
|
||||
def speaker_id(self) -> typing.Optional[int]:
|
||||
return self.settings.speaker_id
|
||||
|
|
@ -131,27 +114,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
def get_default_voices_directories() -> typing.List[Path]:
|
||||
return [_DIR.parent.parent / "voices"]
|
||||
|
||||
# @property
|
||||
# def text_lang(self) -> str:
|
||||
# return (
|
||||
# self.settings.text_language
|
||||
# or self.settings.language
|
||||
# or (
|
||||
# self._current_voice.config.text_language
|
||||
# if self._current_voice
|
||||
# else None
|
||||
# )
|
||||
# or "en_US"
|
||||
# )
|
||||
|
||||
# @property
|
||||
# def sample_rate(self) -> int:
|
||||
# return (
|
||||
# self._current_voice.config.audio.sample_rate
|
||||
# if self._current_voice
|
||||
# else self.settings.sample_rate
|
||||
# )
|
||||
|
||||
def get_voices(self) -> typing.Iterable[Voice]:
|
||||
voices_dirs = (
|
||||
self.settings.voices_directories
|
||||
|
|
@ -185,146 +147,71 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
|
||||
def begin_utterance(self):
|
||||
self._results.clear()
|
||||
# self._current_settings = deepcopy(self.settings)
|
||||
|
||||
def speak_text(self, text: str, text_language: typing.Optional[str] = None):
|
||||
text_language = text_language or self.language
|
||||
for sentence in gruut.sentences(text, lang=text_language):
|
||||
sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
|
||||
voice = self._get_or_load_voice(self.voice)
|
||||
|
||||
for sent_phonemes in voice.text_to_phonemes(text, text_language=text_language):
|
||||
self._results.append(
|
||||
Mimic3Phonemes(
|
||||
current_settings=deepcopy(self.settings),
|
||||
phonemes=sent_phonemes,
|
||||
current_settings=deepcopy(self.settings), phonemes=sent_phonemes,
|
||||
)
|
||||
)
|
||||
|
||||
def _speak_sentence_phonemes(
|
||||
self,
|
||||
sent_phonemes,
|
||||
text: typing.Optional[str] = None,
|
||||
settings: typing.Optional[Mimic3Settings] = None,
|
||||
self, sent_phonemes, settings: typing.Optional[Mimic3Settings] = None,
|
||||
) -> AudioResult:
|
||||
settings = settings or self.settings
|
||||
current_voice = self._get_or_load_voice(settings.voice or DEFAULT_VOICE)
|
||||
voice = self._get_or_load_voice(settings.voice or self.voice)
|
||||
sent_phoneme_ids = voice.phonemes_to_ids(sent_phonemes)
|
||||
|
||||
config = current_voice.config
|
||||
onnx_model = current_voice.onnx_model
|
||||
phoneme_to_id = current_voice.phoneme_to_id
|
||||
phoneme_map = current_voice.phoneme_map or config.phonemes.phoneme_map
|
||||
_LOGGER.debug("phonemes=%s, ids=%s", sent_phonemes, sent_phoneme_ids)
|
||||
|
||||
sent_phoneme_ids = phonemes2ids.phonemes2ids(
|
||||
word_phonemes=sent_phonemes,
|
||||
phoneme_to_id=phoneme_to_id,
|
||||
pad=config.phonemes.pad,
|
||||
bos=config.phonemes.bos,
|
||||
eos=config.phonemes.eos,
|
||||
auto_bos_eos=config.phonemes.auto_bos_eos,
|
||||
blank=config.phonemes.blank,
|
||||
blank_word=config.phonemes.blank_word,
|
||||
blank_between=config.phonemes.blank_between,
|
||||
blank_at_start=config.phonemes.blank_at_start,
|
||||
blank_at_end=config.phonemes.blank_at_end,
|
||||
simple_punctuation=config.phonemes.simple_punctuation,
|
||||
punctuation_map=config.phonemes.punctuation_map,
|
||||
separate=config.phonemes.separate,
|
||||
separate_graphemes=config.phonemes.separate_graphemes,
|
||||
separate_tones=config.phonemes.separate_tones,
|
||||
tone_before=config.phonemes.tone_before,
|
||||
phoneme_map=phoneme_map,
|
||||
fail_on_missing=False,
|
||||
audio = voice.ids_to_audio(
|
||||
sent_phoneme_ids,
|
||||
speaker=self.speaker_id,
|
||||
length_scale=settings.length_scale,
|
||||
noise_scale=settings.noise_scale,
|
||||
noise_w=settings.noise_w,
|
||||
)
|
||||
|
||||
if text:
|
||||
_LOGGER.debug("%s %s %s", text, sent_phonemes, sent_phoneme_ids)
|
||||
else:
|
||||
_LOGGER.debug("%s %s", sent_phonemes, sent_phoneme_ids)
|
||||
|
||||
# Create model inputs
|
||||
text_array = np.expand_dims(np.array(sent_phoneme_ids, dtype=np.int64), 0)
|
||||
text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
|
||||
scales_array = np.array(
|
||||
[
|
||||
settings.noise_scale,
|
||||
settings.length_scale,
|
||||
settings.noise_w,
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
inputs = {
|
||||
"input": text_array,
|
||||
"input_lengths": text_lengths_array,
|
||||
"scales": scales_array,
|
||||
}
|
||||
|
||||
if config.is_multispeaker:
|
||||
speaker_id = settings.speaker_id if settings.speaker_id is not None else 0
|
||||
speaker_id_array = np.array([speaker_id], dtype=np.int64)
|
||||
inputs["sid"] = speaker_id_array
|
||||
|
||||
# Infer audio from phonemes
|
||||
start_time = time.perf_counter()
|
||||
audio = onnx_model.run(None, inputs)[0].squeeze()
|
||||
audio = audio_float_to_int16(audio)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
# Compute real-time factor
|
||||
audio_duration_sec = audio.shape[-1] / config.audio.sample_rate
|
||||
infer_sec = end_time - start_time
|
||||
real_time_factor = (
|
||||
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
|
||||
)
|
||||
|
||||
_LOGGER.debug("RTF: %s", real_time_factor)
|
||||
|
||||
audio_bytes = audio.tobytes()
|
||||
return AudioResult(
|
||||
sample_rate_hz=config.audio.sample_rate,
|
||||
sample_rate_hz=voice.config.audio.sample_rate,
|
||||
audio_bytes=audio_bytes,
|
||||
# 16-bit mono
|
||||
sample_width_bytes=2,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
def speak_tokens(self, tokens: typing.Iterable[BaseToken]):
|
||||
def speak_tokens(
|
||||
self,
|
||||
tokens: typing.Iterable[BaseToken],
|
||||
text_language: typing.Optional[str] = None,
|
||||
):
|
||||
voice = self._get_or_load_voice(self.voice)
|
||||
token_phonemes: PHONEMES_LIST = []
|
||||
|
||||
for token in tokens:
|
||||
if isinstance(token, Word):
|
||||
word_role = xmlescape(token.role) if token.role else ""
|
||||
word_text = xmlescape(token.text)
|
||||
|
||||
sentence = next(
|
||||
iter(
|
||||
gruut.sentences(
|
||||
f'<w role="{word_role}">{word_text}</w>', ssml=True
|
||||
)
|
||||
)
|
||||
word_phonemes = voice.word_to_phonemes(
|
||||
token.text, word_role=token.role, text_language=text_language
|
||||
)
|
||||
token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
|
||||
token_phonemes.append(word_phonemes)
|
||||
elif isinstance(token, Phonemes):
|
||||
phoneme_str = token.text.strip()
|
||||
if " " in phoneme_str:
|
||||
token_phonemes.append(phoneme_str.split())
|
||||
else:
|
||||
token_phonemes.append(list(phoneme_str))
|
||||
token_phonemes.append(list(IPA.graphemes(phoneme_str)))
|
||||
elif isinstance(token, SayAs):
|
||||
word_text = xmlescape(token.text)
|
||||
interpret_as = xmlescape(token.interpret_as)
|
||||
format_attr = (
|
||||
f'format="{xmlescape(token.format)}"' if token.format else ""
|
||||
say_as_phonemes = voice.say_as_to_phonemes(
|
||||
token.text,
|
||||
interpret_as=token.interpret_as,
|
||||
say_format=token.format,
|
||||
text_language=text_language,
|
||||
)
|
||||
|
||||
sentence = next(
|
||||
iter(
|
||||
gruut.sentences(
|
||||
f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
|
||||
ssml=True,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
|
||||
token_phonemes.extend(say_as_phonemes)
|
||||
|
||||
if token_phonemes:
|
||||
self._results.append(
|
||||
|
|
@ -379,7 +266,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
if sent_phonemes:
|
||||
yield self._speak_sentence_phonemes(sent_phonemes)
|
||||
|
||||
def _get_or_load_voice(self, voice_key: str) -> LoadedVoice:
|
||||
def _get_or_load_voice(self, voice_key: str) -> Mimic3Voice:
|
||||
existing_voice = self.loaded_voices.get(voice_key)
|
||||
if existing_voice is not None:
|
||||
return existing_voice
|
||||
|
|
@ -399,57 +286,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
|
|||
|
||||
return existing_voice
|
||||
|
||||
_LOGGER.debug("Loading voice from %s", model_dir)
|
||||
|
||||
config_path = model_dir / "config.json"
|
||||
_LOGGER.debug("Loading model config from %s", config_path)
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as config_file:
|
||||
config = TrainingConfig.load(config_file)
|
||||
|
||||
# phoneme -> id
|
||||
phoneme_ids_path = model_dir / "phonemes.txt"
|
||||
_LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
|
||||
with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
|
||||
phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
|
||||
|
||||
generator_path = model_dir / "generator.onnx"
|
||||
_LOGGER.debug("Loading model from %s", generator_path)
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
# sess_options.enable_cpu_mem_arena = False
|
||||
# sess_options.enable_mem_pattern = False
|
||||
# sess_options.enable_mem_reuse = False
|
||||
|
||||
onnx_model = onnxruntime.InferenceSession(
|
||||
str(generator_path), sess_options=sess_options
|
||||
)
|
||||
|
||||
voice = LoadedVoice(
|
||||
config=config, onnx_model=onnx_model, phoneme_to_id=phoneme_to_id
|
||||
)
|
||||
|
||||
# valid_phonemes = []
|
||||
# for phoneme_str in self._phoneme_to_id:
|
||||
# maybe_phoneme = Phoneme(phoneme_str)
|
||||
# if any(
|
||||
# [
|
||||
# maybe_phoneme.vowel,
|
||||
# maybe_phoneme.consonant,
|
||||
# maybe_phoneme.dipthong,
|
||||
# maybe_phoneme.schwa,
|
||||
# ]
|
||||
# ):
|
||||
# valid_phonemes.append(maybe_phoneme)
|
||||
|
||||
# self._voice_phonemes = Phonemes(phonemes=valid_phonemes)
|
||||
|
||||
# phoneme -> phoneme, phoneme, ...
|
||||
phoneme_map_path = model_dir / "phoneme_map.txt"
|
||||
if phoneme_map_path.is_file():
|
||||
_LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
|
||||
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
|
||||
voice.phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
|
||||
voice = Mimic3Voice.load_from_directory(model_dir)
|
||||
|
||||
_LOGGER.info("Loaded voice from %s", model_dir)
|
||||
|
||||
|
|
|
|||
403
mimic3-tts/mimic3_tts/voice.py
Normal file
403
mimic3-tts/mimic3_tts/voice.py
Normal file
|
|
@ -0,0 +1,403 @@
|
|||
#!/usr/bin/env python3
|
||||
import itertools
|
||||
import logging
|
||||
import time
|
||||
import typing
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from pathlib import Path
|
||||
from xml.sax.saxutils import escape as xmlescape
|
||||
|
||||
import espeak_phonemizer
|
||||
import gruut
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
import phonemes2ids
|
||||
from gruut_ipa import IPA
|
||||
|
||||
from mimic3_tts.config import Phonemizer, TrainingConfig
|
||||
from mimic3_tts.utils import audio_float_to_int16
|
||||
|
||||
PHONEME_TYPE = str
|
||||
PHONEME_ID_TYPE = int
|
||||
WORD_PHONEMES_TYPE = typing.List[typing.List[PHONEME_TYPE]]
|
||||
PHONEME_MAP_TYPE = typing.Dict[PHONEME_TYPE, typing.List[PHONEME_TYPE]]
|
||||
|
||||
SPEAKER_NAME_TYPE = str
|
||||
SPEAKER_ID_TYPE = int
|
||||
SPEAKER_MAP_TYPE = typing.Dict[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
|
||||
|
||||
DEFAULT_LANGUAGE = "en_US"
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Mimic3Voice(metaclass=ABCMeta):
|
||||
def __init__(
|
||||
self,
|
||||
config: TrainingConfig,
|
||||
onnx_model: onnxruntime.InferenceSession,
|
||||
phoneme_to_id: typing.Dict[PHONEME_TYPE, int],
|
||||
phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None,
|
||||
speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None,
|
||||
):
|
||||
self.config = config
|
||||
self.onnx_model = onnx_model
|
||||
self.phoneme_to_id = phoneme_to_id
|
||||
self.phoneme_map = phoneme_map
|
||||
self.speaker_map = speaker_map
|
||||
|
||||
@abstractmethod
|
||||
def text_to_phonemes(
|
||||
self, text: str, text_language: typing.Optional[str] = None
|
||||
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
|
||||
pass
|
||||
|
||||
def word_to_phonemes(
|
||||
self,
|
||||
word_text: str,
|
||||
word_role: typing.Optional[str] = None,
|
||||
text_language: typing.Optional[str] = None,
|
||||
) -> typing.List[PHONEME_TYPE]:
|
||||
word_phonemes = []
|
||||
for sent_phonemes in self.text_to_phonemes(
|
||||
word_text, text_language=text_language
|
||||
):
|
||||
for sent_word_phonemes in sent_phonemes:
|
||||
word_phonemes.extend(sent_word_phonemes)
|
||||
|
||||
return word_phonemes
|
||||
|
||||
def say_as_to_phonemes(
|
||||
self,
|
||||
text: str,
|
||||
interpret_as: str,
|
||||
say_format: typing.Optional[str] = None,
|
||||
text_language: typing.Optional[str] = None,
|
||||
) -> WORD_PHONEMES_TYPE:
|
||||
word_phonemes = []
|
||||
for sent_phonemes in self.text_to_phonemes(text, text_language=text_language):
|
||||
word_phonemes.extend(sent_phonemes)
|
||||
|
||||
return word_phonemes
|
||||
|
||||
def phonemes_to_ids(
|
||||
self, phonemes: WORD_PHONEMES_TYPE
|
||||
) -> typing.Sequence[PHONEME_ID_TYPE]:
|
||||
phoneme_map = self.phoneme_map or self.config.phonemes.phoneme_map
|
||||
|
||||
return phonemes2ids.phonemes2ids(
|
||||
word_phonemes=phonemes,
|
||||
phoneme_to_id=self.phoneme_to_id,
|
||||
pad=self.config.phonemes.pad,
|
||||
bos=self.config.phonemes.bos,
|
||||
eos=self.config.phonemes.eos,
|
||||
auto_bos_eos=self.config.phonemes.auto_bos_eos,
|
||||
blank=self.config.phonemes.blank,
|
||||
blank_word=self.config.phonemes.blank_word,
|
||||
blank_between=self.config.phonemes.blank_between,
|
||||
blank_at_start=self.config.phonemes.blank_at_start,
|
||||
blank_at_end=self.config.phonemes.blank_at_end,
|
||||
simple_punctuation=self.config.phonemes.simple_punctuation,
|
||||
punctuation_map=self.config.phonemes.punctuation_map,
|
||||
separate=self.config.phonemes.separate,
|
||||
separate_graphemes=self.config.phonemes.separate_graphemes,
|
||||
separate_tones=self.config.phonemes.separate_tones,
|
||||
tone_before=self.config.phonemes.tone_before,
|
||||
phoneme_map=phoneme_map,
|
||||
fail_on_missing=False,
|
||||
)
|
||||
|
||||
def ids_to_audio(
|
||||
self,
|
||||
phoneme_ids: typing.Sequence[PHONEME_ID_TYPE],
|
||||
speaker: typing.Optional[
|
||||
typing.Union[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
|
||||
] = None,
|
||||
length_scale: float = 1.0,
|
||||
noise_scale: float = 0.333,
|
||||
noise_w: float = 1.0,
|
||||
) -> np.ndarray:
|
||||
# Create model inputs
|
||||
text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||
text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
|
||||
scales_array = np.array(
|
||||
[noise_scale, length_scale, noise_w,], dtype=np.float32,
|
||||
)
|
||||
|
||||
# TODO: Use settings from voice config
|
||||
inputs = {
|
||||
"input": text_array,
|
||||
"input_lengths": text_lengths_array,
|
||||
"scales": scales_array,
|
||||
}
|
||||
|
||||
if self.config.is_multispeaker:
|
||||
speaker_id = 0
|
||||
if isinstance(speaker, SPEAKER_NAME_TYPE):
|
||||
if self.speaker_map:
|
||||
speaker_id = self.speaker_map.get(speaker, speaker_id)
|
||||
elif speaker is not None:
|
||||
speaker_id = speaker
|
||||
|
||||
speaker_id_array = np.array([speaker_id], dtype=np.int64)
|
||||
inputs["sid"] = speaker_id_array
|
||||
|
||||
# Infer audio from phonemes
|
||||
start_time = time.perf_counter()
|
||||
audio = self.onnx_model.run(None, inputs)[0].squeeze()
|
||||
audio = audio_float_to_int16(audio)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
# Compute real-time factor
|
||||
audio_duration_sec = audio.shape[-1] / self.config.audio.sample_rate
|
||||
infer_sec = end_time - start_time
|
||||
real_time_factor = (
|
||||
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
|
||||
)
|
||||
|
||||
_LOGGER.debug("RTF: %s", real_time_factor)
|
||||
|
||||
return audio
|
||||
|
||||
@staticmethod
|
||||
def load_from_directory(
|
||||
voice_dir: typing.Union[str, Path],
|
||||
session_options: typing.Optional[onnxruntime.SessionOptions] = None,
|
||||
) -> "Mimic3Voice":
|
||||
voice_dir = Path(voice_dir)
|
||||
_LOGGER.debug("Loading voice from %s", voice_dir)
|
||||
|
||||
config_path = voice_dir / "config.json"
|
||||
_LOGGER.debug("Loading config from %s", config_path)
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as config_file:
|
||||
config = TrainingConfig.load(config_file)
|
||||
|
||||
# phoneme -> id
|
||||
phoneme_ids_path = voice_dir / "phonemes.txt"
|
||||
_LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
|
||||
with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
|
||||
phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
|
||||
|
||||
generator_path = voice_dir / "generator.onnx"
|
||||
_LOGGER.debug("Loading model from %s", generator_path)
|
||||
|
||||
# Load onnx model
|
||||
session_options = session_options or onnxruntime.SessionOptions()
|
||||
onnx_model = onnxruntime.InferenceSession(
|
||||
str(generator_path), sess_options=session_options
|
||||
)
|
||||
|
||||
# phoneme -> phoneme, phoneme, ...
|
||||
phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None
|
||||
phoneme_map_path = voice_dir / "phoneme_map.txt"
|
||||
if phoneme_map_path.is_file():
|
||||
_LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
|
||||
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
|
||||
phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
|
||||
|
||||
# TODO: Load speaker map
|
||||
|
||||
if config.phonemizer == Phonemizer.GRUUT:
|
||||
return GruutVoice(
|
||||
config=config,
|
||||
onnx_model=onnx_model,
|
||||
phoneme_to_id=phoneme_to_id,
|
||||
phoneme_map=phoneme_map,
|
||||
)
|
||||
|
||||
if config.phonemizer == Phonemizer.ESPEAK:
|
||||
return EspeakVoice(
|
||||
config=config,
|
||||
onnx_model=onnx_model,
|
||||
phoneme_to_id=phoneme_to_id,
|
||||
phoneme_map=phoneme_map,
|
||||
)
|
||||
if config.phonemizer == Phonemizer.SYMBOLS:
|
||||
return SymbolsVoice(
|
||||
config=config,
|
||||
onnx_model=onnx_model,
|
||||
phoneme_to_id=phoneme_to_id,
|
||||
phoneme_map=phoneme_map,
|
||||
)
|
||||
|
||||
raise ValueError(f"Unsupported phonemizer: {config.phonemizer}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class GruutVoice(Mimic3Voice):
|
||||
def text_to_phonemes(
|
||||
self, text: str, text_language: typing.Optional[str] = None
|
||||
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
for sentence in gruut.sentences(text, lang=text_language):
|
||||
sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
|
||||
if sent_phonemes:
|
||||
yield sent_phonemes
|
||||
|
||||
def word_to_phonemes(
|
||||
self,
|
||||
word_text: str,
|
||||
word_role: typing.Optional[str] = None,
|
||||
text_language: typing.Optional[str] = None,
|
||||
) -> typing.List[PHONEME_TYPE]:
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
|
||||
word_role = xmlescape(word_role) if word_role else ""
|
||||
word_text = xmlescape(word_text)
|
||||
|
||||
sentence = next(
|
||||
iter(
|
||||
gruut.sentences(
|
||||
f'<w role="{word_role}">{word_text}</w>',
|
||||
ssml=True,
|
||||
lang=text_language,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
sentence_word = next(iter(sentence))
|
||||
|
||||
return sentence_word.phonemes
|
||||
|
||||
def say_as_to_phonemes(
|
||||
self,
|
||||
text: str,
|
||||
interpret_as: str,
|
||||
say_format: typing.Optional[str] = None,
|
||||
text_language: typing.Optional[str] = None,
|
||||
) -> WORD_PHONEMES_TYPE:
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
|
||||
word_text = xmlescape(text)
|
||||
interpret_as = xmlescape(interpret_as)
|
||||
format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
|
||||
|
||||
sentences = gruut.sentences(
|
||||
f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
|
||||
ssml=True,
|
||||
lang=text_language,
|
||||
)
|
||||
|
||||
sent_phonemes: WORD_PHONEMES_TYPE = []
|
||||
|
||||
for sentence in sentences:
|
||||
sent_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
|
||||
|
||||
return sent_phonemes
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class EspeakVoice(Mimic3Voice):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._phonemizer = espeak_phonemizer.Phonemizer()
|
||||
|
||||
def text_to_phonemes(
|
||||
self, text: str, text_language: typing.Optional[str] = None
|
||||
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
|
||||
phoneme_separator = ""
|
||||
word_separator = self.config.phonemes.word_separator
|
||||
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
|
||||
voice = self._language_to_voice(text_language)
|
||||
|
||||
phoneme_str = self._phonemizer.phonemize(
|
||||
text,
|
||||
voice=voice,
|
||||
keep_clause_breakers=True,
|
||||
phoneme_separator=phoneme_separator,
|
||||
word_separator=word_separator,
|
||||
punctuation_separator=phoneme_separator,
|
||||
)
|
||||
|
||||
word_phonemes = [
|
||||
list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
|
||||
]
|
||||
|
||||
yield word_phonemes
|
||||
|
||||
def word_to_phonemes(
|
||||
self,
|
||||
word_text: str,
|
||||
word_role: typing.Optional[str] = None,
|
||||
text_language: typing.Optional[str] = None,
|
||||
) -> typing.List[PHONEME_TYPE]:
|
||||
phoneme_separator = ""
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
|
||||
word_role = xmlescape(word_role) if word_role else ""
|
||||
word_text = xmlescape(word_text)
|
||||
|
||||
voice = self._language_to_voice(text_language)
|
||||
|
||||
phoneme_str = self._phonemizer.phonemize(
|
||||
f'<w role="{word_role}">{word_text}</w>',
|
||||
voice=voice,
|
||||
keep_clause_breakers=True,
|
||||
phoneme_separator=phoneme_separator,
|
||||
punctuation_separator=phoneme_separator,
|
||||
ssml=True,
|
||||
)
|
||||
|
||||
word_phonemes = list(IPA.graphemes(phoneme_str))
|
||||
|
||||
return word_phonemes
|
||||
|
||||
def say_as_to_phonemes(
|
||||
self,
|
||||
text: str,
|
||||
interpret_as: str,
|
||||
say_format: typing.Optional[str] = None,
|
||||
text_language: typing.Optional[str] = None,
|
||||
) -> WORD_PHONEMES_TYPE:
|
||||
phoneme_separator = ""
|
||||
word_separator = self.config.phonemes.word_separator
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
|
||||
word_text = xmlescape(text)
|
||||
interpret_as = xmlescape(interpret_as)
|
||||
format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
|
||||
|
||||
voice = self._language_to_voice(text_language)
|
||||
|
||||
phoneme_str = self._phonemizer.phonemize(
|
||||
f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
|
||||
voice=voice,
|
||||
keep_clause_breakers=True,
|
||||
phoneme_separator=phoneme_separator,
|
||||
punctuation_separator=phoneme_separator,
|
||||
word_separator=word_separator,
|
||||
ssml=True,
|
||||
)
|
||||
|
||||
word_phonemes = [
|
||||
list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
|
||||
]
|
||||
|
||||
return word_phonemes
|
||||
|
||||
def _language_to_voice(self, language: str) -> str:
|
||||
# en_US -> en-us
|
||||
return language.strip().lower().replace("_", "-")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class SymbolsVoice(Mimic3Voice):
|
||||
def text_to_phonemes(
|
||||
self, text: str, text_language: typing.Optional[str] = None
|
||||
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
|
||||
word_separator = self.config.phonemes.word_separator
|
||||
word_phonemes = [
|
||||
list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator)
|
||||
]
|
||||
yield word_phonemes
|
||||
|
|
@ -2,3 +2,6 @@
|
|||
|
||||
[mypy-setuptools.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-onnxruntime.*]
|
||||
ignore_missing_imports = True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue