Add espeak and symbol voices

This commit is contained in:
Michael Hansen 2022-03-18 17:04:56 -04:00
commit fb9cd71919
6 changed files with 543 additions and 237 deletions

View file

@ -1,2 +1,3 @@
from .tts import Mimic3TextToSpeechSystem, Mimic3Settings
from opentts_abc import AudioResult, MarkResult
from .tts import Mimic3Settings, Mimic3TextToSpeechSystem

View file

@ -5,13 +5,23 @@ import wave
logging.basicConfig(level=logging.DEBUG)
from opentts_abc.ssml import SSMLSpeaker
from mimic3_tts.tts import Mimic3TextToSpeechSystem, Mimic3Settings, AudioResult, MarkResult
settings = Mimic3Settings(length_scale=1.2, noise_w=0)
from mimic3_tts.tts import (
AudioResult,
MarkResult,
Mimic3Settings,
Mimic3TextToSpeechSystem,
)
settings = Mimic3Settings()
tts = Mimic3TextToSpeechSystem(settings)
speaker = SSMLSpeaker(tts)
ssml = '<speak><s><voice name="en_US/vctk_low#20">This is a test.</voice></s></speak>'
# ssml = '<speak><voice name="el_GR/rapunzelina_low"><s><w>Το</w><w>αερόστρωμνό</w><w>μου</w><w>είναι</w><w>γεμάτο</w><w>χέλια.</w></s></voice></speak>'
# ssml = '<speak><voice name="uk_UK/m-ailabs_low"><s><w>бажав</w></s></voice></speak>'
# ssml = '<speak><s><w>Hello</w><w>World</w></s></speak>'
# ssml = '<speak><s>Hello world</s></speak>'
ssml = '<speak><s><voice name="el_GR/rapunzelina_low"><say-as interpret-as="characters">12</say-as></voice></s></speak>'
wav_file: wave.Wave_write = wave.open("out.wav", "wb")
params_set = False

View file

@ -1,18 +1,4 @@
"""Configuration classes"""
# Copyright 2021 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
import json
import typing
@ -20,6 +6,7 @@ from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
import numpy as np
from dataclasses_json import DataClassJsonMixin
from gruut_ipa import IPA
from phonemes2ids import BlankBetween
@ -59,6 +46,51 @@ class AudioConfig(DataClassJsonMixin):
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
# -------------------------------------------------------------------------
# Normalization
# -------------------------------------------------------------------------
def normalize(self, mel_db: np.ndarray) -> np.ndarray:
"""Put values in [0, max_norm] or [-max_norm, max_norm]"""
mel_norm = ((mel_db - self.ref_level_db) - self.min_level_db) / (
-self.min_level_db
)
if self.symmetric_norm:
# Symmetric norm
mel_norm = ((2 * self.max_norm) * mel_norm) - self.max_norm
if self.clip_norm:
mel_norm = np.clip(mel_norm, -self.max_norm, self.max_norm)
else:
# Asymmetric norm
mel_norm = self.max_norm * mel_norm
if self.clip_norm:
mel_norm = np.clip(mel_norm, 0, self.max_norm)
return mel_norm
def denormalize(self, mel_db: np.ndarray) -> np.ndarray:
"""Pull values out of [0, max_norm] or [-max_norm, max_norm]"""
if self.symmetric_norm:
# Symmetric norm
if self.clip_norm:
mel_denorm = np.clip(mel_db, -self.max_norm, self.max_norm)
mel_denorm = (
(mel_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)
) + self.min_level_db
else:
# Asymmetric norm
if self.clip_norm:
mel_denorm = np.clip(mel_db, 0, self.max_norm)
mel_denorm = (
mel_denorm * -self.min_level_db / self.max_norm
) + self.min_level_db
mel_denorm += self.ref_level_db
return mel_denorm
@dataclass
class ModelConfig(DataClassJsonMixin):
@ -84,7 +116,7 @@ class ModelConfig(DataClassJsonMixin):
upsample_kernel_sizes: typing.Tuple[int, ...] = (16, 16, 4, 4)
n_layers_q: int = 3
use_spectral_norm: bool = False
gin_channels: int = 256
gin_channels: int = 0 # single speaker
use_sdp: bool = True # StochasticDurationPredictor
@property
@ -100,7 +132,7 @@ class PhonemesConfig(DataClassJsonMixin):
word_separator: str = "#"
"""Separator between word phonemes in CSV input (must not match phoneme_separator)"""
phoneme_to_id: typing.Optional[typing.Mapping[str, int]] = None
phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
pad: typing.Optional[str] = "_"
bos: typing.Optional[str] = None
eos: typing.Optional[str] = None
@ -110,15 +142,18 @@ class PhonemesConfig(DataClassJsonMixin):
blank_at_start: bool = True
blank_at_end: bool = True
simple_punctuation: bool = True
punctuation_map: typing.Optional[typing.Mapping[str, str]] = None
punctuation_map: typing.Optional[typing.Dict[str, str]] = None
separate: typing.Optional[typing.List[str]] = None
separate_graphemes: bool = False
separate_tones: bool = False
tone_before: bool = False
phoneme_map: typing.Optional[typing.Mapping[str, str]] = None
phoneme_map: typing.Optional[typing.Dict[str, str]] = None
auto_bos_eos: bool = False
minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
break_phonemes_into_graphemes: bool = False
drop_stress: bool = False
symbols: typing.Optional[typing.List[str]] = None
def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
"""Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
@ -158,8 +193,7 @@ class MetadataFormat(str, Enum):
@dataclass
class DatasetConfig:
name: str
metadata_path: typing.Optional[typing.Union[str, Path]] = None
train_path: typing.Optional[typing.Union[str, Path]] = None
metadata_format: MetadataFormat = MetadataFormat.TEXT
multispeaker: bool = False
text_language: typing.Optional[str] = None
audio_dir: typing.Optional[typing.Union[str, Path]] = None
@ -183,6 +217,13 @@ class AlignerConfig:
casing: typing.Optional[TextCasing] = None
@dataclass
class InferenceConfig:
length_scale: float = 1.0
noise_scale: float = 0.667
noise_w: float = 0.8
@dataclass
class TrainingConfig(DataClassJsonMixin):
seed: int = 1234
@ -206,6 +247,8 @@ class TrainingConfig(DataClassJsonMixin):
min_spec_length: typing.Optional[int] = None
max_spec_length: typing.Optional[int] = None
min_speaker_utterances: typing.Optional[int] = None
last_epoch: int = 1
global_step: int = 1
best_loss: typing.Optional[float] = None
@ -216,22 +259,31 @@ class TrainingConfig(DataClassJsonMixin):
text_language: typing.Optional[str] = None
phonemizer: typing.Optional[Phonemizer] = None
datasets: typing.List[DatasetConfig] = field(default_factory=list)
dataset_format: MetadataFormat = MetadataFormat.TEXT
inference: InferenceConfig = field(default_factory=InferenceConfig)
version: int = 1
git_commit: str = ""
@property
def is_multispeaker(self):
return (
self.model.is_multispeaker
or any(d.multispeaker for d in self.datasets)
)
return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)
def save(self, config_file: typing.TextIO):
"""Save config as JSON to a file"""
json.dump(self.to_dict(), config_file, indent=4)
def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
if self.speaker_id_map is None:
self.speaker_id_map = {}
full_speaker_name = f"{dataset_name}_{speaker_name}"
speaker_id = self.speaker_id_map.get(full_speaker_name)
if speaker_id is None:
speaker_id = len(self.speaker_id_map)
self.speaker_id_map[full_speaker_name] = speaker_id
return speaker_id
@staticmethod
def load(config_file: typing.TextIO) -> "TrainingConfig":
"""Load config from a JSON file"""

View file

@ -1,11 +1,9 @@
#!/usr/bin/env python3
import dataclasses
import logging
import time
import typing
from abc import ABCMeta
from dataclasses import dataclass, field
from copy import deepcopy
from dataclasses import dataclass, field
from pathlib import Path
from xml.sax.saxutils import escape as xmlescape
@ -14,22 +12,22 @@ import numpy as np
import onnxruntime
import phonemes2ids
from gruut.const import LookupPhonemes, WordRole
from gruut_ipa import guess_phonemes, IPA, Phonemes, Phoneme
from gruut_ipa import IPA, Phoneme, guess_phonemes
from opentts_abc import (
TextToSpeechSystem,
Voice,
BaseToken,
BaseResult,
MarkResult,
AudioResult,
Word,
BaseResult,
BaseToken,
MarkResult,
Phonemes,
SayAs,
TextToSpeechSystem,
Voice,
Word,
)
from mimic3_tts.config import TrainingConfig
from mimic3_tts.utils import audio_float_to_int16
from mimic3_tts.voice import Mimic3Voice
_DIR = Path(__file__).parent
@ -51,20 +49,12 @@ class Mimic3Settings:
voices_directories: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None
speaker_id: typing.Optional[int] = None
length_scale: float = 1.0
noise_scale: float = 0.333
noise_w: float = 1.0
noise_scale: float = 0.667
noise_w: float = 0.8
text_language: typing.Optional[str] = None
sample_rate: int = 22050
@dataclass
class LoadedVoice:
config: TrainingConfig
onnx_model: onnxruntime.InferenceSession
phoneme_to_id: typing.Mapping[str, int]
phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
@dataclass
class Mimic3Phonemes:
current_settings: Mimic3Settings
@ -80,12 +70,9 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def __init__(self, settings: Mimic3Settings):
self.settings = settings
# self._current_voice: typing.Optional[LoadedVoice] = None
# self._current_settings = self.settings
self._results: typing.List[typing.Union[BaseResult, Mimic3Phonemes]] = []
self.loaded_voices: typing.Dict[str, LoadedVoice] = {}
self.loaded_voices: typing.Dict[str, Mimic3Voice] = {}
@property
def voice(self) -> str:
@ -107,10 +94,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
# TODO: Use speaker map
self.speaker_id = int(speaker_id_str)
# self._current_voice = self._get_or_load_voice(
# self.settings.voice or DEFAULT_VOICE
# )
@property
def speaker_id(self) -> typing.Optional[int]:
return self.settings.speaker_id
@ -131,27 +114,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def get_default_voices_directories() -> typing.List[Path]:
return [_DIR.parent.parent / "voices"]
# @property
# def text_lang(self) -> str:
# return (
# self.settings.text_language
# or self.settings.language
# or (
# self._current_voice.config.text_language
# if self._current_voice
# else None
# )
# or "en_US"
# )
# @property
# def sample_rate(self) -> int:
# return (
# self._current_voice.config.audio.sample_rate
# if self._current_voice
# else self.settings.sample_rate
# )
def get_voices(self) -> typing.Iterable[Voice]:
voices_dirs = (
self.settings.voices_directories
@ -185,146 +147,71 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def begin_utterance(self):
self._results.clear()
# self._current_settings = deepcopy(self.settings)
def speak_text(self, text: str, text_language: typing.Optional[str] = None):
text_language = text_language or self.language
for sentence in gruut.sentences(text, lang=text_language):
sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
voice = self._get_or_load_voice(self.voice)
for sent_phonemes in voice.text_to_phonemes(text, text_language=text_language):
self._results.append(
Mimic3Phonemes(
current_settings=deepcopy(self.settings),
phonemes=sent_phonemes,
current_settings=deepcopy(self.settings), phonemes=sent_phonemes,
)
)
def _speak_sentence_phonemes(
self,
sent_phonemes,
text: typing.Optional[str] = None,
settings: typing.Optional[Mimic3Settings] = None,
self, sent_phonemes, settings: typing.Optional[Mimic3Settings] = None,
) -> AudioResult:
settings = settings or self.settings
current_voice = self._get_or_load_voice(settings.voice or DEFAULT_VOICE)
voice = self._get_or_load_voice(settings.voice or self.voice)
sent_phoneme_ids = voice.phonemes_to_ids(sent_phonemes)
config = current_voice.config
onnx_model = current_voice.onnx_model
phoneme_to_id = current_voice.phoneme_to_id
phoneme_map = current_voice.phoneme_map or config.phonemes.phoneme_map
_LOGGER.debug("phonemes=%s, ids=%s", sent_phonemes, sent_phoneme_ids)
sent_phoneme_ids = phonemes2ids.phonemes2ids(
word_phonemes=sent_phonemes,
phoneme_to_id=phoneme_to_id,
pad=config.phonemes.pad,
bos=config.phonemes.bos,
eos=config.phonemes.eos,
auto_bos_eos=config.phonemes.auto_bos_eos,
blank=config.phonemes.blank,
blank_word=config.phonemes.blank_word,
blank_between=config.phonemes.blank_between,
blank_at_start=config.phonemes.blank_at_start,
blank_at_end=config.phonemes.blank_at_end,
simple_punctuation=config.phonemes.simple_punctuation,
punctuation_map=config.phonemes.punctuation_map,
separate=config.phonemes.separate,
separate_graphemes=config.phonemes.separate_graphemes,
separate_tones=config.phonemes.separate_tones,
tone_before=config.phonemes.tone_before,
phoneme_map=phoneme_map,
fail_on_missing=False,
audio = voice.ids_to_audio(
sent_phoneme_ids,
speaker=self.speaker_id,
length_scale=settings.length_scale,
noise_scale=settings.noise_scale,
noise_w=settings.noise_w,
)
if text:
_LOGGER.debug("%s %s %s", text, sent_phonemes, sent_phoneme_ids)
else:
_LOGGER.debug("%s %s", sent_phonemes, sent_phoneme_ids)
# Create model inputs
text_array = np.expand_dims(np.array(sent_phoneme_ids, dtype=np.int64), 0)
text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
scales_array = np.array(
[
settings.noise_scale,
settings.length_scale,
settings.noise_w,
],
dtype=np.float32,
)
inputs = {
"input": text_array,
"input_lengths": text_lengths_array,
"scales": scales_array,
}
if config.is_multispeaker:
speaker_id = settings.speaker_id if settings.speaker_id is not None else 0
speaker_id_array = np.array([speaker_id], dtype=np.int64)
inputs["sid"] = speaker_id_array
# Infer audio from phonemes
start_time = time.perf_counter()
audio = onnx_model.run(None, inputs)[0].squeeze()
audio = audio_float_to_int16(audio)
end_time = time.perf_counter()
# Compute real-time factor
audio_duration_sec = audio.shape[-1] / config.audio.sample_rate
infer_sec = end_time - start_time
real_time_factor = (
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
)
_LOGGER.debug("RTF: %s", real_time_factor)
audio_bytes = audio.tobytes()
return AudioResult(
sample_rate_hz=config.audio.sample_rate,
sample_rate_hz=voice.config.audio.sample_rate,
audio_bytes=audio_bytes,
# 16-bit mono
sample_width_bytes=2,
num_channels=1,
)
def speak_tokens(self, tokens: typing.Iterable[BaseToken]):
def speak_tokens(
self,
tokens: typing.Iterable[BaseToken],
text_language: typing.Optional[str] = None,
):
voice = self._get_or_load_voice(self.voice)
token_phonemes: PHONEMES_LIST = []
for token in tokens:
if isinstance(token, Word):
word_role = xmlescape(token.role) if token.role else ""
word_text = xmlescape(token.text)
sentence = next(
iter(
gruut.sentences(
f'<w role="{word_role}">{word_text}</w>', ssml=True
)
)
word_phonemes = voice.word_to_phonemes(
token.text, word_role=token.role, text_language=text_language
)
token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
token_phonemes.append(word_phonemes)
elif isinstance(token, Phonemes):
phoneme_str = token.text.strip()
if " " in phoneme_str:
token_phonemes.append(phoneme_str.split())
else:
token_phonemes.append(list(phoneme_str))
token_phonemes.append(list(IPA.graphemes(phoneme_str)))
elif isinstance(token, SayAs):
word_text = xmlescape(token.text)
interpret_as = xmlescape(token.interpret_as)
format_attr = (
f'format="{xmlescape(token.format)}"' if token.format else ""
say_as_phonemes = voice.say_as_to_phonemes(
token.text,
interpret_as=token.interpret_as,
say_format=token.format,
text_language=text_language,
)
sentence = next(
iter(
gruut.sentences(
f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
ssml=True,
)
)
)
token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
token_phonemes.extend(say_as_phonemes)
if token_phonemes:
self._results.append(
@ -379,7 +266,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
if sent_phonemes:
yield self._speak_sentence_phonemes(sent_phonemes)
def _get_or_load_voice(self, voice_key: str) -> LoadedVoice:
def _get_or_load_voice(self, voice_key: str) -> Mimic3Voice:
existing_voice = self.loaded_voices.get(voice_key)
if existing_voice is not None:
return existing_voice
@ -399,57 +286,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
return existing_voice
_LOGGER.debug("Loading voice from %s", model_dir)
config_path = model_dir / "config.json"
_LOGGER.debug("Loading model config from %s", config_path)
with open(config_path, "r", encoding="utf-8") as config_file:
config = TrainingConfig.load(config_file)
# phoneme -> id
phoneme_ids_path = model_dir / "phonemes.txt"
_LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
generator_path = model_dir / "generator.onnx"
_LOGGER.debug("Loading model from %s", generator_path)
sess_options = onnxruntime.SessionOptions()
# sess_options.enable_cpu_mem_arena = False
# sess_options.enable_mem_pattern = False
# sess_options.enable_mem_reuse = False
onnx_model = onnxruntime.InferenceSession(
str(generator_path), sess_options=sess_options
)
voice = LoadedVoice(
config=config, onnx_model=onnx_model, phoneme_to_id=phoneme_to_id
)
# valid_phonemes = []
# for phoneme_str in self._phoneme_to_id:
# maybe_phoneme = Phoneme(phoneme_str)
# if any(
# [
# maybe_phoneme.vowel,
# maybe_phoneme.consonant,
# maybe_phoneme.dipthong,
# maybe_phoneme.schwa,
# ]
# ):
# valid_phonemes.append(maybe_phoneme)
# self._voice_phonemes = Phonemes(phonemes=valid_phonemes)
# phoneme -> phoneme, phoneme, ...
phoneme_map_path = model_dir / "phoneme_map.txt"
if phoneme_map_path.is_file():
_LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
voice.phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
voice = Mimic3Voice.load_from_directory(model_dir)
_LOGGER.info("Loaded voice from %s", model_dir)

View file

@ -0,0 +1,403 @@
#!/usr/bin/env python3
import itertools
import logging
import time
import typing
from abc import ABCMeta, abstractmethod
from pathlib import Path
from xml.sax.saxutils import escape as xmlescape
import espeak_phonemizer
import gruut
import numpy as np
import onnxruntime
import phonemes2ids
from gruut_ipa import IPA
from mimic3_tts.config import Phonemizer, TrainingConfig
from mimic3_tts.utils import audio_float_to_int16
PHONEME_TYPE = str
PHONEME_ID_TYPE = int
WORD_PHONEMES_TYPE = typing.List[typing.List[PHONEME_TYPE]]
PHONEME_MAP_TYPE = typing.Dict[PHONEME_TYPE, typing.List[PHONEME_TYPE]]
SPEAKER_NAME_TYPE = str
SPEAKER_ID_TYPE = int
SPEAKER_MAP_TYPE = typing.Dict[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
DEFAULT_LANGUAGE = "en_US"
_LOGGER = logging.getLogger(__name__)
# -----------------------------------------------------------------------------
class Mimic3Voice(metaclass=ABCMeta):
def __init__(
self,
config: TrainingConfig,
onnx_model: onnxruntime.InferenceSession,
phoneme_to_id: typing.Dict[PHONEME_TYPE, int],
phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None,
speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None,
):
self.config = config
self.onnx_model = onnx_model
self.phoneme_to_id = phoneme_to_id
self.phoneme_map = phoneme_map
self.speaker_map = speaker_map
@abstractmethod
def text_to_phonemes(
self, text: str, text_language: typing.Optional[str] = None
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
pass
def word_to_phonemes(
self,
word_text: str,
word_role: typing.Optional[str] = None,
text_language: typing.Optional[str] = None,
) -> typing.List[PHONEME_TYPE]:
word_phonemes = []
for sent_phonemes in self.text_to_phonemes(
word_text, text_language=text_language
):
for sent_word_phonemes in sent_phonemes:
word_phonemes.extend(sent_word_phonemes)
return word_phonemes
def say_as_to_phonemes(
self,
text: str,
interpret_as: str,
say_format: typing.Optional[str] = None,
text_language: typing.Optional[str] = None,
) -> WORD_PHONEMES_TYPE:
word_phonemes = []
for sent_phonemes in self.text_to_phonemes(text, text_language=text_language):
word_phonemes.extend(sent_phonemes)
return word_phonemes
def phonemes_to_ids(
self, phonemes: WORD_PHONEMES_TYPE
) -> typing.Sequence[PHONEME_ID_TYPE]:
phoneme_map = self.phoneme_map or self.config.phonemes.phoneme_map
return phonemes2ids.phonemes2ids(
word_phonemes=phonemes,
phoneme_to_id=self.phoneme_to_id,
pad=self.config.phonemes.pad,
bos=self.config.phonemes.bos,
eos=self.config.phonemes.eos,
auto_bos_eos=self.config.phonemes.auto_bos_eos,
blank=self.config.phonemes.blank,
blank_word=self.config.phonemes.blank_word,
blank_between=self.config.phonemes.blank_between,
blank_at_start=self.config.phonemes.blank_at_start,
blank_at_end=self.config.phonemes.blank_at_end,
simple_punctuation=self.config.phonemes.simple_punctuation,
punctuation_map=self.config.phonemes.punctuation_map,
separate=self.config.phonemes.separate,
separate_graphemes=self.config.phonemes.separate_graphemes,
separate_tones=self.config.phonemes.separate_tones,
tone_before=self.config.phonemes.tone_before,
phoneme_map=phoneme_map,
fail_on_missing=False,
)
def ids_to_audio(
self,
phoneme_ids: typing.Sequence[PHONEME_ID_TYPE],
speaker: typing.Optional[
typing.Union[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
] = None,
length_scale: float = 1.0,
noise_scale: float = 0.333,
noise_w: float = 1.0,
) -> np.ndarray:
# Create model inputs
text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
scales_array = np.array(
[noise_scale, length_scale, noise_w,], dtype=np.float32,
)
# TODO: Use settings from voice config
inputs = {
"input": text_array,
"input_lengths": text_lengths_array,
"scales": scales_array,
}
if self.config.is_multispeaker:
speaker_id = 0
if isinstance(speaker, SPEAKER_NAME_TYPE):
if self.speaker_map:
speaker_id = self.speaker_map.get(speaker, speaker_id)
elif speaker is not None:
speaker_id = speaker
speaker_id_array = np.array([speaker_id], dtype=np.int64)
inputs["sid"] = speaker_id_array
# Infer audio from phonemes
start_time = time.perf_counter()
audio = self.onnx_model.run(None, inputs)[0].squeeze()
audio = audio_float_to_int16(audio)
end_time = time.perf_counter()
# Compute real-time factor
audio_duration_sec = audio.shape[-1] / self.config.audio.sample_rate
infer_sec = end_time - start_time
real_time_factor = (
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
)
_LOGGER.debug("RTF: %s", real_time_factor)
return audio
@staticmethod
def load_from_directory(
voice_dir: typing.Union[str, Path],
session_options: typing.Optional[onnxruntime.SessionOptions] = None,
) -> "Mimic3Voice":
voice_dir = Path(voice_dir)
_LOGGER.debug("Loading voice from %s", voice_dir)
config_path = voice_dir / "config.json"
_LOGGER.debug("Loading config from %s", config_path)
with open(config_path, "r", encoding="utf-8") as config_file:
config = TrainingConfig.load(config_file)
# phoneme -> id
phoneme_ids_path = voice_dir / "phonemes.txt"
_LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
generator_path = voice_dir / "generator.onnx"
_LOGGER.debug("Loading model from %s", generator_path)
# Load onnx model
session_options = session_options or onnxruntime.SessionOptions()
onnx_model = onnxruntime.InferenceSession(
str(generator_path), sess_options=session_options
)
# phoneme -> phoneme, phoneme, ...
phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None
phoneme_map_path = voice_dir / "phoneme_map.txt"
if phoneme_map_path.is_file():
_LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
# TODO: Load speaker map
if config.phonemizer == Phonemizer.GRUUT:
return GruutVoice(
config=config,
onnx_model=onnx_model,
phoneme_to_id=phoneme_to_id,
phoneme_map=phoneme_map,
)
if config.phonemizer == Phonemizer.ESPEAK:
return EspeakVoice(
config=config,
onnx_model=onnx_model,
phoneme_to_id=phoneme_to_id,
phoneme_map=phoneme_map,
)
if config.phonemizer == Phonemizer.SYMBOLS:
return SymbolsVoice(
config=config,
onnx_model=onnx_model,
phoneme_to_id=phoneme_to_id,
phoneme_map=phoneme_map,
)
raise ValueError(f"Unsupported phonemizer: {config.phonemizer}")
# -----------------------------------------------------------------------------
class GruutVoice(Mimic3Voice):
def text_to_phonemes(
self, text: str, text_language: typing.Optional[str] = None
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
for sentence in gruut.sentences(text, lang=text_language):
sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
if sent_phonemes:
yield sent_phonemes
def word_to_phonemes(
self,
word_text: str,
word_role: typing.Optional[str] = None,
text_language: typing.Optional[str] = None,
) -> typing.List[PHONEME_TYPE]:
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
word_role = xmlescape(word_role) if word_role else ""
word_text = xmlescape(word_text)
sentence = next(
iter(
gruut.sentences(
f'<w role="{word_role}">{word_text}</w>',
ssml=True,
lang=text_language,
)
)
)
sentence_word = next(iter(sentence))
return sentence_word.phonemes
def say_as_to_phonemes(
self,
text: str,
interpret_as: str,
say_format: typing.Optional[str] = None,
text_language: typing.Optional[str] = None,
) -> WORD_PHONEMES_TYPE:
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
word_text = xmlescape(text)
interpret_as = xmlescape(interpret_as)
format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
sentences = gruut.sentences(
f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
ssml=True,
lang=text_language,
)
sent_phonemes: WORD_PHONEMES_TYPE = []
for sentence in sentences:
sent_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
return sent_phonemes
# -----------------------------------------------------------------------------
class EspeakVoice(Mimic3Voice):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._phonemizer = espeak_phonemizer.Phonemizer()
def text_to_phonemes(
self, text: str, text_language: typing.Optional[str] = None
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
phoneme_separator = ""
word_separator = self.config.phonemes.word_separator
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
voice = self._language_to_voice(text_language)
phoneme_str = self._phonemizer.phonemize(
text,
voice=voice,
keep_clause_breakers=True,
phoneme_separator=phoneme_separator,
word_separator=word_separator,
punctuation_separator=phoneme_separator,
)
word_phonemes = [
list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
]
yield word_phonemes
def word_to_phonemes(
self,
word_text: str,
word_role: typing.Optional[str] = None,
text_language: typing.Optional[str] = None,
) -> typing.List[PHONEME_TYPE]:
phoneme_separator = ""
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
word_role = xmlescape(word_role) if word_role else ""
word_text = xmlescape(word_text)
voice = self._language_to_voice(text_language)
phoneme_str = self._phonemizer.phonemize(
f'<w role="{word_role}">{word_text}</w>',
voice=voice,
keep_clause_breakers=True,
phoneme_separator=phoneme_separator,
punctuation_separator=phoneme_separator,
ssml=True,
)
word_phonemes = list(IPA.graphemes(phoneme_str))
return word_phonemes
def say_as_to_phonemes(
self,
text: str,
interpret_as: str,
say_format: typing.Optional[str] = None,
text_language: typing.Optional[str] = None,
) -> WORD_PHONEMES_TYPE:
phoneme_separator = ""
word_separator = self.config.phonemes.word_separator
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
word_text = xmlescape(text)
interpret_as = xmlescape(interpret_as)
format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
voice = self._language_to_voice(text_language)
phoneme_str = self._phonemizer.phonemize(
f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
voice=voice,
keep_clause_breakers=True,
phoneme_separator=phoneme_separator,
punctuation_separator=phoneme_separator,
word_separator=word_separator,
ssml=True,
)
word_phonemes = [
list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
]
return word_phonemes
def _language_to_voice(self, language: str) -> str:
# en_US -> en-us
return language.strip().lower().replace("_", "-")
# -----------------------------------------------------------------------------
class SymbolsVoice(Mimic3Voice):
def text_to_phonemes(
self, text: str, text_language: typing.Optional[str] = None
) -> typing.Iterable[WORD_PHONEMES_TYPE]:
word_separator = self.config.phonemes.word_separator
word_phonemes = [
list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator)
]
yield word_phonemes

View file

@ -2,3 +2,6 @@
[mypy-setuptools.*]
ignore_missing_imports = True
[mypy-onnxruntime.*]
ignore_missing_imports = True