diff --git a/mimic3-tts/mimic3_tts/__init__.py b/mimic3-tts/mimic3_tts/__init__.py index 012fd97..087a94a 100644 --- a/mimic3-tts/mimic3_tts/__init__.py +++ b/mimic3-tts/mimic3_tts/__init__.py @@ -1,2 +1,3 @@ -from .tts import Mimic3TextToSpeechSystem, Mimic3Settings from opentts_abc import AudioResult, MarkResult + +from .tts import Mimic3Settings, Mimic3TextToSpeechSystem diff --git a/mimic3-tts/mimic3_tts/__main__.py b/mimic3-tts/mimic3_tts/__main__.py index 1309492..72ed425 100644 --- a/mimic3-tts/mimic3_tts/__main__.py +++ b/mimic3-tts/mimic3_tts/__main__.py @@ -5,13 +5,23 @@ import wave logging.basicConfig(level=logging.DEBUG) from opentts_abc.ssml import SSMLSpeaker -from mimic3_tts.tts import Mimic3TextToSpeechSystem, Mimic3Settings, AudioResult, MarkResult -settings = Mimic3Settings(length_scale=1.2, noise_w=0) +from mimic3_tts.tts import ( + AudioResult, + MarkResult, + Mimic3Settings, + Mimic3TextToSpeechSystem, +) + +settings = Mimic3Settings() tts = Mimic3TextToSpeechSystem(settings) speaker = SSMLSpeaker(tts) -ssml = 'This is a test.' +# ssml = 'Τοαερόστρωμνόμουείναιγεμάτοχέλια.' +# ssml = 'бажав' +# ssml = 'HelloWorld' +# ssml = 'Hello world' +ssml = '12' wav_file: wave.Wave_write = wave.open("out.wav", "wb") params_set = False diff --git a/mimic3-tts/mimic3_tts/config.py b/mimic3-tts/mimic3_tts/config.py index d9f4f26..87edf5a 100644 --- a/mimic3-tts/mimic3_tts/config.py +++ b/mimic3-tts/mimic3_tts/config.py @@ -1,18 +1,4 @@ """Configuration classes""" -# Copyright 2021 Mycroft AI Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# import collections import json import typing @@ -20,6 +6,7 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path +import numpy as np from dataclasses_json import DataClassJsonMixin from gruut_ipa import IPA from phonemes2ids import BlankBetween @@ -59,6 +46,51 @@ class AudioConfig(DataClassJsonMixin): if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 + # ------------------------------------------------------------------------- + # Normalization + # ------------------------------------------------------------------------- + + def normalize(self, mel_db: np.ndarray) -> np.ndarray: + """Put values in [0, max_norm] or [-max_norm, max_norm]""" + mel_norm = ((mel_db - self.ref_level_db) - self.min_level_db) / ( + -self.min_level_db + ) + if self.symmetric_norm: + # Symmetric norm + mel_norm = ((2 * self.max_norm) * mel_norm) - self.max_norm + if self.clip_norm: + mel_norm = np.clip(mel_norm, -self.max_norm, self.max_norm) + else: + # Asymmetric norm + mel_norm = self.max_norm * mel_norm + if self.clip_norm: + mel_norm = np.clip(mel_norm, 0, self.max_norm) + + return mel_norm + + def denormalize(self, mel_db: np.ndarray) -> np.ndarray: + """Pull values out of [0, max_norm] or [-max_norm, max_norm]""" + if self.symmetric_norm: + # Symmetric norm + if self.clip_norm: + mel_denorm = np.clip(mel_db, -self.max_norm, self.max_norm) + + mel_denorm = ( + (mel_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm) + ) + self.min_level_db + else: + # Asymmetric norm + if self.clip_norm: + mel_denorm = np.clip(mel_db, 0, self.max_norm) + + mel_denorm = ( + mel_denorm * -self.min_level_db / self.max_norm + ) + self.min_level_db + + mel_denorm += self.ref_level_db + + return mel_denorm + @dataclass class ModelConfig(DataClassJsonMixin): @@ -84,7 +116,7 @@ class ModelConfig(DataClassJsonMixin): upsample_kernel_sizes: typing.Tuple[int, ...] = (16, 16, 4, 4) n_layers_q: int = 3 use_spectral_norm: bool = False - gin_channels: int = 256 + gin_channels: int = 0 # single speaker use_sdp: bool = True # StochasticDurationPredictor @property @@ -100,7 +132,7 @@ class PhonemesConfig(DataClassJsonMixin): word_separator: str = "#" """Separator between word phonemes in CSV input (must not match phoneme_separator)""" - phoneme_to_id: typing.Optional[typing.Mapping[str, int]] = None + phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None pad: typing.Optional[str] = "_" bos: typing.Optional[str] = None eos: typing.Optional[str] = None @@ -110,15 +142,18 @@ class PhonemesConfig(DataClassJsonMixin): blank_at_start: bool = True blank_at_end: bool = True simple_punctuation: bool = True - punctuation_map: typing.Optional[typing.Mapping[str, str]] = None + punctuation_map: typing.Optional[typing.Dict[str, str]] = None separate: typing.Optional[typing.List[str]] = None separate_graphemes: bool = False separate_tones: bool = False tone_before: bool = False - phoneme_map: typing.Optional[typing.Mapping[str, str]] = None + phoneme_map: typing.Optional[typing.Dict[str, str]] = None auto_bos_eos: bool = False minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value + break_phonemes_into_graphemes: bool = False + drop_stress: bool = False + symbols: typing.Optional[typing.List[str]] = None def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]: """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)""" @@ -158,8 +193,7 @@ class MetadataFormat(str, Enum): @dataclass class DatasetConfig: name: str - metadata_path: typing.Optional[typing.Union[str, Path]] = None - train_path: typing.Optional[typing.Union[str, Path]] = None + metadata_format: MetadataFormat = MetadataFormat.TEXT multispeaker: bool = False text_language: typing.Optional[str] = None audio_dir: typing.Optional[typing.Union[str, Path]] = None @@ -183,6 +217,13 @@ class AlignerConfig: casing: typing.Optional[TextCasing] = None +@dataclass +class InferenceConfig: + length_scale: float = 1.0 + noise_scale: float = 0.667 + noise_w: float = 0.8 + + @dataclass class TrainingConfig(DataClassJsonMixin): seed: int = 1234 @@ -206,6 +247,8 @@ class TrainingConfig(DataClassJsonMixin): min_spec_length: typing.Optional[int] = None max_spec_length: typing.Optional[int] = None + min_speaker_utterances: typing.Optional[int] = None + last_epoch: int = 1 global_step: int = 1 best_loss: typing.Optional[float] = None @@ -216,22 +259,31 @@ class TrainingConfig(DataClassJsonMixin): text_language: typing.Optional[str] = None phonemizer: typing.Optional[Phonemizer] = None datasets: typing.List[DatasetConfig] = field(default_factory=list) - dataset_format: MetadataFormat = MetadataFormat.TEXT + inference: InferenceConfig = field(default_factory=InferenceConfig) version: int = 1 git_commit: str = "" @property def is_multispeaker(self): - return ( - self.model.is_multispeaker - or any(d.multispeaker for d in self.datasets) - ) + return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets) def save(self, config_file: typing.TextIO): """Save config as JSON to a file""" json.dump(self.to_dict(), config_file, indent=4) + def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int: + if self.speaker_id_map is None: + self.speaker_id_map = {} + + full_speaker_name = f"{dataset_name}_{speaker_name}" + speaker_id = self.speaker_id_map.get(full_speaker_name) + if speaker_id is None: + speaker_id = len(self.speaker_id_map) + self.speaker_id_map[full_speaker_name] = speaker_id + + return speaker_id + @staticmethod def load(config_file: typing.TextIO) -> "TrainingConfig": """Load config from a JSON file""" diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py index 75a18a0..7fb3009 100644 --- a/mimic3-tts/mimic3_tts/tts.py +++ b/mimic3-tts/mimic3_tts/tts.py @@ -1,11 +1,9 @@ #!/usr/bin/env python3 -import dataclasses import logging import time import typing -from abc import ABCMeta -from dataclasses import dataclass, field from copy import deepcopy +from dataclasses import dataclass, field from pathlib import Path from xml.sax.saxutils import escape as xmlescape @@ -14,22 +12,22 @@ import numpy as np import onnxruntime import phonemes2ids from gruut.const import LookupPhonemes, WordRole -from gruut_ipa import guess_phonemes, IPA, Phonemes, Phoneme - +from gruut_ipa import IPA, Phoneme, guess_phonemes from opentts_abc import ( - TextToSpeechSystem, - Voice, - BaseToken, - BaseResult, - MarkResult, AudioResult, - Word, + BaseResult, + BaseToken, + MarkResult, Phonemes, SayAs, + TextToSpeechSystem, + Voice, + Word, ) from mimic3_tts.config import TrainingConfig from mimic3_tts.utils import audio_float_to_int16 +from mimic3_tts.voice import Mimic3Voice _DIR = Path(__file__).parent @@ -51,20 +49,12 @@ class Mimic3Settings: voices_directories: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None speaker_id: typing.Optional[int] = None length_scale: float = 1.0 - noise_scale: float = 0.333 - noise_w: float = 1.0 + noise_scale: float = 0.667 + noise_w: float = 0.8 text_language: typing.Optional[str] = None sample_rate: int = 22050 -@dataclass -class LoadedVoice: - config: TrainingConfig - onnx_model: onnxruntime.InferenceSession - phoneme_to_id: typing.Mapping[str, int] - phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None - - @dataclass class Mimic3Phonemes: current_settings: Mimic3Settings @@ -80,12 +70,9 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): def __init__(self, settings: Mimic3Settings): self.settings = settings - # self._current_voice: typing.Optional[LoadedVoice] = None - # self._current_settings = self.settings - self._results: typing.List[typing.Union[BaseResult, Mimic3Phonemes]] = [] - self.loaded_voices: typing.Dict[str, LoadedVoice] = {} + self.loaded_voices: typing.Dict[str, Mimic3Voice] = {} @property def voice(self) -> str: @@ -107,10 +94,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): # TODO: Use speaker map self.speaker_id = int(speaker_id_str) - # self._current_voice = self._get_or_load_voice( - # self.settings.voice or DEFAULT_VOICE - # ) - @property def speaker_id(self) -> typing.Optional[int]: return self.settings.speaker_id @@ -131,27 +114,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): def get_default_voices_directories() -> typing.List[Path]: return [_DIR.parent.parent / "voices"] - # @property - # def text_lang(self) -> str: - # return ( - # self.settings.text_language - # or self.settings.language - # or ( - # self._current_voice.config.text_language - # if self._current_voice - # else None - # ) - # or "en_US" - # ) - - # @property - # def sample_rate(self) -> int: - # return ( - # self._current_voice.config.audio.sample_rate - # if self._current_voice - # else self.settings.sample_rate - # ) - def get_voices(self) -> typing.Iterable[Voice]: voices_dirs = ( self.settings.voices_directories @@ -185,146 +147,71 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): def begin_utterance(self): self._results.clear() - # self._current_settings = deepcopy(self.settings) def speak_text(self, text: str, text_language: typing.Optional[str] = None): - text_language = text_language or self.language - for sentence in gruut.sentences(text, lang=text_language): - sent_phonemes = [w.phonemes for w in sentence if w.phonemes] + voice = self._get_or_load_voice(self.voice) + + for sent_phonemes in voice.text_to_phonemes(text, text_language=text_language): self._results.append( Mimic3Phonemes( - current_settings=deepcopy(self.settings), - phonemes=sent_phonemes, + current_settings=deepcopy(self.settings), phonemes=sent_phonemes, ) ) def _speak_sentence_phonemes( - self, - sent_phonemes, - text: typing.Optional[str] = None, - settings: typing.Optional[Mimic3Settings] = None, + self, sent_phonemes, settings: typing.Optional[Mimic3Settings] = None, ) -> AudioResult: settings = settings or self.settings - current_voice = self._get_or_load_voice(settings.voice or DEFAULT_VOICE) + voice = self._get_or_load_voice(settings.voice or self.voice) + sent_phoneme_ids = voice.phonemes_to_ids(sent_phonemes) - config = current_voice.config - onnx_model = current_voice.onnx_model - phoneme_to_id = current_voice.phoneme_to_id - phoneme_map = current_voice.phoneme_map or config.phonemes.phoneme_map + _LOGGER.debug("phonemes=%s, ids=%s", sent_phonemes, sent_phoneme_ids) - sent_phoneme_ids = phonemes2ids.phonemes2ids( - word_phonemes=sent_phonemes, - phoneme_to_id=phoneme_to_id, - pad=config.phonemes.pad, - bos=config.phonemes.bos, - eos=config.phonemes.eos, - auto_bos_eos=config.phonemes.auto_bos_eos, - blank=config.phonemes.blank, - blank_word=config.phonemes.blank_word, - blank_between=config.phonemes.blank_between, - blank_at_start=config.phonemes.blank_at_start, - blank_at_end=config.phonemes.blank_at_end, - simple_punctuation=config.phonemes.simple_punctuation, - punctuation_map=config.phonemes.punctuation_map, - separate=config.phonemes.separate, - separate_graphemes=config.phonemes.separate_graphemes, - separate_tones=config.phonemes.separate_tones, - tone_before=config.phonemes.tone_before, - phoneme_map=phoneme_map, - fail_on_missing=False, + audio = voice.ids_to_audio( + sent_phoneme_ids, + speaker=self.speaker_id, + length_scale=settings.length_scale, + noise_scale=settings.noise_scale, + noise_w=settings.noise_w, ) - if text: - _LOGGER.debug("%s %s %s", text, sent_phonemes, sent_phoneme_ids) - else: - _LOGGER.debug("%s %s", sent_phonemes, sent_phoneme_ids) - - # Create model inputs - text_array = np.expand_dims(np.array(sent_phoneme_ids, dtype=np.int64), 0) - text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64) - scales_array = np.array( - [ - settings.noise_scale, - settings.length_scale, - settings.noise_w, - ], - dtype=np.float32, - ) - - inputs = { - "input": text_array, - "input_lengths": text_lengths_array, - "scales": scales_array, - } - - if config.is_multispeaker: - speaker_id = settings.speaker_id if settings.speaker_id is not None else 0 - speaker_id_array = np.array([speaker_id], dtype=np.int64) - inputs["sid"] = speaker_id_array - - # Infer audio from phonemes - start_time = time.perf_counter() - audio = onnx_model.run(None, inputs)[0].squeeze() - audio = audio_float_to_int16(audio) - end_time = time.perf_counter() - - # Compute real-time factor - audio_duration_sec = audio.shape[-1] / config.audio.sample_rate - infer_sec = end_time - start_time - real_time_factor = ( - infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0 - ) - - _LOGGER.debug("RTF: %s", real_time_factor) - audio_bytes = audio.tobytes() return AudioResult( - sample_rate_hz=config.audio.sample_rate, + sample_rate_hz=voice.config.audio.sample_rate, audio_bytes=audio_bytes, # 16-bit mono sample_width_bytes=2, num_channels=1, ) - def speak_tokens(self, tokens: typing.Iterable[BaseToken]): + def speak_tokens( + self, + tokens: typing.Iterable[BaseToken], + text_language: typing.Optional[str] = None, + ): + voice = self._get_or_load_voice(self.voice) token_phonemes: PHONEMES_LIST = [] for token in tokens: if isinstance(token, Word): - word_role = xmlescape(token.role) if token.role else "" - word_text = xmlescape(token.text) - - sentence = next( - iter( - gruut.sentences( - f'{word_text}', ssml=True - ) - ) + word_phonemes = voice.word_to_phonemes( + token.text, word_role=token.role, text_language=text_language ) - token_phonemes.extend(w.phonemes for w in sentence if w.phonemes) + token_phonemes.append(word_phonemes) elif isinstance(token, Phonemes): phoneme_str = token.text.strip() if " " in phoneme_str: token_phonemes.append(phoneme_str.split()) else: - token_phonemes.append(list(phoneme_str)) + token_phonemes.append(list(IPA.graphemes(phoneme_str))) elif isinstance(token, SayAs): - word_text = xmlescape(token.text) - interpret_as = xmlescape(token.interpret_as) - format_attr = ( - f'format="{xmlescape(token.format)}"' if token.format else "" + say_as_phonemes = voice.say_as_to_phonemes( + token.text, + interpret_as=token.interpret_as, + say_format=token.format, + text_language=text_language, ) - - sentence = next( - iter( - gruut.sentences( - f'{word_text}', - ssml=True, - ) - ) - ) - - token_phonemes.extend(w.phonemes for w in sentence if w.phonemes) + token_phonemes.extend(say_as_phonemes) if token_phonemes: self._results.append( @@ -379,7 +266,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): if sent_phonemes: yield self._speak_sentence_phonemes(sent_phonemes) - def _get_or_load_voice(self, voice_key: str) -> LoadedVoice: + def _get_or_load_voice(self, voice_key: str) -> Mimic3Voice: existing_voice = self.loaded_voices.get(voice_key) if existing_voice is not None: return existing_voice @@ -399,57 +286,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): return existing_voice - _LOGGER.debug("Loading voice from %s", model_dir) - - config_path = model_dir / "config.json" - _LOGGER.debug("Loading model config from %s", config_path) - - with open(config_path, "r", encoding="utf-8") as config_file: - config = TrainingConfig.load(config_file) - - # phoneme -> id - phoneme_ids_path = model_dir / "phonemes.txt" - _LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path) - with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file: - phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file) - - generator_path = model_dir / "generator.onnx" - _LOGGER.debug("Loading model from %s", generator_path) - - sess_options = onnxruntime.SessionOptions() - # sess_options.enable_cpu_mem_arena = False - # sess_options.enable_mem_pattern = False - # sess_options.enable_mem_reuse = False - - onnx_model = onnxruntime.InferenceSession( - str(generator_path), sess_options=sess_options - ) - - voice = LoadedVoice( - config=config, onnx_model=onnx_model, phoneme_to_id=phoneme_to_id - ) - - # valid_phonemes = [] - # for phoneme_str in self._phoneme_to_id: - # maybe_phoneme = Phoneme(phoneme_str) - # if any( - # [ - # maybe_phoneme.vowel, - # maybe_phoneme.consonant, - # maybe_phoneme.dipthong, - # maybe_phoneme.schwa, - # ] - # ): - # valid_phonemes.append(maybe_phoneme) - - # self._voice_phonemes = Phonemes(phonemes=valid_phonemes) - - # phoneme -> phoneme, phoneme, ... - phoneme_map_path = model_dir / "phoneme_map.txt" - if phoneme_map_path.is_file(): - _LOGGER.debug("Loading phoneme map from %s", phoneme_map_path) - with open(phoneme_map_path, "r", encoding="utf-8") as map_file: - voice.phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file) + voice = Mimic3Voice.load_from_directory(model_dir) _LOGGER.info("Loaded voice from %s", model_dir) diff --git a/mimic3-tts/mimic3_tts/voice.py b/mimic3-tts/mimic3_tts/voice.py new file mode 100644 index 0000000..140e6ce --- /dev/null +++ b/mimic3-tts/mimic3_tts/voice.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +import itertools +import logging +import time +import typing +from abc import ABCMeta, abstractmethod +from pathlib import Path +from xml.sax.saxutils import escape as xmlescape + +import espeak_phonemizer +import gruut +import numpy as np +import onnxruntime +import phonemes2ids +from gruut_ipa import IPA + +from mimic3_tts.config import Phonemizer, TrainingConfig +from mimic3_tts.utils import audio_float_to_int16 + +PHONEME_TYPE = str +PHONEME_ID_TYPE = int +WORD_PHONEMES_TYPE = typing.List[typing.List[PHONEME_TYPE]] +PHONEME_MAP_TYPE = typing.Dict[PHONEME_TYPE, typing.List[PHONEME_TYPE]] + +SPEAKER_NAME_TYPE = str +SPEAKER_ID_TYPE = int +SPEAKER_MAP_TYPE = typing.Dict[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE] + +DEFAULT_LANGUAGE = "en_US" + +_LOGGER = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- + + +class Mimic3Voice(metaclass=ABCMeta): + def __init__( + self, + config: TrainingConfig, + onnx_model: onnxruntime.InferenceSession, + phoneme_to_id: typing.Dict[PHONEME_TYPE, int], + phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None, + speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None, + ): + self.config = config + self.onnx_model = onnx_model + self.phoneme_to_id = phoneme_to_id + self.phoneme_map = phoneme_map + self.speaker_map = speaker_map + + @abstractmethod + def text_to_phonemes( + self, text: str, text_language: typing.Optional[str] = None + ) -> typing.Iterable[WORD_PHONEMES_TYPE]: + pass + + def word_to_phonemes( + self, + word_text: str, + word_role: typing.Optional[str] = None, + text_language: typing.Optional[str] = None, + ) -> typing.List[PHONEME_TYPE]: + word_phonemes = [] + for sent_phonemes in self.text_to_phonemes( + word_text, text_language=text_language + ): + for sent_word_phonemes in sent_phonemes: + word_phonemes.extend(sent_word_phonemes) + + return word_phonemes + + def say_as_to_phonemes( + self, + text: str, + interpret_as: str, + say_format: typing.Optional[str] = None, + text_language: typing.Optional[str] = None, + ) -> WORD_PHONEMES_TYPE: + word_phonemes = [] + for sent_phonemes in self.text_to_phonemes(text, text_language=text_language): + word_phonemes.extend(sent_phonemes) + + return word_phonemes + + def phonemes_to_ids( + self, phonemes: WORD_PHONEMES_TYPE + ) -> typing.Sequence[PHONEME_ID_TYPE]: + phoneme_map = self.phoneme_map or self.config.phonemes.phoneme_map + + return phonemes2ids.phonemes2ids( + word_phonemes=phonemes, + phoneme_to_id=self.phoneme_to_id, + pad=self.config.phonemes.pad, + bos=self.config.phonemes.bos, + eos=self.config.phonemes.eos, + auto_bos_eos=self.config.phonemes.auto_bos_eos, + blank=self.config.phonemes.blank, + blank_word=self.config.phonemes.blank_word, + blank_between=self.config.phonemes.blank_between, + blank_at_start=self.config.phonemes.blank_at_start, + blank_at_end=self.config.phonemes.blank_at_end, + simple_punctuation=self.config.phonemes.simple_punctuation, + punctuation_map=self.config.phonemes.punctuation_map, + separate=self.config.phonemes.separate, + separate_graphemes=self.config.phonemes.separate_graphemes, + separate_tones=self.config.phonemes.separate_tones, + tone_before=self.config.phonemes.tone_before, + phoneme_map=phoneme_map, + fail_on_missing=False, + ) + + def ids_to_audio( + self, + phoneme_ids: typing.Sequence[PHONEME_ID_TYPE], + speaker: typing.Optional[ + typing.Union[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE] + ] = None, + length_scale: float = 1.0, + noise_scale: float = 0.333, + noise_w: float = 1.0, + ) -> np.ndarray: + # Create model inputs + text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) + text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64) + scales_array = np.array( + [noise_scale, length_scale, noise_w,], dtype=np.float32, + ) + + # TODO: Use settings from voice config + inputs = { + "input": text_array, + "input_lengths": text_lengths_array, + "scales": scales_array, + } + + if self.config.is_multispeaker: + speaker_id = 0 + if isinstance(speaker, SPEAKER_NAME_TYPE): + if self.speaker_map: + speaker_id = self.speaker_map.get(speaker, speaker_id) + elif speaker is not None: + speaker_id = speaker + + speaker_id_array = np.array([speaker_id], dtype=np.int64) + inputs["sid"] = speaker_id_array + + # Infer audio from phonemes + start_time = time.perf_counter() + audio = self.onnx_model.run(None, inputs)[0].squeeze() + audio = audio_float_to_int16(audio) + end_time = time.perf_counter() + + # Compute real-time factor + audio_duration_sec = audio.shape[-1] / self.config.audio.sample_rate + infer_sec = end_time - start_time + real_time_factor = ( + infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0 + ) + + _LOGGER.debug("RTF: %s", real_time_factor) + + return audio + + @staticmethod + def load_from_directory( + voice_dir: typing.Union[str, Path], + session_options: typing.Optional[onnxruntime.SessionOptions] = None, + ) -> "Mimic3Voice": + voice_dir = Path(voice_dir) + _LOGGER.debug("Loading voice from %s", voice_dir) + + config_path = voice_dir / "config.json" + _LOGGER.debug("Loading config from %s", config_path) + + with open(config_path, "r", encoding="utf-8") as config_file: + config = TrainingConfig.load(config_file) + + # phoneme -> id + phoneme_ids_path = voice_dir / "phonemes.txt" + _LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path) + with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file: + phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file) + + generator_path = voice_dir / "generator.onnx" + _LOGGER.debug("Loading model from %s", generator_path) + + # Load onnx model + session_options = session_options or onnxruntime.SessionOptions() + onnx_model = onnxruntime.InferenceSession( + str(generator_path), sess_options=session_options + ) + + # phoneme -> phoneme, phoneme, ... + phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None + phoneme_map_path = voice_dir / "phoneme_map.txt" + if phoneme_map_path.is_file(): + _LOGGER.debug("Loading phoneme map from %s", phoneme_map_path) + with open(phoneme_map_path, "r", encoding="utf-8") as map_file: + phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file) + + # TODO: Load speaker map + + if config.phonemizer == Phonemizer.GRUUT: + return GruutVoice( + config=config, + onnx_model=onnx_model, + phoneme_to_id=phoneme_to_id, + phoneme_map=phoneme_map, + ) + + if config.phonemizer == Phonemizer.ESPEAK: + return EspeakVoice( + config=config, + onnx_model=onnx_model, + phoneme_to_id=phoneme_to_id, + phoneme_map=phoneme_map, + ) + if config.phonemizer == Phonemizer.SYMBOLS: + return SymbolsVoice( + config=config, + onnx_model=onnx_model, + phoneme_to_id=phoneme_to_id, + phoneme_map=phoneme_map, + ) + + raise ValueError(f"Unsupported phonemizer: {config.phonemizer}") + + +# ----------------------------------------------------------------------------- + + +class GruutVoice(Mimic3Voice): + def text_to_phonemes( + self, text: str, text_language: typing.Optional[str] = None + ) -> typing.Iterable[WORD_PHONEMES_TYPE]: + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + for sentence in gruut.sentences(text, lang=text_language): + sent_phonemes = [w.phonemes for w in sentence if w.phonemes] + if sent_phonemes: + yield sent_phonemes + + def word_to_phonemes( + self, + word_text: str, + word_role: typing.Optional[str] = None, + text_language: typing.Optional[str] = None, + ) -> typing.List[PHONEME_TYPE]: + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + + word_role = xmlescape(word_role) if word_role else "" + word_text = xmlescape(word_text) + + sentence = next( + iter( + gruut.sentences( + f'{word_text}', + ssml=True, + lang=text_language, + ) + ) + ) + + sentence_word = next(iter(sentence)) + + return sentence_word.phonemes + + def say_as_to_phonemes( + self, + text: str, + interpret_as: str, + say_format: typing.Optional[str] = None, + text_language: typing.Optional[str] = None, + ) -> WORD_PHONEMES_TYPE: + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + + word_text = xmlescape(text) + interpret_as = xmlescape(interpret_as) + format_attr = f'format="{xmlescape(say_format)}"' if say_format else "" + + sentences = gruut.sentences( + f'{word_text}', + ssml=True, + lang=text_language, + ) + + sent_phonemes: WORD_PHONEMES_TYPE = [] + + for sentence in sentences: + sent_phonemes.extend(w.phonemes for w in sentence if w.phonemes) + + return sent_phonemes + + +# ----------------------------------------------------------------------------- + + +class EspeakVoice(Mimic3Voice): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._phonemizer = espeak_phonemizer.Phonemizer() + + def text_to_phonemes( + self, text: str, text_language: typing.Optional[str] = None + ) -> typing.Iterable[WORD_PHONEMES_TYPE]: + phoneme_separator = "" + word_separator = self.config.phonemes.word_separator + + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + + voice = self._language_to_voice(text_language) + + phoneme_str = self._phonemizer.phonemize( + text, + voice=voice, + keep_clause_breakers=True, + phoneme_separator=phoneme_separator, + word_separator=word_separator, + punctuation_separator=phoneme_separator, + ) + + word_phonemes = [ + list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator) + ] + + yield word_phonemes + + def word_to_phonemes( + self, + word_text: str, + word_role: typing.Optional[str] = None, + text_language: typing.Optional[str] = None, + ) -> typing.List[PHONEME_TYPE]: + phoneme_separator = "" + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + + word_role = xmlescape(word_role) if word_role else "" + word_text = xmlescape(word_text) + + voice = self._language_to_voice(text_language) + + phoneme_str = self._phonemizer.phonemize( + f'{word_text}', + voice=voice, + keep_clause_breakers=True, + phoneme_separator=phoneme_separator, + punctuation_separator=phoneme_separator, + ssml=True, + ) + + word_phonemes = list(IPA.graphemes(phoneme_str)) + + return word_phonemes + + def say_as_to_phonemes( + self, + text: str, + interpret_as: str, + say_format: typing.Optional[str] = None, + text_language: typing.Optional[str] = None, + ) -> WORD_PHONEMES_TYPE: + phoneme_separator = "" + word_separator = self.config.phonemes.word_separator + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + + word_text = xmlescape(text) + interpret_as = xmlescape(interpret_as) + format_attr = f'format="{xmlescape(say_format)}"' if say_format else "" + + voice = self._language_to_voice(text_language) + + phoneme_str = self._phonemizer.phonemize( + f'{word_text}', + voice=voice, + keep_clause_breakers=True, + phoneme_separator=phoneme_separator, + punctuation_separator=phoneme_separator, + word_separator=word_separator, + ssml=True, + ) + + word_phonemes = [ + list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator) + ] + + return word_phonemes + + def _language_to_voice(self, language: str) -> str: + # en_US -> en-us + return language.strip().lower().replace("_", "-") + + +# ----------------------------------------------------------------------------- + + +class SymbolsVoice(Mimic3Voice): + def text_to_phonemes( + self, text: str, text_language: typing.Optional[str] = None + ) -> typing.Iterable[WORD_PHONEMES_TYPE]: + word_separator = self.config.phonemes.word_separator + word_phonemes = [ + list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator) + ] + yield word_phonemes diff --git a/mimic3-tts/mypy.ini b/mimic3-tts/mypy.ini index 7d81e3f..916fdd7 100644 --- a/mimic3-tts/mypy.ini +++ b/mimic3-tts/mypy.ini @@ -2,3 +2,6 @@ [mypy-setuptools.*] ignore_missing_imports = True + +[mypy-onnxruntime.*] +ignore_missing_imports = True