diff --git a/mimic3-tts/mimic3_tts/__init__.py b/mimic3-tts/mimic3_tts/__init__.py
index 012fd97..087a94a 100644
--- a/mimic3-tts/mimic3_tts/__init__.py
+++ b/mimic3-tts/mimic3_tts/__init__.py
@@ -1,2 +1,3 @@
-from .tts import Mimic3TextToSpeechSystem, Mimic3Settings
from opentts_abc import AudioResult, MarkResult
+
+from .tts import Mimic3Settings, Mimic3TextToSpeechSystem
diff --git a/mimic3-tts/mimic3_tts/__main__.py b/mimic3-tts/mimic3_tts/__main__.py
index 1309492..72ed425 100644
--- a/mimic3-tts/mimic3_tts/__main__.py
+++ b/mimic3-tts/mimic3_tts/__main__.py
@@ -5,13 +5,23 @@ import wave
logging.basicConfig(level=logging.DEBUG)
from opentts_abc.ssml import SSMLSpeaker
-from mimic3_tts.tts import Mimic3TextToSpeechSystem, Mimic3Settings, AudioResult, MarkResult
-settings = Mimic3Settings(length_scale=1.2, noise_w=0)
+from mimic3_tts.tts import (
+ AudioResult,
+ MarkResult,
+ Mimic3Settings,
+ Mimic3TextToSpeechSystem,
+)
+
+settings = Mimic3Settings()
tts = Mimic3TextToSpeechSystem(settings)
speaker = SSMLSpeaker(tts)
-ssml = 'This is a test.'
+# ssml = 'Τοαερόστρωμνόμουείναιγεμάτοχέλια.'
+# ssml = 'бажав'
+# ssml = 'HelloWorld'
+# ssml = 'Hello world'
+ssml = '12'
wav_file: wave.Wave_write = wave.open("out.wav", "wb")
params_set = False
diff --git a/mimic3-tts/mimic3_tts/config.py b/mimic3-tts/mimic3_tts/config.py
index d9f4f26..87edf5a 100644
--- a/mimic3-tts/mimic3_tts/config.py
+++ b/mimic3-tts/mimic3_tts/config.py
@@ -1,18 +1,4 @@
"""Configuration classes"""
-# Copyright 2021 Mycroft AI Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
import collections
import json
import typing
@@ -20,6 +6,7 @@ from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
+import numpy as np
from dataclasses_json import DataClassJsonMixin
from gruut_ipa import IPA
from phonemes2ids import BlankBetween
@@ -59,6 +46,51 @@ class AudioConfig(DataClassJsonMixin):
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
+ # -------------------------------------------------------------------------
+ # Normalization
+ # -------------------------------------------------------------------------
+
+ def normalize(self, mel_db: np.ndarray) -> np.ndarray:
+ """Put values in [0, max_norm] or [-max_norm, max_norm]"""
+ mel_norm = ((mel_db - self.ref_level_db) - self.min_level_db) / (
+ -self.min_level_db
+ )
+ if self.symmetric_norm:
+ # Symmetric norm
+ mel_norm = ((2 * self.max_norm) * mel_norm) - self.max_norm
+ if self.clip_norm:
+ mel_norm = np.clip(mel_norm, -self.max_norm, self.max_norm)
+ else:
+ # Asymmetric norm
+ mel_norm = self.max_norm * mel_norm
+ if self.clip_norm:
+ mel_norm = np.clip(mel_norm, 0, self.max_norm)
+
+ return mel_norm
+
+ def denormalize(self, mel_db: np.ndarray) -> np.ndarray:
+ """Pull values out of [0, max_norm] or [-max_norm, max_norm]"""
+ if self.symmetric_norm:
+ # Symmetric norm
+ if self.clip_norm:
+ mel_denorm = np.clip(mel_db, -self.max_norm, self.max_norm)
+
+ mel_denorm = (
+ (mel_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)
+ ) + self.min_level_db
+ else:
+ # Asymmetric norm
+ if self.clip_norm:
+ mel_denorm = np.clip(mel_db, 0, self.max_norm)
+
+ mel_denorm = (
+ mel_denorm * -self.min_level_db / self.max_norm
+ ) + self.min_level_db
+
+ mel_denorm += self.ref_level_db
+
+ return mel_denorm
+
@dataclass
class ModelConfig(DataClassJsonMixin):
@@ -84,7 +116,7 @@ class ModelConfig(DataClassJsonMixin):
upsample_kernel_sizes: typing.Tuple[int, ...] = (16, 16, 4, 4)
n_layers_q: int = 3
use_spectral_norm: bool = False
- gin_channels: int = 256
+ gin_channels: int = 0 # single speaker
use_sdp: bool = True # StochasticDurationPredictor
@property
@@ -100,7 +132,7 @@ class PhonemesConfig(DataClassJsonMixin):
word_separator: str = "#"
"""Separator between word phonemes in CSV input (must not match phoneme_separator)"""
- phoneme_to_id: typing.Optional[typing.Mapping[str, int]] = None
+ phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
pad: typing.Optional[str] = "_"
bos: typing.Optional[str] = None
eos: typing.Optional[str] = None
@@ -110,15 +142,18 @@ class PhonemesConfig(DataClassJsonMixin):
blank_at_start: bool = True
blank_at_end: bool = True
simple_punctuation: bool = True
- punctuation_map: typing.Optional[typing.Mapping[str, str]] = None
+ punctuation_map: typing.Optional[typing.Dict[str, str]] = None
separate: typing.Optional[typing.List[str]] = None
separate_graphemes: bool = False
separate_tones: bool = False
tone_before: bool = False
- phoneme_map: typing.Optional[typing.Mapping[str, str]] = None
+ phoneme_map: typing.Optional[typing.Dict[str, str]] = None
auto_bos_eos: bool = False
minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
+ break_phonemes_into_graphemes: bool = False
+ drop_stress: bool = False
+ symbols: typing.Optional[typing.List[str]] = None
def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
"""Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
@@ -158,8 +193,7 @@ class MetadataFormat(str, Enum):
@dataclass
class DatasetConfig:
name: str
- metadata_path: typing.Optional[typing.Union[str, Path]] = None
- train_path: typing.Optional[typing.Union[str, Path]] = None
+ metadata_format: MetadataFormat = MetadataFormat.TEXT
multispeaker: bool = False
text_language: typing.Optional[str] = None
audio_dir: typing.Optional[typing.Union[str, Path]] = None
@@ -183,6 +217,13 @@ class AlignerConfig:
casing: typing.Optional[TextCasing] = None
+@dataclass
+class InferenceConfig:
+ length_scale: float = 1.0
+ noise_scale: float = 0.667
+ noise_w: float = 0.8
+
+
@dataclass
class TrainingConfig(DataClassJsonMixin):
seed: int = 1234
@@ -206,6 +247,8 @@ class TrainingConfig(DataClassJsonMixin):
min_spec_length: typing.Optional[int] = None
max_spec_length: typing.Optional[int] = None
+ min_speaker_utterances: typing.Optional[int] = None
+
last_epoch: int = 1
global_step: int = 1
best_loss: typing.Optional[float] = None
@@ -216,22 +259,31 @@ class TrainingConfig(DataClassJsonMixin):
text_language: typing.Optional[str] = None
phonemizer: typing.Optional[Phonemizer] = None
datasets: typing.List[DatasetConfig] = field(default_factory=list)
- dataset_format: MetadataFormat = MetadataFormat.TEXT
+ inference: InferenceConfig = field(default_factory=InferenceConfig)
version: int = 1
git_commit: str = ""
@property
def is_multispeaker(self):
- return (
- self.model.is_multispeaker
- or any(d.multispeaker for d in self.datasets)
- )
+ return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)
def save(self, config_file: typing.TextIO):
"""Save config as JSON to a file"""
json.dump(self.to_dict(), config_file, indent=4)
+ def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
+ if self.speaker_id_map is None:
+ self.speaker_id_map = {}
+
+ full_speaker_name = f"{dataset_name}_{speaker_name}"
+ speaker_id = self.speaker_id_map.get(full_speaker_name)
+ if speaker_id is None:
+ speaker_id = len(self.speaker_id_map)
+ self.speaker_id_map[full_speaker_name] = speaker_id
+
+ return speaker_id
+
@staticmethod
def load(config_file: typing.TextIO) -> "TrainingConfig":
"""Load config from a JSON file"""
diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py
index 75a18a0..7fb3009 100644
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@@ -1,11 +1,9 @@
#!/usr/bin/env python3
-import dataclasses
import logging
import time
import typing
-from abc import ABCMeta
-from dataclasses import dataclass, field
from copy import deepcopy
+from dataclasses import dataclass, field
from pathlib import Path
from xml.sax.saxutils import escape as xmlescape
@@ -14,22 +12,22 @@ import numpy as np
import onnxruntime
import phonemes2ids
from gruut.const import LookupPhonemes, WordRole
-from gruut_ipa import guess_phonemes, IPA, Phonemes, Phoneme
-
+from gruut_ipa import IPA, Phoneme, guess_phonemes
from opentts_abc import (
- TextToSpeechSystem,
- Voice,
- BaseToken,
- BaseResult,
- MarkResult,
AudioResult,
- Word,
+ BaseResult,
+ BaseToken,
+ MarkResult,
Phonemes,
SayAs,
+ TextToSpeechSystem,
+ Voice,
+ Word,
)
from mimic3_tts.config import TrainingConfig
from mimic3_tts.utils import audio_float_to_int16
+from mimic3_tts.voice import Mimic3Voice
_DIR = Path(__file__).parent
@@ -51,20 +49,12 @@ class Mimic3Settings:
voices_directories: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None
speaker_id: typing.Optional[int] = None
length_scale: float = 1.0
- noise_scale: float = 0.333
- noise_w: float = 1.0
+ noise_scale: float = 0.667
+ noise_w: float = 0.8
text_language: typing.Optional[str] = None
sample_rate: int = 22050
-@dataclass
-class LoadedVoice:
- config: TrainingConfig
- onnx_model: onnxruntime.InferenceSession
- phoneme_to_id: typing.Mapping[str, int]
- phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
-
-
@dataclass
class Mimic3Phonemes:
current_settings: Mimic3Settings
@@ -80,12 +70,9 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def __init__(self, settings: Mimic3Settings):
self.settings = settings
- # self._current_voice: typing.Optional[LoadedVoice] = None
- # self._current_settings = self.settings
-
self._results: typing.List[typing.Union[BaseResult, Mimic3Phonemes]] = []
- self.loaded_voices: typing.Dict[str, LoadedVoice] = {}
+ self.loaded_voices: typing.Dict[str, Mimic3Voice] = {}
@property
def voice(self) -> str:
@@ -107,10 +94,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
# TODO: Use speaker map
self.speaker_id = int(speaker_id_str)
- # self._current_voice = self._get_or_load_voice(
- # self.settings.voice or DEFAULT_VOICE
- # )
-
@property
def speaker_id(self) -> typing.Optional[int]:
return self.settings.speaker_id
@@ -131,27 +114,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def get_default_voices_directories() -> typing.List[Path]:
return [_DIR.parent.parent / "voices"]
- # @property
- # def text_lang(self) -> str:
- # return (
- # self.settings.text_language
- # or self.settings.language
- # or (
- # self._current_voice.config.text_language
- # if self._current_voice
- # else None
- # )
- # or "en_US"
- # )
-
- # @property
- # def sample_rate(self) -> int:
- # return (
- # self._current_voice.config.audio.sample_rate
- # if self._current_voice
- # else self.settings.sample_rate
- # )
-
def get_voices(self) -> typing.Iterable[Voice]:
voices_dirs = (
self.settings.voices_directories
@@ -185,146 +147,71 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
def begin_utterance(self):
self._results.clear()
- # self._current_settings = deepcopy(self.settings)
def speak_text(self, text: str, text_language: typing.Optional[str] = None):
- text_language = text_language or self.language
- for sentence in gruut.sentences(text, lang=text_language):
- sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
+ voice = self._get_or_load_voice(self.voice)
+
+ for sent_phonemes in voice.text_to_phonemes(text, text_language=text_language):
self._results.append(
Mimic3Phonemes(
- current_settings=deepcopy(self.settings),
- phonemes=sent_phonemes,
+ current_settings=deepcopy(self.settings), phonemes=sent_phonemes,
)
)
def _speak_sentence_phonemes(
- self,
- sent_phonemes,
- text: typing.Optional[str] = None,
- settings: typing.Optional[Mimic3Settings] = None,
+ self, sent_phonemes, settings: typing.Optional[Mimic3Settings] = None,
) -> AudioResult:
settings = settings or self.settings
- current_voice = self._get_or_load_voice(settings.voice or DEFAULT_VOICE)
+ voice = self._get_or_load_voice(settings.voice or self.voice)
+ sent_phoneme_ids = voice.phonemes_to_ids(sent_phonemes)
- config = current_voice.config
- onnx_model = current_voice.onnx_model
- phoneme_to_id = current_voice.phoneme_to_id
- phoneme_map = current_voice.phoneme_map or config.phonemes.phoneme_map
+ _LOGGER.debug("phonemes=%s, ids=%s", sent_phonemes, sent_phoneme_ids)
- sent_phoneme_ids = phonemes2ids.phonemes2ids(
- word_phonemes=sent_phonemes,
- phoneme_to_id=phoneme_to_id,
- pad=config.phonemes.pad,
- bos=config.phonemes.bos,
- eos=config.phonemes.eos,
- auto_bos_eos=config.phonemes.auto_bos_eos,
- blank=config.phonemes.blank,
- blank_word=config.phonemes.blank_word,
- blank_between=config.phonemes.blank_between,
- blank_at_start=config.phonemes.blank_at_start,
- blank_at_end=config.phonemes.blank_at_end,
- simple_punctuation=config.phonemes.simple_punctuation,
- punctuation_map=config.phonemes.punctuation_map,
- separate=config.phonemes.separate,
- separate_graphemes=config.phonemes.separate_graphemes,
- separate_tones=config.phonemes.separate_tones,
- tone_before=config.phonemes.tone_before,
- phoneme_map=phoneme_map,
- fail_on_missing=False,
+ audio = voice.ids_to_audio(
+ sent_phoneme_ids,
+ speaker=self.speaker_id,
+ length_scale=settings.length_scale,
+ noise_scale=settings.noise_scale,
+ noise_w=settings.noise_w,
)
- if text:
- _LOGGER.debug("%s %s %s", text, sent_phonemes, sent_phoneme_ids)
- else:
- _LOGGER.debug("%s %s", sent_phonemes, sent_phoneme_ids)
-
- # Create model inputs
- text_array = np.expand_dims(np.array(sent_phoneme_ids, dtype=np.int64), 0)
- text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
- scales_array = np.array(
- [
- settings.noise_scale,
- settings.length_scale,
- settings.noise_w,
- ],
- dtype=np.float32,
- )
-
- inputs = {
- "input": text_array,
- "input_lengths": text_lengths_array,
- "scales": scales_array,
- }
-
- if config.is_multispeaker:
- speaker_id = settings.speaker_id if settings.speaker_id is not None else 0
- speaker_id_array = np.array([speaker_id], dtype=np.int64)
- inputs["sid"] = speaker_id_array
-
- # Infer audio from phonemes
- start_time = time.perf_counter()
- audio = onnx_model.run(None, inputs)[0].squeeze()
- audio = audio_float_to_int16(audio)
- end_time = time.perf_counter()
-
- # Compute real-time factor
- audio_duration_sec = audio.shape[-1] / config.audio.sample_rate
- infer_sec = end_time - start_time
- real_time_factor = (
- infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
- )
-
- _LOGGER.debug("RTF: %s", real_time_factor)
-
audio_bytes = audio.tobytes()
return AudioResult(
- sample_rate_hz=config.audio.sample_rate,
+ sample_rate_hz=voice.config.audio.sample_rate,
audio_bytes=audio_bytes,
# 16-bit mono
sample_width_bytes=2,
num_channels=1,
)
- def speak_tokens(self, tokens: typing.Iterable[BaseToken]):
+ def speak_tokens(
+ self,
+ tokens: typing.Iterable[BaseToken],
+ text_language: typing.Optional[str] = None,
+ ):
+ voice = self._get_or_load_voice(self.voice)
token_phonemes: PHONEMES_LIST = []
for token in tokens:
if isinstance(token, Word):
- word_role = xmlescape(token.role) if token.role else ""
- word_text = xmlescape(token.text)
-
- sentence = next(
- iter(
- gruut.sentences(
- f'{word_text}', ssml=True
- )
- )
+ word_phonemes = voice.word_to_phonemes(
+ token.text, word_role=token.role, text_language=text_language
)
- token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
+ token_phonemes.append(word_phonemes)
elif isinstance(token, Phonemes):
phoneme_str = token.text.strip()
if " " in phoneme_str:
token_phonemes.append(phoneme_str.split())
else:
- token_phonemes.append(list(phoneme_str))
+ token_phonemes.append(list(IPA.graphemes(phoneme_str)))
elif isinstance(token, SayAs):
- word_text = xmlescape(token.text)
- interpret_as = xmlescape(token.interpret_as)
- format_attr = (
- f'format="{xmlescape(token.format)}"' if token.format else ""
+ say_as_phonemes = voice.say_as_to_phonemes(
+ token.text,
+ interpret_as=token.interpret_as,
+ say_format=token.format,
+ text_language=text_language,
)
-
- sentence = next(
- iter(
- gruut.sentences(
- f'{word_text}',
- ssml=True,
- )
- )
- )
-
- token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
+ token_phonemes.extend(say_as_phonemes)
if token_phonemes:
self._results.append(
@@ -379,7 +266,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
if sent_phonemes:
yield self._speak_sentence_phonemes(sent_phonemes)
- def _get_or_load_voice(self, voice_key: str) -> LoadedVoice:
+ def _get_or_load_voice(self, voice_key: str) -> Mimic3Voice:
existing_voice = self.loaded_voices.get(voice_key)
if existing_voice is not None:
return existing_voice
@@ -399,57 +286,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
return existing_voice
- _LOGGER.debug("Loading voice from %s", model_dir)
-
- config_path = model_dir / "config.json"
- _LOGGER.debug("Loading model config from %s", config_path)
-
- with open(config_path, "r", encoding="utf-8") as config_file:
- config = TrainingConfig.load(config_file)
-
- # phoneme -> id
- phoneme_ids_path = model_dir / "phonemes.txt"
- _LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
- with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
- phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
-
- generator_path = model_dir / "generator.onnx"
- _LOGGER.debug("Loading model from %s", generator_path)
-
- sess_options = onnxruntime.SessionOptions()
- # sess_options.enable_cpu_mem_arena = False
- # sess_options.enable_mem_pattern = False
- # sess_options.enable_mem_reuse = False
-
- onnx_model = onnxruntime.InferenceSession(
- str(generator_path), sess_options=sess_options
- )
-
- voice = LoadedVoice(
- config=config, onnx_model=onnx_model, phoneme_to_id=phoneme_to_id
- )
-
- # valid_phonemes = []
- # for phoneme_str in self._phoneme_to_id:
- # maybe_phoneme = Phoneme(phoneme_str)
- # if any(
- # [
- # maybe_phoneme.vowel,
- # maybe_phoneme.consonant,
- # maybe_phoneme.dipthong,
- # maybe_phoneme.schwa,
- # ]
- # ):
- # valid_phonemes.append(maybe_phoneme)
-
- # self._voice_phonemes = Phonemes(phonemes=valid_phonemes)
-
- # phoneme -> phoneme, phoneme, ...
- phoneme_map_path = model_dir / "phoneme_map.txt"
- if phoneme_map_path.is_file():
- _LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
- with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
- voice.phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
+ voice = Mimic3Voice.load_from_directory(model_dir)
_LOGGER.info("Loaded voice from %s", model_dir)
diff --git a/mimic3-tts/mimic3_tts/voice.py b/mimic3-tts/mimic3_tts/voice.py
new file mode 100644
index 0000000..140e6ce
--- /dev/null
+++ b/mimic3-tts/mimic3_tts/voice.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+import itertools
+import logging
+import time
+import typing
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+from xml.sax.saxutils import escape as xmlescape
+
+import espeak_phonemizer
+import gruut
+import numpy as np
+import onnxruntime
+import phonemes2ids
+from gruut_ipa import IPA
+
+from mimic3_tts.config import Phonemizer, TrainingConfig
+from mimic3_tts.utils import audio_float_to_int16
+
+PHONEME_TYPE = str
+PHONEME_ID_TYPE = int
+WORD_PHONEMES_TYPE = typing.List[typing.List[PHONEME_TYPE]]
+PHONEME_MAP_TYPE = typing.Dict[PHONEME_TYPE, typing.List[PHONEME_TYPE]]
+
+SPEAKER_NAME_TYPE = str
+SPEAKER_ID_TYPE = int
+SPEAKER_MAP_TYPE = typing.Dict[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
+
+DEFAULT_LANGUAGE = "en_US"
+
+_LOGGER = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+
+
+class Mimic3Voice(metaclass=ABCMeta):
+ def __init__(
+ self,
+ config: TrainingConfig,
+ onnx_model: onnxruntime.InferenceSession,
+ phoneme_to_id: typing.Dict[PHONEME_TYPE, int],
+ phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None,
+ speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None,
+ ):
+ self.config = config
+ self.onnx_model = onnx_model
+ self.phoneme_to_id = phoneme_to_id
+ self.phoneme_map = phoneme_map
+ self.speaker_map = speaker_map
+
+ @abstractmethod
+ def text_to_phonemes(
+ self, text: str, text_language: typing.Optional[str] = None
+ ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+ pass
+
+ def word_to_phonemes(
+ self,
+ word_text: str,
+ word_role: typing.Optional[str] = None,
+ text_language: typing.Optional[str] = None,
+ ) -> typing.List[PHONEME_TYPE]:
+ word_phonemes = []
+ for sent_phonemes in self.text_to_phonemes(
+ word_text, text_language=text_language
+ ):
+ for sent_word_phonemes in sent_phonemes:
+ word_phonemes.extend(sent_word_phonemes)
+
+ return word_phonemes
+
+ def say_as_to_phonemes(
+ self,
+ text: str,
+ interpret_as: str,
+ say_format: typing.Optional[str] = None,
+ text_language: typing.Optional[str] = None,
+ ) -> WORD_PHONEMES_TYPE:
+ word_phonemes = []
+ for sent_phonemes in self.text_to_phonemes(text, text_language=text_language):
+ word_phonemes.extend(sent_phonemes)
+
+ return word_phonemes
+
+ def phonemes_to_ids(
+ self, phonemes: WORD_PHONEMES_TYPE
+ ) -> typing.Sequence[PHONEME_ID_TYPE]:
+ phoneme_map = self.phoneme_map or self.config.phonemes.phoneme_map
+
+ return phonemes2ids.phonemes2ids(
+ word_phonemes=phonemes,
+ phoneme_to_id=self.phoneme_to_id,
+ pad=self.config.phonemes.pad,
+ bos=self.config.phonemes.bos,
+ eos=self.config.phonemes.eos,
+ auto_bos_eos=self.config.phonemes.auto_bos_eos,
+ blank=self.config.phonemes.blank,
+ blank_word=self.config.phonemes.blank_word,
+ blank_between=self.config.phonemes.blank_between,
+ blank_at_start=self.config.phonemes.blank_at_start,
+ blank_at_end=self.config.phonemes.blank_at_end,
+ simple_punctuation=self.config.phonemes.simple_punctuation,
+ punctuation_map=self.config.phonemes.punctuation_map,
+ separate=self.config.phonemes.separate,
+ separate_graphemes=self.config.phonemes.separate_graphemes,
+ separate_tones=self.config.phonemes.separate_tones,
+ tone_before=self.config.phonemes.tone_before,
+ phoneme_map=phoneme_map,
+ fail_on_missing=False,
+ )
+
+ def ids_to_audio(
+ self,
+ phoneme_ids: typing.Sequence[PHONEME_ID_TYPE],
+ speaker: typing.Optional[
+ typing.Union[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
+ ] = None,
+ length_scale: float = 1.0,
+ noise_scale: float = 0.333,
+ noise_w: float = 1.0,
+ ) -> np.ndarray:
+ # Create model inputs
+ text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+ text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
+ scales_array = np.array(
+ [noise_scale, length_scale, noise_w,], dtype=np.float32,
+ )
+
+ # TODO: Use settings from voice config
+ inputs = {
+ "input": text_array,
+ "input_lengths": text_lengths_array,
+ "scales": scales_array,
+ }
+
+ if self.config.is_multispeaker:
+ speaker_id = 0
+ if isinstance(speaker, SPEAKER_NAME_TYPE):
+ if self.speaker_map:
+ speaker_id = self.speaker_map.get(speaker, speaker_id)
+ elif speaker is not None:
+ speaker_id = speaker
+
+ speaker_id_array = np.array([speaker_id], dtype=np.int64)
+ inputs["sid"] = speaker_id_array
+
+ # Infer audio from phonemes
+ start_time = time.perf_counter()
+ audio = self.onnx_model.run(None, inputs)[0].squeeze()
+ audio = audio_float_to_int16(audio)
+ end_time = time.perf_counter()
+
+ # Compute real-time factor
+ audio_duration_sec = audio.shape[-1] / self.config.audio.sample_rate
+ infer_sec = end_time - start_time
+ real_time_factor = (
+ infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
+ )
+
+ _LOGGER.debug("RTF: %s", real_time_factor)
+
+ return audio
+
+ @staticmethod
+ def load_from_directory(
+ voice_dir: typing.Union[str, Path],
+ session_options: typing.Optional[onnxruntime.SessionOptions] = None,
+ ) -> "Mimic3Voice":
+ voice_dir = Path(voice_dir)
+ _LOGGER.debug("Loading voice from %s", voice_dir)
+
+ config_path = voice_dir / "config.json"
+ _LOGGER.debug("Loading config from %s", config_path)
+
+ with open(config_path, "r", encoding="utf-8") as config_file:
+ config = TrainingConfig.load(config_file)
+
+ # phoneme -> id
+ phoneme_ids_path = voice_dir / "phonemes.txt"
+ _LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
+ with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
+ phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
+
+ generator_path = voice_dir / "generator.onnx"
+ _LOGGER.debug("Loading model from %s", generator_path)
+
+ # Load onnx model
+ session_options = session_options or onnxruntime.SessionOptions()
+ onnx_model = onnxruntime.InferenceSession(
+ str(generator_path), sess_options=session_options
+ )
+
+ # phoneme -> phoneme, phoneme, ...
+ phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None
+ phoneme_map_path = voice_dir / "phoneme_map.txt"
+ if phoneme_map_path.is_file():
+ _LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
+ with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
+ phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
+
+ # TODO: Load speaker map
+
+ if config.phonemizer == Phonemizer.GRUUT:
+ return GruutVoice(
+ config=config,
+ onnx_model=onnx_model,
+ phoneme_to_id=phoneme_to_id,
+ phoneme_map=phoneme_map,
+ )
+
+ if config.phonemizer == Phonemizer.ESPEAK:
+ return EspeakVoice(
+ config=config,
+ onnx_model=onnx_model,
+ phoneme_to_id=phoneme_to_id,
+ phoneme_map=phoneme_map,
+ )
+ if config.phonemizer == Phonemizer.SYMBOLS:
+ return SymbolsVoice(
+ config=config,
+ onnx_model=onnx_model,
+ phoneme_to_id=phoneme_to_id,
+ phoneme_map=phoneme_map,
+ )
+
+ raise ValueError(f"Unsupported phonemizer: {config.phonemizer}")
+
+
+# -----------------------------------------------------------------------------
+
+
+class GruutVoice(Mimic3Voice):
+ def text_to_phonemes(
+ self, text: str, text_language: typing.Optional[str] = None
+ ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+ text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+ for sentence in gruut.sentences(text, lang=text_language):
+ sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
+ if sent_phonemes:
+ yield sent_phonemes
+
+ def word_to_phonemes(
+ self,
+ word_text: str,
+ word_role: typing.Optional[str] = None,
+ text_language: typing.Optional[str] = None,
+ ) -> typing.List[PHONEME_TYPE]:
+ text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+ word_role = xmlescape(word_role) if word_role else ""
+ word_text = xmlescape(word_text)
+
+ sentence = next(
+ iter(
+ gruut.sentences(
+ f'{word_text}',
+ ssml=True,
+ lang=text_language,
+ )
+ )
+ )
+
+ sentence_word = next(iter(sentence))
+
+ return sentence_word.phonemes
+
+ def say_as_to_phonemes(
+ self,
+ text: str,
+ interpret_as: str,
+ say_format: typing.Optional[str] = None,
+ text_language: typing.Optional[str] = None,
+ ) -> WORD_PHONEMES_TYPE:
+ text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+ word_text = xmlescape(text)
+ interpret_as = xmlescape(interpret_as)
+ format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
+
+ sentences = gruut.sentences(
+ f'{word_text}',
+ ssml=True,
+ lang=text_language,
+ )
+
+ sent_phonemes: WORD_PHONEMES_TYPE = []
+
+ for sentence in sentences:
+ sent_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
+
+ return sent_phonemes
+
+
+# -----------------------------------------------------------------------------
+
+
+class EspeakVoice(Mimic3Voice):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._phonemizer = espeak_phonemizer.Phonemizer()
+
+ def text_to_phonemes(
+ self, text: str, text_language: typing.Optional[str] = None
+ ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+ phoneme_separator = ""
+ word_separator = self.config.phonemes.word_separator
+
+ text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+ voice = self._language_to_voice(text_language)
+
+ phoneme_str = self._phonemizer.phonemize(
+ text,
+ voice=voice,
+ keep_clause_breakers=True,
+ phoneme_separator=phoneme_separator,
+ word_separator=word_separator,
+ punctuation_separator=phoneme_separator,
+ )
+
+ word_phonemes = [
+ list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
+ ]
+
+ yield word_phonemes
+
+ def word_to_phonemes(
+ self,
+ word_text: str,
+ word_role: typing.Optional[str] = None,
+ text_language: typing.Optional[str] = None,
+ ) -> typing.List[PHONEME_TYPE]:
+ phoneme_separator = ""
+ text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+ word_role = xmlescape(word_role) if word_role else ""
+ word_text = xmlescape(word_text)
+
+ voice = self._language_to_voice(text_language)
+
+ phoneme_str = self._phonemizer.phonemize(
+ f'{word_text}',
+ voice=voice,
+ keep_clause_breakers=True,
+ phoneme_separator=phoneme_separator,
+ punctuation_separator=phoneme_separator,
+ ssml=True,
+ )
+
+ word_phonemes = list(IPA.graphemes(phoneme_str))
+
+ return word_phonemes
+
+ def say_as_to_phonemes(
+ self,
+ text: str,
+ interpret_as: str,
+ say_format: typing.Optional[str] = None,
+ text_language: typing.Optional[str] = None,
+ ) -> WORD_PHONEMES_TYPE:
+ phoneme_separator = ""
+ word_separator = self.config.phonemes.word_separator
+ text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+ word_text = xmlescape(text)
+ interpret_as = xmlescape(interpret_as)
+ format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
+
+ voice = self._language_to_voice(text_language)
+
+ phoneme_str = self._phonemizer.phonemize(
+ f'{word_text}',
+ voice=voice,
+ keep_clause_breakers=True,
+ phoneme_separator=phoneme_separator,
+ punctuation_separator=phoneme_separator,
+ word_separator=word_separator,
+ ssml=True,
+ )
+
+ word_phonemes = [
+ list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
+ ]
+
+ return word_phonemes
+
+ def _language_to_voice(self, language: str) -> str:
+ # en_US -> en-us
+ return language.strip().lower().replace("_", "-")
+
+
+# -----------------------------------------------------------------------------
+
+
+class SymbolsVoice(Mimic3Voice):
+ def text_to_phonemes(
+ self, text: str, text_language: typing.Optional[str] = None
+ ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+ word_separator = self.config.phonemes.word_separator
+ word_phonemes = [
+ list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator)
+ ]
+ yield word_phonemes
diff --git a/mimic3-tts/mypy.ini b/mimic3-tts/mypy.ini
index 7d81e3f..916fdd7 100644
--- a/mimic3-tts/mypy.ini
+++ b/mimic3-tts/mypy.ini
@@ -2,3 +2,6 @@
[mypy-setuptools.*]
ignore_missing_imports = True
+
+[mypy-onnxruntime.*]
+ignore_missing_imports = True