Add espeak and symbol voices

2022-03-18 17:04:56 -04:00 · 2022-03-18 17:04:56 -04:00 · fb9cd71919
commit fb9cd71919
parent 1eae034d8c
6 changed files with 543 additions and 237 deletions
--- a/mimic3-tts/mimic3_tts/init.py
+++ b/mimic3-tts/mimic3_tts/init.py
@ -1,2 +1,3 @@
-from .tts import Mimic3TextToSpeechSystem, Mimic3Settings
 from opentts_abc import AudioResult, MarkResult
+
+from .tts import Mimic3Settings, Mimic3TextToSpeechSystem
--- a/mimic3-tts/mimic3_tts/main.py
+++ b/mimic3-tts/mimic3_tts/main.py
@ -5,13 +5,23 @@ import wave
 logging.basicConfig(level=logging.DEBUG)

 from opentts_abc.ssml import SSMLSpeaker
-from mimic3_tts.tts import Mimic3TextToSpeechSystem, Mimic3Settings, AudioResult, MarkResult

-settings = Mimic3Settings(length_scale=1.2, noise_w=0)
+from mimic3_tts.tts import (
+    AudioResult,
+    MarkResult,
+    Mimic3Settings,
+    Mimic3TextToSpeechSystem,
+)
+
+settings = Mimic3Settings()
 tts = Mimic3TextToSpeechSystem(settings)

 speaker = SSMLSpeaker(tts)
-ssml = '<speak><s><voice name="en_US/vctk_low#20">This is a test.</voice></s></speak>'
+# ssml = '<speak><voice name="el_GR/rapunzelina_low"><s><w>Το</w><w>αερόστρωμνό</w><w>μου</w><w>είναι</w><w>γεμάτο</w><w>χέλια.</w></s></voice></speak>'
+# ssml = '<speak><voice name="uk_UK/m-ailabs_low"><s><w>бажав</w></s></voice></speak>'
+# ssml = '<speak><s><w>Hello</w><w>World</w></s></speak>'
+# ssml = '<speak><s>Hello world</s></speak>'
+ssml = '<speak><s><voice name="el_GR/rapunzelina_low"><say-as interpret-as="characters">12</say-as></voice></s></speak>'

 wav_file: wave.Wave_write = wave.open("out.wav", "wb")
 params_set = False
--- a/mimic3-tts/mimic3_tts/config.py
+++ b/mimic3-tts/mimic3_tts/config.py
@ -1,18 +1,4 @@
 """Configuration classes"""
-# Copyright 2021 Mycroft AI Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
 import collections
 import json
 import typing
@ -20,6 +6,7 @@ from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path

+import numpy as np
 from dataclasses_json import DataClassJsonMixin
 from gruut_ipa import IPA
 from phonemes2ids import BlankBetween
@ -59,6 +46,51 @@ class AudioConfig(DataClassJsonMixin):
        if self.mel_fmax is not None:
            assert self.mel_fmax <= self.sample_rate // 2

+    # -------------------------------------------------------------------------
+    # Normalization
+    # -------------------------------------------------------------------------
+
+    def normalize(self, mel_db: np.ndarray) -> np.ndarray:
+        """Put values in [0, max_norm] or [-max_norm, max_norm]"""
+        mel_norm = ((mel_db - self.ref_level_db) - self.min_level_db) / (
+            -self.min_level_db
+        )
+        if self.symmetric_norm:
+            # Symmetric norm
+            mel_norm = ((2 * self.max_norm) * mel_norm) - self.max_norm
+            if self.clip_norm:
+                mel_norm = np.clip(mel_norm, -self.max_norm, self.max_norm)
+        else:
+            # Asymmetric norm
+            mel_norm = self.max_norm * mel_norm
+            if self.clip_norm:
+                mel_norm = np.clip(mel_norm, 0, self.max_norm)
+
+        return mel_norm
+
+    def denormalize(self, mel_db: np.ndarray) -> np.ndarray:
+        """Pull values out of [0, max_norm] or [-max_norm, max_norm]"""
+        if self.symmetric_norm:
+            # Symmetric norm
+            if self.clip_norm:
+                mel_denorm = np.clip(mel_db, -self.max_norm, self.max_norm)
+
+            mel_denorm = (
+                (mel_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)
+            ) + self.min_level_db
+        else:
+            # Asymmetric norm
+            if self.clip_norm:
+                mel_denorm = np.clip(mel_db, 0, self.max_norm)
+
+            mel_denorm = (
+                mel_denorm * -self.min_level_db / self.max_norm
+            ) + self.min_level_db
+
+        mel_denorm += self.ref_level_db
+
+        return mel_denorm
+

@dataclass
 class ModelConfig(DataClassJsonMixin):
@ -84,7 +116,7 @@ class ModelConfig(DataClassJsonMixin):
    upsample_kernel_sizes: typing.Tuple[int, ...] = (16, 16, 4, 4)
    n_layers_q: int = 3
    use_spectral_norm: bool = False
-    gin_channels: int = 256
+    gin_channels: int = 0  # single speaker
    use_sdp: bool = True  # StochasticDurationPredictor

    @property
@ -100,7 +132,7 @@ class PhonemesConfig(DataClassJsonMixin):
    word_separator: str = "#"
    """Separator between word phonemes in CSV input (must not match phoneme_separator)"""

-    phoneme_to_id: typing.Optional[typing.Mapping[str, int]] = None
+    phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
    pad: typing.Optional[str] = "_"
    bos: typing.Optional[str] = None
    eos: typing.Optional[str] = None
@ -110,15 +142,18 @@ class PhonemesConfig(DataClassJsonMixin):
    blank_at_start: bool = True
    blank_at_end: bool = True
    simple_punctuation: bool = True
-    punctuation_map: typing.Optional[typing.Mapping[str, str]] = None
+    punctuation_map: typing.Optional[typing.Dict[str, str]] = None
    separate: typing.Optional[typing.List[str]] = None
    separate_graphemes: bool = False
    separate_tones: bool = False
    tone_before: bool = False
-    phoneme_map: typing.Optional[typing.Mapping[str, str]] = None
+    phoneme_map: typing.Optional[typing.Dict[str, str]] = None
    auto_bos_eos: bool = False
    minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
    major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
+    break_phonemes_into_graphemes: bool = False
+    drop_stress: bool = False
+    symbols: typing.Optional[typing.List[str]] = None

    def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
        """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
@ -158,8 +193,7 @@ class MetadataFormat(str, Enum):
@dataclass
 class DatasetConfig:
    name: str
-    metadata_path: typing.Optional[typing.Union[str, Path]] = None
-    train_path: typing.Optional[typing.Union[str, Path]] = None
+    metadata_format: MetadataFormat = MetadataFormat.TEXT
    multispeaker: bool = False
    text_language: typing.Optional[str] = None
    audio_dir: typing.Optional[typing.Union[str, Path]] = None
@ -183,6 +217,13 @@ class AlignerConfig:
    casing: typing.Optional[TextCasing] = None


+@dataclass
+class InferenceConfig:
+    length_scale: float = 1.0
+    noise_scale: float = 0.667
+    noise_w: float = 0.8
+
+
@dataclass
 class TrainingConfig(DataClassJsonMixin):
    seed: int = 1234
@ -206,6 +247,8 @@ class TrainingConfig(DataClassJsonMixin):
    min_spec_length: typing.Optional[int] = None
    max_spec_length: typing.Optional[int] = None

+    min_speaker_utterances: typing.Optional[int] = None
+
    last_epoch: int = 1
    global_step: int = 1
    best_loss: typing.Optional[float] = None
@ -216,22 +259,31 @@ class TrainingConfig(DataClassJsonMixin):
    text_language: typing.Optional[str] = None
    phonemizer: typing.Optional[Phonemizer] = None
    datasets: typing.List[DatasetConfig] = field(default_factory=list)
-    dataset_format: MetadataFormat = MetadataFormat.TEXT
+    inference: InferenceConfig = field(default_factory=InferenceConfig)

    version: int = 1
    git_commit: str = ""

    @property
    def is_multispeaker(self):
-        return (
-            self.model.is_multispeaker
-            or any(d.multispeaker for d in self.datasets)
-        )
+        return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)

    def save(self, config_file: typing.TextIO):
        """Save config as JSON to a file"""
        json.dump(self.to_dict(), config_file, indent=4)

+    def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
+        if self.speaker_id_map is None:
+            self.speaker_id_map = {}
+
+        full_speaker_name = f"{dataset_name}_{speaker_name}"
+        speaker_id = self.speaker_id_map.get(full_speaker_name)
+        if speaker_id is None:
+            speaker_id = len(self.speaker_id_map)
+            self.speaker_id_map[full_speaker_name] = speaker_id
+
+        return speaker_id
+
    @staticmethod
    def load(config_file: typing.TextIO) -> "TrainingConfig":
        """Load config from a JSON file"""
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@ -1,11 +1,9 @@
 #!/usr/bin/env python3
-import dataclasses
 import logging
 import time
 import typing
-from abc import ABCMeta
-from dataclasses import dataclass, field
 from copy import deepcopy
+from dataclasses import dataclass, field
 from pathlib import Path
 from xml.sax.saxutils import escape as xmlescape

@ -14,22 +12,22 @@ import numpy as np
 import onnxruntime
 import phonemes2ids
 from gruut.const import LookupPhonemes, WordRole
-from gruut_ipa import guess_phonemes, IPA, Phonemes, Phoneme
-
+from gruut_ipa import IPA, Phoneme, guess_phonemes
 from opentts_abc import (
-    TextToSpeechSystem,
-    Voice,
-    BaseToken,
-    BaseResult,
-    MarkResult,
    AudioResult,
-    Word,
+    BaseResult,
+    BaseToken,
+    MarkResult,
    Phonemes,
    SayAs,
+    TextToSpeechSystem,
+    Voice,
+    Word,
 )

 from mimic3_tts.config import TrainingConfig
 from mimic3_tts.utils import audio_float_to_int16
+from mimic3_tts.voice import Mimic3Voice

 _DIR = Path(__file__).parent

@ -51,20 +49,12 @@ class Mimic3Settings:
    voices_directories: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None
    speaker_id: typing.Optional[int] = None
    length_scale: float = 1.0
-    noise_scale: float = 0.333
-    noise_w: float = 1.0
+    noise_scale: float = 0.667
+    noise_w: float = 0.8
    text_language: typing.Optional[str] = None
    sample_rate: int = 22050


-@dataclass
-class LoadedVoice:
-    config: TrainingConfig
-    onnx_model: onnxruntime.InferenceSession
-    phoneme_to_id: typing.Mapping[str, int]
-    phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
-
-
@dataclass
 class Mimic3Phonemes:
    current_settings: Mimic3Settings
@ -80,12 +70,9 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
    def __init__(self, settings: Mimic3Settings):
        self.settings = settings

-        # self._current_voice: typing.Optional[LoadedVoice] = None
-        # self._current_settings = self.settings
-
        self._results: typing.List[typing.Union[BaseResult, Mimic3Phonemes]] = []

-        self.loaded_voices: typing.Dict[str, LoadedVoice] = {}
+        self.loaded_voices: typing.Dict[str, Mimic3Voice] = {}

    @property
    def voice(self) -> str:
@ -107,10 +94,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
            # TODO: Use speaker map
            self.speaker_id = int(speaker_id_str)

-        # self._current_voice = self._get_or_load_voice(
-        #     self.settings.voice or DEFAULT_VOICE
-        # )
-
    @property
    def speaker_id(self) -> typing.Optional[int]:
        return self.settings.speaker_id
@ -131,27 +114,6 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
    def get_default_voices_directories() -> typing.List[Path]:
        return [_DIR.parent.parent / "voices"]

-    # @property
-    # def text_lang(self) -> str:
-    #     return (
-    #         self.settings.text_language
-    #         or self.settings.language
-    #         or (
-    #             self._current_voice.config.text_language
-    #             if self._current_voice
-    #             else None
-    #         )
-    #         or "en_US"
-    #     )
-
-    # @property
-    # def sample_rate(self) -> int:
-    #     return (
-    #         self._current_voice.config.audio.sample_rate
-    #         if self._current_voice
-    #         else self.settings.sample_rate
-    #     )
-
    def get_voices(self) -> typing.Iterable[Voice]:
        voices_dirs = (
            self.settings.voices_directories
@ -185,146 +147,71 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):

    def begin_utterance(self):
        self._results.clear()
-        # self._current_settings = deepcopy(self.settings)

    def speak_text(self, text: str, text_language: typing.Optional[str] = None):
-        text_language = text_language or self.language
-        for sentence in gruut.sentences(text, lang=text_language):
-            sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
+        voice = self._get_or_load_voice(self.voice)
+
+        for sent_phonemes in voice.text_to_phonemes(text, text_language=text_language):
            self._results.append(
                Mimic3Phonemes(
-                    current_settings=deepcopy(self.settings),
-                    phonemes=sent_phonemes,
+                    current_settings=deepcopy(self.settings), phonemes=sent_phonemes,
                )
            )

    def _speak_sentence_phonemes(
-        self,
-        sent_phonemes,
-        text: typing.Optional[str] = None,
-        settings: typing.Optional[Mimic3Settings] = None,
+        self, sent_phonemes, settings: typing.Optional[Mimic3Settings] = None,
    ) -> AudioResult:
        settings = settings or self.settings
-        current_voice = self._get_or_load_voice(settings.voice or DEFAULT_VOICE)
+        voice = self._get_or_load_voice(settings.voice or self.voice)
+        sent_phoneme_ids = voice.phonemes_to_ids(sent_phonemes)

-        config = current_voice.config
-        onnx_model = current_voice.onnx_model
-        phoneme_to_id = current_voice.phoneme_to_id
-        phoneme_map = current_voice.phoneme_map or config.phonemes.phoneme_map
+        _LOGGER.debug("phonemes=%s, ids=%s", sent_phonemes, sent_phoneme_ids)

-        sent_phoneme_ids = phonemes2ids.phonemes2ids(
-            word_phonemes=sent_phonemes,
-            phoneme_to_id=phoneme_to_id,
-            pad=config.phonemes.pad,
-            bos=config.phonemes.bos,
-            eos=config.phonemes.eos,
-            auto_bos_eos=config.phonemes.auto_bos_eos,
-            blank=config.phonemes.blank,
-            blank_word=config.phonemes.blank_word,
-            blank_between=config.phonemes.blank_between,
-            blank_at_start=config.phonemes.blank_at_start,
-            blank_at_end=config.phonemes.blank_at_end,
-            simple_punctuation=config.phonemes.simple_punctuation,
-            punctuation_map=config.phonemes.punctuation_map,
-            separate=config.phonemes.separate,
-            separate_graphemes=config.phonemes.separate_graphemes,
-            separate_tones=config.phonemes.separate_tones,
-            tone_before=config.phonemes.tone_before,
-            phoneme_map=phoneme_map,
-            fail_on_missing=False,
+        audio = voice.ids_to_audio(
+            sent_phoneme_ids,
+            speaker=self.speaker_id,
+            length_scale=settings.length_scale,
+            noise_scale=settings.noise_scale,
+            noise_w=settings.noise_w,
        )

-        if text:
-            _LOGGER.debug("%s %s %s", text, sent_phonemes, sent_phoneme_ids)
-        else:
-            _LOGGER.debug("%s %s", sent_phonemes, sent_phoneme_ids)
-
-        # Create model inputs
-        text_array = np.expand_dims(np.array(sent_phoneme_ids, dtype=np.int64), 0)
-        text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
-        scales_array = np.array(
-            [
-                settings.noise_scale,
-                settings.length_scale,
-                settings.noise_w,
-            ],
-            dtype=np.float32,
-        )
-
-        inputs = {
-            "input": text_array,
-            "input_lengths": text_lengths_array,
-            "scales": scales_array,
-        }
-
-        if config.is_multispeaker:
-            speaker_id = settings.speaker_id if settings.speaker_id is not None else 0
-            speaker_id_array = np.array([speaker_id], dtype=np.int64)
-            inputs["sid"] = speaker_id_array
-
-        # Infer audio from phonemes
-        start_time = time.perf_counter()
-        audio = onnx_model.run(None, inputs)[0].squeeze()
-        audio = audio_float_to_int16(audio)
-        end_time = time.perf_counter()
-
-        # Compute real-time factor
-        audio_duration_sec = audio.shape[-1] / config.audio.sample_rate
-        infer_sec = end_time - start_time
-        real_time_factor = (
-            infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
-        )
-
-        _LOGGER.debug("RTF: %s", real_time_factor)
-
        audio_bytes = audio.tobytes()
        return AudioResult(
-            sample_rate_hz=config.audio.sample_rate,
+            sample_rate_hz=voice.config.audio.sample_rate,
            audio_bytes=audio_bytes,
            # 16-bit mono
            sample_width_bytes=2,
            num_channels=1,
        )

-    def speak_tokens(self, tokens: typing.Iterable[BaseToken]):
+    def speak_tokens(
+        self,
+        tokens: typing.Iterable[BaseToken],
+        text_language: typing.Optional[str] = None,
+    ):
+        voice = self._get_or_load_voice(self.voice)
        token_phonemes: PHONEMES_LIST = []

        for token in tokens:
            if isinstance(token, Word):
-                word_role = xmlescape(token.role) if token.role else ""
-                word_text = xmlescape(token.text)
-
-                sentence = next(
-                    iter(
-                        gruut.sentences(
-                            f'<w role="{word_role}">{word_text}</w>', ssml=True
-                        )
-                    )
+                word_phonemes = voice.word_to_phonemes(
+                    token.text, word_role=token.role, text_language=text_language
                )
-                token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
+                token_phonemes.append(word_phonemes)
            elif isinstance(token, Phonemes):
                phoneme_str = token.text.strip()
                if " " in phoneme_str:
                    token_phonemes.append(phoneme_str.split())
                else:
-                    token_phonemes.append(list(phoneme_str))
+                    token_phonemes.append(list(IPA.graphemes(phoneme_str)))
            elif isinstance(token, SayAs):
-                word_text = xmlescape(token.text)
-                interpret_as = xmlescape(token.interpret_as)
-                format_attr = (
-                    f'format="{xmlescape(token.format)}"' if token.format else ""
+                say_as_phonemes = voice.say_as_to_phonemes(
+                    token.text,
+                    interpret_as=token.interpret_as,
+                    say_format=token.format,
+                    text_language=text_language,
                )
-
-                sentence = next(
-                    iter(
-                        gruut.sentences(
-                            f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
-                            ssml=True,
-                        )
-                    )
-                )
-
-                token_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
+                token_phonemes.extend(say_as_phonemes)

        if token_phonemes:
            self._results.append(
@ -379,7 +266,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
        if sent_phonemes:
            yield self._speak_sentence_phonemes(sent_phonemes)

-    def _get_or_load_voice(self, voice_key: str) -> LoadedVoice:
+    def _get_or_load_voice(self, voice_key: str) -> Mimic3Voice:
        existing_voice = self.loaded_voices.get(voice_key)
        if existing_voice is not None:
            return existing_voice
@ -399,57 +286,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):

            return existing_voice

-        _LOGGER.debug("Loading voice from %s", model_dir)
-
-        config_path = model_dir / "config.json"
-        _LOGGER.debug("Loading model config from %s", config_path)
-
-        with open(config_path, "r", encoding="utf-8") as config_file:
-            config = TrainingConfig.load(config_file)
-
-        # phoneme -> id
-        phoneme_ids_path = model_dir / "phonemes.txt"
-        _LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
-        with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
-            phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
-
-        generator_path = model_dir / "generator.onnx"
-        _LOGGER.debug("Loading model from %s", generator_path)
-
-        sess_options = onnxruntime.SessionOptions()
-        # sess_options.enable_cpu_mem_arena = False
-        # sess_options.enable_mem_pattern = False
-        # sess_options.enable_mem_reuse = False
-
-        onnx_model = onnxruntime.InferenceSession(
-            str(generator_path), sess_options=sess_options
-        )
-
-        voice = LoadedVoice(
-            config=config, onnx_model=onnx_model, phoneme_to_id=phoneme_to_id
-        )
-
-        # valid_phonemes = []
-        # for phoneme_str in self._phoneme_to_id:
-        #     maybe_phoneme = Phoneme(phoneme_str)
-        #     if any(
-        #         [
-        #             maybe_phoneme.vowel,
-        #             maybe_phoneme.consonant,
-        #             maybe_phoneme.dipthong,
-        #             maybe_phoneme.schwa,
-        #         ]
-        #     ):
-        #         valid_phonemes.append(maybe_phoneme)
-
-        # self._voice_phonemes = Phonemes(phonemes=valid_phonemes)
-
-        # phoneme -> phoneme, phoneme, ...
-        phoneme_map_path = model_dir / "phoneme_map.txt"
-        if phoneme_map_path.is_file():
-            _LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
-            with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
-                voice.phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
+        voice = Mimic3Voice.load_from_directory(model_dir)

        _LOGGER.info("Loaded voice from %s", model_dir)

--- a/mimic3-tts/mimic3_tts/voice.py
+++ b/mimic3-tts/mimic3_tts/voice.py
@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+import itertools
+import logging
+import time
+import typing
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+from xml.sax.saxutils import escape as xmlescape
+
+import espeak_phonemizer
+import gruut
+import numpy as np
+import onnxruntime
+import phonemes2ids
+from gruut_ipa import IPA
+
+from mimic3_tts.config import Phonemizer, TrainingConfig
+from mimic3_tts.utils import audio_float_to_int16
+
+PHONEME_TYPE = str
+PHONEME_ID_TYPE = int
+WORD_PHONEMES_TYPE = typing.List[typing.List[PHONEME_TYPE]]
+PHONEME_MAP_TYPE = typing.Dict[PHONEME_TYPE, typing.List[PHONEME_TYPE]]
+
+SPEAKER_NAME_TYPE = str
+SPEAKER_ID_TYPE = int
+SPEAKER_MAP_TYPE = typing.Dict[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
+
+DEFAULT_LANGUAGE = "en_US"
+
+_LOGGER = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+
+
+class Mimic3Voice(metaclass=ABCMeta):
+    def __init__(
+        self,
+        config: TrainingConfig,
+        onnx_model: onnxruntime.InferenceSession,
+        phoneme_to_id: typing.Dict[PHONEME_TYPE, int],
+        phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None,
+        speaker_map: typing.Optional[SPEAKER_MAP_TYPE] = None,
+    ):
+        self.config = config
+        self.onnx_model = onnx_model
+        self.phoneme_to_id = phoneme_to_id
+        self.phoneme_map = phoneme_map
+        self.speaker_map = speaker_map
+
+    @abstractmethod
+    def text_to_phonemes(
+        self, text: str, text_language: typing.Optional[str] = None
+    ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+        pass
+
+    def word_to_phonemes(
+        self,
+        word_text: str,
+        word_role: typing.Optional[str] = None,
+        text_language: typing.Optional[str] = None,
+    ) -> typing.List[PHONEME_TYPE]:
+        word_phonemes = []
+        for sent_phonemes in self.text_to_phonemes(
+            word_text, text_language=text_language
+        ):
+            for sent_word_phonemes in sent_phonemes:
+                word_phonemes.extend(sent_word_phonemes)
+
+        return word_phonemes
+
+    def say_as_to_phonemes(
+        self,
+        text: str,
+        interpret_as: str,
+        say_format: typing.Optional[str] = None,
+        text_language: typing.Optional[str] = None,
+    ) -> WORD_PHONEMES_TYPE:
+        word_phonemes = []
+        for sent_phonemes in self.text_to_phonemes(text, text_language=text_language):
+            word_phonemes.extend(sent_phonemes)
+
+        return word_phonemes
+
+    def phonemes_to_ids(
+        self, phonemes: WORD_PHONEMES_TYPE
+    ) -> typing.Sequence[PHONEME_ID_TYPE]:
+        phoneme_map = self.phoneme_map or self.config.phonemes.phoneme_map
+
+        return phonemes2ids.phonemes2ids(
+            word_phonemes=phonemes,
+            phoneme_to_id=self.phoneme_to_id,
+            pad=self.config.phonemes.pad,
+            bos=self.config.phonemes.bos,
+            eos=self.config.phonemes.eos,
+            auto_bos_eos=self.config.phonemes.auto_bos_eos,
+            blank=self.config.phonemes.blank,
+            blank_word=self.config.phonemes.blank_word,
+            blank_between=self.config.phonemes.blank_between,
+            blank_at_start=self.config.phonemes.blank_at_start,
+            blank_at_end=self.config.phonemes.blank_at_end,
+            simple_punctuation=self.config.phonemes.simple_punctuation,
+            punctuation_map=self.config.phonemes.punctuation_map,
+            separate=self.config.phonemes.separate,
+            separate_graphemes=self.config.phonemes.separate_graphemes,
+            separate_tones=self.config.phonemes.separate_tones,
+            tone_before=self.config.phonemes.tone_before,
+            phoneme_map=phoneme_map,
+            fail_on_missing=False,
+        )
+
+    def ids_to_audio(
+        self,
+        phoneme_ids: typing.Sequence[PHONEME_ID_TYPE],
+        speaker: typing.Optional[
+            typing.Union[SPEAKER_NAME_TYPE, SPEAKER_ID_TYPE]
+        ] = None,
+        length_scale: float = 1.0,
+        noise_scale: float = 0.333,
+        noise_w: float = 1.0,
+    ) -> np.ndarray:
+        # Create model inputs
+        text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        text_lengths_array = np.array([text_array.shape[1]], dtype=np.int64)
+        scales_array = np.array(
+            [noise_scale, length_scale, noise_w,], dtype=np.float32,
+        )
+
+        # TODO: Use settings from voice config
+        inputs = {
+            "input": text_array,
+            "input_lengths": text_lengths_array,
+            "scales": scales_array,
+        }
+
+        if self.config.is_multispeaker:
+            speaker_id = 0
+            if isinstance(speaker, SPEAKER_NAME_TYPE):
+                if self.speaker_map:
+                    speaker_id = self.speaker_map.get(speaker, speaker_id)
+            elif speaker is not None:
+                speaker_id = speaker
+
+            speaker_id_array = np.array([speaker_id], dtype=np.int64)
+            inputs["sid"] = speaker_id_array
+
+        # Infer audio from phonemes
+        start_time = time.perf_counter()
+        audio = self.onnx_model.run(None, inputs)[0].squeeze()
+        audio = audio_float_to_int16(audio)
+        end_time = time.perf_counter()
+
+        # Compute real-time factor
+        audio_duration_sec = audio.shape[-1] / self.config.audio.sample_rate
+        infer_sec = end_time - start_time
+        real_time_factor = (
+            infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
+        )
+
+        _LOGGER.debug("RTF: %s", real_time_factor)
+
+        return audio
+
+    @staticmethod
+    def load_from_directory(
+        voice_dir: typing.Union[str, Path],
+        session_options: typing.Optional[onnxruntime.SessionOptions] = None,
+    ) -> "Mimic3Voice":
+        voice_dir = Path(voice_dir)
+        _LOGGER.debug("Loading voice from %s", voice_dir)
+
+        config_path = voice_dir / "config.json"
+        _LOGGER.debug("Loading config from %s", config_path)
+
+        with open(config_path, "r", encoding="utf-8") as config_file:
+            config = TrainingConfig.load(config_file)
+
+        # phoneme -> id
+        phoneme_ids_path = voice_dir / "phonemes.txt"
+        _LOGGER.debug("Loading model phonemes from %s", phoneme_ids_path)
+        with open(phoneme_ids_path, "r", encoding="utf-8") as ids_file:
+            phoneme_to_id = phonemes2ids.load_phoneme_ids(ids_file)
+
+        generator_path = voice_dir / "generator.onnx"
+        _LOGGER.debug("Loading model from %s", generator_path)
+
+        # Load onnx model
+        session_options = session_options or onnxruntime.SessionOptions()
+        onnx_model = onnxruntime.InferenceSession(
+            str(generator_path), sess_options=session_options
+        )
+
+        # phoneme -> phoneme, phoneme, ...
+        phoneme_map: typing.Optional[PHONEME_MAP_TYPE] = None
+        phoneme_map_path = voice_dir / "phoneme_map.txt"
+        if phoneme_map_path.is_file():
+            _LOGGER.debug("Loading phoneme map from %s", phoneme_map_path)
+            with open(phoneme_map_path, "r", encoding="utf-8") as map_file:
+                phoneme_map = phonemes2ids.utils.load_phoneme_map(map_file)
+
+        # TODO: Load speaker map
+
+        if config.phonemizer == Phonemizer.GRUUT:
+            return GruutVoice(
+                config=config,
+                onnx_model=onnx_model,
+                phoneme_to_id=phoneme_to_id,
+                phoneme_map=phoneme_map,
+            )
+
+        if config.phonemizer == Phonemizer.ESPEAK:
+            return EspeakVoice(
+                config=config,
+                onnx_model=onnx_model,
+                phoneme_to_id=phoneme_to_id,
+                phoneme_map=phoneme_map,
+            )
+        if config.phonemizer == Phonemizer.SYMBOLS:
+            return SymbolsVoice(
+                config=config,
+                onnx_model=onnx_model,
+                phoneme_to_id=phoneme_to_id,
+                phoneme_map=phoneme_map,
+            )
+
+        raise ValueError(f"Unsupported phonemizer: {config.phonemizer}")
+
+
+# -----------------------------------------------------------------------------
+
+
+class GruutVoice(Mimic3Voice):
+    def text_to_phonemes(
+        self, text: str, text_language: typing.Optional[str] = None
+    ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+        text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+        for sentence in gruut.sentences(text, lang=text_language):
+            sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
+            if sent_phonemes:
+                yield sent_phonemes
+
+    def word_to_phonemes(
+        self,
+        word_text: str,
+        word_role: typing.Optional[str] = None,
+        text_language: typing.Optional[str] = None,
+    ) -> typing.List[PHONEME_TYPE]:
+        text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+        word_role = xmlescape(word_role) if word_role else ""
+        word_text = xmlescape(word_text)
+
+        sentence = next(
+            iter(
+                gruut.sentences(
+                    f'<w role="{word_role}">{word_text}</w>',
+                    ssml=True,
+                    lang=text_language,
+                )
+            )
+        )
+
+        sentence_word = next(iter(sentence))
+
+        return sentence_word.phonemes
+
+    def say_as_to_phonemes(
+        self,
+        text: str,
+        interpret_as: str,
+        say_format: typing.Optional[str] = None,
+        text_language: typing.Optional[str] = None,
+    ) -> WORD_PHONEMES_TYPE:
+        text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+        word_text = xmlescape(text)
+        interpret_as = xmlescape(interpret_as)
+        format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
+
+        sentences = gruut.sentences(
+            f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
+            ssml=True,
+            lang=text_language,
+        )
+
+        sent_phonemes: WORD_PHONEMES_TYPE = []
+
+        for sentence in sentences:
+            sent_phonemes.extend(w.phonemes for w in sentence if w.phonemes)
+
+        return sent_phonemes
+
+
+# -----------------------------------------------------------------------------
+
+
+class EspeakVoice(Mimic3Voice):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._phonemizer = espeak_phonemizer.Phonemizer()
+
+    def text_to_phonemes(
+        self, text: str, text_language: typing.Optional[str] = None
+    ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+        phoneme_separator = ""
+        word_separator = self.config.phonemes.word_separator
+
+        text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+        voice = self._language_to_voice(text_language)
+
+        phoneme_str = self._phonemizer.phonemize(
+            text,
+            voice=voice,
+            keep_clause_breakers=True,
+            phoneme_separator=phoneme_separator,
+            word_separator=word_separator,
+            punctuation_separator=phoneme_separator,
+        )
+
+        word_phonemes = [
+            list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
+        ]
+
+        yield word_phonemes
+
+    def word_to_phonemes(
+        self,
+        word_text: str,
+        word_role: typing.Optional[str] = None,
+        text_language: typing.Optional[str] = None,
+    ) -> typing.List[PHONEME_TYPE]:
+        phoneme_separator = ""
+        text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+        word_role = xmlescape(word_role) if word_role else ""
+        word_text = xmlescape(word_text)
+
+        voice = self._language_to_voice(text_language)
+
+        phoneme_str = self._phonemizer.phonemize(
+            f'<w role="{word_role}">{word_text}</w>',
+            voice=voice,
+            keep_clause_breakers=True,
+            phoneme_separator=phoneme_separator,
+            punctuation_separator=phoneme_separator,
+            ssml=True,
+        )
+
+        word_phonemes = list(IPA.graphemes(phoneme_str))
+
+        return word_phonemes
+
+    def say_as_to_phonemes(
+        self,
+        text: str,
+        interpret_as: str,
+        say_format: typing.Optional[str] = None,
+        text_language: typing.Optional[str] = None,
+    ) -> WORD_PHONEMES_TYPE:
+        phoneme_separator = ""
+        word_separator = self.config.phonemes.word_separator
+        text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
+
+        word_text = xmlescape(text)
+        interpret_as = xmlescape(interpret_as)
+        format_attr = f'format="{xmlescape(say_format)}"' if say_format else ""
+
+        voice = self._language_to_voice(text_language)
+
+        phoneme_str = self._phonemizer.phonemize(
+            f'<say-as interpret-as="{interpret_as}" {format_attr}>{word_text}</say-as>',
+            voice=voice,
+            keep_clause_breakers=True,
+            phoneme_separator=phoneme_separator,
+            punctuation_separator=phoneme_separator,
+            word_separator=word_separator,
+            ssml=True,
+        )
+
+        word_phonemes = [
+            list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split(word_separator)
+        ]
+
+        return word_phonemes
+
+    def _language_to_voice(self, language: str) -> str:
+        # en_US -> en-us
+        return language.strip().lower().replace("_", "-")
+
+
+# -----------------------------------------------------------------------------
+
+
+class SymbolsVoice(Mimic3Voice):
+    def text_to_phonemes(
+        self, text: str, text_language: typing.Optional[str] = None
+    ) -> typing.Iterable[WORD_PHONEMES_TYPE]:
+        word_separator = self.config.phonemes.word_separator
+        word_phonemes = [
+            list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator)
+        ]
+        yield word_phonemes
--- a/mimic3-tts/mypy.ini
+++ b/mimic3-tts/mypy.ini
@ -2,3 +2,6 @@

 [mypy-setuptools.*]
 ignore_missing_imports = True
+
+[mypy-onnxruntime.*]
+ignore_missing_imports = True