Add epitran voices

This commit is contained in:
Michael Hansen 2022-04-07 12:03:29 -04:00
commit 5dd49fbbb0
7 changed files with 190 additions and 2 deletions

View file

@ -33,9 +33,15 @@ args = get_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
# Override epitran
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
# Override epitran
logging.getLogger().setLevel(logging.INFO)
_LOGGER.debug(args)

View file

@ -299,6 +299,13 @@ Voices whose "phonemes" are characters from an alphabet, typically with some pun
For voices whose orthography (writing system) is close enough to its spoken form, character-based voices allow for skipping the phonemization step. However, these voices do not support text normalization, so numbers, dates, etc. must be written out.
### Epitran-based Voices
Voices that use [epitran](https://github.com/dmort27/epitran/) for phonemization.
epitran uses rules to generate phonetic pronunciations from text. It does not support text normalization, however, so numbers, dates, etc. must be written out.
## License
See [license file](LICENSE)

View file

@ -196,6 +196,7 @@ class Phonemizer(str, Enum):
SYMBOLS = "symbols"
GRUUT = "gruut"
ESPEAK = "espeak"
EPITRAN = "epitran"
class Aligner(str, Enum):

View file

@ -23,6 +23,7 @@ from enum import Enum
from pathlib import Path
from xml.sax.saxutils import escape as xmlescape
import epitran
import espeak_phonemizer
import gruut
import numpy as np
@ -310,6 +311,7 @@ class Mimic3Voice(metaclass=ABCMeta):
phoneme_map=phoneme_map,
speaker_map=speaker_map,
)
if config.phonemizer == Phonemizer.SYMBOLS:
# Phonemes are characters from an alphabet
return SymbolsVoice(
@ -320,6 +322,16 @@ class Mimic3Voice(metaclass=ABCMeta):
speaker_map=speaker_map,
)
if config.phonemizer == Phonemizer.EPITRAN:
# Phonemes are from epitran: https://github.com/dmort27/epitran/
return EpitranVoice(
config=config,
onnx_model=onnx_model,
phoneme_to_id=phoneme_to_id,
phoneme_map=phoneme_map,
speaker_map=speaker_map,
)
raise ValueError(f"Unsupported phonemizer: {config.phonemizer}")
@ -525,3 +537,51 @@ class SymbolsVoice(Mimic3Voice):
list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator)
]
yield word_phonemes, BreakType.NONE
# -----------------------------------------------------------------------------
class EpitranVoice(Mimic3Voice):
"""Voice whose phonemes come from epitran (https://github.com/dmort27/epitran/)"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._epis: typing.Dict[str, epitran.Epitran] = {}
def text_to_phonemes(
self, text: str, text_language: typing.Optional[str] = None
) -> TEXT_TO_PHONEMES_TYPE:
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
epi = self._epis.get(text_language)
if epi is None:
epi = epitran.Epitran(text_language)
self._epis[text_language] = epi
phoneme_str = epi.transliterate(text)
all_word_phonemes = [
list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split()
]
minor_break = self.config.phonemes.minor_break
major_break = self.config.phonemes.major_break
if minor_break or major_break:
# Split on breaks
sent_phonemes = []
for word_phonemes in all_word_phonemes:
sent_phonemes.append(word_phonemes)
if minor_break and (word_phonemes[-1] == minor_break):
yield sent_phonemes, BreakType.MINOR
sent_phonemes = []
elif major_break and (word_phonemes[-1] == major_break):
yield sent_phonemes, BreakType.MAJOR
sent_phonemes = []
if sent_phonemes:
yield sent_phonemes, BreakType.MAJOR
else:
# No split
yield all_word_phonemes, BreakType.UTTERANCE

View file

@ -2,8 +2,8 @@
"de_DE/thorsten_low": {
"files": {
"LICENSE": {
"size_bytes": 6557,
"sha256_sum": "434e11b12f4a3f3096032bc35c5189afe7827b726212b2406a28189598d9c4cf"
"size_bytes": 6555,
"sha256_sum": "36ffd9dc085d529a7e60e1276d73ae5a030b020313e6c5408593a6ae2af39673"
},
"README.md": {
"size_bytes": 193,
@ -73,6 +73,10 @@
"size_bytes": 155,
"sha256_sum": "2e6c39454c35910c6b48518523429ec2e4f27ae8eb763e311877a017db23d4da"
},
"SOURCE": {
"size_bytes": 18,
"sha256_sum": "8197ffe96f3b6772797357e007d63cde409573a0bd3fe174489e01a5faa95553"
},
"config.json": {
"size_bytes": 3434,
"sha256_sum": "1fdaa1124e02cc177eb776fbc6e08c838b56bd2e86c82d8d7fe434d9337806b0"
@ -404,6 +408,54 @@
"speakers": [],
"properties": {}
},
"fr_FR/m-ailabs_low": {
"files": {
"LICENSE": {
"size_bytes": 1372,
"sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c"
},
"README.md": {
"size_bytes": 192,
"sha256_sum": "cea70c7d84bd5f85607efd4545825a36f6117106e2deb8b00cdb923f550670e8"
},
"SOURCE": {
"size_bytes": 61,
"sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a"
},
"config.json": {
"size_bytes": 3608,
"sha256_sum": "db66a1051ae131d82ebb9257faeab5d1403ddcf33342eb84ac8e3fe4cf9c7dc9"
},
"generator.onnx": {
"size_bytes": 76330079,
"sha256_sum": "cf1de519bf1d02e6d8d2685f64192783b9ca6d7a8bd101b03e20f5c5ceee28bb"
},
"phoneme_map.txt": {
"size_bytes": 15,
"sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
},
"phonemes.txt": {
"size_bytes": 232,
"sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
},
"speaker_map.csv": {
"size_bytes": 119,
"sha256_sum": "25df8f995e663206cb06a203a084051552557108d2e165f42476787cd6e1ee5e"
},
"speakers.txt": {
"size_bytes": 64,
"sha256_sum": "01e93046b04e95815d5397ee759bba055e80a5e6bdcabc117ff801c1da841d20"
}
},
"speakers": [
"ezwa",
"nadine_eckert_boulet",
"bernard",
"zeckou",
"gilles_g_le_blanc"
],
"properties": {}
},
"fr_FR/siwis_low": {
"files": {
"LICENSE": {
@ -602,6 +654,14 @@
},
"ru_RU/multi_low": {
"files": {
"README.md": {
"size_bytes": 266,
"sha256_sum": "f2a1a1aecc439fefb59879481dc659eb532d7cbf4e417829f68543e5379cabdc"
},
"SOURCE": {
"size_bytes": 132,
"sha256_sum": "9c3b3fd5fe49e2e1d4e5daa5c643a713da955415d06fccf50add1185301f351a"
},
"config.json": {
"size_bytes": 3923,
"sha256_sum": "314e0fdd09183942d2f7393d4b950a12823849c0f72d22e62dc9858a6b4886c6"
@ -618,6 +678,14 @@
"size_bytes": 76335199,
"sha256_sum": "cb84b12479fc619943cb8fbb56827f7fd95f5ffcbebf2c220606b3a9750bf2ca"
},
"m-ailabs/LICENSE": {
"size_bytes": 1372,
"sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c"
},
"m-ailabs/SOURCE": {
"size_bytes": 61,
"sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a"
},
"phoneme_map.txt": {
"size_bytes": 15,
"sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
@ -668,6 +736,48 @@
"speakers": [],
"properties": {}
},
"te_IN/cmu-indic_low": {
"files": {
"LICENSE": {
"size_bytes": 960,
"sha256_sum": "244ff21a910baf28bcb27b1975620a79d2be8611815ecc599f08eb06dd6f000e"
},
"README.md": {
"size_bytes": 162,
"sha256_sum": "19ee27d0ac6dbcc337a39ff442a13cd04302594e648877c82ed6a01f43400f3f"
},
"SOURCE": {
"size_bytes": 30,
"sha256_sum": "d79737ae72d64666485e1899ec32badf1f1043be2619463139d502d5f88f4167"
},
"config.json": {
"size_bytes": 3647,
"sha256_sum": "aaaee2aa729ccdf98776690187bce96e63a835d91b4683ecf2951e8ad32fc485"
},
"generator.onnx": {
"size_bytes": 76331359,
"sha256_sum": "c8abcadc961fba7369f5fff4672eba194a105ecaf044ca4e3fd0a5b3c81c11b6"
},
"phonemes.txt": {
"size_bytes": 282,
"sha256_sum": "ec8b51eb56fde1a81c7eb4442cb7cf7604596501b63093b441c04fbc943c895c"
},
"speaker_map.csv": {
"size_bytes": 49,
"sha256_sum": "0ce2ff1aa2d78aed066e36b204df647ac9c565b4bd7bdb3bb657e334d7a0e4ab"
},
"speakers.txt": {
"size_bytes": 13,
"sha256_sum": "7d9c38da91ac4289c23f20d7cb0b6efb69fc8ca4807d62f0a5d7f0a4e0cf77e4"
}
},
"speakers": [
"ss",
"sk",
"kpn"
],
"properties": {}
},
"uk_UK/m-ailabs_low": {
"files": {
"LICENSE": {

View file

@ -3,6 +3,9 @@
[mypy-setuptools.*]
ignore_missing_imports = True
[mypy-epitran.*]
ignore_missing_imports = True
[mypy-onnxruntime.*]
ignore_missing_imports = True

View file

@ -1,4 +1,5 @@
dataclasses-json<1.0
epitran==1.17
espeak-phonemizer>=1.0,<2.0
gruut>=2.3.0,<3.0
numpy<2.0