Add epitran voices
This commit is contained in:
parent
3ccf2c7ab3
commit
5dd49fbbb0
7 changed files with 190 additions and 2 deletions
|
|
@ -33,9 +33,15 @@ args = get_args()
|
|||
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# Override epitran
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Override epitran
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
|
||||
_LOGGER.debug(args)
|
||||
|
||||
|
|
|
|||
|
|
@ -299,6 +299,13 @@ Voices whose "phonemes" are characters from an alphabet, typically with some pun
|
|||
For voices whose orthography (writing system) is close enough to its spoken form, character-based voices allow for skipping the phonemization step. However, these voices do not support text normalization, so numbers, dates, etc. must be written out.
|
||||
|
||||
|
||||
### Epitran-based Voices
|
||||
|
||||
Voices that use [epitran](https://github.com/dmort27/epitran/) for phonemization.
|
||||
|
||||
epitran uses rules to generate phonetic pronunciations from text. It does not support text normalization, however, so numbers, dates, etc. must be written out.
|
||||
|
||||
|
||||
## License
|
||||
|
||||
See [license file](LICENSE)
|
||||
|
|
|
|||
|
|
@ -196,6 +196,7 @@ class Phonemizer(str, Enum):
|
|||
SYMBOLS = "symbols"
|
||||
GRUUT = "gruut"
|
||||
ESPEAK = "espeak"
|
||||
EPITRAN = "epitran"
|
||||
|
||||
|
||||
class Aligner(str, Enum):
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from enum import Enum
|
|||
from pathlib import Path
|
||||
from xml.sax.saxutils import escape as xmlescape
|
||||
|
||||
import epitran
|
||||
import espeak_phonemizer
|
||||
import gruut
|
||||
import numpy as np
|
||||
|
|
@ -310,6 +311,7 @@ class Mimic3Voice(metaclass=ABCMeta):
|
|||
phoneme_map=phoneme_map,
|
||||
speaker_map=speaker_map,
|
||||
)
|
||||
|
||||
if config.phonemizer == Phonemizer.SYMBOLS:
|
||||
# Phonemes are characters from an alphabet
|
||||
return SymbolsVoice(
|
||||
|
|
@ -320,6 +322,16 @@ class Mimic3Voice(metaclass=ABCMeta):
|
|||
speaker_map=speaker_map,
|
||||
)
|
||||
|
||||
if config.phonemizer == Phonemizer.EPITRAN:
|
||||
# Phonemes are from epitran: https://github.com/dmort27/epitran/
|
||||
return EpitranVoice(
|
||||
config=config,
|
||||
onnx_model=onnx_model,
|
||||
phoneme_to_id=phoneme_to_id,
|
||||
phoneme_map=phoneme_map,
|
||||
speaker_map=speaker_map,
|
||||
)
|
||||
|
||||
raise ValueError(f"Unsupported phonemizer: {config.phonemizer}")
|
||||
|
||||
|
||||
|
|
@ -525,3 +537,51 @@ class SymbolsVoice(Mimic3Voice):
|
|||
list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator)
|
||||
]
|
||||
yield word_phonemes, BreakType.NONE
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class EpitranVoice(Mimic3Voice):
|
||||
"""Voice whose phonemes come from epitran (https://github.com/dmort27/epitran/)"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._epis: typing.Dict[str, epitran.Epitran] = {}
|
||||
|
||||
def text_to_phonemes(
|
||||
self, text: str, text_language: typing.Optional[str] = None
|
||||
) -> TEXT_TO_PHONEMES_TYPE:
|
||||
text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE
|
||||
|
||||
epi = self._epis.get(text_language)
|
||||
if epi is None:
|
||||
epi = epitran.Epitran(text_language)
|
||||
self._epis[text_language] = epi
|
||||
|
||||
phoneme_str = epi.transliterate(text)
|
||||
all_word_phonemes = [
|
||||
list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split()
|
||||
]
|
||||
|
||||
minor_break = self.config.phonemes.minor_break
|
||||
major_break = self.config.phonemes.major_break
|
||||
|
||||
if minor_break or major_break:
|
||||
# Split on breaks
|
||||
sent_phonemes = []
|
||||
for word_phonemes in all_word_phonemes:
|
||||
sent_phonemes.append(word_phonemes)
|
||||
|
||||
if minor_break and (word_phonemes[-1] == minor_break):
|
||||
yield sent_phonemes, BreakType.MINOR
|
||||
sent_phonemes = []
|
||||
elif major_break and (word_phonemes[-1] == major_break):
|
||||
yield sent_phonemes, BreakType.MAJOR
|
||||
sent_phonemes = []
|
||||
|
||||
if sent_phonemes:
|
||||
yield sent_phonemes, BreakType.MAJOR
|
||||
else:
|
||||
# No split
|
||||
yield all_word_phonemes, BreakType.UTTERANCE
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
"de_DE/thorsten_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
"size_bytes": 6557,
|
||||
"sha256_sum": "434e11b12f4a3f3096032bc35c5189afe7827b726212b2406a28189598d9c4cf"
|
||||
"size_bytes": 6555,
|
||||
"sha256_sum": "36ffd9dc085d529a7e60e1276d73ae5a030b020313e6c5408593a6ae2af39673"
|
||||
},
|
||||
"README.md": {
|
||||
"size_bytes": 193,
|
||||
|
|
@ -73,6 +73,10 @@
|
|||
"size_bytes": 155,
|
||||
"sha256_sum": "2e6c39454c35910c6b48518523429ec2e4f27ae8eb763e311877a017db23d4da"
|
||||
},
|
||||
"SOURCE": {
|
||||
"size_bytes": 18,
|
||||
"sha256_sum": "8197ffe96f3b6772797357e007d63cde409573a0bd3fe174489e01a5faa95553"
|
||||
},
|
||||
"config.json": {
|
||||
"size_bytes": 3434,
|
||||
"sha256_sum": "1fdaa1124e02cc177eb776fbc6e08c838b56bd2e86c82d8d7fe434d9337806b0"
|
||||
|
|
@ -404,6 +408,54 @@
|
|||
"speakers": [],
|
||||
"properties": {}
|
||||
},
|
||||
"fr_FR/m-ailabs_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
"size_bytes": 1372,
|
||||
"sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c"
|
||||
},
|
||||
"README.md": {
|
||||
"size_bytes": 192,
|
||||
"sha256_sum": "cea70c7d84bd5f85607efd4545825a36f6117106e2deb8b00cdb923f550670e8"
|
||||
},
|
||||
"SOURCE": {
|
||||
"size_bytes": 61,
|
||||
"sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a"
|
||||
},
|
||||
"config.json": {
|
||||
"size_bytes": 3608,
|
||||
"sha256_sum": "db66a1051ae131d82ebb9257faeab5d1403ddcf33342eb84ac8e3fe4cf9c7dc9"
|
||||
},
|
||||
"generator.onnx": {
|
||||
"size_bytes": 76330079,
|
||||
"sha256_sum": "cf1de519bf1d02e6d8d2685f64192783b9ca6d7a8bd101b03e20f5c5ceee28bb"
|
||||
},
|
||||
"phoneme_map.txt": {
|
||||
"size_bytes": 15,
|
||||
"sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
|
||||
},
|
||||
"phonemes.txt": {
|
||||
"size_bytes": 232,
|
||||
"sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
|
||||
},
|
||||
"speaker_map.csv": {
|
||||
"size_bytes": 119,
|
||||
"sha256_sum": "25df8f995e663206cb06a203a084051552557108d2e165f42476787cd6e1ee5e"
|
||||
},
|
||||
"speakers.txt": {
|
||||
"size_bytes": 64,
|
||||
"sha256_sum": "01e93046b04e95815d5397ee759bba055e80a5e6bdcabc117ff801c1da841d20"
|
||||
}
|
||||
},
|
||||
"speakers": [
|
||||
"ezwa",
|
||||
"nadine_eckert_boulet",
|
||||
"bernard",
|
||||
"zeckou",
|
||||
"gilles_g_le_blanc"
|
||||
],
|
||||
"properties": {}
|
||||
},
|
||||
"fr_FR/siwis_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
|
|
@ -602,6 +654,14 @@
|
|||
},
|
||||
"ru_RU/multi_low": {
|
||||
"files": {
|
||||
"README.md": {
|
||||
"size_bytes": 266,
|
||||
"sha256_sum": "f2a1a1aecc439fefb59879481dc659eb532d7cbf4e417829f68543e5379cabdc"
|
||||
},
|
||||
"SOURCE": {
|
||||
"size_bytes": 132,
|
||||
"sha256_sum": "9c3b3fd5fe49e2e1d4e5daa5c643a713da955415d06fccf50add1185301f351a"
|
||||
},
|
||||
"config.json": {
|
||||
"size_bytes": 3923,
|
||||
"sha256_sum": "314e0fdd09183942d2f7393d4b950a12823849c0f72d22e62dc9858a6b4886c6"
|
||||
|
|
@ -618,6 +678,14 @@
|
|||
"size_bytes": 76335199,
|
||||
"sha256_sum": "cb84b12479fc619943cb8fbb56827f7fd95f5ffcbebf2c220606b3a9750bf2ca"
|
||||
},
|
||||
"m-ailabs/LICENSE": {
|
||||
"size_bytes": 1372,
|
||||
"sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c"
|
||||
},
|
||||
"m-ailabs/SOURCE": {
|
||||
"size_bytes": 61,
|
||||
"sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a"
|
||||
},
|
||||
"phoneme_map.txt": {
|
||||
"size_bytes": 15,
|
||||
"sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
|
||||
|
|
@ -668,6 +736,48 @@
|
|||
"speakers": [],
|
||||
"properties": {}
|
||||
},
|
||||
"te_IN/cmu-indic_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
"size_bytes": 960,
|
||||
"sha256_sum": "244ff21a910baf28bcb27b1975620a79d2be8611815ecc599f08eb06dd6f000e"
|
||||
},
|
||||
"README.md": {
|
||||
"size_bytes": 162,
|
||||
"sha256_sum": "19ee27d0ac6dbcc337a39ff442a13cd04302594e648877c82ed6a01f43400f3f"
|
||||
},
|
||||
"SOURCE": {
|
||||
"size_bytes": 30,
|
||||
"sha256_sum": "d79737ae72d64666485e1899ec32badf1f1043be2619463139d502d5f88f4167"
|
||||
},
|
||||
"config.json": {
|
||||
"size_bytes": 3647,
|
||||
"sha256_sum": "aaaee2aa729ccdf98776690187bce96e63a835d91b4683ecf2951e8ad32fc485"
|
||||
},
|
||||
"generator.onnx": {
|
||||
"size_bytes": 76331359,
|
||||
"sha256_sum": "c8abcadc961fba7369f5fff4672eba194a105ecaf044ca4e3fd0a5b3c81c11b6"
|
||||
},
|
||||
"phonemes.txt": {
|
||||
"size_bytes": 282,
|
||||
"sha256_sum": "ec8b51eb56fde1a81c7eb4442cb7cf7604596501b63093b441c04fbc943c895c"
|
||||
},
|
||||
"speaker_map.csv": {
|
||||
"size_bytes": 49,
|
||||
"sha256_sum": "0ce2ff1aa2d78aed066e36b204df647ac9c565b4bd7bdb3bb657e334d7a0e4ab"
|
||||
},
|
||||
"speakers.txt": {
|
||||
"size_bytes": 13,
|
||||
"sha256_sum": "7d9c38da91ac4289c23f20d7cb0b6efb69fc8ca4807d62f0a5d7f0a4e0cf77e4"
|
||||
}
|
||||
},
|
||||
"speakers": [
|
||||
"ss",
|
||||
"sk",
|
||||
"kpn"
|
||||
],
|
||||
"properties": {}
|
||||
},
|
||||
"uk_UK/m-ailabs_low": {
|
||||
"files": {
|
||||
"LICENSE": {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@
|
|||
[mypy-setuptools.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-epitran.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-onnxruntime.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
dataclasses-json<1.0
|
||||
epitran==1.17
|
||||
espeak-phonemizer>=1.0,<2.0
|
||||
gruut>=2.3.0,<3.0
|
||||
numpy<2.0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue