From 5dd49fbbb00666fb7f402339ff3cf022e8a177e9 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 7 Apr 2022 12:03:29 -0400 Subject: [PATCH] Add epitran voices --- mimic3-http/mimic3_http/__main__.py | 6 ++ mimic3-tts/README.md | 7 ++ mimic3-tts/mimic3_tts/config.py | 1 + mimic3-tts/mimic3_tts/voice.py | 60 +++++++++++++++ mimic3-tts/mimic3_tts/voices.json | 114 +++++++++++++++++++++++++++- mimic3-tts/mypy.ini | 3 + mimic3-tts/requirements.txt | 1 + 7 files changed, 190 insertions(+), 2 deletions(-) diff --git a/mimic3-http/mimic3_http/__main__.py b/mimic3-http/mimic3_http/__main__.py index e6919ba..fc36bf0 100644 --- a/mimic3-http/mimic3_http/__main__.py +++ b/mimic3-http/mimic3_http/__main__.py @@ -33,9 +33,15 @@ args = get_args() if args.debug: logging.basicConfig(level=logging.DEBUG) + + # Override epitran + logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) + # Override epitran + logging.getLogger().setLevel(logging.INFO) + _LOGGER.debug(args) diff --git a/mimic3-tts/README.md b/mimic3-tts/README.md index 226d523..2dc0bab 100644 --- a/mimic3-tts/README.md +++ b/mimic3-tts/README.md @@ -299,6 +299,13 @@ Voices whose "phonemes" are characters from an alphabet, typically with some pun For voices whose orthography (writing system) is close enough to its spoken form, character-based voices allow for skipping the phonemization step. However, these voices do not support text normalization, so numbers, dates, etc. must be written out. +### Epitran-based Voices + +Voices that use [epitran](https://github.com/dmort27/epitran/) for phonemization. + +epitran uses rules to generate phonetic pronunciations from text. It does not support text normalization, however, so numbers, dates, etc. must be written out. + + ## License See [license file](LICENSE) diff --git a/mimic3-tts/mimic3_tts/config.py b/mimic3-tts/mimic3_tts/config.py index b2c2d5b..72dc11a 100644 --- a/mimic3-tts/mimic3_tts/config.py +++ b/mimic3-tts/mimic3_tts/config.py @@ -196,6 +196,7 @@ class Phonemizer(str, Enum): SYMBOLS = "symbols" GRUUT = "gruut" ESPEAK = "espeak" + EPITRAN = "epitran" class Aligner(str, Enum): diff --git a/mimic3-tts/mimic3_tts/voice.py b/mimic3-tts/mimic3_tts/voice.py index d24db86..d0f8c1d 100644 --- a/mimic3-tts/mimic3_tts/voice.py +++ b/mimic3-tts/mimic3_tts/voice.py @@ -23,6 +23,7 @@ from enum import Enum from pathlib import Path from xml.sax.saxutils import escape as xmlescape +import epitran import espeak_phonemizer import gruut import numpy as np @@ -310,6 +311,7 @@ class Mimic3Voice(metaclass=ABCMeta): phoneme_map=phoneme_map, speaker_map=speaker_map, ) + if config.phonemizer == Phonemizer.SYMBOLS: # Phonemes are characters from an alphabet return SymbolsVoice( @@ -320,6 +322,16 @@ class Mimic3Voice(metaclass=ABCMeta): speaker_map=speaker_map, ) + if config.phonemizer == Phonemizer.EPITRAN: + # Phonemes are from epitran: https://github.com/dmort27/epitran/ + return EpitranVoice( + config=config, + onnx_model=onnx_model, + phoneme_to_id=phoneme_to_id, + phoneme_map=phoneme_map, + speaker_map=speaker_map, + ) + raise ValueError(f"Unsupported phonemizer: {config.phonemizer}") @@ -525,3 +537,51 @@ class SymbolsVoice(Mimic3Voice): list(IPA.graphemes(wp_str)) for wp_str in text.split(word_separator) ] yield word_phonemes, BreakType.NONE + + +# ----------------------------------------------------------------------------- + + +class EpitranVoice(Mimic3Voice): + """Voice whose phonemes come from epitran (https://github.com/dmort27/epitran/)""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._epis: typing.Dict[str, epitran.Epitran] = {} + + def text_to_phonemes( + self, text: str, text_language: typing.Optional[str] = None + ) -> TEXT_TO_PHONEMES_TYPE: + text_language = text_language or self.config.text_language or DEFAULT_LANGUAGE + + epi = self._epis.get(text_language) + if epi is None: + epi = epitran.Epitran(text_language) + self._epis[text_language] = epi + + phoneme_str = epi.transliterate(text) + all_word_phonemes = [ + list(IPA.graphemes(wp_str)) for wp_str in phoneme_str.split() + ] + + minor_break = self.config.phonemes.minor_break + major_break = self.config.phonemes.major_break + + if minor_break or major_break: + # Split on breaks + sent_phonemes = [] + for word_phonemes in all_word_phonemes: + sent_phonemes.append(word_phonemes) + + if minor_break and (word_phonemes[-1] == minor_break): + yield sent_phonemes, BreakType.MINOR + sent_phonemes = [] + elif major_break and (word_phonemes[-1] == major_break): + yield sent_phonemes, BreakType.MAJOR + sent_phonemes = [] + + if sent_phonemes: + yield sent_phonemes, BreakType.MAJOR + else: + # No split + yield all_word_phonemes, BreakType.UTTERANCE diff --git a/mimic3-tts/mimic3_tts/voices.json b/mimic3-tts/mimic3_tts/voices.json index a4fe065..9ba27ed 100644 --- a/mimic3-tts/mimic3_tts/voices.json +++ b/mimic3-tts/mimic3_tts/voices.json @@ -2,8 +2,8 @@ "de_DE/thorsten_low": { "files": { "LICENSE": { - "size_bytes": 6557, - "sha256_sum": "434e11b12f4a3f3096032bc35c5189afe7827b726212b2406a28189598d9c4cf" + "size_bytes": 6555, + "sha256_sum": "36ffd9dc085d529a7e60e1276d73ae5a030b020313e6c5408593a6ae2af39673" }, "README.md": { "size_bytes": 193, @@ -73,6 +73,10 @@ "size_bytes": 155, "sha256_sum": "2e6c39454c35910c6b48518523429ec2e4f27ae8eb763e311877a017db23d4da" }, + "SOURCE": { + "size_bytes": 18, + "sha256_sum": "8197ffe96f3b6772797357e007d63cde409573a0bd3fe174489e01a5faa95553" + }, "config.json": { "size_bytes": 3434, "sha256_sum": "1fdaa1124e02cc177eb776fbc6e08c838b56bd2e86c82d8d7fe434d9337806b0" @@ -404,6 +408,54 @@ "speakers": [], "properties": {} }, + "fr_FR/m-ailabs_low": { + "files": { + "LICENSE": { + "size_bytes": 1372, + "sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c" + }, + "README.md": { + "size_bytes": 192, + "sha256_sum": "cea70c7d84bd5f85607efd4545825a36f6117106e2deb8b00cdb923f550670e8" + }, + "SOURCE": { + "size_bytes": 61, + "sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a" + }, + "config.json": { + "size_bytes": 3608, + "sha256_sum": "db66a1051ae131d82ebb9257faeab5d1403ddcf33342eb84ac8e3fe4cf9c7dc9" + }, + "generator.onnx": { + "size_bytes": 76330079, + "sha256_sum": "cf1de519bf1d02e6d8d2685f64192783b9ca6d7a8bd101b03e20f5c5ceee28bb" + }, + "phoneme_map.txt": { + "size_bytes": 15, + "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d" + }, + "phonemes.txt": { + "size_bytes": 232, + "sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e" + }, + "speaker_map.csv": { + "size_bytes": 119, + "sha256_sum": "25df8f995e663206cb06a203a084051552557108d2e165f42476787cd6e1ee5e" + }, + "speakers.txt": { + "size_bytes": 64, + "sha256_sum": "01e93046b04e95815d5397ee759bba055e80a5e6bdcabc117ff801c1da841d20" + } + }, + "speakers": [ + "ezwa", + "nadine_eckert_boulet", + "bernard", + "zeckou", + "gilles_g_le_blanc" + ], + "properties": {} + }, "fr_FR/siwis_low": { "files": { "LICENSE": { @@ -602,6 +654,14 @@ }, "ru_RU/multi_low": { "files": { + "README.md": { + "size_bytes": 266, + "sha256_sum": "f2a1a1aecc439fefb59879481dc659eb532d7cbf4e417829f68543e5379cabdc" + }, + "SOURCE": { + "size_bytes": 132, + "sha256_sum": "9c3b3fd5fe49e2e1d4e5daa5c643a713da955415d06fccf50add1185301f351a" + }, "config.json": { "size_bytes": 3923, "sha256_sum": "314e0fdd09183942d2f7393d4b950a12823849c0f72d22e62dc9858a6b4886c6" @@ -618,6 +678,14 @@ "size_bytes": 76335199, "sha256_sum": "cb84b12479fc619943cb8fbb56827f7fd95f5ffcbebf2c220606b3a9750bf2ca" }, + "m-ailabs/LICENSE": { + "size_bytes": 1372, + "sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c" + }, + "m-ailabs/SOURCE": { + "size_bytes": 61, + "sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a" + }, "phoneme_map.txt": { "size_bytes": 15, "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d" @@ -668,6 +736,48 @@ "speakers": [], "properties": {} }, + "te_IN/cmu-indic_low": { + "files": { + "LICENSE": { + "size_bytes": 960, + "sha256_sum": "244ff21a910baf28bcb27b1975620a79d2be8611815ecc599f08eb06dd6f000e" + }, + "README.md": { + "size_bytes": 162, + "sha256_sum": "19ee27d0ac6dbcc337a39ff442a13cd04302594e648877c82ed6a01f43400f3f" + }, + "SOURCE": { + "size_bytes": 30, + "sha256_sum": "d79737ae72d64666485e1899ec32badf1f1043be2619463139d502d5f88f4167" + }, + "config.json": { + "size_bytes": 3647, + "sha256_sum": "aaaee2aa729ccdf98776690187bce96e63a835d91b4683ecf2951e8ad32fc485" + }, + "generator.onnx": { + "size_bytes": 76331359, + "sha256_sum": "c8abcadc961fba7369f5fff4672eba194a105ecaf044ca4e3fd0a5b3c81c11b6" + }, + "phonemes.txt": { + "size_bytes": 282, + "sha256_sum": "ec8b51eb56fde1a81c7eb4442cb7cf7604596501b63093b441c04fbc943c895c" + }, + "speaker_map.csv": { + "size_bytes": 49, + "sha256_sum": "0ce2ff1aa2d78aed066e36b204df647ac9c565b4bd7bdb3bb657e334d7a0e4ab" + }, + "speakers.txt": { + "size_bytes": 13, + "sha256_sum": "7d9c38da91ac4289c23f20d7cb0b6efb69fc8ca4807d62f0a5d7f0a4e0cf77e4" + } + }, + "speakers": [ + "ss", + "sk", + "kpn" + ], + "properties": {} + }, "uk_UK/m-ailabs_low": { "files": { "LICENSE": { diff --git a/mimic3-tts/mypy.ini b/mimic3-tts/mypy.ini index bbe7a0e..51b8cb3 100644 --- a/mimic3-tts/mypy.ini +++ b/mimic3-tts/mypy.ini @@ -3,6 +3,9 @@ [mypy-setuptools.*] ignore_missing_imports = True +[mypy-epitran.*] +ignore_missing_imports = True + [mypy-onnxruntime.*] ignore_missing_imports = True diff --git a/mimic3-tts/requirements.txt b/mimic3-tts/requirements.txt index 14f2ee1..7f6210b 100644 --- a/mimic3-tts/requirements.txt +++ b/mimic3-tts/requirements.txt @@ -1,4 +1,5 @@ dataclasses-json<1.0 +epitran==1.17 espeak-phonemizer>=1.0,<2.0 gruut>=2.3.0,<3.0 numpy<2.0