Add phoneme counts

This commit is contained in:
Michael Hansen 2021-08-05 16:40:47 -04:00
commit 297921d683
3 changed files with 92 additions and 3 deletions

View file

@ -15,7 +15,7 @@ class Phonemizer:
espeakCHARS_AUTO = 0
espeakPHONEMES = 0x100
LANG_SWITCH_FLAG = re.compile(r"\([^)]+\)")
LANG_SWITCH_FLAG = re.compile(r"\([^)]*\)")
DEFAULT_CLAUSE_BREAKERS = {",", ";", ":", ".", "!", "?"}

View file

@ -1,8 +1,11 @@
import argparse
import csv
import itertools
import logging
import os
import sys
import typing
from collections import Counter
_LOGGER = logging.getLogger("phoneme_ids")
@ -62,10 +65,26 @@ def main():
action="store_true",
help="Pull primary/secondary stress out as separate phonemes",
)
parser.add_argument(
"--write-phoneme-counts", help="Path to write phoneme counts observed in input"
)
parser.add_argument(
"-m",
"--map",
nargs=2,
action="append",
help="Map from observed phoneme to desired phonemes",
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
# Map from observed phonemes to desired
phoneme_map = {}
if args.map:
for from_phoneme, to_phoneme in args.map:
phoneme_map[from_phoneme] = to_phoneme
phoneme_to_id = {}
if args.read_phonemes:
@ -124,6 +143,7 @@ def main():
# Read all input and get set of phonemes
all_phonemes = set(phoneme_to_id.keys())
all_phoneme_counts = Counter()
if args.simple_punctuation:
# Add , and .
@ -163,7 +183,10 @@ def main():
if args.simple_punctuation:
phoneme = _PUNCTUATION_MAP.get(phoneme, phoneme)
phoneme = phoneme_map.get(phoneme, phoneme)
all_phonemes.add(phoneme)
all_phoneme_counts[phoneme] += 1
# Assign phonemes to ids in sorted order
for phoneme in sorted(all_phonemes):
@ -191,7 +214,7 @@ def main():
if args.separate_stress:
# Split stress out
while phoneme and (phoneme[0] in _STRESS):
stress = phoneme[0]
stress = phoneme_map.get(phoneme[0], phoneme[0])
word_ids.append(phoneme_to_id[stress])
phoneme = phoneme[1:]
@ -199,6 +222,7 @@ def main():
if args.simple_punctuation:
phoneme = _PUNCTUATION_MAP.get(phoneme, phoneme)
phoneme = phoneme_map.get(phoneme, phoneme)
word_ids.append(phoneme_to_id[phoneme])
if word_ids:
@ -237,6 +261,72 @@ def main():
):
print(phoneme_id, phoneme, file=phonemes_file)
if args.write_phoneme_counts:
# Write file with PHONEME<space>COUNT format
with open(args.write_phoneme_counts, "w") as phoneme_counts_file:
for phoneme, phoneme_count in all_phoneme_counts.most_common():
print(phoneme, phoneme_count, file=phoneme_counts_file)
# -----------------------------------------------------------------------------
def phonemes_to_ids(
word_phonemes: typing.List[typing.List[str]],
phoneme_to_id: typing.Mapping[str, int],
pad: typing.Optional[str] = None,
bos: typing.Optional[str] = None,
eos: typing.Optional[str] = None,
blank: typing.Optional[str] = None,
simple_punctuation: bool = False,
separate_stress: bool = False,
phoneme_map: typing.Optional[typing.Mapping[str, str]] = None,
) -> typing.List[int]:
if phoneme_map is None:
phoneme_map = {}
blank_id: typing.Optional[int] = None
if blank:
blank_id = phoneme_to_id[blank]
# Transform into phoneme ids
word_phoneme_ids = []
# Add beginning-of-sentence symbol
if bos:
word_phoneme_ids.append([phoneme_to_id[bos]])
if blank_id is not None:
word_phoneme_ids.append([blank_id])
for word in word_phonemes:
word_ids = []
for phoneme in word:
if separate_stress:
# Split stress out
while phoneme and (phoneme[0] in _STRESS):
stress = phoneme_map.get(phoneme[0], phoneme[0])
word_ids.append(phoneme_to_id[stress])
phoneme = phoneme[1:]
if phoneme:
if simple_punctuation:
phoneme = _PUNCTUATION_MAP.get(phoneme, phoneme)
phoneme = phoneme_map.get(phoneme, phoneme)
word_ids.append(phoneme_to_id[phoneme])
if word_ids:
word_phoneme_ids.append(word_ids)
if blank_id is not None:
word_phoneme_ids.append([blank_id])
# Add end-of-sentence symbol
if eos:
word_phoneme_ids.append([phoneme_to_id[eos]])
return list(itertools.chain.from_iterable(word_phoneme_ids))
# -----------------------------------------------------------------------------

View file

@ -30,7 +30,6 @@ setuptools.setup(
url="https://github.com/synesthesiam/espeak-phonemizer",
packages=setuptools.find_packages(),
package_data={"espeak_phonemizer": ["VERSION", "py.typed"]},
install_requires=requirements,
entry_points={
"console_scripts": [
"espeak-phonemizer = espeak_phonemizer.__main__:main",