diff --git a/mimic3-http/mimic3_http/_resources.py b/mimic3-http/mimic3_http/_resources.py new file mode 100644 index 0000000..5feacea --- /dev/null +++ b/mimic3-http/mimic3_http/_resources.py @@ -0,0 +1,34 @@ +# Copyright 2022 Mycroft AI Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +"""Shared access to package resources""" +import os +import typing +from pathlib import Path + +try: + import importlib.resources + + files = importlib.resources.files +except (ImportError, AttributeError): + # Backport for Python < 3.9 + import importlib_resources # type: ignore + + files = importlib_resources.files + +_PACKAGE = "mimic3_http" +_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE))) + +__version__ = (_DIR / "VERSION").read_text(encoding="utf-8").strip() diff --git a/mimic3-tts/.gitignore b/mimic3-tts/.gitignore index 82f869f..f4edcec 100644 --- a/mimic3-tts/.gitignore +++ b/mimic3-tts/.gitignore @@ -12,3 +12,5 @@ htmlcov __pycache__/ .mypy_cache/ *.egg-info/ + +flycheck_*.py diff --git a/mimic3-tts/mimic3_tts/__init__.py b/mimic3-tts/mimic3_tts/__init__.py index 460732e..10d2a28 100644 --- a/mimic3-tts/mimic3_tts/__init__.py +++ b/mimic3-tts/mimic3_tts/__init__.py @@ -1,6 +1,15 @@ from pathlib import Path -from opentts_abc import AudioResult, MarkResult +from opentts_abc import ( + AudioResult, + BaseResult, + BaseToken, + MarkResult, + Phonemes, + SayAs, + Voice, + Word, +) from opentts_abc.ssml import SSMLSpeaker from ._resources import __version__ diff --git a/mimic3-tts/mimic3_tts/__main__.py b/mimic3-tts/mimic3_tts/__main__.py index edf45ec..f6a140c 100644 --- a/mimic3-tts/mimic3_tts/__main__.py +++ b/mimic3-tts/mimic3_tts/__main__.py @@ -1,50 +1,536 @@ #!/usr/bin/env python3 +# Copyright 2022 Mycroft AI Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +import argparse +import csv +import io import logging +import os +import string +import sys +import tempfile +import threading +import time +import typing import wave +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from queue import Queue -from opentts_abc.ssml import SSMLSpeaker +from ._resources import _PACKAGE -from .tts import AudioResult, MarkResult, Mimic3Settings, Mimic3TextToSpeechSystem - -logging.basicConfig(level=logging.DEBUG) - -settings = Mimic3Settings() -tts = Mimic3TextToSpeechSystem(settings) - -speaker = SSMLSpeaker(tts) -# ssml = 'Τοαερόστρωμνόμουείναιγεμάτοχέλια.' -# ssml = 'бажав' -# ssml = 'HelloWorld' -# ssml = 'Hello world' -# ssml = '12' -ssml = """ - - - Today is a test. - This is another test. - +if typing.TYPE_CHECKING: + from . import BaseResult, Mimic3TextToSpeechSystem # noqa: F401 - - - Soy el 1. - - - -""" +_LOGGER = logging.getLogger(_PACKAGE) -wav_file: wave.Wave_write = wave.open("out.wav", "wb") -params_set = False -with wav_file: - for result in speaker.speak(ssml): - if isinstance(result, AudioResult): - if not params_set: - wav_file.setframerate(result.sample_rate_hz) - wav_file.setsampwidth(result.sample_width_bytes) - wav_file.setnchannels(result.num_channels) - params_set = True +# ----------------------------------------------------------------------------- - wav_file.writeframes(result.audio_bytes) - elif isinstance(result, MarkResult): - print("mark", result.name) + +@dataclass +class ResultToProcess: + result: "BaseResult" + line: str + line_id: str = "" + + +@dataclass +class CommandLineInterfaceState: + args: argparse.Namespace + texts: typing.Optional[typing.Iterable[str]] = None + mark_writer: typing.Optional[typing.TextIO] = None + tts: typing.Optional["Mimic3TextToSpeechSystem"] = None + text_from_stdin: bool = False + + all_audio: bytes = field(default_factory=bytes) + sample_rate_hz: int = 22050 + sample_width_bytes: int = 2 + num_channels: int = 1 + + result_queue: typing.Optional["Queue[typing.Optional[ResultToProcess]]"] = None + result_thread: typing.Optional[threading.Thread] = None + + +class OutputNaming(str, Enum): + """Format used for output file names""" + + TEXT = "text" + TIME = "time" + ID = "id" + + +class StdinFormat(str, Enum): + """Format of standard input""" + + AUTO = "auto" + """Choose based on SSML state""" + + LINES = "lines" + """Each line is a separate sentence/document""" + + DOCUMENT = "document" + """Entire input is one document""" + + +# ----------------------------------------------------------------------------- + + +def main(): + """Main entry point""" + args = get_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + if args.version: + # Print version and exit + from . import __version__ + + print(__version__) + sys.exit(0) + + state = CommandLineInterfaceState(args=args) + initialize_args(state) + initialize_tts(state) + + try: + if args.voices: + # Print voices and exit + print_voices(state) + else: + # Process user input + if os.isatty(sys.stdin.fileno()): + print("Reading text from stdin...", file=sys.stderr) + + process_lines(state) + finally: + shutdown_tts(state) + + +def initialize_args(state: CommandLineInterfaceState): + import numpy as np + + args = state.args + + # Create output directory + if args.output_dir: + args.output_dir = Path(args.output_dir) + args.output_dir.mkdir(parents=True, exist_ok=True) + + # Open file for writing the names from tags in SSML. + # Each name is printed on a single line. + if args.mark_file: + args.mark_file = Path(args.mark_file) + args.mark_file.parent.mkdir(parents=True, exist_ok=True) + state.mark_writer = open( # pylint: disable=consider-using-with + args.mark_file, "w", encoding="utf-8" + ) + elif args.stdout: + state.mark_writer = sys.stderr + else: + state.mark_writer = sys.stdout + + if args.seed is not None: + _LOGGER.debug("Setting random seed to %s", args.seed) + np.random.seed(args.seed) + + if args.csv: + args.output_naming = "id" + + # Read text from stdin or arguments + if args.text: + # Use arguments + state.texts = args.text + else: + # Use stdin + state.text_from_stdin = True + stdin_format = StdinFormat.LINES + + if (args.stdin_format == StdinFormat.AUTO) and args.ssml: + # Assume SSML input is entire document + stdin_format = StdinFormat.DOCUMENT + + if stdin_format == StdinFormat.DOCUMENT: + # One big line + state.texts = [sys.stdin.read()] + else: + # Multiple lines + state.texts = sys.stdin + + assert state.texts is not None + + if args.process_on_blank_line: + + # Combine text until a blank line is encountered. + # Good for line-wrapped books where + # sentences are broken + # up across multiple + # lines. + def process_on_blank_line(lines: typing.Iterable[str]): + text = "" + for line in lines: + line = line.strip() + if not line: + if text: + yield text + + text = "" + continue + + text += " " + line + + state.texts = process_on_blank_line(state.texts) + + +def initialize_tts(state: CommandLineInterfaceState): + from mimic3_tts import Mimic3Settings, Mimic3TextToSpeechSystem # noqa: F811 + + args = state.args + + state.tts = Mimic3TextToSpeechSystem(Mimic3Settings()) + + if args.voices: + # Don't bother with the rest of the initialization + return + + if state.args.voice: + # Set default voice + state.tts.voice = state.args.voice + + if state.args.preload_voice: + for voice_key in state.args.preload_voice: + _LOGGER.debug("Preloading voice: %s", voice_key) + state.tts.preload_voice(voice_key) + + state.result_queue = Queue(maxsize=args.result_queue_size) + + state.result_thread = threading.Thread( + target=process_result, daemon=True, args=(state,) + ) + state.result_thread.start() + + +def process_result(state: CommandLineInterfaceState): + try: + from mimic3_tts import AudioResult, MarkResult + + assert state.result_queue is not None + args = state.args + + while True: + result_todo = state.result_queue.get() + if result_todo is None: + break + + try: + result = result_todo.result + line = result_todo.line + line_id = result_todo.line_id + + if isinstance(result, AudioResult): + if args.interactive or args.output_dir: + # Convert to WAV audio + wav_bytes: typing.Optional[bytes] = None + if args.interactive: + if args.stdout: + # Write audio to stdout + sys.stdout.buffer.write(result.audio_bytes) + sys.stdout.buffer.flush() + else: + # Play sound + if not wav_bytes: + wav_bytes = result.to_wav_bytes() + + if wav_bytes: + play_wav_bytes(wav_bytes) + + if args.output_dir: + if not wav_bytes: + wav_bytes = result.to_wav_bytes() + + # Determine file name + if args.output_naming == OutputNaming.TEXT: + # Use text itself + file_name = line.strip().replace(" ", "_") + file_name = file_name.translate( + str.maketrans( + "", "", string.punctuation.replace("_", "") + ) + ) + elif args.output_naming == OutputNaming.TIME: + # Use timestamp + file_name = str(time.time()) + elif args.output_naming == OutputNaming.ID: + file_name = line_id + + assert file_name, f"No file name for text: {line}" + wav_path = args.output_dir / (file_name + ".wav") + wav_path.write_bytes(wav_bytes) + + _LOGGER.debug("Wrote %s", wav_path) + else: + # Combine all audio and output to stdout at the end + state.all_audio += result.audio_bytes + state.sample_rate_hz = result.sample_rate_hz + state.sample_width_bytes = result.sample_width_bytes + state.num_channels = result.num_channels + elif isinstance(result, MarkResult): + if state.mark_writer: + print(result.name, file=state.mark_writer) + except Exception: + _LOGGER.exception("Error processing result") + except Exception: + _LOGGER.exception("process_result") + + +def process_line( + line: str, + state: CommandLineInterfaceState, + line_id: str = "", +): + from mimic3_tts import SSMLSpeaker + + assert state.tts is not None + assert state.result_queue is not None + + args = state.args + + if args.ssml: + results = SSMLSpeaker(state.tts).speak(line) + else: + state.tts.begin_utterance() + + # TODO: text language + state.tts.speak_text(line) + + results = state.tts.end_utterance() + + for result in results: + state.result_queue.put( + ResultToProcess( + result=result, + line=line, + line_id=line_id, + ) + ) + + +def process_lines(state: CommandLineInterfaceState): + assert state.texts is not None + + args = state.args + + try: + result_idx = 0 + + for line in state.texts: + line_id = "" + line = line.strip() + if not line: + continue + + if args.output_naming == OutputNaming.ID: + # Line has the format id|text instead of just text + line_id, line = line.split(args.id_delimiter, maxsplit=1) + + process_line(line, state, line_id=line_id) + result_idx += 1 + + except KeyboardInterrupt: + if state.result_queue is not None: + # Draw audio playback queue + while not state.result_queue.empty(): + state.result_queue.get() + finally: + # Wait for raw stream to finish + if state.result_queue is not None: + state.result_queue.put(None) + + if state.result_thread is not None: + print("Waiting for audio to finish...", file=sys.stderr) + state.result_thread.join() + + # ------------------------------------------------------------------------- + + # Write combined audio to stdout + if state.all_audio: + _LOGGER.debug("Writing WAV audio to stdout") + + if sys.stdout.isatty() and (not state.args.stdout): + with io.BytesIO() as wav_io: + wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") + with wav_file_play: + wav_file_play.setframerate(state.sample_rate_hz) + wav_file_play.setsampwidth(state.sample_width_bytes) + wav_file_play.setnchannels(state.num_channels) + wav_file_play.writeframes(state.all_audio) + + play_wav_bytes(wav_io.getvalue()) + else: + # Write output directly to stdout + wav_file_write: wave.Wave_write = wave.open(sys.stdout.buffer, "wb") + with wav_file_write: + wav_file_write.setframerate(state.sample_rate_hz) + wav_file_write.setsampwidth(state.sample_width_bytes) + wav_file_write.setnchannels(state.num_channels) + wav_file_write.writeframes(state.all_audio) + + sys.stdout.buffer.flush() + + +def shutdown_tts(state: CommandLineInterfaceState): + if state.tts is not None: + state.tts.shutdown() + state.tts = None + + +def play_wav_bytes(wav_bytes: bytes): + from playsound import playsound + + with tempfile.NamedTemporaryFile(mode="wb+", suffix=".wav") as wav_file: + wav_file.write(wav_bytes) + wav_file.seek(0) + + _LOGGER.debug("Playing WAV file: %s", wav_file.name) + playsound(wav_file.name) + + +def print_voices(state: CommandLineInterfaceState): + assert state.tts is not None + + voices = list(state.tts.get_voices()) + voices = sorted(voices, key=lambda v: v.key) + + writer = csv.writer(sys.stdout, delimiter="\t") + writer.writerow(("KEY", "LANGUAGE", "NAME", "DESCRIPTION", "LOCATION")) + for voice in voices: + writer.writerow( + (voice.key, voice.language, voice.name, voice.description, voice.location) + ) + + +# ----------------------------------------------------------------------------- + + +def get_args(): + """Parse command-line arguments""" + parser = argparse.ArgumentParser(prog=_PACKAGE) + # parser.add_argument( + # "--language", help="Gruut language for text input (en-us, etc.)" + # ) + parser.add_argument( + "text", nargs="*", help="Text to convert to speech (default: stdin)" + ) + parser.add_argument( + "--stdin-format", + choices=[str(v.value) for v in StdinFormat], + default=StdinFormat.AUTO, + help="Format of stdin text (default: auto)", + ) + parser.add_argument( + "--voice", + "-v", + help="Name of voice (expected in /)", + ) + # parser.add_argument( + # "--voices-dir", + # help="Directory with voices (format is /)", + # ) + parser.add_argument("--voices", action="store_true", help="List available voices") + parser.add_argument("--output-dir", help="Directory to write WAV file(s)") + parser.add_argument( + "--output-naming", + choices=[v.value for v in OutputNaming], + default="text", + help="Naming scheme for output WAV files (requires --output-dir)", + ) + parser.add_argument( + "--id-delimiter", + default="|", + help="Delimiter between id and text in lines (default: |). Requires --output-naming id", + ) + parser.add_argument( + "--interactive", + action="store_true", + help="Play audio after each input line (see --play-command)", + ) + parser.add_argument("--csv", action="store_true", help="Input format is id|text") + parser.add_argument( + "--mark-file", + help="File to write mark names to as they're encountered (--ssml only)", + ) + + parser.add_argument( + "--noise-scale", + type=float, + help="Noise scale [0-1], default is 0.667", + ) + parser.add_argument( + "--length-scale", + type=float, + help="Length scale (1.0 is default speed, 0.5 is 2x faster)", + ) + parser.add_argument( + "--noise-w", + type=float, + help="Variation in cadence [0-1], default is 0.8", + ) + + # Miscellaneous + parser.add_argument( + "--result-queue-size", + default=5, + help="Maximum number of sentences to maintain in output queue (default: 5)", + ) + parser.add_argument( + "--process-on-blank-line", + action="store_true", + help="Process text only after encountering a blank line", + ) + parser.add_argument("--ssml", action="store_true", help="Input text is SSML") + # parser.add_argument( + # "--optimizations", + # choices=["auto", "on", "off"], + # default="auto", + # help="Enable/disable Onnx optimizations (auto=disable on armv7l)", + # ) + + parser.add_argument( + "--stdout", + action="store_true", + help="Force audio output to stdout even if a tty is detected", + ) + parser.add_argument( + "--preload-voice", action="append", help="Preload voice when starting up" + ) + parser.add_argument("--seed", type=int, help="Set random seed (default: not set)") + parser.add_argument("--version", action="store_true", help="Print version and exit") + parser.add_argument( + "--debug", action="store_true", help="Print DEBUG messages to the console" + ) + + return parser.parse_args() + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/mimic3-tts/mimic3_tts/download.py b/mimic3-tts/mimic3_tts/download.py new file mode 100644 index 0000000..f53c58d --- /dev/null +++ b/mimic3-tts/mimic3_tts/download.py @@ -0,0 +1,170 @@ +# Copyright 2022 Mycroft AI Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +import argparse +import json +import logging +import shutil +import sys +import tempfile +import typing +import urllib.request +from pathlib import Path +from urllib.error import HTTPError + +from xdgenvpy import XDG + +from ._resources import _DIR, _PACKAGE + +_LOGGER = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- + + +class VoiceDownloadError(Exception): + """Occurs when a voice fails to download""" + + +def download_voice(voices_dir: typing.Union[str, Path], link: str) -> Path: + """Download and extract a voice (or vocoder)""" + from tqdm.auto import tqdm + + voice_name = link.split("/")[-1] + voices_dir = Path(voices_dir) + voices_dir.mkdir(parents=True, exist_ok=True) + + _LOGGER.debug("Downloading voice to %s from %s", voices_dir, link) + + try: + with urllib.request.urlopen(link) as response: + with tempfile.NamedTemporaryFile(mode="wb+", suffix=".tar.gz") as temp_file: + with tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + miniters=1, + desc=voice_name, + total=int(response.headers.get("content-length", 0)), + ) as pbar: + chunk = response.read(4096) + while chunk: + temp_file.write(chunk) + pbar.update(len(chunk)) + chunk = response.read(4096) + + temp_file.seek(0) + + # Extract + with tempfile.TemporaryDirectory() as temp_dir_str: + temp_dir = Path(temp_dir_str) + _LOGGER.debug("Extracting %s to %s", temp_file.name, temp_dir_str) + shutil.unpack_archive(temp_file.name, temp_dir_str) + + # Expecting / + lang_dir = next(temp_dir.iterdir()) + assert lang_dir.is_dir() + + voice_dir = next(lang_dir.iterdir()) + assert voice_dir.is_dir() + + # Copy to destination + dest_lang_dir = voices_dir / lang_dir.name + dest_lang_dir.mkdir(parents=True, exist_ok=True) + + dest_voice_dir = voices_dir / lang_dir.name / voice_dir.name + if dest_voice_dir.is_dir(): + # Delete existing files + shutil.rmtree(str(dest_voice_dir)) + + # Move files + _LOGGER.debug("Moving %s to %s", voice_dir, dest_voice_dir) + shutil.move(str(voice_dir), str(dest_voice_dir)) + + _LOGGER.info("Installed %s to %s", link, dest_voice_dir) + + return dest_voice_dir + except HTTPError as e: + _LOGGER.exception("download_voice") + raise VoiceDownloadError( + f"Failed to download voice {voice_name} from {link}: {e}" + ) from e + + +# ----------------------------------------------------------------------------- + + +def main(): + """Main entry point""" + default_voices_dir = Path(XDG().XDG_DATA_HOME) / "mimic3" + + parser = argparse.ArgumentParser(prog=f"{_PACKAGE}.download") + parser.add_argument("--url", action="append", help="URL of voice to download") + parser.add_argument( + "--name", + action="append", + help="Name of voice to download (e.g., en_US/vctk_low)", + ) + parser.add_argument( + "--output-dir", + default=default_voices_dir, + help=f"Path to output directory (default: {default_voices_dir})", + ) + parser.add_argument( + "--debug", action="store_true", help="Print DEBUG messages to console" + ) + args = parser.parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + _LOGGER.debug(args) + + args.output_dir = Path(args.output_dir) + args.url = args.url or [] + args.name = args.name or [] + + with open(_DIR / "voices.json", "r", encoding="utf-8") as voices_file: + voices_by_name = json.load(voices_file) + + if (not args.url) and (not args.name): + # Print available voices and exit + json.dump(voices_by_name, sys.stdout, indent=4, ensure_ascii=False) + sys.exit(0) + + urls_to_download = args.url + + if args.name: + # Gather URLs for voices by name + + for voice_name in args.name: + voice_info = voices_by_name.get(voice_name) + if not voice_info: + _LOGGER.fatal("Voice not found: %s", voice_name) + sys.exit(1) + + urls_to_download.append(voice_info["url"]) + + args.output_dir.mkdir(parents=True, exist_ok=True) + + for url in urls_to_download: + download_voice(args.output_dir, url) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/mimic3-tts/mimic3_tts/tts.py b/mimic3-tts/mimic3_tts/tts.py index 7637ce2..497a5fc 100644 --- a/mimic3-tts/mimic3_tts/tts.py +++ b/mimic3-tts/mimic3_tts/tts.py @@ -187,11 +187,12 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): speakers.append(line) yield Voice( - key=str(voice_dir.absolute()), + key=f"{voice_lang}/{voice_name}", name=voice_name, language=voice_lang, description="", speakers=speakers, + location=str(voice_dir.absolute()), properties=properties, ) @@ -380,14 +381,15 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem): model_dir: typing.Optional[Path] = None for maybe_voice in self.get_voices(): if maybe_voice.key.endswith(voice_key): - model_dir = Path(maybe_voice.key) + model_dir = Path(maybe_voice.location) break if model_dir is None: raise VoiceNotFoundError(voice_key) - # Full path to voice model directory - canonical_key = str(model_dir.absolute()) + voice_lang = model_dir.parent.name + voice_name = model_dir.name + canonical_key = f"{voice_lang}/{voice_name}" existing_voice = self._loaded_voices.get(canonical_key) if existing_voice is not None: diff --git a/mimic3-tts/mypy.ini b/mimic3-tts/mypy.ini index 99a4d79..c002909 100644 --- a/mimic3-tts/mypy.ini +++ b/mimic3-tts/mypy.ini @@ -6,5 +6,11 @@ ignore_missing_imports = True [mypy-onnxruntime.*] ignore_missing_imports = True +[mypy-playsound.*] +ignore_missing_imports = True + +[mypy-tqdm.*] +ignore_missing_imports = True + [mypy-xdgenvpy.*] ignore_missing_imports = True diff --git a/mimic3-tts/pylintrc b/mimic3-tts/pylintrc index 792fbe6..ea7eeab 100644 --- a/mimic3-tts/pylintrc +++ b/mimic3-tts/pylintrc @@ -34,7 +34,7 @@ disable= missing-class-docstring, missing-function-docstring, import-error, - relative-import-beyond-top-level + relative-beyond-top-level [FORMAT] expected-line-ending-format=LF diff --git a/mimic3-tts/requirements.txt b/mimic3-tts/requirements.txt index 4cc949b..3cb3c4a 100644 --- a/mimic3-tts/requirements.txt +++ b/mimic3-tts/requirements.txt @@ -1,8 +1,10 @@ dataclasses-json<1.0 espeak-phonemizer>=1.0,<2.0 -gruut[en,de,es,nl,it,fr,sw]>=2.2.2,<3.0 +gruut>=2.2.2,<3.0 numpy<2.0 onnxruntime>=1.6,<2.0 -phonemes2ids<2.0 opentts_abc<1.0 +phonemes2ids<2.0 +playsound~=1.3.0 +tqdm>=4,<5 xdgenvpy>2.0,<3 diff --git a/mimic3-tts/setup.py b/mimic3-tts/setup.py index 7aa5872..ddefcc9 100644 --- a/mimic3-tts/setup.py +++ b/mimic3-tts/setup.py @@ -42,6 +42,25 @@ with open(version_path, "r", encoding="utf-8") as version_file: # ----------------------------------------------------------------------------- +# dependency => [tags] +extras = {} + +# Create language-specific extras +for lang in [ + "de", + "es", + "fr", + "it", + "nl", + "pt", + "ru", + "sv", + "sw", +]: + extras[f"gruut[{lang}]"] = [lang] + +# ----------------------------------------------------------------------------- + setup( name="mimic3_tts", version=version, @@ -53,7 +72,8 @@ setup( packages=setuptools.find_packages(), package_data={"mimic3_tts": ["VERSION", "py.typed"]}, install_requires=requirements, - extras_require={':python_version<"3.9"': ["importlib_resources"]}, + extras_require={':python_version<"3.9"': ["importlib_resources"], **extras}, + entry_points={"console_scripts": ["mimic3 = mimic3_cli.__main__:main"]}, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", diff --git a/opentts-abc/.gitignore b/opentts-abc/.gitignore index 82f869f..f4edcec 100644 --- a/opentts-abc/.gitignore +++ b/opentts-abc/.gitignore @@ -12,3 +12,5 @@ htmlcov __pycache__/ .mypy_cache/ *.egg-info/ + +flycheck_*.py diff --git a/opentts-abc/opentts_abc/__init__.py b/opentts-abc/opentts_abc/__init__.py index d18dcfd..aeb846d 100644 --- a/opentts-abc/opentts_abc/__init__.py +++ b/opentts-abc/opentts_abc/__init__.py @@ -156,6 +156,9 @@ class Voice: description: str """Human-readable description of the voice""" + location: str + """File path or URI where the voice exists""" + speakers: typing.Optional[typing.Sequence[str]] = None """List of speakers within the voice model if multi-speaker"""