Add ability to download voices

2022-03-29 16:18:50 -04:00 · 2022-03-29 16:18:50 -04:00 · dd04ebd6f8
commit dd04ebd6f8
parent 9277373c20
8 changed files with 950 additions and 119 deletions
--- a/2
+++ b/2
@ -67,7 +67,7 @@ RUN --mount=type=cache,id=apt-run,target=/var/cache/apt \
    mkdir -p /var/cache/apt/${TARGETARCH}${TARGETVARIANT}/archives/partial && \
    apt-get update && \
    apt-get install --yes --no-install-recommends \
-        python3 ca-certificates
+        python3 ca-certificates libespeak-ng1

 RUN useradd -ms /bin/bash mimic3

--- a/mimic3-tts/download.sh
+++ b/mimic3-tts/download.sh
@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Kebab to snake case
+module_name="$(basename "${this_dir}" | sed -e 's/-/_/g')"
+src_dir="${this_dir}/${module_name}"
+
+# Path to virtual environment
+: "${venv:=${this_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    # Activate virtual environment if available
+    source "${venv}/bin/activate"
+fi
+
+export PYTHONPATH="${this_dir}"
+python3 -m "${module_name}.download" "$@"
--- a/mimic3-tts/mimic3_tts/main.py
+++ b/mimic3-tts/mimic3_tts/main.py
@ -126,6 +126,7 @@ def main():


 def initialize_args(state: CommandLineInterfaceState):
+    """Initialze CLI state from command-line arguments"""
    import numpy as np

    args = state.args
@ -201,11 +202,14 @@ def initialize_args(state: CommandLineInterfaceState):


 def initialize_tts(state: CommandLineInterfaceState):
+    """Create Mimic 3 TTS from command-line arguments"""
    from mimic3_tts import Mimic3Settings, Mimic3TextToSpeechSystem  # noqa: F811

    args = state.args

-    state.tts = Mimic3TextToSpeechSystem(Mimic3Settings())
+    state.tts = Mimic3TextToSpeechSystem(
+        Mimic3Settings(voices_directories=args.voices_dir, speaker=args.speaker)
+    )

    if args.voices:
        # Don't bother with the rest of the initialization
@ -433,9 +437,6 @@ def print_voices(state: CommandLineInterfaceState):
 def get_args():
    """Parse command-line arguments"""
    parser = argparse.ArgumentParser(prog=_PACKAGE)
-    # parser.add_argument(
-    #     "--language", help="Gruut language for text input (en-us, etc.)"
-    # )
    parser.add_argument(
        "text", nargs="*", help="Text to convert to speech (default: stdin)"
    )
@ -450,10 +451,16 @@ def get_args():
        "-v",
        help="Name of voice (expected in <voices-dir>/<language>)",
    )
-    # parser.add_argument(
-    #     "--voices-dir",
-    #     help="Directory with voices (format is <language>/<name_model-type>)",
-    # )
+    parser.add_argument(
+        "--speaker",
+        "-s",
+        help="Name or number of speaker (default: first speaker)",
+    )
+    parser.add_argument(
+        "--voices-dir",
+        action="append",
+        help="Directory with voices (format is <language>/<voice_name>)",
+    )
    parser.add_argument("--voices", action="store_true", help="List available voices")
    parser.add_argument("--output-dir", help="Directory to write WAV file(s)")
    parser.add_argument(
@ -506,13 +513,6 @@ def get_args():
        help="Process text only after encountering a blank line",
    )
    parser.add_argument("--ssml", action="store_true", help="Input text is SSML")
-    # parser.add_argument(
-    #     "--optimizations",
-    #     choices=["auto", "on", "off"],
-    #     default="auto",
-    #     help="Enable/disable Onnx optimizations (auto=disable on armv7l)",
-    # )
-
    parser.add_argument(
        "--stdout",
        action="store_true",
--- a/mimic3-tts/mimic3_tts/_resources.py
+++ b/mimic3-tts/mimic3_tts/_resources.py
@ -14,6 +14,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 """Shared access to package resources"""
+import json
 import os
 import typing
 from pathlib import Path
@ -32,3 +33,19 @@ _PACKAGE = "mimic3_tts"
 _DIR = Path(typing.cast(os.PathLike, files(_PACKAGE)))

 __version__ = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
+
+# Load voices.json
+# {
+#   "<lang>/<voice>": {
+#     "files": {
+#       "relative/path": {
+#         "size_bytes": size in bytes,
+#         "sha256_sum": sha256 hash
+#       }
+#     },
+#     "speakers": [],
+#     "properties": {}
+#   }
+# }
+with open(_DIR / "voices.json", "r", encoding="utf-8") as voices_file:
+    _VOICES = json.load(voices_file)
--- a/mimic3-tts/mimic3_tts/const.py
+++ b/mimic3-tts/mimic3_tts/const.py
@ -0,0 +1,23 @@
+# Copyright 2022 Mycroft AI Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+from pathlib import Path
+
+from xdgenvpy import XDG
+
+DEFAULT_VOICE = "en_US/vctk_low"
+DEFAULT_LANGUAGE = "en_US"
+DEFAULT_VOICES_URL_FORMAT = "https://github.com/MycroftAI/mimic3-voices/raw/master/{lang}/{name}"
+DEFAULT_VOICES_DOWNLOAD_DIR = Path(XDG().XDG_DATA_HOME) / "mimic3"
--- a/mimic3-tts/mimic3_tts/download.py
+++ b/mimic3-tts/mimic3_tts/download.py
@ -16,17 +16,15 @@
 import argparse
 import json
 import logging
-import shutil
 import sys
-import tempfile
 import typing
 import urllib.request
+from dataclasses import dataclass
 from pathlib import Path
 from urllib.error import HTTPError

-from xdgenvpy import XDG
-
-from ._resources import _DIR, _PACKAGE
+from ._resources import _PACKAGE, _VOICES
+from .const import DEFAULT_VOICES_DOWNLOAD_DIR, DEFAULT_VOICES_URL_FORMAT

 _LOGGER = logging.getLogger(__name__)

@ -37,69 +35,61 @@ class VoiceDownloadError(Exception):
    """Occurs when a voice fails to download"""


-def download_voice(voices_dir: typing.Union[str, Path], link: str) -> Path:
-    """Download and extract a voice (or vocoder)"""
+@dataclass
+class VoiceFile:
+    """File associated with a voice to download"""
+
+    relative_path: str
+    size_bytes: typing.Optional[int] = None
+    sha256_sum: typing.Optional[str] = None
+
+
+def download_voice(
+    voice_key: str,
+    url_base: str,
+    voice_files: typing.Iterable[VoiceFile],
+    voices_dir: typing.Union[str, Path],
+    chunk_bytes: int = 4096,
+):
+    """Downloads a voice to a directory"""
    from tqdm.auto import tqdm

-    voice_name = link.split("/")[-1]
-    voices_dir = Path(voices_dir)
-    voices_dir.mkdir(parents=True, exist_ok=True)
+    if url_base.endswith("/"):
+        # Remove final slash
+        url_base = url_base[:-1]

-    _LOGGER.debug("Downloading voice to %s from %s", voices_dir, link)
+    voice_dir = Path(voices_dir) / voice_key
+    voice_dir.mkdir(parents=True, exist_ok=True)

-    try:
-        with urllib.request.urlopen(link) as response:
-            with tempfile.NamedTemporaryFile(mode="wb+", suffix=".tar.gz") as temp_file:
-                with tqdm(
-                    unit="B",
-                    unit_scale=True,
-                    unit_divisor=1024,
-                    miniters=1,
-                    desc=voice_name,
-                    total=int(response.headers.get("content-length", 0)),
-                ) as pbar:
-                    chunk = response.read(4096)
-                    while chunk:
-                        temp_file.write(chunk)
-                        pbar.update(len(chunk))
-                        chunk = response.read(4096)
+    _LOGGER.debug("Downloading voice %s to %s", voice_key, voice_dir)

-                temp_file.seek(0)
+    for voice_file in voice_files:
+        file_url = f"{url_base}/{voice_file.relative_path}"
+        file_path = voice_dir / voice_file.relative_path

-                # Extract
-                with tempfile.TemporaryDirectory() as temp_dir_str:
-                    temp_dir = Path(temp_dir_str)
-                    _LOGGER.debug("Extracting %s to %s", temp_file.name, temp_dir_str)
-                    shutil.unpack_archive(temp_file.name, temp_dir_str)
+        try:
+            with urllib.request.urlopen(file_url) as response:
+                with open(file_path, mode="wb") as dest_file:
+                    with tqdm(
+                        unit="B",
+                        unit_scale=True,
+                        unit_divisor=1024,
+                        miniters=1,
+                        desc=voice_file.relative_path,
+                        total=int(response.headers.get("content-length", 0)),
+                    ) as pbar:
+                        chunk = response.read(chunk_bytes)
+                        while chunk:
+                            dest_file.write(chunk)
+                            pbar.update(len(chunk))
+                            chunk = response.read(chunk_bytes)

-                    # Expecting <language>/<voice_name>
-                    lang_dir = next(temp_dir.iterdir())
-                    assert lang_dir.is_dir()
-
-                    voice_dir = next(lang_dir.iterdir())
-                    assert voice_dir.is_dir()
-
-                    # Copy to destination
-                    dest_lang_dir = voices_dir / lang_dir.name
-                    dest_lang_dir.mkdir(parents=True, exist_ok=True)
-
-                    dest_voice_dir = voices_dir / lang_dir.name / voice_dir.name
-                    if dest_voice_dir.is_dir():
-                        # Delete existing files
-                        shutil.rmtree(str(dest_voice_dir))
-
-                    # Move files
-                    _LOGGER.debug("Moving %s to %s", voice_dir, dest_voice_dir)
-                    shutil.move(str(voice_dir), str(dest_voice_dir))
-
-                    _LOGGER.info("Installed %s to %s", link, dest_voice_dir)
-
-                    return dest_voice_dir
-    except HTTPError as e:
-        _LOGGER.exception("download_voice")
-        raise VoiceDownloadError(
-            f"Failed to download voice {voice_name} from {link}: {e}"
-        ) from e
+            _LOGGER.debug("Downloaded %s", file_path)
+        except HTTPError as e:
+            _LOGGER.exception("download_voice")
+            raise VoiceDownloadError(
+                f"Failed to download file for voice {voice_key} from {file_url}: {e}"
+            ) from e


 # -----------------------------------------------------------------------------
@ -107,19 +97,21 @@ def download_voice(voices_dir: typing.Union[str, Path], link: str) -> Path:

 def main():
    """Main entry point"""
-    default_voices_dir = Path(XDG().XDG_DATA_HOME) / "mimic3"
-
    parser = argparse.ArgumentParser(prog=f"{_PACKAGE}.download")
-    parser.add_argument("--url", action="append", help="URL of voice to download")
    parser.add_argument(
-        "--name",
-        action="append",
-        help="Name of voice to download (e.g., en_US/vctk_low)",
+        "key",
+        nargs="*",
+        help="Keys of voices to download (e.g., en_US/vctk_low)",
    )
    parser.add_argument(
        "--output-dir",
-        default=default_voices_dir,
-        help=f"Path to output directory (default: {default_voices_dir})",
+        default=DEFAULT_VOICES_DOWNLOAD_DIR,
+        help="Path to output directory",
+    )
+    parser.add_argument(
+        "--url-format",
+        default=DEFAULT_VOICES_URL_FORMAT,
+        help="URL format string for voices (contains {key}, {lang}, {name})",
    )
    parser.add_argument(
        "--debug", action="store_true", help="Print DEBUG messages to console"
@ -134,34 +126,28 @@ def main():
    _LOGGER.debug(args)

    args.output_dir = Path(args.output_dir)
-    args.url = args.url or []
-    args.name = args.name or []
+    args.key = args.key or []

-    with open(_DIR / "voices.json", "r", encoding="utf-8") as voices_file:
-        voices_by_name = json.load(voices_file)
-
-    if (not args.url) and (not args.name):
+    if not args.key:
        # Print available voices and exit
-        json.dump(voices_by_name, sys.stdout, indent=4, ensure_ascii=False)
+        json.dump(_VOICES, sys.stdout, indent=4, ensure_ascii=False)
        sys.exit(0)

-    urls_to_download = args.url
-
-    if args.name:
-        # Gather URLs for voices by name
-
-        for voice_name in args.name:
-            voice_info = voices_by_name.get(voice_name)
-            if not voice_info:
-                _LOGGER.fatal("Voice not found: %s", voice_name)
-                sys.exit(1)
-
-            urls_to_download.append(voice_info["url"])
-
    args.output_dir.mkdir(parents=True, exist_ok=True)

-    for url in urls_to_download:
-        download_voice(args.output_dir, url)
+    for voice_key in args.key:
+        voice_lang, voice_name = voice_key.split("/", maxsplit=1)
+        voice_info = _VOICES[voice_key]
+        voice_url = str.format(
+            args.url_format, key=voice_key, lang=voice_lang, name=voice_name
+        )
+        voice_files = voice_info["files"]
+        download_voice(
+            voice_key=voice_key,
+            url_base=voice_url,
+            voice_files=[VoiceFile(file_key) for file_key in voice_files.keys()],
+            voices_dir=args.output_dir,
+        )


 # -----------------------------------------------------------------------------
--- a/mimic3-tts/mimic3_tts/tts.py
+++ b/mimic3-tts/mimic3_tts/tts.py
@ -35,7 +35,15 @@ from opentts_abc import (
 )
 from xdgenvpy import XDG

+from ._resources import _VOICES
 from .config import TrainingConfig
+from .const import (
+    DEFAULT_LANGUAGE,
+    DEFAULT_VOICE,
+    DEFAULT_VOICES_DOWNLOAD_DIR,
+    DEFAULT_VOICES_URL_FORMAT,
+)
+from .download import VoiceFile, download_voice
 from .voice import SPEAKER_TYPE, Mimic3Voice

 _DIR = Path(__file__).parent
@ -44,9 +52,6 @@ _LOGGER = logging.getLogger(__name__)

 PHONEMES_LIST_TYPE = typing.List[typing.List[str]]

-DEFAULT_VOICE = "en_US/vctk_low"
-DEFAULT_LANGUAGE = "en_US"
-

 # -----------------------------------------------------------------------------

@ -64,6 +69,15 @@ class Mimic3Settings:
    voices_directories: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None
    """Directories to search for voices (<lang>/<voice>)"""

+    voices_url_format: str = DEFAULT_VOICES_URL_FORMAT
+    """URL format string for a voice directory.
+
+    May contain:
+      * {key} - unique voice key
+      * {lang} - voice language
+      * {name} - voice name
+    """
+
    speaker: typing.Optional[SPEAKER_TYPE] = None
    """Default speaker name or id"""

@ -82,6 +96,12 @@ class Mimic3Settings:
    sample_rate: int = 22050
    """Sample rate of silence from add_break() in Hertz"""

+    voices_download_dir: typing.Union[str, Path] = DEFAULT_VOICES_DOWNLOAD_DIR
+    """Directory to download voices to"""
+
+    no_download: bool = False
+    """Do not download voices automatically"""
+

@dataclass
 class Mimic3Phonemes:
@ -125,8 +145,7 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
            - /usr/local/share/mimic3
            - /usr/share/mimic3
        """
-        data_dirs = [Path(d) / "mimic3" for d in XDG().XDG_DATA_DIRS.split(":")]
-        return [_DIR.parent.parent / "voices"] + data_dirs
+        return [Path(d) / "mimic3" for d in XDG().XDG_DATA_DIRS.split(":")]

    def get_voices(self) -> typing.Iterable[Voice]:
        """Returns an iterable of all available voices"""
@ -137,29 +156,34 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
        if self.settings.voices_directories is not None:
            voices_dirs = itertools.chain(self.settings.voices_directories, voices_dirs)

+        known_voices = set(_VOICES.keys())
+
        # voices/<language>/<voice>/
        for voices_dir in voices_dirs:
            voices_dir = Path(voices_dir)

-            if not voices_dir.is_dir():
+            if not voices_dir.is_dir() or voices_dir.name.startswith("."):
                _LOGGER.debug("Skipping voice directory %s", voices_dir)
                continue

            _LOGGER.debug("Searching %s for voices", voices_dir)

            for lang_dir in voices_dir.iterdir():
-                if not lang_dir.is_dir():
+                if not lang_dir.is_dir() or lang_dir.name.startswith("."):
                    continue

                for voice_dir in lang_dir.iterdir():
-                    if not voice_dir.is_dir():
+                    if not voice_dir.is_dir() or voice_dir.name.startswith("."):
+                        continue
+
+                    config_path = voice_dir / "config.json"
+                    if not config_path.is_file():
                        continue

                    _LOGGER.debug("Voice found in %s", voice_dir)
                    voice_lang = lang_dir.name

                    # Load config
-                    config_path = voice_dir / "config.json"
                    _LOGGER.debug("Loading config from %s", config_path)

                    with open(config_path, "r", encoding="utf-8") as config_file:
@ -186,8 +210,10 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
                                if line:
                                    speakers.append(line)

+                    voice_key = f"{voice_lang}/{voice_name}"
+
                    yield Voice(
-                        key=f"{voice_lang}/{voice_name}",
+                        key=voice_key,
                        name=voice_name,
                        language=voice_lang,
                        description="",
@ -196,6 +222,30 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
                        properties=properties,
                    )

+                    known_voices.discard(voice_key)
+
+        # Yield voices that haven't yet been downloaded
+        for voice_key in known_voices:
+            voice_lang, voice_name = voice_key.split("/", maxsplit=1)
+            voice_info = _VOICES.get(voice_key, {})
+            speakers = voice_info.get("speakers", [])
+            properties = voice_info.get("properties", {})
+
+            yield Voice(
+                key=voice_key,
+                name=voice_name,
+                language=voice_lang,
+                description="",
+                speakers=speakers,
+                location=str.format(
+                    self.settings.voices_url_format,
+                    lang=voice_lang,
+                    name=voice_name,
+                    key=voice_key,
+                ),
+                properties=properties,
+            )
+
    def preload_voice(self, voice_key: str):
        """Ensure voice is loaded in memory before synthesis"""
        self._get_or_load_voice(voice_key)
@ -381,8 +431,16 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
        model_dir: typing.Optional[Path] = None
        for maybe_voice in self.get_voices():
            if maybe_voice.key.endswith(voice_key):
-                model_dir = Path(maybe_voice.location)
-                break
+                maybe_model_dir = Path(maybe_voice.location)
+
+                if (not maybe_model_dir.is_dir()) and (not self.settings.no_download):
+                    # Download voice
+                    maybe_model_dir = self._download_voice(voice_key)
+
+                if maybe_model_dir.is_dir():
+                    # Voice found
+                    model_dir = maybe_model_dir
+                    break

        if model_dir is None:
            raise VoiceNotFoundError(voice_key)
@ -407,3 +465,25 @@ class Mimic3TextToSpeechSystem(TextToSpeechSystem):
        self._loaded_voices[canonical_key] = voice

        return voice
+
+    def _download_voice(self, voice_key: str) -> Path:
+        """Downloads a voice by key"""
+        voice_lang, voice_name = voice_key.split("/", maxsplit=1)
+        voice_info = _VOICES[voice_key]
+        voice_url = str.format(
+            self.settings.voices_url_format,
+            key=voice_key,
+            lang=voice_lang,
+            name=voice_name,
+        )
+        voice_files = voice_info["files"]
+        download_voice(
+            voice_key=voice_key,
+            url_base=voice_url,
+            voice_files=[VoiceFile(file_key) for file_key in voice_files.keys()],
+            voices_dir=self.settings.voices_download_dir,
+        )
+
+        voice_dir = Path(self.settings.voices_download_dir) / voice_key
+
+        return voice_dir
--- a/mimic3-tts/mimic3_tts/voices.json
+++ b/mimic3-tts/mimic3_tts/voices.json
@ -0,0 +1,705 @@
+{
+    "de_DE/thorsten_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 6557,
+                "sha256_sum": "434e11b12f4a3f3096032bc35c5189afe7827b726212b2406a28189598d9c4cf"
+            },
+            "README.md": {
+                "size_bytes": 193,
+                "sha256_sum": "e341ebb38ee231c19fc6b132058398725060bc62871a3de39552dea92e2282b2"
+            },
+            "SOURCE": {
+                "size_bytes": 61,
+                "sha256_sum": "5913b6f0cf4fc4d751aade453924bfa05413245075d3a294f6c70e6497e7e01c"
+            },
+            "config.json": {
+                "size_bytes": 3736,
+                "sha256_sum": "27ad9d2e36d3beaf2fd797537edf0b2243b73795eb57742b2aa69525258dd088"
+            },
+            "generator.onnx": {
+                "size_bytes": 62798359,
+                "sha256_sum": "166146bf2705b3c280d3ca6b29f1f3315fe474feb58b47db5152bf78a28af4d0"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 340,
+                "sha256_sum": "530fed94716cbb8ebe88700028257f2ce39566e6e37e62da3a9e9ce4fc8a90d5"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "el_GR/rapunzelina_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 6384,
+                "sha256_sum": "e052310c1e6d75057abe231ba94b7f2eedee1aec4a0c5c658c8151f6f8c05fd8"
+            },
+            "README.md": {
+                "size_bytes": 199,
+                "sha256_sum": "9a7979350469d0819cb7cdd293f63a99ed29643782c5e435e70f8599c02a565a"
+            },
+            "SOURCE": {
+                "size_bytes": 69,
+                "sha256_sum": "c3d41e924e28a9a5d6384af1be84a140ff3ab957f338f56680a076fef07d12b3"
+            },
+            "config.json": {
+                "size_bytes": 3397,
+                "sha256_sum": "5d4da9a6d55500c067a66b29d21aa14df4d6fe53e9e5ce5b3ee1b2d8ecbb98fc"
+            },
+            "generator.onnx": {
+                "size_bytes": 62787607,
+                "sha256_sum": "f364132e32a8160b7a5945e7f52fd25fa4f8413c8826de07d6b21ec4222bd0d6"
+            },
+            "phonemes.txt": {
+                "size_bytes": 215,
+                "sha256_sum": "0deecbaabd16fa94b58375c4bfb1ee66da6567cc56507d52b9c32d0d9553f642"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "en_US/cmu-arctic_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 960,
+                "sha256_sum": "244ff21a910baf28bcb27b1975620a79d2be8611815ecc599f08eb06dd6f000e"
+            },
+            "README.md": {
+                "size_bytes": 181,
+                "sha256_sum": "3d5ad2368b2e61a31679400322924eeb312c7b97e68a4fc127461bb6ef18bae5"
+            },
+            "SOURCE": {
+                "size_bytes": 35,
+                "sha256_sum": "234919f888057ce202730f2ce9e87ab526c6db4b410047a3c9ca52b1cf51de2b"
+            },
+            "config.json": {
+                "size_bytes": 3550,
+                "sha256_sum": "e98bf4210293be786fc219612f6a0ac1a67b40bb2f5fa5f7c7ddbd595638c193"
+            },
+            "generator.onnx": {
+                "size_bytes": 76359777,
+                "sha256_sum": "366fd96a96c7ee81ce932973b9c457d13b99696c1a98eda117395e7c882695b0"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 263,
+                "sha256_sum": "8f9c3e6ced14d7fc5426e4e1bc7f7cc1037a20a645ca34110abcb76148fa8bfd"
+            },
+            "speaker_map.csv": {
+                "size_bytes": 332,
+                "sha256_sum": "30409b44f0d4413ef99a146c86849844086cf7aa97c645660473dbe094ca2565"
+            },
+            "speakers.txt": {
+                "size_bytes": 90,
+                "sha256_sum": "f8d46538e6058f2f7d58b0bfd996cfb1bd9a4e6c81a1b6764ff9bb49fd48cdf0"
+            }
+        },
+        "speakers": [
+            "awb",
+            "rms",
+            "slt",
+            "ksp",
+            "clb",
+            "aew",
+            "bdl",
+            "lnh",
+            "jmk",
+            "rxr",
+            "fem",
+            "ljm",
+            "slp",
+            "ahw",
+            "axb",
+            "aup",
+            "eey",
+            "gka"
+        ],
+        "properties": {}
+    },
+    "en_US/ljspeech_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 42,
+                "sha256_sum": "2a380bafa00cc11ecae80f4a1c21f3873361bc9af1f23c8eecc255b143cdaf68"
+            },
+            "README.md": {
+                "size_bytes": 183,
+                "sha256_sum": "43e5814f58fb743862bc7381d3a233b9060d766f8e5ef8336b3f5c4afc38e12e"
+            },
+            "SOURCE": {
+                "size_bytes": 40,
+                "sha256_sum": "f72dc7596d10484aea8dbd1b907728ff332acf8899a38dbca468197a26c3c5d9"
+            },
+            "config.json": {
+                "size_bytes": 3495,
+                "sha256_sum": "7f89388f366789ede1a32756d98b576a18e410f0f1a9af2ce64d0fbbcd0d971f"
+            },
+            "generator.onnx": {
+                "size_bytes": 62792219,
+                "sha256_sum": "d178e03b43b41da49f337626a7024826e79fe7deb7db102a5deedb027f9caa37"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 263,
+                "sha256_sum": "8f9c3e6ced14d7fc5426e4e1bc7f7cc1037a20a645ca34110abcb76148fa8bfd"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "en_US/vctk_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 17417,
+                "sha256_sum": "b351fdf5bbec1e011fd4c09ed1af05df6fd7de2e679fd7a92e6ec4398c38e3ff"
+            },
+            "README.md": {
+                "size_bytes": 179,
+                "sha256_sum": "7e482c32766c0f0612ade79a7255b39da2852cba14d8ad170458fe8b0816e449"
+            },
+            "SOURCE": {
+                "size_bytes": 45,
+                "sha256_sum": "fe147d22acd80ce096d7c3069bb66ece887db8b72fb5f38ac6017f7aa98a9698"
+            },
+            "config.json": {
+                "size_bytes": 3555,
+                "sha256_sum": "ab38b8df74db751dc89d43c17f238ee7a5e56d8e26f59673e272ea4802d275a7"
+            },
+            "generator.onnx": {
+                "size_bytes": 76546145,
+                "sha256_sum": "c958303de83a59fac937a91009c9081b5f2f7369890b9969e05141e56e867d2b"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 263,
+                "sha256_sum": "8f9c3e6ced14d7fc5426e4e1bc7f7cc1037a20a645ca34110abcb76148fa8bfd"
+            },
+            "speaker_map.csv": {
+                "size_bytes": 1523,
+                "sha256_sum": "8ecc8b46e35edcb4664bc5804e77b807bf66fa155a9871f8e51b56f1c63d380b"
+            },
+            "speakers.txt": {
+                "size_bytes": 652,
+                "sha256_sum": "c26aab76774111665e6ce4092b9ae40e18ca2dc048a300325f03f674c398f547"
+            }
+        },
+        "speakers": [
+            "p239",
+            "p236",
+            "p264",
+            "p250",
+            "p259",
+            "p247",
+            "p261",
+            "p263",
+            "p283",
+            "p274",
+            "p286",
+            "p276",
+            "p270",
+            "p281",
+            "p277",
+            "p231",
+            "p238",
+            "p271",
+            "p257",
+            "p273",
+            "p284",
+            "p329",
+            "p361",
+            "p287",
+            "p360",
+            "p374",
+            "p376",
+            "p310",
+            "p304",
+            "p340",
+            "p347",
+            "p330",
+            "p308",
+            "p314",
+            "p317",
+            "p339",
+            "p311",
+            "p294",
+            "p305",
+            "p266",
+            "p335",
+            "p334",
+            "p318",
+            "p323",
+            "p351",
+            "p333",
+            "p313",
+            "p316",
+            "p244",
+            "p307",
+            "p363",
+            "p336",
+            "p312",
+            "p267",
+            "p297",
+            "p275",
+            "p295",
+            "p288",
+            "p258",
+            "p301",
+            "p232",
+            "p292",
+            "p272",
+            "p278",
+            "p280",
+            "p341",
+            "p268",
+            "p298",
+            "p299",
+            "p279",
+            "p285",
+            "p326",
+            "p300",
+            "s5",
+            "p230",
+            "p254",
+            "p269",
+            "p293",
+            "p252",
+            "p345",
+            "p262",
+            "p243",
+            "p227",
+            "p343",
+            "p255",
+            "p229",
+            "p240",
+            "p248",
+            "p253",
+            "p233",
+            "p228",
+            "p251",
+            "p282",
+            "p246",
+            "p234",
+            "p226",
+            "p260",
+            "p245",
+            "p241",
+            "p303",
+            "p265",
+            "p306",
+            "p237",
+            "p249",
+            "p256",
+            "p302",
+            "p364",
+            "p225",
+            "p362"
+        ],
+        "properties": {}
+    },
+    "es_ES/carlfm_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 14,
+                "sha256_sum": "f5b244982699ca9fe5cc8fa8a7c08cf5dee5d3a0c8896892899e5df13316e1b7"
+            },
+            "README.md": {
+                "size_bytes": 192,
+                "sha256_sum": "2140442eaefadcc0162caae3db531fcd6a8070068087499101ec7f7d49f236bb"
+            },
+            "SOURCE": {
+                "size_bytes": 47,
+                "sha256_sum": "afb36ff925af99bf47b97ff5f753a4dc1402e4a3f3e491a7898ad38791b7920c"
+            },
+            "config.json": {
+                "size_bytes": 3401,
+                "sha256_sum": "e4da85d44a84c729310d8bffe81a4452bd86b7a1d3874ce3243f0e1d494e704f"
+            },
+            "generator.onnx": {
+                "size_bytes": 62786839,
+                "sha256_sum": "8cfd9a91a68b5c62e52a7483c61e1f5e8e09c0e21c0ac77a991f1ab123d0e260"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 217,
+                "sha256_sum": "dd24b95ded6ff32b410390bb4aa56bcd96042762add3c166588136096c4890e0"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "fi_FI/harri-tapani-ylilammi_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 6384,
+                "sha256_sum": "e052310c1e6d75057abe231ba94b7f2eedee1aec4a0c5c658c8151f6f8c05fd8"
+            },
+            "README.md": {
+                "size_bytes": 215,
+                "sha256_sum": "061335c2aa8f9f2126a80ecd8f6635e85e8ace7a9a37950f9b808420f8233345"
+            },
+            "SOURCE": {
+                "size_bytes": 71,
+                "sha256_sum": "f5d064abd622989907fbf4116caa1d9914c30dc11b2fd83447d2ba9da8cec1f7"
+            },
+            "config.json": {
+                "size_bytes": 3399,
+                "sha256_sum": "2c7ddf22d83670542cf21b17b8177f53cdcf4e57d581a7fc92ff3c266985c826"
+            },
+            "generator.onnx": {
+                "size_bytes": 62782999,
+                "sha256_sum": "31ab7f2200e3246d50eebb8eefc108f08a709d63b89122a25ae443d9c1d9c82b"
+            },
+            "phonemes.txt": {
+                "size_bytes": 179,
+                "sha256_sum": "986029f7b8967e438c3ff901d328bf2fc9d3f4164f8b72def149208082513b26"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "fr_FR/siwis_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 17416,
+                "sha256_sum": "b34e17103bfb246f2549fc82a279e6ba28834e0cb42f76a92efc14b72e3a3723"
+            },
+            "README.md": {
+                "size_bytes": 174,
+                "sha256_sum": "00ea65658e20fd3301f95f0909eb46c8c54ba377bb6b00032e318ede7c543dca"
+            },
+            "SOURCE": {
+                "size_bytes": 48,
+                "sha256_sum": "e81bea943c3a359cef9dafdd66dd29ec6af41bed92dfc2de28879ffa44ba5c84"
+            },
+            "config.json": {
+                "size_bytes": 3390,
+                "sha256_sum": "9545c9bdda9692175fae0658c754d2cd1cc786615a6ca673eae3cd6eb1b567ed"
+            },
+            "generator.onnx": {
+                "size_bytes": 62788375,
+                "sha256_sum": "5154cc50d87fa6b15c6c5b0eb1597cba15162a8c143baed898ffb55240ba8a4c"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 232,
+                "sha256_sum": "711294d0b5a0ec08ec21ca8a75184e0fee3aba1e1adcf967fe5e1ef96f6c176e"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "hu_HU/diana-majlinger_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 6384,
+                "sha256_sum": "e052310c1e6d75057abe231ba94b7f2eedee1aec4a0c5c658c8151f6f8c05fd8"
+            },
+            "README.md": {
+                "size_bytes": 215,
+                "sha256_sum": "766b65cc2fd22f02cf3e220876b25dfc7354a25a055f43ec1b9efb4f6eab8726"
+            },
+            "SOURCE": {
+                "size_bytes": 73,
+                "sha256_sum": "b8d62d6f483feb8fb6a6ccac36bdc4080a4771cbbf38ab3c4f6f9d0e93a977d6"
+            },
+            "config.json": {
+                "size_bytes": 3396,
+                "sha256_sum": "f91d65016d5d2b8f3c1076552344121db024c3e49c11fcadb8dcdad65e9a192f"
+            },
+            "generator.onnx": {
+                "size_bytes": 62786071,
+                "sha256_sum": "6feb6a2307f2342a4a9ad43d9ae06d0e8778a6f4486b43dc34d8c62bf523c53c"
+            },
+            "phonemes.txt": {
+                "size_bytes": 202,
+                "sha256_sum": "51da9a9ac03277d0f057de50f92fe178ad4980a1e8493a6f0c65040bf660beb0"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "it_IT/riccardo-fasol_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 1372,
+                "sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c"
+            },
+            "README.md": {
+                "size_bytes": 201,
+                "sha256_sum": "9b7125e8de3ad26f4f81acf4cbfa8f27b7f991a91d6750a6653d117c9e25775b"
+            },
+            "SOURCE": {
+                "size_bytes": 61,
+                "sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a"
+            },
+            "config.json": {
+                "size_bytes": 3417,
+                "sha256_sum": "e60ee585ce2c1709c47d8bf21af6ba7fcc75e50fd533ce479ba2b00408630f8f"
+            },
+            "generator.onnx": {
+                "size_bytes": 62785303,
+                "sha256_sum": "b3e06e391e8b056460f64db6c5f3c3e4107a5a53257808fa88d6dcc43b11f3f6"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 210,
+                "sha256_sum": "282837161676bffa5b304cbb878eace1c8da670a46e08e8e800515f924ecfde3"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "ko_KO/kss_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 6384,
+                "sha256_sum": "e052310c1e6d75057abe231ba94b7f2eedee1aec4a0c5c658c8151f6f8c05fd8"
+            },
+            "README.md": {
+                "size_bytes": 191,
+                "sha256_sum": "91fe70211181048b0afba60aae1e7fd90661e90176590ffbcf4f868f3d6608d3"
+            },
+            "SOURCE": {
+                "size_bytes": 70,
+                "sha256_sum": "0c424cc057609d0547bf29e94a9c6dbda619787fde8ff21cc7e404d1c62d562c"
+            },
+            "config.json": {
+                "size_bytes": 3357,
+                "sha256_sum": "2433ba5cefa3dd957dc1276b7a501ab7e8f4a867b6ea8daa3543a5582560157c"
+            },
+            "generator.onnx": {
+                "size_bytes": 62792983,
+                "sha256_sum": "9198b939b5b713c7b59e7ba28163ed2546dc49691fea82c6614dc0b8d5612c51"
+            },
+            "phonemes.txt": {
+                "size_bytes": 256,
+                "sha256_sum": "d8d8f00e1b855c92cbc53f442166f2b4c20d898777f8d754a93eb074d0b218e0"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "nl/rdh_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 7049,
+                "sha256_sum": "7179683e8000e6bdc9bbc60d85edf0a4ac8e76f951857f54fcb775d5886f1309"
+            },
+            "README.md": {
+                "size_bytes": 167,
+                "sha256_sum": "4260521a828b09a25a29fb3ffa3ff57b816452e925b9feae528796ffb5d1f0f8"
+            },
+            "SOURCE": {
+                "size_bytes": 37,
+                "sha256_sum": "e4874bd5a71c42ef3f963944571e107734928b9db960f17dfadb6e2afce2956b"
+            },
+            "config.json": {
+                "size_bytes": 3378,
+                "sha256_sum": "1b9a50cd5e70e44c3aac6fa01bf4b1607627973814fc4ce7e0d4bf1166ce4305"
+            },
+            "generator.onnx": {
+                "size_bytes": 62800663,
+                "sha256_sum": "2082891212f3f399097be4ea540ee397912238f777b1c2bced5986060700e268"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 336,
+                "sha256_sum": "355389fee04f97557232cdde7fb8d4cf03ae2aabd7b0b26ed5978ebbf6575dd4"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "pt_BR/edresson_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 18652,
+                "sha256_sum": "cce5d01fa4a83b794271bd2c28cffdf99afd43c803e6ddefddae39b591ea7448"
+            },
+            "SOURCE": {
+                "size_bytes": 50,
+                "sha256_sum": "1ba21abad312197fbe4c9c0d449e16bad57f4c2e3e8e37e31e2d50b413faab04"
+            },
+            "config.json": {
+                "size_bytes": 3586,
+                "sha256_sum": "d19b81d56f90344e110426d5830e5b27a3af178bccd44dd6b072d811cdade750"
+            },
+            "generator.onnx": {
+                "size_bytes": 62796055,
+                "sha256_sum": "142f4a8268549a8fa148066182e548335eb60826c751228f0c311e8d49d0d938"
+            },
+            "phonemes.txt": {
+                "size_bytes": 282,
+                "sha256_sum": "270d2d069b677555c8d703afa3e3883e43e905e993ebb3e85f3481b60fe9f638"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "ru_RU/multi_low": {
+        "files": {
+            "config.json": {
+                "size_bytes": 3923,
+                "sha256_sum": "314e0fdd09183942d2f7393d4b950a12823849c0f72d22e62dc9858a6b4886c6"
+            },
+            "css10/LICENSE": {
+                "size_bytes": 6384,
+                "sha256_sum": "e052310c1e6d75057abe231ba94b7f2eedee1aec4a0c5c658c8151f6f8c05fd8"
+            },
+            "css10/SOURCE": {
+                "size_bytes": 71,
+                "sha256_sum": "7edef4a18d5ea07a79f374a9ffdf7e5e5dfc347466feac994d87e9bda9be05ec"
+            },
+            "generator.onnx": {
+                "size_bytes": 76335199,
+                "sha256_sum": "cb84b12479fc619943cb8fbb56827f7fd95f5ffcbebf2c220606b3a9750bf2ca"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 326,
+                "sha256_sum": "63b030e0fc9ebd79f93c82a906c34910a23a8de46e372b48bab1119fd28ec2fa"
+            },
+            "speaker_map.csv": {
+                "size_bytes": 61,
+                "sha256_sum": "b6e1b09bfc4358b66e93dbb3f647f572341969fd66bb403fff519e5f540119a0"
+            },
+            "speakers.txt": {
+                "size_bytes": 29,
+                "sha256_sum": "f985c0983cb587acb11e9f33538ae9827c9eae0138acc3dae1def1e7780b3211"
+            }
+        },
+        "speakers": [
+            "hajdurova",
+            "minaev",
+            "nikolaev"
+        ],
+        "properties": {}
+    },
+    "sv_SE/talesyntese_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 51,
+                "sha256_sum": "bd1a963f2c77481f0a658b5fa7fe77c2515e73be3972f1e991741b72f6fd7d31"
+            },
+            "README.md": {
+                "size_bytes": 203,
+                "sha256_sum": "f574e3807bec86b91caa0d70b1ac8c4ef85ecc297afb49b62642f9944554cbaa"
+            },
+            "SOURCE": {
+                "size_bytes": 63,
+                "sha256_sum": "295e2c2e47edb2f156c10808efb9439714d227c5b45b60a4f8ec3adc33451a6b"
+            },
+            "config.json": {
+                "size_bytes": 3376,
+                "sha256_sum": "8e5a29c1a0ae655c9d0d56df025f22286e81ce323d1b68d07977b90bf61ee33e"
+            },
+            "generator.onnx": {
+                "size_bytes": 62802967,
+                "sha256_sum": "bd9a50a8b0d35116c0d543681c2384bb738087ea771f9abee805feb53aa5f708"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 360,
+                "sha256_sum": "b4d2422bcc2b2f3ea739ce3f59019e499b966a74836aa54f6300921c4fc7ae76"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "sw/lanfrica_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 0,
+                "sha256_sum": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+            },
+            "SOURCE": {
+                "size_bytes": 48,
+                "sha256_sum": "1553a74483d9094830d5d249ed6db286eb52f0e7057f0b903efd8c45656bcfeb"
+            },
+            "config.json": {
+                "size_bytes": 3387,
+                "sha256_sum": "3b8f3876f998624fff4760015cbdf7cf7e8110eed9753e45a99479fdc8ba8817"
+            },
+            "generator.onnx": {
+                "size_bytes": 62787607,
+                "sha256_sum": "b470bf4b042ea96d2272162e9efaa8bd48bae4bc771d4a9996631f645e740e80"
+            },
+            "phoneme_map.txt": {
+                "size_bytes": 15,
+                "sha256_sum": "4003f421fc91ed1d5a343442659db6cf9d58bd1c6d8d771abc1999cc24d7694d"
+            },
+            "phonemes.txt": {
+                "size_bytes": 245,
+                "sha256_sum": "4784d6c095a3937b09a6f1fa292df160409033ec1d763d90d9b95ac5a42bf42d"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    },
+    "uk_UK/m-ailabs_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 1372,
+                "sha256_sum": "fdd78a909fb9384d869363522b967557bc9e28e5b65874921f24e48cbb82f38c"
+            },
+            "README.md": {
+                "size_bytes": 198,
+                "sha256_sum": "d399789ee16b4610af50b3316cb1a9281f37002728fc1185e0690840be2bd58b"
+            },
+            "SOURCE": {
+                "size_bytes": 61,
+                "sha256_sum": "841520f6a8cc616e307a92552355691f8c3087fadda2e9b7a03a7863b2d0cf6a"
+            },
+            "config.json": {
+                "size_bytes": 5197,
+                "sha256_sum": "bbd2c66d5920d9e54771d480a982e801cb23ad7de5848625d1ebc82c6b1c7752"
+            },
+            "generator.onnx": {
+                "size_bytes": 76355935,
+                "sha256_sum": "ee409d02d0e02d3bf92c3ee1f7403328213dda7515c17246b2b9ca2f005c09d3"
+            },
+            "phonemes.txt": {
+                "size_bytes": 426,
+                "sha256_sum": "9a4d708ae3ddffc67709c83c608ce6acdeff511ad288b2f037d41ea2ec3867ee"
+            },
+            "speaker_map.csv": {
+                "size_bytes": 118,
+                "sha256_sum": "f74765e11fca2ac205b2acb1213bdaa3bd3f6c9235ebcab5479160bfef1b7aa0"
+            }
+        },
+        "speakers": [],
+        "properties": {}
+    }
+}