Add --csv-voice

2022-03-31 16:50:17 -04:00 · 2022-03-31 16:50:17 -04:00 · 02f02efc77
commit 02f02efc77
parent 7efc3da385
1 changed files with 43 additions and 21 deletions
--- a/mimic3-tts/mimic3_tts/main.py
+++ b/mimic3-tts/mimic3_tts/main.py
@ -142,7 +142,7 @@ def initialize_args(state: CommandLineInterfaceState):

    # Open file for writing the names from <mark> tags in SSML.
    # Each name is printed on a single line.
-    if args.mark_file:
+    if args.mark_file and (args.mark_file != "-"):
        args.mark_file = Path(args.mark_file)
        args.mark_file.parent.mkdir(parents=True, exist_ok=True)
        state.mark_writer = open(  # pylint: disable=consider-using-with
@ -157,8 +157,15 @@ def initialize_args(state: CommandLineInterfaceState):
        _LOGGER.debug("Setting random seed to %s", args.seed)
        np.random.seed(args.seed)

+    if args.csv_voice:
+        # --csv-voice implies --csv
+        args.csv = True
+
    if args.csv:
-        args.output_naming = "id"
+        args.output_naming = OutputNaming.ID
+    elif args.ssml:
+        # Avoid text mangling when using SSML
+        args.output_naming = OutputNaming.TIME

    # Read text from stdin or arguments
    if args.text:
@ -313,6 +320,7 @@ def process_line(
    line: str,
    state: CommandLineInterfaceState,
    line_id: str = "",
+    line_voice: typing.Optional[str] = None,
 ):
    from mimic3_tts import SSMLSpeaker

@ -321,6 +329,14 @@ def process_line(

    args = state.args

+    if line_voice:
+        if line_voice.startswith("#"):
+            # Same voice, but different speaker
+            state.tts.speaker = line_voice[1:]
+        else:
+            # Different voice
+            state.tts.voice = line_voice
+
    if args.ssml:
        results = SSMLSpeaker(state.tts).speak(line)
    else:
@ -333,13 +349,13 @@ def process_line(

    for result in results:
        state.result_queue.put(
-            ResultToProcess(
-                result=result,
-                line=line,
-                line_id=line_id,
-            )
+            ResultToProcess(result=result, line=line, line_id=line_id,)
        )

+    # Restore voice/speaker
+    state.tts.voice = args.voice
+    state.tts.speaker = args.speaker
+

 def process_lines(state: CommandLineInterfaceState):
    assert state.texts is not None
@ -350,6 +366,7 @@ def process_lines(state: CommandLineInterfaceState):
        result_idx = 0

        for line in state.texts:
+            line_voice: typing.Optional[str] = None
            line_id = ""
            line = line.strip()
            if not line:
@ -357,9 +374,14 @@ def process_lines(state: CommandLineInterfaceState):

            if args.output_naming == OutputNaming.ID:
                # Line has the format id|text instead of just text
-                line_id, line = line.split(args.id_delimiter, maxsplit=1)
+                with io.StringIO(line) as line_io:
+                    reader = csv.reader(line_io, delimiter=args.csv_delimiter)
+                    row = next(reader)
+                    line_id, line = row[0], row[-1]
+                    if args.csv_voice:
+                        line_voice = row[1]

-            process_line(line, state, line_id=line_id)
+            process_line(line, state, line_id=line_id, line_voice=line_voice)
            result_idx += 1

    except KeyboardInterrupt:
@ -455,14 +477,10 @@ def get_args():
        help="Format of stdin text (default: auto)",
    )
    parser.add_argument(
-        "--voice",
-        "-v",
-        help="Name of voice (expected in <voices-dir>/<language>)",
+        "--voice", "-v", help="Name of voice (expected in <voices-dir>/<language>)",
    )
    parser.add_argument(
-        "--speaker",
-        "-s",
-        help="Name or number of speaker (default: first speaker)",
+        "--speaker", "-s", help="Name or number of speaker (default: first speaker)",
    )
    parser.add_argument(
        "--voices-dir",
@ -488,15 +506,21 @@ def get_args():
        help="Play audio after each input line (see --play-program)",
    )
    parser.add_argument("--csv", action="store_true", help="Input format is id|text")
+    parser.add_argument(
+        "--csv-delimiter", default="|", help="Delimiter used with --csv (default: |)"
+    )
+    parser.add_argument(
+        "--csv-voice",
+        action="store_true",
+        help="Input format is id|voice|text or id|#speaker|text",
+    )
    parser.add_argument(
        "--mark-file",
        help="File to write mark names to as they're encountered (--ssml only)",
    )

    parser.add_argument(
-        "--noise-scale",
-        type=float,
-        help="Noise scale [0-1], default is 0.667",
+        "--noise-scale", type=float, help="Noise scale [0-1], default is 0.667",
    )
    parser.add_argument(
        "--length-scale",
@ -504,9 +528,7 @@ def get_args():
        help="Length scale (1.0 is default speed, 0.5 is 2x faster)",
    )
    parser.add_argument(
-        "--noise-w",
-        type=float,
-        help="Variation in cadence [0-1], default is 0.8",
+        "--noise-w", type=float, help="Variation in cadence [0-1], default is 0.8",
    )

    # Miscellaneous