From 02f02efc779dec2f229c94c7cafcf571c6e8d76b Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Thu, 31 Mar 2022 16:50:17 -0400
Subject: [PATCH] Add --csv-voice

---
 mimic3-tts/mimic3_tts/__main__.py | 64 +++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 21 deletions(-)
diff --git a/mimic3-tts/mimic3_tts/__main__.py b/mimic3-tts/mimic3_tts/__main__.py
index e679ae3..f60a8b1 100644
--- a/mimic3-tts/mimic3_tts/__main__.py
+++ b/mimic3-tts/mimic3_tts/__main__.py
@@ -142,7 +142,7 @@ def initialize_args(state: CommandLineInterfaceState):
 
     # Open file for writing the names from <mark> tags in SSML.
     # Each name is printed on a single line.
-    if args.mark_file:
+    if args.mark_file and (args.mark_file != "-"):
         args.mark_file = Path(args.mark_file)
         args.mark_file.parent.mkdir(parents=True, exist_ok=True)
         state.mark_writer = open(  # pylint: disable=consider-using-with
@@ -157,8 +157,15 @@ def initialize_args(state: CommandLineInterfaceState):
         _LOGGER.debug("Setting random seed to %s", args.seed)
         np.random.seed(args.seed)
 
+    if args.csv_voice:
+        # --csv-voice implies --csv
+        args.csv = True
+
     if args.csv:
-        args.output_naming = "id"
+        args.output_naming = OutputNaming.ID
+    elif args.ssml:
+        # Avoid text mangling when using SSML
+        args.output_naming = OutputNaming.TIME
 
     # Read text from stdin or arguments
     if args.text:
@@ -313,6 +320,7 @@ def process_line(
     line: str,
     state: CommandLineInterfaceState,
     line_id: str = "",
+    line_voice: typing.Optional[str] = None,
 ):
     from mimic3_tts import SSMLSpeaker
 
@@ -321,6 +329,14 @@ def process_line(
 
     args = state.args
 
+    if line_voice:
+        if line_voice.startswith("#"):
+            # Same voice, but different speaker
+            state.tts.speaker = line_voice[1:]
+        else:
+            # Different voice
+            state.tts.voice = line_voice
+
     if args.ssml:
         results = SSMLSpeaker(state.tts).speak(line)
     else:
@@ -333,13 +349,13 @@ def process_line(
 
     for result in results:
         state.result_queue.put(
-            ResultToProcess(
-                result=result,
-                line=line,
-                line_id=line_id,
-            )
+            ResultToProcess(result=result, line=line, line_id=line_id,)
         )
 
+    # Restore voice/speaker
+    state.tts.voice = args.voice
+    state.tts.speaker = args.speaker
+
 
 def process_lines(state: CommandLineInterfaceState):
     assert state.texts is not None
@@ -350,6 +366,7 @@ def process_lines(state: CommandLineInterfaceState):
         result_idx = 0
 
         for line in state.texts:
+            line_voice: typing.Optional[str] = None
             line_id = ""
             line = line.strip()
             if not line:
@@ -357,9 +374,14 @@ def process_lines(state: CommandLineInterfaceState):
 
             if args.output_naming == OutputNaming.ID:
                 # Line has the format id|text instead of just text
-                line_id, line = line.split(args.id_delimiter, maxsplit=1)
+                with io.StringIO(line) as line_io:
+                    reader = csv.reader(line_io, delimiter=args.csv_delimiter)
+                    row = next(reader)
+                    line_id, line = row[0], row[-1]
+                    if args.csv_voice:
+                        line_voice = row[1]
 
-            process_line(line, state, line_id=line_id)
+            process_line(line, state, line_id=line_id, line_voice=line_voice)
             result_idx += 1
 
     except KeyboardInterrupt:
@@ -455,14 +477,10 @@ def get_args():
         help="Format of stdin text (default: auto)",
     )
     parser.add_argument(
-        "--voice",
-        "-v",
-        help="Name of voice (expected in <voices-dir>/<language>)",
+        "--voice", "-v", help="Name of voice (expected in <voices-dir>/<language>)",
     )
     parser.add_argument(
-        "--speaker",
-        "-s",
-        help="Name or number of speaker (default: first speaker)",
+        "--speaker", "-s", help="Name or number of speaker (default: first speaker)",
     )
     parser.add_argument(
         "--voices-dir",
@@ -488,15 +506,21 @@ def get_args():
         help="Play audio after each input line (see --play-program)",
     )
     parser.add_argument("--csv", action="store_true", help="Input format is id|text")
+    parser.add_argument(
+        "--csv-delimiter", default="|", help="Delimiter used with --csv (default: |)"
+    )
+    parser.add_argument(
+        "--csv-voice",
+        action="store_true",
+        help="Input format is id|voice|text or id|#speaker|text",
+    )
     parser.add_argument(
         "--mark-file",
         help="File to write mark names to as they're encountered (--ssml only)",
     )
 
     parser.add_argument(
-        "--noise-scale",
-        type=float,
-        help="Noise scale [0-1], default is 0.667",
+        "--noise-scale", type=float, help="Noise scale [0-1], default is 0.667",
     )
     parser.add_argument(
         "--length-scale",
@@ -504,9 +528,7 @@ def get_args():
         help="Length scale (1.0 is default speed, 0.5 is 2x faster)",
     )
     parser.add_argument(
-        "--noise-w",
-        type=float,
-        help="Variation in cadence [0-1], default is 0.8",
+        "--noise-w", type=float, help="Variation in cadence [0-1], default is 0.8",
     )
 
     # Miscellaneous