From 02f02efc779dec2f229c94c7cafcf571c6e8d76b Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 31 Mar 2022 16:50:17 -0400 Subject: [PATCH] Add --csv-voice --- mimic3-tts/mimic3_tts/__main__.py | 64 +++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/mimic3-tts/mimic3_tts/__main__.py b/mimic3-tts/mimic3_tts/__main__.py index e679ae3..f60a8b1 100644 --- a/mimic3-tts/mimic3_tts/__main__.py +++ b/mimic3-tts/mimic3_tts/__main__.py @@ -142,7 +142,7 @@ def initialize_args(state: CommandLineInterfaceState): # Open file for writing the names from tags in SSML. # Each name is printed on a single line. - if args.mark_file: + if args.mark_file and (args.mark_file != "-"): args.mark_file = Path(args.mark_file) args.mark_file.parent.mkdir(parents=True, exist_ok=True) state.mark_writer = open( # pylint: disable=consider-using-with @@ -157,8 +157,15 @@ def initialize_args(state: CommandLineInterfaceState): _LOGGER.debug("Setting random seed to %s", args.seed) np.random.seed(args.seed) + if args.csv_voice: + # --csv-voice implies --csv + args.csv = True + if args.csv: - args.output_naming = "id" + args.output_naming = OutputNaming.ID + elif args.ssml: + # Avoid text mangling when using SSML + args.output_naming = OutputNaming.TIME # Read text from stdin or arguments if args.text: @@ -313,6 +320,7 @@ def process_line( line: str, state: CommandLineInterfaceState, line_id: str = "", + line_voice: typing.Optional[str] = None, ): from mimic3_tts import SSMLSpeaker @@ -321,6 +329,14 @@ def process_line( args = state.args + if line_voice: + if line_voice.startswith("#"): + # Same voice, but different speaker + state.tts.speaker = line_voice[1:] + else: + # Different voice + state.tts.voice = line_voice + if args.ssml: results = SSMLSpeaker(state.tts).speak(line) else: @@ -333,13 +349,13 @@ def process_line( for result in results: state.result_queue.put( - ResultToProcess( - result=result, - line=line, - line_id=line_id, - ) + ResultToProcess(result=result, line=line, line_id=line_id,) ) + # Restore voice/speaker + state.tts.voice = args.voice + state.tts.speaker = args.speaker + def process_lines(state: CommandLineInterfaceState): assert state.texts is not None @@ -350,6 +366,7 @@ def process_lines(state: CommandLineInterfaceState): result_idx = 0 for line in state.texts: + line_voice: typing.Optional[str] = None line_id = "" line = line.strip() if not line: @@ -357,9 +374,14 @@ def process_lines(state: CommandLineInterfaceState): if args.output_naming == OutputNaming.ID: # Line has the format id|text instead of just text - line_id, line = line.split(args.id_delimiter, maxsplit=1) + with io.StringIO(line) as line_io: + reader = csv.reader(line_io, delimiter=args.csv_delimiter) + row = next(reader) + line_id, line = row[0], row[-1] + if args.csv_voice: + line_voice = row[1] - process_line(line, state, line_id=line_id) + process_line(line, state, line_id=line_id, line_voice=line_voice) result_idx += 1 except KeyboardInterrupt: @@ -455,14 +477,10 @@ def get_args(): help="Format of stdin text (default: auto)", ) parser.add_argument( - "--voice", - "-v", - help="Name of voice (expected in /)", + "--voice", "-v", help="Name of voice (expected in /)", ) parser.add_argument( - "--speaker", - "-s", - help="Name or number of speaker (default: first speaker)", + "--speaker", "-s", help="Name or number of speaker (default: first speaker)", ) parser.add_argument( "--voices-dir", @@ -488,15 +506,21 @@ def get_args(): help="Play audio after each input line (see --play-program)", ) parser.add_argument("--csv", action="store_true", help="Input format is id|text") + parser.add_argument( + "--csv-delimiter", default="|", help="Delimiter used with --csv (default: |)" + ) + parser.add_argument( + "--csv-voice", + action="store_true", + help="Input format is id|voice|text or id|#speaker|text", + ) parser.add_argument( "--mark-file", help="File to write mark names to as they're encountered (--ssml only)", ) parser.add_argument( - "--noise-scale", - type=float, - help="Noise scale [0-1], default is 0.667", + "--noise-scale", type=float, help="Noise scale [0-1], default is 0.667", ) parser.add_argument( "--length-scale", @@ -504,9 +528,7 @@ def get_args(): help="Length scale (1.0 is default speed, 0.5 is 2x faster)", ) parser.add_argument( - "--noise-w", - type=float, - help="Variation in cadence [0-1], default is 0.8", + "--noise-w", type=float, help="Variation in cadence [0-1], default is 0.8", ) # Miscellaneous