Extensive fixes to dialogue tag punctuation/capitalization

See newly-added code comment for details. This involves a first pass in convert.js then lots of fixups and corrections in substitutions.json.
This commit is contained in:
Domenic Denicola 2017-10-31 23:07:16 -07:00
commit 1fcdd9611d
2 changed files with 1033 additions and 27 deletions

View file

@ -21,9 +21,7 @@ async function convertChapter(chapter, cachePath, contentPath) {
const filename = chapter.filename;
const filePath = path.resolve(cachePath, filename);
console.log(`- Reading ${filename}`);
const contents = await fs.readFile(filePath, { encoding: "utf-8" });
console.log(`- Read ${filename}`);
const rawChapterJSDOM = new JSDOM(contents);
const output = getChapterString(chapter, rawChapterJSDOM.window.document);
@ -199,6 +197,22 @@ function getBodyXML(chapter, contentEl) {
// Fix extra periods at the end of paragraphs
xml = xml.replace(/\.\.<\/p>/g, ".</p>");
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
// > “I didnt get much done,” Greg said, “I got distracted by...
// This should instead be
// > “I didnt get much done,” Greg said. “I got distracted by...
//
// Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
//
// This sometimes overcorrects, as in the following example:
// > “Basically,” Alec said, “For your powers to manifest, ...
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
//
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2");
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
xml = xml.replace(/<i>([A-Za-z]+)<\/i>/g, "<em>$1</em>");

File diff suppressed because it is too large Load diff