Extensive fixes to dialogue tag punctuation/capitalization
See newly-added code comment for details. This involves a first pass in convert.js then lots of fixups and corrections in substitutions.json.
This commit is contained in:
parent
8f437eadd1
commit
1fcdd9611d
2 changed files with 1033 additions and 27 deletions
|
|
@ -21,9 +21,7 @@ async function convertChapter(chapter, cachePath, contentPath) {
|
|||
const filename = chapter.filename;
|
||||
const filePath = path.resolve(cachePath, filename);
|
||||
|
||||
console.log(`- Reading ${filename}`);
|
||||
const contents = await fs.readFile(filePath, { encoding: "utf-8" });
|
||||
console.log(`- Read ${filename}`);
|
||||
|
||||
const rawChapterJSDOM = new JSDOM(contents);
|
||||
const output = getChapterString(chapter, rawChapterJSDOM.window.document);
|
||||
|
|
@ -199,6 +197,22 @@ function getBodyXML(chapter, contentEl) {
|
|||
// Fix extra periods at the end of paragraphs
|
||||
xml = xml.replace(/\.\.<\/p>/g, ".</p>");
|
||||
|
||||
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
|
||||
// > “I didn’t get much done,” Greg said, “I got distracted by...
|
||||
// This should instead be
|
||||
// > “I didn’t get much done,” Greg said. “I got distracted by...
|
||||
//
|
||||
// Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
|
||||
//
|
||||
// This sometimes overcorrects, as in the following example:
|
||||
// > “Basically,” Alec said, “For your powers to manifest, ...
|
||||
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
|
||||
//
|
||||
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
|
||||
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
|
||||
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
|
||||
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2");
|
||||
|
||||
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
|
||||
xml = xml.replace(/<i>([A-Za-z]+)<\/i>/g, "<em>$1</em>");
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue