From 33d88eb52307c530095852499dd844bd2aa18319 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Sat, 7 Nov 2020 18:37:35 -0500 Subject: [PATCH] Refactor convert-worker.js a bit This introduces a few fixes around italics. --- lib/convert-worker.js | 353 +++++++++++++++++++++++------------------ lib/substitutions.json | 20 ++- 2 files changed, 214 insertions(+), 159 deletions(-) diff --git a/lib/convert-worker.js b/lib/convert-worker.js index 9959711..f69df99 100644 --- a/lib/convert-worker.js +++ b/lib/convert-worker.js @@ -135,6 +135,10 @@ function getBodyXML(chapter, book, contentEl) { // Fix recurring strange pattern of extra
in

......
\n

xml = xml.replace(/
\s*<\/em><\/p>/g, "

"); + // Replace single-word s with s. Other s are probably erroneous too, but these are known-bad. + xml = xml.replace(/([^ ]+)<\/i>/g, "$1"); + xml = xml.replace(/([^ ]+)( +)<\/i>/g, "$1$2"); + // There are way too many nonbreaking spaces where they don't belong. // If they show up three in a row, then let them live. Otherwise, they die. // Also remove any run of them after a period. @@ -192,8 +196,6 @@ function getBodyXML(chapter, book, contentEl) { fixEms(); fixQuotesAndApostrophes(); fixEms(); - fixQuotesAndApostrophes(); - fixEms(); xml = xml.replace(/I”m/g, "I’m"); // Similar problems occur in Ward with and as do in Worm with s @@ -206,9 +208,155 @@ function getBodyXML(chapter, book, contentEl) { xml = xml.replace(/(\s*)<\/strong>/g, "$1"); // No need for line breaks before paragraph ends - // These often occur with the
s inside / fixed above. + // These often occur with the
s inside /// fixed above. xml = xml.replace(/
\s*<\/p>/g, "

"); + // Fix missing spaces after commas + xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2"); + + // Fix bad periods and spacing/markup surrounding them + xml = xml.replace(/\.\.<\/p>/g, ".

"); + xml = xml.replace(/\.\.”<\/p>/g, ".”

"); + xml = xml.replace(/ \. /g, ". "); + xml = xml.replace(/ \.<\/p>/g, ".

"); + xml = xml.replace(/\.\.\./g, "…"); + + // Fix extra spaces + xml = xml.replace(/ ? <\/p>/g, "

"); + xml = xml.replace(/([a-z]) ,/g, "$1,"); + + xml = fixDialogueTags(xml); + xml = fixForeignNames(xml); + xml = fixEmDashes(xml); + xml = enDashJointNames(xml); + xml = fixPossessives(xml); + xml = cleanSceneBreaks(xml); + xml = fixCapitalization(xml, book); + xml = fixMispellings(xml); + xml = fixHyphens(xml); + xml = standardizeSpellings(xml); + + // One-off fixes + for (const substitution of substitutions[chapter.url] || []) { + if (substitution.before) { + const indexOf = xml.indexOf(substitution.before); + if (indexOf === -1) { + warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` + + `updated at the source, in which case, you should edit substitutions.json.`); + } + if (indexOf !== xml.lastIndexOf(substitution.before)) { + warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` + + `Update substitutions.json for a more precise substitution.`); + } + + xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after); + } else if (substitution.regExp) { + xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement); + } else { + warnings.push(`Invalid substitution specified for ${chapter.url}`); + } + } + + // Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a . + // Use this opportunity to insert a comment pointing to the original URL, for reference. + xml = xml.replace( + //, + `\n\n`); + + return { xml, warnings }; +} + +function fixDialogueTags(xml) { + // Fix recurring miscapitalization with questions + xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked"); + xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked"); + + // The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example, + // > “I didn’t get much done,” Greg said, “I got distracted by... + // This should instead be + // > “I didn’t get much done,” Greg said. “I got distracted by... + // + // Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.). + // + // This sometimes overcorrects, as in the following example: + // > “Basically,” Alec said, “For your powers to manifest, ... + // Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json. + // + // This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of + // times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the + // capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized. + xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2"); + + return xml; +} + +function fixForeignNames(xml) { + // This is consistently missing diacritics + xml = xml.replace(/Yangban/g, "Yàngbǎn"); + + // These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not + // italicized, so we go in the direction of removing the italics. + xml = xml.replace(/Garama<\/em>/g, "Garama"); + xml = xml.replace(/Thanda<\/em>/g, "Thanda"); + xml = xml.replace(/Sifara([^<]*)<\/em>/g, "Sifara$1"); + xml = xml.replace(/Moord Nag([^<]*)<\/em>/g, "Moord Nag$1"); + xml = xml.replace(/Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1"); + xml = xml.replace(/Turanta([^<]*)<\/em>/g, "Turanta$1"); + + return xml; +} + +function fixEmDashes(xml) { + xml = xml.replace(/ – /g, "—"); + xml = xml.replace(/“((?:)?)-/g, "“$1—"); + xml = xml.replace(/-[,.]?”/g, "—”"); + xml = xml.replace(/-(!|\?)”/g, "—$1”"); + xml = xml.replace(/-[,.]?<\/em>”/g, "—”"); + xml = xml.replace(/-“/g, "—”"); + xml = xml.replace(/

-/g, "

—"); + xml = xml.replace(/-<\/p>/g, "—

"); + xml = xml.replace(/-<\/em><\/p>/g, "—

"); + xml = xml.replace(/\s?\s?–\s?\s?/g, "—"); + xml = xml.replace(/-\s\s?/g, "—"); + xml = xml.replace(/\s?\s-/g, "—"); + xml = xml.replace(/\s+—”/g, "—”"); + xml = xml.replace(/I-I/g, "I—I"); + xml = xml.replace(/I-uh/g, "I—uh"); + + return xml; +} + +function enDashJointNames(xml) { + // Joint names should use en dashes + xml = xml.replace(/Dallon-Pelham/g, "Dallon–Pelham"); + xml = xml.replace(/Bet-Gimel/g, "Bet–Gimel"); + xml = xml.replace(/Tristan-Capricorn/g, "Tristan–Capricorn"); + xml = xml.replace(/Capricorn-Byron/g, "Capricorn–Byron"); + xml = xml.replace(/Tristan-Byron/g, "Tristan–Byron"); + xml = xml.replace(/Gimel-Europe/g, "Gimel–Europe"); + xml = xml.replace(/G-N/g, "G–N"); + xml = xml.replace(/Imp-Damsel/g, "Imp–Damsel"); + xml = xml.replace(/Damsel-Ashley/g, "Damsel–Ashley"); + xml = xml.replace(/Antares-Anelace/g, "Antares–Anelace"); + xml = xml.replace(/Challenger-Gallant/g, "Challenger–Gallant"); + xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1–$2"); + xml = xml.replace(/Norwalk-Fairfield/g, "Norwalk–Fairfield"); + xml = xml.replace(/East-West/g, "east–west"); + xml = xml.replace(/(Green|Yellow)-Black/g, "$1–Black"); + xml = xml.replace(/Creutzfeldt-Jakob/g, "Creutzfeldt–Jakob"); + xml = xml.replace(/Astaroth-Nidhug/g, "Astaroth–Nidhug"); + xml = xml.replace(/Capulet-Montague/g, "Capulet–Montague"); + xml = xml.replace(/Weaver-Clockblocker/g, "Weaver–Clockblocker"); + xml = xml.replace(/Alexandria-Pretender/g, "Alexandria–Pretender"); + xml = xml.replace(/Night Hag-Nyx/g, "Night Hag–Nyx"); + xml = xml.replace(/Crawler-Breed/g, "Crawler–Breed"); + xml = xml.replace(/Simurgh-Myrddin-plant/g, "Simurgh–Myrddin–plant"); + xml = xml.replace(/Armsmaster-Defiant/g, "Armsmaster–Defiant"); + + return xml; +} + +function fixPossessives(xml) { // Fix possessive of names ending in "s" // Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s". xml = xml.replace(/([^‘])Judas’([^s])/g, "$1Judas’s$2"); @@ -246,62 +394,15 @@ function getBodyXML(chapter, book, contentEl) { xml = xml.replace(/([^‘])Ms. Stillons’([^s])/g, "$1Ms. Stillons’s$2"); xml = xml.replace(/([^‘])Chuckles’([^s])/g, "$1Chuckles’s$2"); - // Fixes dashes - xml = xml.replace(/ – /g, "—"); - xml = xml.replace(/“((?:)?)-/g, "“$1—"); - xml = xml.replace(/-[,.]?”/g, "—”"); - xml = xml.replace(/-(!|\?)”/g, "—$1”"); - xml = xml.replace(/-[,.]?<\/em>”/g, "—”"); - xml = xml.replace(/-“/g, "—”"); - xml = xml.replace(/

-/g, "

—"); - xml = xml.replace(/-<\/p>/g, "—

"); - xml = xml.replace(/-<\/em><\/p>/g, "—

"); - xml = xml.replace(/\s?\s?–\s?\s?/g, "—"); - xml = xml.replace(/-\s\s?/g, "—"); - xml = xml.replace(/\s?\s-/g, "—"); - xml = xml.replace(/\s+—”/g, "—”"); - xml = xml.replace(/I-I/g, "I—I"); - xml = xml.replace(/I-uh/g, "I—uh"); - - // "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit - // them. - xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/g, "$1-year-old$2"); - xml = xml.replace(/(\w+) or (\w+)-year-old/g, "$1- or $2-year-old"); - - // Fix missing spaces after commas - xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2"); - - // Joint names should use em dashes - xml = xml.replace(/Dallon-Pelham/g, "Dallon–Pelham"); - xml = xml.replace(/Bet-Gimel/g, "Bet–Gimel"); - xml = xml.replace(/Tristan-Capricorn/g, "Tristan–Capricorn"); - xml = xml.replace(/Capricorn-Byron/g, "Capricorn–Byron"); - xml = xml.replace(/Tristan-Byron/g, "Tristan–Byron"); - xml = xml.replace(/Gimel-Europe/g, "Gimel–Europe"); - xml = xml.replace(/G-N/g, "G–N"); - xml = xml.replace(/Imp-Damsel/g, "Imp–Damsel"); - xml = xml.replace(/Damsel-Ashley/g, "Damsel–Ashley"); - xml = xml.replace(/Antares-Anelace/g, "Antares–Anelace"); - xml = xml.replace(/Challenger-Gallant/g, "Challenger–Gallant"); - xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1–$2"); - xml = xml.replace(/Norwalk-Fairfield/g, "Norwalk–Fairfield"); - xml = xml.replace(/East-West/g, "east–west"); - xml = xml.replace(/(Green|Yellow)-Black/g, "$1–Black"); - xml = xml.replace(/Creutzfeldt-Jakob/g, "Creutzfeldt–Jakob"); - xml = xml.replace(/Astaroth-Nidhug/g, "Astaroth–Nidhug"); - xml = xml.replace(/Capulet-Montague/g, "Capulet–Montague"); - xml = xml.replace(/Weaver-Clockblocker/g, "Weaver–Clockblocker"); - xml = xml.replace(/Alexandria-Pretender/g, "Alexandria–Pretender"); - xml = xml.replace(/Night Hag-Nyx/g, "Night Hag–Nyx"); - xml = xml.replace(/Crawler-Breed/g, "Crawler–Breed"); - xml = xml.replace(/Simurgh-Myrddin-plant/g, "Simurgh–Myrddin–plant"); - xml = xml.replace(/Armsmaster-Defiant/g, "Armsmaster–Defiant"); + return xml; +} +function cleanSceneBreaks(xml) { // Normalize scene breaks.
would be more semantically appropriate, but loses the author's intent. This is // especially the case in Ward, which uses a variety of different scene breaks. + xml = xml.replace(/]*)>■<\/p>/g, `

`); - xml = xml.replace(/

⊙<\/p>/g, `

`); xml = xml.replace(/

⊙<\/strong><\/p>/g, `

`); xml = xml.replace(/

⊙<\/strong><\/em><\/p>/g, `

`); @@ -311,40 +412,10 @@ function getBodyXML(chapter, book, contentEl) { xml = xml.replace(/

⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/g, `

⊙ ⊙ ⊙ ⊙ ⊙

`); - // Fix recurring miscapitalization with questions - xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked"); - xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked"); - - // Fix bad periods and spacing/markup surrounding them - xml = xml.replace(/\.\.<\/p>/g, ".

"); - xml = xml.replace(/\.\.”<\/p>/g, ".”

"); - xml = xml.replace(/ \. /g, ". "); - xml = xml.replace(/ \.<\/p>/g, ".

"); - xml = xml.replace(/\.\.\./g, "…"); - - // Fix extra spaces - xml = xml.replace(/ ? <\/p>/g, "

"); - xml = xml.replace(/([a-z]) ,/g, "$1,"); - - // The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example, - // > “I didn’t get much done,” Greg said, “I got distracted by... - // This should instead be - // > “I didn’t get much done,” Greg said. “I got distracted by... - // - // Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.). - // - // This sometimes overcorrects, as in the following example: - // > “Basically,” Alec said, “For your powers to manifest, ... - // Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json. - // - // This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of - // times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the - // capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized. - xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2"); - - // Replace single-word s with s. Other s are probably erroneous too, but these are known-bad. - xml = xml.replace(/([A-Za-z]+)<\/i>/g, "$1"); + return xml; +} +function fixCapitalization(xml, book) { // This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where // it's incorrect to capitalize in the one-off fixes. // Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals. @@ -366,13 +437,6 @@ function getBodyXML(chapter, book, contentEl) { // This is sometimes missing its capitalization. xml = xml.replace(/the birdcage/g, "the Birdcage"); - // This is usually spelled "TV" but sometimes the other ways. Normalize. - xml = xml.replace(/(\b)tv(\b)/g, "$1TV$2"); - xml = xml.replace(/t\.v\./ig, "TV"); - - // This is commonly misspelled. - xml = xml.replace(/([Ss])houlderblade/g, "$1houlder blade"); - // There's no reason why these should be capitalized. (Note that they never appear at the beginning of any sentences.) xml = xml.replace(/Halberd/g, "halberd"); xml = xml.replace(/Loft/g, "loft"); @@ -397,9 +461,6 @@ function getBodyXML(chapter, book, contentEl) { "$1–$2" ); - // This is consistently missing accents - xml = xml.replace(/Yangban/g, "Yàngbǎn"); - // Place names need to always be capitalized xml = xml.replace(/North end/g, "North End"); xml = xml.replace(/(Stonemast|Shale) avenue/g, "$1 Avenue"); @@ -409,14 +470,48 @@ function getBodyXML(chapter, book, contentEl) { xml = xml.replace(/the megalopolis/g, "the Megalopolis"); xml = xml.replace(/earths(?![a-z])/g, "Earths"); - // These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not - // italicized, so we go in the direction of removing the italics. - xml = xml.replace(/Garama<\/em>/g, "Garama"); - xml = xml.replace(/Thanda<\/em>/g, "Thanda"); - xml = xml.replace(/Sifara([^<]*)<\/em>/g, "Sifara$1"); - xml = xml.replace(/Moord Nag([^<]*)<\/em>/g, "Moord Nag$1"); - xml = xml.replace(/Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1"); - xml = xml.replace(/Turanta([^<]*)<\/em>/g, "Turanta$1"); + // "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of + // instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in + // substitutions.json. + xml = xml.replace(/(? { - if (substitution.before) { - const indexOf = xml.indexOf(substitution.before); - if (indexOf === -1) { - warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` + - `updated at the source, in which case, you should edit substitutions.json.`); - } - if (indexOf !== xml.lastIndexOf(substitution.before)) { - warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` + - `Update substitutions.json for a more precise substitution.`); - } - - xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after); - } else if (substitution.regExp) { - xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement); - } else { - warnings.push(`Invalid substitution specified for ${chapter.url}`); - } - }); - - // Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a . - // Use this opportunity to insert a comment pointing to the original URL, for reference. - xml = xml.replace( - //, - `\n\n`); - - return { xml, warnings }; + return xml; } function isEmptyOrGarbage(el) { diff --git a/lib/substitutions.json b/lib/substitutions.json index dcfbf93..ef2bc4f 100644 --- a/lib/substitutions.json +++ b/lib/substitutions.json @@ -2448,12 +2448,6 @@ "after": "all the way up to the nape of my neck" } ], - "https://parahumans.wordpress.com/2012/12/13/interlude-16-donation-bonus-2/": [ - { - "before": "", - "after": "”" - } - ], "https://parahumans.wordpress.com/2012/12/15/monarch-16-7/": [ { "before": "Brockton bay", @@ -2516,6 +2510,10 @@ { "before": "ten,” I asked. “Just", "after": "ten,” I asked, “just" + }, + { + "before": "weeks, months. Anticipating", + "after": "weeks, months. Anticipating" } ], "https://parahumans.wordpress.com/2013/01/08/migration-17-1/": [ @@ -2654,7 +2652,7 @@ ], "https://parahumans.wordpress.com/2013/01/15/migration-17-8/": [ { - "before": "replied. Need to talk about being more secure with our names. “What’s going on?", + "before": "replied. Need to talk about being more secure with our names. “What’s going on?”", "after": "replied. Need to talk about being more secure with our names. “What’s going on?”" }, { @@ -3356,6 +3354,10 @@ { "before": "aren’t allies.", "after": "aren’t allies.”" + }, + { + "before": "Blameful? “Guilty", + "after": "Blameful? “Guilty" } ], "https://parahumans.wordpress.com/2013/06/20/crushed-24-5/": [ @@ -4547,6 +4549,10 @@ { "before": "dangerous?” The recluse asked", "after": "dangerous?” the recluse asked" + }, + { + "before": "Stop. Please.", + "after": "Stop. Please." } ], "https://www.parahumans.net/2018/02/08/shade-4-4/": [