Refactor convert-worker.js a bit

This introduces a few fixes around italics.
This commit is contained in:
Domenic Denicola 2020-11-07 18:37:35 -05:00
commit 33d88eb523
2 changed files with 214 additions and 159 deletions

View file

@ -135,6 +135,10 @@ function getBodyXML(chapter, book, contentEl) {
// Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p>
xml = xml.replace(/<br \/>\s*<\/em><\/p>/g, "</em></p>");
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
xml = xml.replace(/<i>([^ ]+)<\/i>/g, "<em>$1</em>");
xml = xml.replace(/<i>([^ ]+)( +)<\/i>/g, "<em>$1</em>$2");
// There are way too many nonbreaking spaces where they don't belong.
// If they show up three in a row, then let them live. Otherwise, they die.
// Also remove any run of them after a period.
@ -192,8 +196,6 @@ function getBodyXML(chapter, book, contentEl) {
fixEms();
fixQuotesAndApostrophes();
fixEms();
fixQuotesAndApostrophes();
fixEms();
xml = xml.replace(/I”m/g, "Im");
// Similar problems occur in Ward with <b> and <strong> as do in Worm with <em>s
@ -206,9 +208,155 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/(\s*)<\/strong>/g, "</strong>$1");
// No need for line breaks before paragraph ends
// These often occur with the <br>s inside <b>/<strong> fixed above.
// These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above.
xml = xml.replace(/<br \/>\s*<\/p>/g, "</p>");
// Fix missing spaces after commas
xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2");
// Fix bad periods and spacing/markup surrounding them
xml = xml.replace(/\.\.<\/p>/g, ".</p>");
xml = xml.replace(/\.\.”<\/p>/g, ".”</p>");
xml = xml.replace(/ \. /g, ". ");
xml = xml.replace(/ \.<\/p>/g, ".</p>");
xml = xml.replace(/\.<em>\.\./g, "<em>…");
// Fix extra spaces
xml = xml.replace(/ ? <\/p>/g, "</p>");
xml = xml.replace(/([a-z]) ,/g, "$1,");
xml = fixDialogueTags(xml);
xml = fixForeignNames(xml);
xml = fixEmDashes(xml);
xml = enDashJointNames(xml);
xml = fixPossessives(xml);
xml = cleanSceneBreaks(xml);
xml = fixCapitalization(xml, book);
xml = fixMispellings(xml);
xml = fixHyphens(xml);
xml = standardizeSpellings(xml);
// One-off fixes
for (const substitution of substitutions[chapter.url] || []) {
if (substitution.before) {
const indexOf = xml.indexOf(substitution.before);
if (indexOf === -1) {
warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
`updated at the source, in which case, you should edit substitutions.json.`);
}
if (indexOf !== xml.lastIndexOf(substitution.before)) {
warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
`Update substitutions.json for a more precise substitution.`);
}
xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after);
} else if (substitution.regExp) {
xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement);
} else {
warnings.push(`Invalid substitution specified for ${chapter.url}`);
}
}
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>.
// Use this opportunity to insert a comment pointing to the original URL, for reference.
xml = xml.replace(
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/,
`<body>\n<!-- ${chapter.url} -->\n`);
return { xml, warnings };
}
function fixDialogueTags(xml) {
// Fix recurring miscapitalization with questions
xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked");
xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked");
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
// > “I didnt get much done,” Greg said, “I got distracted by...
// This should instead be
// > “I didnt get much done,” Greg said. “I got distracted by...
//
// Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
//
// This sometimes overcorrects, as in the following example:
// > “Basically,” Alec said, “For your powers to manifest, ...
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
//
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2");
return xml;
}
function fixForeignNames(xml) {
// This is consistently missing diacritics
xml = xml.replace(/Yangban/g, "Yàngbǎn");
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
// italicized, so we go in the direction of removing the italics.
xml = xml.replace(/<em>Garama<\/em>/g, "Garama");
xml = xml.replace(/<em>Thanda<\/em>/g, "Thanda");
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/g, "Sifara$1");
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/g, "Moord Nag$1");
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1");
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/g, "Turanta$1");
return xml;
}
function fixEmDashes(xml) {
xml = xml.replace(/ /g, "—");
xml = xml.replace(/“((?:<em>)?)-/g, "“$1—");
xml = xml.replace(/-[,.]?”/g, "—”");
xml = xml.replace(/-(!|\?)”/g, "—$1”");
xml = xml.replace(/-[,.]?<\/em>”/g, "—</em>”");
xml = xml.replace(/-“/g, "—”");
xml = xml.replace(/<p>-/g, "<p>—");
xml = xml.replace(/-<\/p>/g, "—</p>");
xml = xml.replace(/-<\/em><\/p>/g, "—</em></p>");
xml = xml.replace(/\s?\s?\s?\s?/g, "—");
xml = xml.replace(/-\s\s?/g, "—");
xml = xml.replace(/\s?\s-/g, "—");
xml = xml.replace(/\s+—”/g, "—”");
xml = xml.replace(/I-I/g, "I—I");
xml = xml.replace(/I-uh/g, "I—uh");
return xml;
}
function enDashJointNames(xml) {
// Joint names should use en dashes
xml = xml.replace(/Dallon-Pelham/g, "DallonPelham");
xml = xml.replace(/Bet-Gimel/g, "BetGimel");
xml = xml.replace(/Tristan-Capricorn/g, "TristanCapricorn");
xml = xml.replace(/Capricorn-Byron/g, "CapricornByron");
xml = xml.replace(/Tristan-Byron/g, "TristanByron");
xml = xml.replace(/Gimel-Europe/g, "GimelEurope");
xml = xml.replace(/G-N/g, "GN");
xml = xml.replace(/Imp-Damsel/g, "ImpDamsel");
xml = xml.replace(/Damsel-Ashley/g, "DamselAshley");
xml = xml.replace(/Antares-Anelace/g, "AntaresAnelace");
xml = xml.replace(/Challenger-Gallant/g, "ChallengerGallant");
xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1$2");
xml = xml.replace(/Norwalk-Fairfield/g, "NorwalkFairfield");
xml = xml.replace(/East-West/g, "eastwest");
xml = xml.replace(/(Green|Yellow)-Black/g, "$1Black");
xml = xml.replace(/Creutzfeldt-Jakob/g, "CreutzfeldtJakob");
xml = xml.replace(/Astaroth-Nidhug/g, "AstarothNidhug");
xml = xml.replace(/Capulet-Montague/g, "CapuletMontague");
xml = xml.replace(/Weaver-Clockblocker/g, "WeaverClockblocker");
xml = xml.replace(/Alexandria-Pretender/g, "AlexandriaPretender");
xml = xml.replace(/Night Hag-Nyx/g, "Night HagNyx");
xml = xml.replace(/Crawler-Breed/g, "CrawlerBreed");
xml = xml.replace(/Simurgh-Myrddin-plant/g, "SimurghMyrddinplant");
xml = xml.replace(/Armsmaster-Defiant/g, "ArmsmasterDefiant");
return xml;
}
function fixPossessives(xml) {
// Fix possessive of names ending in "s"
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s".
xml = xml.replace(/([^])Judas([^s])/g, "$1Judass$2");
@ -246,62 +394,15 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/([^])Ms. Stillons([^s])/g, "$1Ms. Stillonss$2");
xml = xml.replace(/([^])Chuckles([^s])/g, "$1Chuckless$2");
// Fixes dashes
xml = xml.replace(/ /g, "—");
xml = xml.replace(/“((?:<em>)?)-/g, "“$1—");
xml = xml.replace(/-[,.]?”/g, "—”");
xml = xml.replace(/-(!|\?)”/g, "—$1”");
xml = xml.replace(/-[,.]?<\/em>”/g, "—</em>”");
xml = xml.replace(/-“/g, "—”");
xml = xml.replace(/<p>-/g, "<p>—");
xml = xml.replace(/-<\/p>/g, "—</p>");
xml = xml.replace(/-<\/em><\/p>/g, "—</em></p>");
xml = xml.replace(/\s?\s?\s?\s?/g, "—");
xml = xml.replace(/-\s\s?/g, "—");
xml = xml.replace(/\s?\s-/g, "—");
xml = xml.replace(/\s+—”/g, "—”");
xml = xml.replace(/I-I/g, "I—I");
xml = xml.replace(/I-uh/g, "I—uh");
// "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit
// them.
xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/g, "$1-year-old$2");
xml = xml.replace(/(\w+) or (\w+)-year-old/g, "$1- or $2-year-old");
// Fix missing spaces after commas
xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2");
// Joint names should use em dashes
xml = xml.replace(/Dallon-Pelham/g, "DallonPelham");
xml = xml.replace(/Bet-Gimel/g, "BetGimel");
xml = xml.replace(/Tristan-Capricorn/g, "TristanCapricorn");
xml = xml.replace(/Capricorn-Byron/g, "CapricornByron");
xml = xml.replace(/Tristan-Byron/g, "TristanByron");
xml = xml.replace(/Gimel-Europe/g, "GimelEurope");
xml = xml.replace(/G-N/g, "GN");
xml = xml.replace(/Imp-Damsel/g, "ImpDamsel");
xml = xml.replace(/Damsel-Ashley/g, "DamselAshley");
xml = xml.replace(/Antares-Anelace/g, "AntaresAnelace");
xml = xml.replace(/Challenger-Gallant/g, "ChallengerGallant");
xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1$2");
xml = xml.replace(/Norwalk-Fairfield/g, "NorwalkFairfield");
xml = xml.replace(/East-West/g, "eastwest");
xml = xml.replace(/(Green|Yellow)-Black/g, "$1Black");
xml = xml.replace(/Creutzfeldt-Jakob/g, "CreutzfeldtJakob");
xml = xml.replace(/Astaroth-Nidhug/g, "AstarothNidhug");
xml = xml.replace(/Capulet-Montague/g, "CapuletMontague");
xml = xml.replace(/Weaver-Clockblocker/g, "WeaverClockblocker");
xml = xml.replace(/Alexandria-Pretender/g, "AlexandriaPretender");
xml = xml.replace(/Night Hag-Nyx/g, "Night HagNyx");
xml = xml.replace(/Crawler-Breed/g, "CrawlerBreed");
xml = xml.replace(/Simurgh-Myrddin-plant/g, "SimurghMyrddinplant");
xml = xml.replace(/Armsmaster-Defiant/g, "ArmsmasterDefiant");
return xml;
}
function cleanSceneBreaks(xml) {
// Normalize scene breaks. <hr> would be more semantically appropriate, but loses the author's intent. This is
// especially the case in Ward, which uses a variety of different scene breaks.
xml = xml.replace(/<p(?:[^>]*)>■<\/p>/g, `<p style="text-align: center;">■</p>`);
xml = xml.replace(/<p style="text-align: center;">⊙<\/p>/g, `<p style="text-align: center;">⊙</p>`);
xml = xml.replace(/<p style="text-align: center;"><strong>⊙<\/strong><\/p>/g, `<p style="text-align: center;">⊙</p>`);
xml = xml.replace(/<p style="text-align: center;"><em><strong>⊙<\/strong><\/em><\/p>/g,
`<p style="text-align: center;">⊙</p>`);
@ -311,40 +412,10 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/<p style="text-align: center;"><strong>⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/g,
`<p style="text-align: center;">⊙ ⊙ ⊙ ⊙ ⊙</p>`);
// Fix recurring miscapitalization with questions
xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked");
xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked");
// Fix bad periods and spacing/markup surrounding them
xml = xml.replace(/\.\.<\/p>/g, ".</p>");
xml = xml.replace(/\.\.”<\/p>/g, ".”</p>");
xml = xml.replace(/ \. /g, ". ");
xml = xml.replace(/ \.<\/p>/g, ".</p>");
xml = xml.replace(/\.<em>\.\./g, "<em>…");
// Fix extra spaces
xml = xml.replace(/ ? <\/p>/g, "</p>");
xml = xml.replace(/([a-z]) ,/g, "$1,");
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
// > “I didnt get much done,” Greg said, “I got distracted by...
// This should instead be
// > “I didnt get much done,” Greg said. “I got distracted by...
//
// Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
//
// This sometimes overcorrects, as in the following example:
// > “Basically,” Alec said, “For your powers to manifest, ...
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
//
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2");
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
xml = xml.replace(/<i>([A-Za-z]+)<\/i>/g, "<em>$1</em>");
return xml;
}
function fixCapitalization(xml, book) {
// This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where
// it's incorrect to capitalize in the one-off fixes.
// Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals.
@ -366,13 +437,6 @@ function getBodyXML(chapter, book, contentEl) {
// This is sometimes missing its capitalization.
xml = xml.replace(/the birdcage/g, "the Birdcage");
// This is usually spelled "TV" but sometimes the other ways. Normalize.
xml = xml.replace(/(\b)tv(\b)/g, "$1TV$2");
xml = xml.replace(/t\.v\./ig, "TV");
// This is commonly misspelled.
xml = xml.replace(/([Ss])houlderblade/g, "$1houlder blade");
// There's no reason why these should be capitalized. (Note that they never appear at the beginning of any sentences.)
xml = xml.replace(/Halberd/g, "halberd");
xml = xml.replace(/Loft/g, "loft");
@ -397,9 +461,6 @@ function getBodyXML(chapter, book, contentEl) {
"$1$2"
);
// This is consistently missing accents
xml = xml.replace(/Yangban/g, "Yàngbǎn");
// Place names need to always be capitalized
xml = xml.replace(/North end/g, "North End");
xml = xml.replace(/(Stonemast|Shale) avenue/g, "$1 Avenue");
@ -409,14 +470,48 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/the megalopolis/g, "the Megalopolis");
xml = xml.replace(/earths(?![a-z])/g, "Earths");
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
// italicized, so we go in the direction of removing the italics.
xml = xml.replace(/<em>Garama<\/em>/g, "Garama");
xml = xml.replace(/<em>Thanda<\/em>/g, "Thanda");
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/g, "Sifara$1");
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/g, "Moord Nag$1");
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1");
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/g, "Turanta$1");
// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
// substitutions.json.
xml = xml.replace(/(?<!mom), dad(?![a-z])/g, ", Dad");
xml = xml.replace(/, mom(?![a-z-])/g, ", Mom");
return xml;
}
function fixMispellings(xml) {
// This is commonly misspelled.
xml = xml.replace(/([Ss])houlderblade/g, "$1houlder blade");
// Preemptive(ly) is often hyphenated (not always). It should not be.
xml = xml.replace(/([Pp])re-emptive/g, "$1reemptive");
return xml;
}
function fixHyphens(xml) {
// "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit
// them.
xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/g, "$1-year-old$2");
xml = xml.replace(/(\w+) or (\w+)-year-old/g, "$1- or $2-year-old");
// These are consistently missing hyphens.
xml = xml.replace(/self destruct/g, "self-destruct");
xml = xml.replace(/life threatening/g, "life-threatening");
xml = xml.replace(/hard headed/g, "hard-headed");
xml = xml.replace(/shoulder mounted/g, "shoulder-mounted");
xml = xml.replace(/golden skinned/g, "golden-skinned");
xml = xml.replace(/creepy crawl/g, "creepy-crawl");
xml = xml.replace(/well armed/g, "well-armed");
xml = xml.replace(/able bodied/g, "able-bodied");
return xml;
}
function standardizeSpellings(xml) {
// This is usually spelled "TV" but sometimes the other ways. Normalize.
xml = xml.replace(/(\b)tv(\b)/g, "$1TV$2");
xml = xml.replace(/t\.v\./ig, "TV");
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
// writing notes and thus probably the intention was to be less formal. Also it seems per
@ -438,53 +533,7 @@ function getBodyXML(chapter, book, contentEl) {
// "gray" is the majority spelling, except for "greyhound"
xml = xml.replace(/(G|g)rey(?!hound)/g, "$1ray");
// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
// substitutions.json.
xml = xml.replace(/(?<!mom), dad(?![a-z])/g, ", Dad");
xml = xml.replace(/, mom(?![a-z-])/g, ", Mom");
// These are consistently missing hyphens.
xml = xml.replace(/self destruct/g, "self-destruct");
xml = xml.replace(/life threatening/g, "life-threatening");
xml = xml.replace(/hard headed/g, "hard-headed");
xml = xml.replace(/shoulder mounted/g, "shoulder-mounted");
xml = xml.replace(/golden skinned/g, "golden-skinned");
xml = xml.replace(/creepy crawl/g, "creepy-crawl");
xml = xml.replace(/well armed/g, "well-armed");
xml = xml.replace(/able bodied/g, "able-bodied");
// Preemptive(ly) is often hyphenated (not always). It should not be.
xml = xml.replace(/([Pp])re-emptive/g, "$1reemptive");
// One-off fixes
(substitutions[chapter.url] || []).forEach(substitution => {
if (substitution.before) {
const indexOf = xml.indexOf(substitution.before);
if (indexOf === -1) {
warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
`updated at the source, in which case, you should edit substitutions.json.`);
}
if (indexOf !== xml.lastIndexOf(substitution.before)) {
warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
`Update substitutions.json for a more precise substitution.`);
}
xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after);
} else if (substitution.regExp) {
xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement);
} else {
warnings.push(`Invalid substitution specified for ${chapter.url}`);
}
});
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>.
// Use this opportunity to insert a comment pointing to the original URL, for reference.
xml = xml.replace(
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/,
`<body>\n<!-- ${chapter.url} -->\n`);
return { xml, warnings };
return xml;
}
function isEmptyOrGarbage(el) {

View file

@ -2448,12 +2448,6 @@
"after": "all the way up to the nape of my neck"
}
],
"https://parahumans.wordpress.com/2012/12/13/interlude-16-donation-bonus-2/": [
{
"before": "<i>“</i>",
"after": "”"
}
],
"https://parahumans.wordpress.com/2012/12/15/monarch-16-7/": [
{
"before": "Brockton bay",
@ -2516,6 +2510,10 @@
{
"before": "ten,” I asked. “Just",
"after": "ten,” I asked, “just"
},
{
"before": "weeks, <i>months. </i>Anticipating",
"after": "weeks, <em>months</em>. Anticipating"
}
],
"https://parahumans.wordpress.com/2013/01/08/migration-17-1/": [
@ -2654,7 +2652,7 @@
],
"https://parahumans.wordpress.com/2013/01/15/migration-17-8/": [
{
"before": "replied. <i>Need to talk about being more secure with our names. “</i>Whats going on?<i>“</i>",
"before": "replied. <i>Need to talk about being more secure with our names. “</i>Whats going on?",
"after": "replied. <em>Need to talk about being more secure with our names.</em> “Whats going on?”"
},
{
@ -3356,6 +3354,10 @@
{
"before": "arent allies.",
"after": "arent allies.”"
},
{
"before": "<i>Blameful? </i>“Guilty",
"after": "<em>Blameful?</em> “Guilty"
}
],
"https://parahumans.wordpress.com/2013/06/20/crushed-24-5/": [
@ -4547,6 +4549,10 @@
{
"before": "dangerous?” The recluse asked",
"after": "dangerous?” the recluse asked"
},
{
"before": "<i>Stop. </i>Please.",
"after": "<em>Stop.</em> Please."
}
],
"https://www.parahumans.net/2018/02/08/shade-4-4/": [