Moord Nag([^<]*)<\/em>/g, "Moord Nag$1");
+ xml = xml.replace(/Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1");
+ xml = xml.replace(/Turanta([^<]*)<\/em>/g, "Turanta$1");
+
+ return xml;
+}
+
+function fixEmDashes(xml) {
+ xml = xml.replace(/ – /g, "—");
+ xml = xml.replace(/“((?:)?)-/g, "“$1—");
+ xml = xml.replace(/-[,.]?”/g, "—”");
+ xml = xml.replace(/-(!|\?)”/g, "—$1”");
+ xml = xml.replace(/-[,.]?<\/em>”/g, "—”");
+ xml = xml.replace(/-“/g, "—”");
+ xml = xml.replace(/-/g, "
—");
+ xml = xml.replace(/-<\/p>/g, "—
");
+ xml = xml.replace(/-<\/em><\/p>/g, "—");
+ xml = xml.replace(/\s?\s?–\s?\s?/g, "—");
+ xml = xml.replace(/-\s\s?/g, "—");
+ xml = xml.replace(/\s?\s-/g, "—");
+ xml = xml.replace(/\s+—”/g, "—”");
+ xml = xml.replace(/I-I/g, "I—I");
+ xml = xml.replace(/I-uh/g, "I—uh");
+
+ return xml;
+}
+
+function enDashJointNames(xml) {
+ // Joint names should use en dashes
+ xml = xml.replace(/Dallon-Pelham/g, "Dallon–Pelham");
+ xml = xml.replace(/Bet-Gimel/g, "Bet–Gimel");
+ xml = xml.replace(/Tristan-Capricorn/g, "Tristan–Capricorn");
+ xml = xml.replace(/Capricorn-Byron/g, "Capricorn–Byron");
+ xml = xml.replace(/Tristan-Byron/g, "Tristan–Byron");
+ xml = xml.replace(/Gimel-Europe/g, "Gimel–Europe");
+ xml = xml.replace(/G-N/g, "G–N");
+ xml = xml.replace(/Imp-Damsel/g, "Imp–Damsel");
+ xml = xml.replace(/Damsel-Ashley/g, "Damsel–Ashley");
+ xml = xml.replace(/Antares-Anelace/g, "Antares–Anelace");
+ xml = xml.replace(/Challenger-Gallant/g, "Challenger–Gallant");
+ xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1–$2");
+ xml = xml.replace(/Norwalk-Fairfield/g, "Norwalk–Fairfield");
+ xml = xml.replace(/East-West/g, "east–west");
+ xml = xml.replace(/(Green|Yellow)-Black/g, "$1–Black");
+ xml = xml.replace(/Creutzfeldt-Jakob/g, "Creutzfeldt–Jakob");
+ xml = xml.replace(/Astaroth-Nidhug/g, "Astaroth–Nidhug");
+ xml = xml.replace(/Capulet-Montague/g, "Capulet–Montague");
+ xml = xml.replace(/Weaver-Clockblocker/g, "Weaver–Clockblocker");
+ xml = xml.replace(/Alexandria-Pretender/g, "Alexandria–Pretender");
+ xml = xml.replace(/Night Hag-Nyx/g, "Night Hag–Nyx");
+ xml = xml.replace(/Crawler-Breed/g, "Crawler–Breed");
+ xml = xml.replace(/Simurgh-Myrddin-plant/g, "Simurgh–Myrddin–plant");
+ xml = xml.replace(/Armsmaster-Defiant/g, "Armsmaster–Defiant");
+
+ return xml;
+}
+
+function fixPossessives(xml) {
// Fix possessive of names ending in "s"
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s".
xml = xml.replace(/([^‘])Judas’([^s])/g, "$1Judas’s$2");
@@ -246,62 +394,15 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/([^‘])Ms. Stillons’([^s])/g, "$1Ms. Stillons’s$2");
xml = xml.replace(/([^‘])Chuckles’([^s])/g, "$1Chuckles’s$2");
- // Fixes dashes
- xml = xml.replace(/ – /g, "—");
- xml = xml.replace(/“((?:)?)-/g, "“$1—");
- xml = xml.replace(/-[,.]?”/g, "—”");
- xml = xml.replace(/-(!|\?)”/g, "—$1”");
- xml = xml.replace(/-[,.]?<\/em>”/g, "—”");
- xml = xml.replace(/-“/g, "—”");
- xml = xml.replace(/-/g, "
—");
- xml = xml.replace(/-<\/p>/g, "—
");
- xml = xml.replace(/-<\/em><\/p>/g, "—");
- xml = xml.replace(/\s?\s?–\s?\s?/g, "—");
- xml = xml.replace(/-\s\s?/g, "—");
- xml = xml.replace(/\s?\s-/g, "—");
- xml = xml.replace(/\s+—”/g, "—”");
- xml = xml.replace(/I-I/g, "I—I");
- xml = xml.replace(/I-uh/g, "I—uh");
-
- // "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit
- // them.
- xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/g, "$1-year-old$2");
- xml = xml.replace(/(\w+) or (\w+)-year-old/g, "$1- or $2-year-old");
-
- // Fix missing spaces after commas
- xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2");
-
- // Joint names should use em dashes
- xml = xml.replace(/Dallon-Pelham/g, "Dallon–Pelham");
- xml = xml.replace(/Bet-Gimel/g, "Bet–Gimel");
- xml = xml.replace(/Tristan-Capricorn/g, "Tristan–Capricorn");
- xml = xml.replace(/Capricorn-Byron/g, "Capricorn–Byron");
- xml = xml.replace(/Tristan-Byron/g, "Tristan–Byron");
- xml = xml.replace(/Gimel-Europe/g, "Gimel–Europe");
- xml = xml.replace(/G-N/g, "G–N");
- xml = xml.replace(/Imp-Damsel/g, "Imp–Damsel");
- xml = xml.replace(/Damsel-Ashley/g, "Damsel–Ashley");
- xml = xml.replace(/Antares-Anelace/g, "Antares–Anelace");
- xml = xml.replace(/Challenger-Gallant/g, "Challenger–Gallant");
- xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1–$2");
- xml = xml.replace(/Norwalk-Fairfield/g, "Norwalk–Fairfield");
- xml = xml.replace(/East-West/g, "east–west");
- xml = xml.replace(/(Green|Yellow)-Black/g, "$1–Black");
- xml = xml.replace(/Creutzfeldt-Jakob/g, "Creutzfeldt–Jakob");
- xml = xml.replace(/Astaroth-Nidhug/g, "Astaroth–Nidhug");
- xml = xml.replace(/Capulet-Montague/g, "Capulet–Montague");
- xml = xml.replace(/Weaver-Clockblocker/g, "Weaver–Clockblocker");
- xml = xml.replace(/Alexandria-Pretender/g, "Alexandria–Pretender");
- xml = xml.replace(/Night Hag-Nyx/g, "Night Hag–Nyx");
- xml = xml.replace(/Crawler-Breed/g, "Crawler–Breed");
- xml = xml.replace(/Simurgh-Myrddin-plant/g, "Simurgh–Myrddin–plant");
- xml = xml.replace(/Armsmaster-Defiant/g, "Armsmaster–Defiant");
+ return xml;
+}
+function cleanSceneBreaks(xml) {
// Normalize scene breaks.
would be more semantically appropriate, but loses the author's intent. This is
// especially the case in Ward, which uses a variety of different scene breaks.
+
xml = xml.replace(/]*)>■<\/p>/g, `
■
`);
- xml = xml.replace(/⊙<\/p>/g, `
⊙
`);
xml = xml.replace(/⊙<\/strong><\/p>/g, `⊙
`);
xml = xml.replace(/⊙<\/strong><\/em><\/p>/g,
`⊙
`);
@@ -311,40 +412,10 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/g,
`⊙ ⊙ ⊙ ⊙ ⊙
`);
- // Fix recurring miscapitalization with questions
- xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked");
- xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked");
-
- // Fix bad periods and spacing/markup surrounding them
- xml = xml.replace(/\.\.<\/p>/g, ".
");
- xml = xml.replace(/\.\.”<\/p>/g, ".”
");
- xml = xml.replace(/ \. /g, ". ");
- xml = xml.replace(/ \.<\/p>/g, ".
");
- xml = xml.replace(/\.\.\./g, "…");
-
- // Fix extra spaces
- xml = xml.replace(/ ? <\/p>/g, "");
- xml = xml.replace(/([a-z]) ,/g, "$1,");
-
- // The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
- // > “I didn’t get much done,” Greg said, “I got distracted by...
- // This should instead be
- // > “I didn’t get much done,” Greg said. “I got distracted by...
- //
- // Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
- //
- // This sometimes overcorrects, as in the following example:
- // > “Basically,” Alec said, “For your powers to manifest, ...
- // Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
- //
- // This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
- // times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
- // capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
- xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2");
-
- // Replace single-word s with s. Other s are probably erroneous too, but these are known-bad.
- xml = xml.replace(/([A-Za-z]+)<\/i>/g, "$1");
+ return xml;
+}
+function fixCapitalization(xml, book) {
// This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where
// it's incorrect to capitalize in the one-off fixes.
// Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals.
@@ -366,13 +437,6 @@ function getBodyXML(chapter, book, contentEl) {
// This is sometimes missing its capitalization.
xml = xml.replace(/the birdcage/g, "the Birdcage");
- // This is usually spelled "TV" but sometimes the other ways. Normalize.
- xml = xml.replace(/(\b)tv(\b)/g, "$1TV$2");
- xml = xml.replace(/t\.v\./ig, "TV");
-
- // This is commonly misspelled.
- xml = xml.replace(/([Ss])houlderblade/g, "$1houlder blade");
-
// There's no reason why these should be capitalized. (Note that they never appear at the beginning of any sentences.)
xml = xml.replace(/Halberd/g, "halberd");
xml = xml.replace(/Loft/g, "loft");
@@ -397,9 +461,6 @@ function getBodyXML(chapter, book, contentEl) {
"$1–$2"
);
- // This is consistently missing accents
- xml = xml.replace(/Yangban/g, "Yàngbǎn");
-
// Place names need to always be capitalized
xml = xml.replace(/North end/g, "North End");
xml = xml.replace(/(Stonemast|Shale) avenue/g, "$1 Avenue");
@@ -409,14 +470,48 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/the megalopolis/g, "the Megalopolis");
xml = xml.replace(/earths(?![a-z])/g, "Earths");
- // These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
- // italicized, so we go in the direction of removing the italics.
- xml = xml.replace(/Garama<\/em>/g, "Garama");
- xml = xml.replace(/Thanda<\/em>/g, "Thanda");
- xml = xml.replace(/Sifara([^<]*)<\/em>/g, "Sifara$1");
- xml = xml.replace(/Moord Nag([^<]*)<\/em>/g, "Moord Nag$1");
- xml = xml.replace(/Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1");
- xml = xml.replace(/Turanta([^<]*)<\/em>/g, "Turanta$1");
+ // "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
+ // instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
+ // substitutions.json.
+ xml = xml.replace(/(? {
- if (substitution.before) {
- const indexOf = xml.indexOf(substitution.before);
- if (indexOf === -1) {
- warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
- `updated at the source, in which case, you should edit substitutions.json.`);
- }
- if (indexOf !== xml.lastIndexOf(substitution.before)) {
- warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
- `Update substitutions.json for a more precise substitution.`);
- }
-
- xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after);
- } else if (substitution.regExp) {
- xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement);
- } else {
- warnings.push(`Invalid substitution specified for ${chapter.url}`);
- }
- });
-
- // Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a .
- // Use this opportunity to insert a comment pointing to the original URL, for reference.
- xml = xml.replace(
- //,
- `\n\n`);
-
- return { xml, warnings };
+ return xml;
}
function isEmptyOrGarbage(el) {
diff --git a/lib/substitutions.json b/lib/substitutions.json
index dcfbf93..ef2bc4f 100644
--- a/lib/substitutions.json
+++ b/lib/substitutions.json
@@ -2448,12 +2448,6 @@
"after": "all the way up to the nape of my neck"
}
],
- "https://parahumans.wordpress.com/2012/12/13/interlude-16-donation-bonus-2/": [
- {
- "before": "“",
- "after": "”"
- }
- ],
"https://parahumans.wordpress.com/2012/12/15/monarch-16-7/": [
{
"before": "Brockton bay",
@@ -2516,6 +2510,10 @@
{
"before": "ten,” I asked. “Just",
"after": "ten,” I asked, “just"
+ },
+ {
+ "before": "weeks, months. Anticipating",
+ "after": "weeks, months. Anticipating"
}
],
"https://parahumans.wordpress.com/2013/01/08/migration-17-1/": [
@@ -2654,7 +2652,7 @@
],
"https://parahumans.wordpress.com/2013/01/15/migration-17-8/": [
{
- "before": "replied. Need to talk about being more secure with our names. “What’s going on?“",
+ "before": "replied. Need to talk about being more secure with our names. “What’s going on?”",
"after": "replied. Need to talk about being more secure with our names. “What’s going on?”"
},
{
@@ -3356,6 +3354,10 @@
{
"before": "aren’t allies.",
"after": "aren’t allies.”"
+ },
+ {
+ "before": "Blameful? “Guilty",
+ "after": "Blameful? “Guilty"
}
],
"https://parahumans.wordpress.com/2013/06/20/crushed-24-5/": [
@@ -4547,6 +4549,10 @@
{
"before": "dangerous?” The recluse asked",
"after": "dangerous?” the recluse asked"
+ },
+ {
+ "before": "Stop. Please.",
+ "after": "Stop. Please."
}
],
"https://www.parahumans.net/2018/02/08/shade-4-4/": [