Compare commits

...

9 commits

Author SHA1 Message Date
Domenic Denicola
8f73d57a5a 5.1.0 2022-05-28 18:05:25 -04:00
Santiago Arambillete
b25a0d3a65
Add some fixes for Worm 2022-05-28 18:02:34 -04:00
Domenic Denicola
752c4c3916 Update dependencies 2022-05-28 17:21:28 -04:00
Domenic Denicola
43a80d5dd9 5.0.0 2022-01-22 18:50:33 -05:00
Domenic Denicola
7c9d96578b Use minipass-fetch instead of requisition
Requisition appears to be unmaintained and causes security alerts upon installation.
2022-01-22 18:50:18 -05:00
Domenic Denicola
ac56d1c4c7 Update dependencies and minimum Node 2022-01-22 18:33:02 -05:00
Evan Young
12394da334
Fix failure on convert in modern Node.js
Closes #30.
2022-01-22 13:39:12 -05:00
Domenic Denicola
c860873d78 Update dependencies 2021-02-15 22:05:38 -05:00
Domenic Denicola
360f108c1c Use @domenic/eslint-config 2021-02-15 22:05:38 -05:00
10 changed files with 3000 additions and 1618 deletions

View file

@ -1,259 +1,10 @@
{ {
"root": true, "root": true,
"extends": "@domenic",
"env": { "env": {
"node": true, "node": true
"es6": true
},
"parserOptions": {
"ecmaVersion": 2019
}, },
"rules": { "rules": {
// Possible errors "no-console": "off"
"no-await-in-loop": "off",
"comma-dangle": ["error", "never"],
"no-cond-assign": ["error", "except-parens"],
"no-console": "off",
"no-constant-condition": "error",
"no-control-regex": "error",
"no-debugger": "error",
"no-dupe-args": "error",
"no-dupe-keys": "error",
"no-duplicate-case": "error",
"no-empty": "error",
"no-empty-character-class": "error",
"no-ex-assign": "error",
"no-extra-boolean-cast": "error",
"no-extra-parens": ["error", "all", { "conditionalAssign": false, "nestedBinaryExpressions": false }],
"no-extra-semi": "error",
"no-func-assign": "error",
"no-inner-declarations": "off",
"no-invalid-regexp": "error",
"no-irregular-whitespace": "error",
"no-obj-calls": "error",
"no-prototype-builtins": "error",
"no-regex-spaces": "error",
"no-sparse-arrays": "error",
"no-template-curly-in-string": "error",
"no-unexpected-multiline": "error",
"no-unreachable": "error",
"no-unsafe-finally": "off",
"no-unsafe-negation": "error",
"use-isnan": "error",
"valid-jsdoc": "off",
"valid-typeof": "error",
// Best practices
"accessor-pairs": "error",
"array-callback-return": "error",
"block-scoped-var": "off",
"class-methods-use-this": "error",
"complexity": "off",
"consistent-return": "error",
"curly": ["error", "all"],
"default-case": "off",
"dot-location": ["error", "property"],
"dot-notation": "error",
"eqeqeq": "error",
"guard-for-in": "off",
"no-alert": "error",
"no-caller": "error",
"no-case-declarations": "error",
"no-div-regex": "off",
"no-else-return": "error",
"no-empty-function": "error",
"no-empty-pattern": "error",
"no-eq-null": "error",
"no-eval": "error",
"no-extend-native": "error",
"no-extra-bind": "error",
"no-extra-label": "error",
"no-fallthrough": "error",
"no-floating-decimal": "error",
"no-global-assign": "error",
"no-implicit-coercion": "error",
"no-implicit-globals": "error",
"no-implied-eval": "off",
"no-invalid-this": "error",
"no-iterator": "error",
"no-labels": ["error", { "allowLoop": true }],
"no-lone-blocks": "error",
"no-loop-func": "off",
"no-magic-numbers": "off",
"no-multi-spaces": "error",
"no-multi-str": "error",
"no-new": "error",
"no-new-func": "error",
"no-new-wrappers": "error",
"no-octal": "error",
"no-octal-escape": "error",
"no-param-reassign": "off",
"no-process-env": "error",
"no-proto": "error",
"no-redeclare": "error",
"no-restricted-properties": "off",
"no-return-assign": ["error", "except-parens"],
"no-return-await": "error",
"no-script-url": "off",
"no-self-assign": "error",
"no-self-compare": "error",
"no-sequences": "error",
"no-throw-literal": "error",
"no-unmodified-loop-condition": "error",
"no-unused-expressions": "error",
"no-unused-labels": "error",
"no-useless-call": "error",
"no-useless-concat": "error",
"no-useless-escape": "error",
"no-useless-return": "error",
"no-void": "error",
"no-warning-comments": "off",
"no-with": "error",
"radix": ["error", "as-needed"],
"require-await": "error",
"vars-on-top": "off",
"wrap-iife": ["error", "outside"],
"yoda": ["error", "never"],
// Strict Mode
"strict": ["error", "global"],
// Variables
"init-declarations": "off",
"no-catch-shadow": "error",
"no-delete-var": "error",
"no-label-var": "error",
"no-restricted-globals": "off",
"no-shadow": "error",
"no-shadow-restricted-names": "error",
"no-undef": "error",
"no-undef-init": "error",
"no-undefined": "off",
"no-unused-vars": "error",
"no-use-before-define": ["error", "nofunc"],
// Node.js and CommonJS
"callback-return": "off",
"global-require": "error",
"handle-callback-err": "error",
"no-mixed-requires": ["error", true],
"no-new-require": "error",
"no-path-concat": "error",
"no-process-exit": "error",
"no-restricted-imports": "off",
"no-restricted-modules": "off",
"no-sync": "off",
// Stylistic Issues
"array-bracket-spacing": ["error", "never"],
"block-spacing": ["error", "always"],
"brace-style": ["error", "1tbs", { "allowSingleLine": false }],
"camelcase": ["error", { "properties": "always" }],
"capitalized-comments": ["error", "always", { "ignoreConsecutiveComments": true }],
"comma-spacing": ["error", { "before": false, "after": true }],
"comma-style": ["error", "last"],
"computed-property-spacing": ["error", "never"],
"consistent-this": "off",
"eol-last": "error",
"func-call-spacing": ["error", "never"],
"func-name-matching": ["error", "always"],
"func-names": ["error", "never"],
"func-style": ["error", "declaration"],
"id-blacklist": "off",
"id-length": "off",
"id-match": "off",
"indent": ["error", 2, { "SwitchCase": 1 }],
"jsx-quotes": "off",
"key-spacing": ["error", { "beforeColon": false, "afterColon": true, "mode": "strict" }],
"keyword-spacing": ["error", { "before": true, "after": true }],
"line-comment-position": "off",
"linebreak-style": ["error", "unix"],
"lines-around-comment": "off",
"lines-around-directive": "off",
"max-depth": "off",
"max-len": ["error", 120, { "ignoreUrls": true }],
"max-lines": "off",
"max-nested-callbacks": "off",
"max-params": "off",
"max-statements": "off",
"max-statements-per-line": ["error", { "max": 1 }],
"multiline-ternary": "off",
"new-cap": "error",
"new-parens": "error",
"newline-after-var": "off",
"newline-before-return": "off",
"newline-per-chained-call": "off",
"no-array-constructor": "error",
"no-bitwise": "off",
"no-continue": "off",
"no-inline-comments": "off",
"no-lonely-if": "error",
"no-mixed-operators": "error",
"no-mixed-spaces-and-tabs": "error",
"no-multiple-empty-lines": "error",
"no-negated-condition": "off",
"no-nested-ternary": "error",
"no-new-object": "error",
"no-plusplus": "off",
"no-restricted-syntax": "off",
"no-tabs": "error",
"no-ternary": "off",
"no-trailing-spaces": "error",
"no-underscore-dangle": "off",
"no-unneeded-ternary": "error",
"no-whitespace-before-property": "error",
"object-curly-newline": ["error", { "multiline": true }],
"object-curly-spacing": ["error", "always"],
"object-property-newline": "off",
"one-var": ["error", { "initialized": "never" }],
"one-var-declaration-per-line": ["error", "initializations"],
"operator-assignment": ["error", "always"],
"operator-linebreak": ["error", "after"],
"padded-blocks": ["error", "never"],
"quote-props": ["error", "as-needed"],
"quotes": ["error", "double", { "avoidEscape": true, "allowTemplateLiterals": true }],
"require-jsdoc": "off",
"semi": ["error", "always"],
"semi-spacing": "error",
"sort-keys": "off",
"sort-vars": "off",
"space-before-blocks": ["error", "always"],
"space-before-function-paren": ["error", { "anonymous": "always", "named": "never" }],
"space-in-parens": ["error", "never"],
"space-infix-ops": "error",
"space-unary-ops": ["error", { "words": true, "nonwords": false }],
"spaced-comment": ["error", "always", { "markers": ["///"] }],
"unicode-bom": ["error", "never"],
"wrap-regex": "off",
// ECMAScript 6
"arrow-body-style": "off", // meh
"arrow-parens": ["error", "as-needed"],
"arrow-spacing": "error",
"constructor-super": "error",
"generator-star-spacing": ["error", "after"],
"no-class-assign": "error",
"no-confusing-arrow": "off",
"no-const-assign": "error",
"no-dupe-class-members": "error",
"no-duplicate-imports": "error",
"no-new-symbol": "error",
"no-this-before-super": "error",
"no-useless-computed-key": "error",
"no-useless-constructor": "error",
"no-useless-rename": "error",
"no-var": "error",
"object-shorthand": "error",
"prefer-arrow-callback": "error",
"prefer-const": "error",
"prefer-numeric-literals": "error",
"prefer-rest-params": "error",
"prefer-spread": "error",
"prefer-template": "off",
"require-yield": "error",
"rest-spread-spacing": ["error", "never"],
"sort-imports": "off",
"symbol-description": "error",
"template-curly-spacing": ["error", "never"],
"yield-star-spacing": ["error", "after"]
} }
} }

View file

@ -13,9 +13,9 @@ jobs:
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- uses: actions/setup-node@v1 - uses: actions/setup-node@v2
with: with:
node-version: 14 node-version: 16
- run: npm install - run: npm install

View file

@ -4,7 +4,7 @@ Scrapes the web serial [_Worm_](https://parahumans.wordpress.com/) and its seque
## How to use ## How to use
First you'll need a modern version of [Node.js](https://nodejs.org/en/). Install whatever is current (not LTS); at least v12.10.0 is necessary. First you'll need a modern version of [Node.js](https://nodejs.org/en/). At least v16.13.2 is necessary.
Then, open a terminal ([Mac documentation](http://blog.teamtreehouse.com/introduction-to-the-mac-os-x-command-line), [Windows documentation](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and install the program by typing Then, open a terminal ([Mac documentation](http://blog.teamtreehouse.com/introduction-to-the-mac-os-x-command-line), [Windows documentation](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and install the program by typing

View file

@ -142,124 +142,125 @@ function getBodyXML(chapter, book, contentEl) {
let xml = xmlSerializer.serializeToString(bodyEl); let xml = xmlSerializer.serializeToString(bodyEl);
// Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p> // Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p>
xml = xml.replace(/<br \/>\s*<\/em><\/p>/g, "</em></p>"); xml = xml.replace(/<br \/>\s*<\/em><\/p>/ug, "</em></p>");
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad. // Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
xml = xml.replace(/<i>([^ ]+)<\/i>/g, "<em>$1</em>"); xml = xml.replace(/<i>([^ ]+)<\/i>/ug, "<em>$1</em>");
xml = xml.replace(/<i>([^ ]+)( +)<\/i>/g, "<em>$1</em>$2"); xml = xml.replace(/<i>([^ ]+)( +)<\/i>/ug, "<em>$1</em>$2");
// There are way too many nonbreaking spaces where they don't belong. If they show up three in a row, then let them // There are way too many nonbreaking spaces where they don't belong. If they show up three in a row, then let them
// live; they're maybe being used for alignment or something. Otherwise, they die. // live; they're maybe being used for alignment or something. Otherwise, they die.
// //
// Also, normalize spaces after a period/quote mark to two (normal) spaces. The second one is invisible when // Also, normalize spaces after a period/quote mark to two (normal) spaces. The second one is invisible when
// rendered, but it helps future heuristics detect end of sentences. // rendered, but it helps future heuristics detect end of sentences.
xml = xml.replace(/\xA0{1,2}(?!\x20\xA0)/g, " "); xml = xml.replace(/\xA0{1,2}(?!\x20\xA0)/ug, " ");
xml = xml.replace(/([.”])\x20*\xA0[\xA0\x20]*/g, "$1 "); xml = xml.replace(/([.”])\x20*\xA0[\xA0\x20]*/ug, "$1 ");
xml = xml.replace(/([.”])\x20{3,}/g, "$1 "); xml = xml.replace(/([.”])\x20{3,}/ug, "$1 ");
function fixEms() { function fixEms() {
// Fix recurring broken-up or erroneous <em>s // Fix recurring broken-up or erroneous <em>s
xml = xml.replace(/<\/em>s/g, "s</em>"); xml = xml.replace(/<\/em>s/ug, "s</em>");
xml = xml.replace(/<em><\/em>/g, ""); xml = xml.replace(/<em><\/em>/ug, "");
xml = xml.replace(/<\/em><em>/g, ""); xml = xml.replace(/<\/em><em>/ug, "");
xml = xml.replace(/<em>(\s?\s?[^A-Za-z]\s?\s?)<\/em>/g, "$1"); xml = xml.replace(/<em>(\s?\s?[^A-Za-z]\s?\s?)<\/em>/ug, "$1");
xml = xml.replace(/<\/em>(\s?\s?[^A-Za-z]\s?\s?)<em>/g, "$1"); xml = xml.replace(/<\/em>(\s?\s?[^A-Za-z]\s?\s?)<em>/ug, "$1");
xml = xml.replace(/“<em>([^>]+)<\/em>(!|\?|\.)”/g, "“<em>$1$2</em>”"); xml = xml.replace(/“<em>([^>]+)<\/em>(!|\?|\.)”/ug, "“<em>$1$2</em>”");
xml = xml.replace(/<p><em>([^>]+)<\/em>(!|\?|\.)<\/p>/g, "<p><em>$1$2</em></p>"); xml = xml.replace(/<p><em>([^>]+)<\/em>(!|\?|\.)<\/p>/ug, "<p><em>$1$2</em></p>");
xml = xml.replace(/(!|\?|\.)\s{2}<\/em><\/p>/g, "$1</em></p>"); xml = xml.replace(/(!|\?|\.)\s{2}<\/em><\/p>/ug, "$1</em></p>");
xml = xml.replace(/<em>([a-z]+)(\?|\.)<\/em>/g, "<em>$1</em>$2"); xml = xml.replace(/<em>([a-z]+)(\?|\.)<\/em>/ug, "<em>$1</em>$2");
xml = xml.replace(/<em>([^>]+?)( +)<\/em>/g, "<em>$1</em>$2"); xml = xml.replace(/<em>([^>]+?)( +)<\/em>/ug, "<em>$1</em>$2");
xml = xml.replace(/<em> ([a-zA-Z]+)<\/em>/g, " <em>$1</em>"); xml = xml.replace(/<em> ([a-zA-Z]+)<\/em>/ug, " <em>$1</em>");
xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>/g, "<em>$1</em>"); xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>/ug, "<em>$1</em>");
xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>\s*/g, "<em>$1</em>"); xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>\s*/ug, "<em>$1</em>");
xml = xml.replace(/\s*<em>\s*([^<]+)\s*<\/em>/g, "<em>$1</em>"); xml = xml.replace(/\s*<em>\s*([^<]+)\s*<\/em>/ug, "<em>$1</em>");
xml = xml.replace(/<em>“\s*([^<”]+)\s*”<\/em>/g, "“<em>$1</em>”"); xml = xml.replace(/<em>“\s*([^<”]+)\s*”<\/em>/ug, "“<em>$1</em>”");
xml = xml.replace(/<em>“\s*([^<”]+)\s*<\/em>\s*”/g, "“<em>$1</em>”"); xml = xml.replace(/<em>“\s*([^<”]+)\s*<\/em>\s*”/ug, "“<em>$1</em>”");
xml = xml.replace(/“\s*<em>\s*([^<”]+)\s*”<\/em>/g, "“<em>$1</em>”"); xml = xml.replace(/“\s*<em>\s*([^<”]+)\s*”<\/em>/ug, "“<em>$1</em>”");
xml = xml.replace(/([^\n>])<em> ?/g, "$1 <em>"); xml = xml.replace(/([^\n>])<em> ?/ug, "$1 <em>");
xml = xml.replace(/ ?<\/em>/g, "</em> "); xml = xml.replace(/ ?<\/em>/ug, "</em> ");
xml = xml.replace(/<p([^>]+)> <em>/g, "<p$1><em>"); xml = xml.replace(/<p([^>]+)> <em>/ug, "<p$1><em>");
xml = xml.replace(/<\/em> <\/p>/g, "</em></p>"); xml = xml.replace(/<\/em> <\/p>/ug, "</em></p>");
xml = xml.replace(/<em>([a-z]+),<\/em>/g, "<em>$1</em>,"); xml = xml.replace(/<em>([a-z]+),<\/em>/ug, "<em>$1</em>,");
} }
// These quote/apostrophe/em fixes interact with each other. TODO: try to disentangle so we don't repeat all of // These quote/apostrophe/em fixes interact with each other. TODO: try to disentangle so we don't repeat all of
// fixEms. // fixEms.
xml = xml.replace(/,” <\/em>/g, "</em>,” "); xml = xml.replace(/,” <\/em>/ug, "</em>,” ");
fixEms(); fixEms();
xml = xml.replace(/<p>”/g, "<p>“"); xml = xml.replace(/<p>”/ug, "<p>“");
xml = xml.replace(/“\s*<\/p>/g, "”</p>"); xml = xml.replace(/“\s*<\/p>/ug, "”</p>");
xml = xml.replace(/“\s*<\/em><\/p>/g, "</em>”</p>"); xml = xml.replace(/“\s*<\/em><\/p>/ug, "</em>”</p>");
xml = xml.replace(/\s*<\/p>/g, "</p>"); xml = xml.replace(/\s*<\/p>/ug, "</p>");
xml = xml.replace(/\s*<\/em><\/p>/g, "</em></p>"); xml = xml.replace(/\s*<\/em><\/p>/ug, "</em></p>");
xml = xml.replace(/,” <\/em>/g, "</em>,” "); xml = xml.replace(/,” <\/em>/ug, "</em>,” ");
xml = xml.replace(//g, ""); xml = xml.replace(//ug, "");
xml = xml.replace(/″/g, "”"); xml = xml.replace(/″/ug, "”");
xml = xml.replace(/([A-Za-z])s(\s?)/g, "$1s$2"); xml = xml.replace(/([A-Za-z])s(\s?)/ug, "$1s$2");
xml = xml.replace(/Im/g, "Im"); xml = xml.replace(/Im/ug, "Im");
xml = xml.replace(/<p>“\s+/g, "<p>“"); xml = xml.replace(/<p>“\s+/ug, "<p>“");
xml = xml.replace(/\s+”/g, "”"); xml = xml.replace(/\s+”/ug, "”");
xml = xml.replace(/'/g, ""); xml = xml.replace(/'/ug, "");
xml = xml.replace(/([A-Za-z]+)/g, "$1"); xml = xml.replace(/([A-Za-z]+)/ug, "$1");
xml = xml.replace(/([a-z])”<\/p>/g, "$1.”</p>"); xml = xml.replace(/([a-z])”<\/p>/ug, "$1.”</p>");
fixEms(); fixEms();
xml = xml.replace(/<em>([^<]+)<\/em>/g, "<em>$1</em>"); xml = xml.replace(/<em>([^<]+)<\/em>/ug, "<em>$1</em>");
xml = xml.replace(/<em>([a-z]+)!<\/em>/g, "<em>$1</em>!"); xml = xml.replace(/<em>([a-z]+)!<\/em>/ug, "<em>$1</em>!");
xml = xml.replace(/(?<! {2})<em>([\w ]+)([!.?])”<\/em>/g, "<em>$1</em>$2”"); xml = xml.replace(/(?<! {2})<em>([\w ]+)([!.?])”<\/em>/ug, "<em>$1</em>$2”");
xml = xml.replace(/<em>([\w ]+[!.?])”<\/em>/g, "<em>$1</em>”"); xml = xml.replace(/<em>([\w ]+[!.?])”<\/em>/ug, "<em>$1</em>”");
xml = xml.replace(/I”(m|ll)/g, "I$1"); xml = xml.replace(/I”(m|ll)/ug, "I$1");
xml = xml.replace(/””<\/p>/g, "”</p>"); xml = xml.replace(/””<\/p>/ug, "”</p>");
xml = xml.replace(/^([^“]+?) ?”(?![ —<])/gm, "$1 “"); xml = xml.replace(/^([^“]+?) ?”(?![ —<])/ugm, "$1 “");
xml = xml.replace(/(?<!“)<em>([A-Za-z]+),<\/em>(?!”| +[A-Za-z]+ thought)/, "<em>$1</em>,"); xml = xml.replace(/(?<!“)<em>([A-Za-z]+),<\/em>(?!”| +[A-Za-z]+ thought)/u, "<em>$1</em>,");
xml = xml.replace(/([Kk])ay(?!)/g, "$1ay"); xml = xml.replace(/([Kk])ay(?!)/ug, "$1ay");
xml = xml.replace(/<em>(Why|What|Who|How|Where|When)<\/em>\?/g, "<em>$1?</em>"); xml = xml.replace(/<em>(Why|What|Who|How|Where|When)<\/em>\?/ug, "<em>$1?</em>");
xml = xml.replace(/,<\/em>/g, "</em>,"); xml = xml.replace(/,<\/em>/ug, "</em>,");
xml = xml.replace(/,”<\/p>/g, ".”</p>"); xml = xml.replace(/,”<\/p>/ug, ".”</p>");
xml = xml.replace(/<p>(.*),<\/p>/g, "<p>$1.</p>"); xml = xml.replace(/<p>(.*),<\/p>/ug, "<p>$1.</p>");
xml = xml.replace(/(\w+)(\w+)/g, "$1$2"); xml = xml.replace(/(\w+)(\w+)/ug, "$1$2");
xml = xml.replace(/<em>([a-z]+), ([a-z]+)<\/em>/g, "<em>$1</em>, <em>$2</em>"); xml = xml.replace(/<em>([a-z]+), ([a-z]+)<\/em>/ug, "<em>$1</em>, <em>$2</em>");
// Similar problems occur in Ward with <b> and <strong> as do in Worm with <em>s // Similar problems occur in Ward with <b> and <strong> as do in Worm with <em>s
xml = xml.replace(/<b \/>/g, ""); xml = xml.replace(/<b \/>/ug, "");
xml = xml.replace(/<b>(\s*<br \/>\s*)<\/b>/g, "$1"); xml = xml.replace(/<b>(\s*<br \/>\s*)<\/b>/ug, "$1");
xml = xml.replace(/<strong>(\s*<br \/>\s*)<\/strong>/g, "$1"); xml = xml.replace(/<strong>(\s*<br \/>\s*)<\/strong>/ug, "$1");
xml = xml.replace(/<\/strong>(\s*)<strong>/g, "$1"); xml = xml.replace(/<\/strong>(\s*)<strong>/ug, "$1");
xml = xml.replace(/<strong>@<\/strong>/g, "@"); xml = xml.replace(/<strong>@<\/strong>/ug, "@");
xml = xml.replace(/<br \/>(\s*)<\/strong>/g, "</strong><br />$1"); xml = xml.replace(/<br \/>(\s*)<\/strong>/ug, "</strong><br />$1");
xml = xml.replace(/(\s*)<\/strong>/g, "</strong>$1"); xml = xml.replace(/(\s*)<\/strong>/ug, "</strong>$1");
xml = xml.replace(/><strong>(.*)<\/strong>:</g, "><strong>$1:</strong><"); xml = xml.replace(/><strong>(.*)<\/strong>:</ug, "><strong>$1:</strong><");
// No need for line breaks before paragraph ends or after paragraph starts // No need for line breaks before paragraph ends or after paragraph starts
// These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above. // These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above.
xml = xml.replace(/<br \/>\s*<\/p>/g, "</p>"); xml = xml.replace(/<br \/>\s*<\/p>/ug, "</p>");
xml = xml.replace(/<p><br \/>\s*/g, "<p>"); xml = xml.replace(/<p><br \/>\s*/ug, "<p>");
// This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh. // This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh.
xml = xml.replace(/<\/em>\s*“\s*<\/p>/g, "</em>”</p>"); xml = xml.replace(/<\/em>\s*“\s*<\/p>/ug, "</em>”</p>");
// Fix missing spaces after commas // Fix missing spaces after commas
xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2"); xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/ug, "$1, $2");
// Fix bad periods and spacing/markup surrounding them // Fix bad periods and spacing/markup surrounding them
xml = xml.replace(/\.\.<\/p>/g, ".</p>"); xml = xml.replace(/\.\.<\/p>/ug, ".</p>");
xml = xml.replace(/\.\.”<\/p>/g, ".”</p>"); xml = xml.replace(/\.\.”<\/p>/ug, ".”</p>");
xml = xml.replace(/ \. /g, ". "); xml = xml.replace(/ \. /ug, ". ");
xml = xml.replace(/ \.<\/p>/g, ".</p>"); xml = xml.replace(/ \.<\/p>/ug, ".</p>");
xml = xml.replace(/\.<em>\.\./g, "<em>…"); xml = xml.replace(/\.<em>\.\./ug, "<em>…");
xml = xml.replace(/\.\. {2}/g, ". "); xml = xml.replace(/\.\. {2}/ug, ". ");
xml = xml.replace(/\.\./g, "…"); xml = xml.replace(/\.\./ug, "…");
xml = xml.replace(/(?<!Mr|Ms|Mrs)…\./g, "…"); xml = xml.replace(/(?<!Mr|Ms|Mrs)…\./ug, "…");
xml = xml.replace(/(?<=Mr|Ms|Mrs)…\./g, ".…"); xml = xml.replace(/(?<=Mr|Ms|Mrs)…\./ug, ".…");
// Fix extra spaces // Fix extra spaces
xml = xml.replace(/ ? <\/p>/g, "</p>"); xml = xml.replace(/ ? <\/p>/ug, "</p>");
xml = xml.replace(/([a-z]) ,/g, "$1,"); xml = xml.replace(/([a-z]) ,/ug, "$1,");
// Use actual emojis instead of images // Use actual emojis instead of images
xml = xml.replace( xml = xml.replace(
// eslint-disable-next-line max-len // eslint-disable-next-line max-len
/<img width="16" height="16" class="wp-smiley emoji" draggable="false" alt="O_o" src="https:\/\/s1.wp.com\/wp-content\/mu-plugins\/wpcom-smileys\/o_O.svg" style="height: 1em; max-height: 1em;" \/>/g, /<img width="16" height="16" class="wp-smiley emoji" draggable="false" alt="O_o" src="https:\/\/s1.wp.com\/wp-content\/mu-plugins\/wpcom-smileys\/o_O.svg" style="height: 1em; max-height: 1em;" \/>/ug,
"🤨"); "🤨"
);
xml = fixTruncatedWords(xml); xml = fixTruncatedWords(xml);
xml = fixDialogueTags(xml); xml = fixDialogueTags(xml);
@ -288,9 +289,9 @@ function getBodyXML(chapter, book, contentEl) {
`Update substitutions.json for a more precise substitution.`); `Update substitutions.json for a more precise substitution.`);
} }
xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after); xml = xml.replace(new RegExp(escapeRegExp(substitution.before), "u"), substitution.after);
} else if (substitution.regExp) { } else if (substitution.regExp) {
xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement); xml = xml.replace(new RegExp(substitution.regExp, "ug"), substitution.replacement);
} else { } else {
warnings.push(`Invalid substitution specified for ${chapter.url}`); warnings.push(`Invalid substitution specified for ${chapter.url}`);
} }
@ -299,30 +300,31 @@ function getBodyXML(chapter, book, contentEl) {
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>. // Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>.
// Use this opportunity to insert a comment pointing to the original URL, for reference. // Use this opportunity to insert a comment pointing to the original URL, for reference.
xml = xml.replace( xml = xml.replace(
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/, /<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/u,
`<body>\n<!-- ${chapter.url} -->\n`); `<body>\n<!-- ${chapter.url} -->\n`
);
return { xml, warnings }; return { xml, warnings };
} }
function fixTruncatedWords(xml) { function fixTruncatedWords(xml) {
xml = xml.replace(/Sup/g, "Sup"); xml = xml.replace(/Sup/ug, "Sup");
xml = xml.replace(/cuz/g, "cuz"); xml = xml.replace(/cuz/ug, "cuz");
// Short for "Sidepeace" // Short for "Sidepeace"
xml = xml.replace(/[][Pp]iece(?![a-z])/g, "Piece"); xml = xml.replace(/[][Pp]iece(?![a-z])/ug, "Piece");
// Short for "Disjoint" // Short for "Disjoint"
xml = xml.replace(/[][Jj]oint(?![a-z])/g, "Joint"); xml = xml.replace(/[][Jj]oint(?![a-z])/ug, "Joint");
// Short for "Contender" // Short for "Contender"
xml = xml.replace(/[][Tt]end(?![a-z])/g, "Tend"); xml = xml.replace(/[][Tt]end(?![a-z])/ug, "Tend");
// Short for "Anelace" // Short for "Anelace"
xml = xml.replace(/[][Ll]ace(?![a-z])/g, "Lace"); xml = xml.replace(/[][Ll]ace(?![a-z])/ug, "Lace");
// Short for "Birdcage" // Short for "Birdcage"
xml = xml.replace(/[][Cc]age(?![a-z])/g, "Cage"); xml = xml.replace(/[][Cc]age(?![a-z])/ug, "Cage");
// We can't do "Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an // We can't do "Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an
// open quote, so we do that in substitutions.json. // open quote, so we do that in substitutions.json.
@ -332,8 +334,8 @@ function fixTruncatedWords(xml) {
function fixDialogueTags(xml) { function fixDialogueTags(xml) {
// Fix recurring miscapitalization with questions // Fix recurring miscapitalization with questions
xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked"); xml = xml.replace(/\?”\s\s?She asked/ug, "?” she asked");
xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked"); xml = xml.replace(/\?”\s\s?He asked/ug, "?” he asked");
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example, // The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
// > “I didnt get much done,” Greg said, “I got distracted by... // > “I didnt get much done,” Greg said, “I got distracted by...
@ -349,105 +351,108 @@ function fixDialogueTags(xml) {
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of // This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the // times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized. // capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2"); xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/ug, ",” $1. “$2");
return xml; return xml;
} }
function fixForeignNames(xml) { function fixForeignNames(xml) {
// This is consistently missing diacritics // This is consistently missing diacritics
xml = xml.replace(/Yangban/g, "Yàngbǎn"); xml = xml.replace(/Yangban/ug, "Yàngbǎn");
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not // These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
// italicized, so we go in the direction of removing the italics. // italicized, so we go in the direction of removing the italics.
xml = xml.replace(/<em>Garama<\/em>/g, "Garama"); xml = xml.replace(/<em>Garama<\/em>/ug, "Garama");
xml = xml.replace(/<em>Thanda<\/em>/g, "Thanda"); xml = xml.replace(/<em>Thanda<\/em>/ug, "Thanda");
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/g, "Sifara$1"); xml = xml.replace(/<em>Sifara([^<]*)<\/em>/ug, "Sifara$1");
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/g, "Moord Nag$1"); xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/ug, "Moord Nag$1");
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1"); xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/ug, "Califa de Perro$1");
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/g, "Turanta$1"); xml = xml.replace(/<em>Turanta([^<]*)<\/em>/ug, "Turanta$1");
return xml; return xml;
} }
function standardizeNames(xml) { function standardizeNames(xml) {
// 197 instances of "Mrs." to 21 of "Ms." // 197 instances of "Mrs." to 21 of "Ms."
xml = xml.replace(/Ms\. Yamada/g, "Mrs. Yamada"); xml = xml.replace(/Ms\. Yamada/ug, "Mrs. Yamada");
// 25 instances of "Amias" to 3 of "Amais" // 25 instances of "Amias" to 3 of "Amais"
xml = xml.replace(/Amais/g, "Amias"); xml = xml.replace(/Amais/ug, "Amias");
// 185 instances of Juliette to 4 of Juliet // 185 instances of Juliette to 4 of Juliet
xml = xml.replace(/Juliet(?=\b)/g, "Juliette"); xml = xml.replace(/Juliet(?=\b)/ug, "Juliette");
// Earlier chapters have a space; later ones do not. They're separate words, so side with the earlier chapters. // Earlier chapters have a space; later ones do not. They're separate words, so side with the earlier chapters.
// One location is missing the "k". // One location is missing the "k".
xml = xml.replace(/Crock? o[]Shit/g, "Crock o Shit"); xml = xml.replace(/Crock? o[]Shit/ug, "Crock o Shit");
// 5 instances of "Jotun" to 2 of "Jotunn" // 5 instances of "Jotun" to 2 of "Jotunn"
xml = xml.replace(/Jotunn/g, "Jotun"); xml = xml.replace(/Jotunn/ug, "Jotun");
// 13 instances of Elman to 1 of Elmann // 13 instances of Elman to 1 of Elmann
xml = xml.replace(/Elmann/g, "Elman"); xml = xml.replace(/Elmann/ug, "Elman");
// Thousands of instances of Tattletale to 4 instances of Tatteltale // Thousands of instances of Tattletale to 4 instances of Tatteltale
xml = xml.replace(/Tatteltale/g, "Tattletale"); xml = xml.replace(/Tatteltale/ug, "Tattletale");
// 73 instances of Über to 2 of Uber
xml = xml.replace(/Uber/ug, "Über");
return xml; return xml;
} }
function fixEmDashes(xml) { function fixEmDashes(xml) {
xml = xml.replace(/ /g, "—"); xml = xml.replace(/ /ug, "—");
xml = xml.replace(/“((?:<em>)?)-/g, "“$1—"); xml = xml.replace(/“((?:<em>)?)-/ug, "“$1—");
xml = xml.replace(/-[,.]?”/g, "—”"); xml = xml.replace(/-[,.]?”/ug, "—”");
xml = xml.replace(/-(!|\?)”/g, "—$1”"); xml = xml.replace(/-(!|\?)”/ug, "—$1”");
xml = xml.replace(/-[,.]?<\/([a-z]+)>”/g, "—</$1>”"); xml = xml.replace(/-[,.]?<\/([a-z]+)>”/ug, "—</$1>”");
xml = xml.replace(/-“/g, "—”"); xml = xml.replace(/-“/ug, "—”");
xml = xml.replace(/<p>-/g, "<p>—"); xml = xml.replace(/<p>-/ug, "<p>—");
xml = xml.replace(/-<\/p>/g, "—</p>"); xml = xml.replace(/-<\/p>/ug, "—</p>");
xml = xml.replace(/-<br \/>/g, "—<br />"); xml = xml.replace(/-<br \/>/ug, "—<br />");
xml = xml.replace(/-<\/([a-z]+)><\/p>/g, "—</$1></p>"); xml = xml.replace(/-<\/([a-z]+)><\/p>/ug, "—</$1></p>");
xml = xml.replace(/\s?\s?\s?\s?/g, "—"); xml = xml.replace(/\s?\s?\s?\s?/ug, "—");
xml = xml.replace(/-\s\s?/g, "—"); xml = xml.replace(/-\s\s?/ug, "—");
xml = xml.replace(/\s?\s-/g, "—"); xml = xml.replace(/\s?\s-/ug, "—");
xml = xml.replace(/\s+—”/g, "—”"); xml = xml.replace(/\s+—”/ug, "—”");
xml = xml.replace(/I-I/g, "I—I"); xml = xml.replace(/I-I/ug, "I—I");
xml = xml.replace(/I-uh/g, "I—uh"); xml = xml.replace(/I-uh/ug, "I—uh");
xml = xml.replace(/-\?/g, "—?"); xml = xml.replace(/-\?/ug, "—?");
return xml; return xml;
} }
function enDashJointNames(xml) { function enDashJointNames(xml) {
// Joint names should use en dashes // Joint names should use en dashes
xml = xml.replace(/Dallon-Pelham/g, "DallonPelham"); xml = xml.replace(/Dallon-Pelham/ug, "DallonPelham");
xml = xml.replace(/Bet-Gimel/g, "BetGimel"); xml = xml.replace(/Bet-Gimel/ug, "BetGimel");
xml = xml.replace(/Cheit-Gimel/g, "BetGimel"); xml = xml.replace(/Cheit-Gimel/ug, "BetGimel");
xml = xml.replace(/Tristan-Capricorn/g, "TristanCapricorn"); xml = xml.replace(/Tristan-Capricorn/ug, "TristanCapricorn");
xml = xml.replace(/Capricorn-Byron/g, "CapricornByron"); xml = xml.replace(/Capricorn-Byron/ug, "CapricornByron");
xml = xml.replace(/Tristan-Byron/g, "TristanByron"); xml = xml.replace(/Tristan-Byron/ug, "TristanByron");
xml = xml.replace(/Gimel-Europe/g, "GimelEurope"); xml = xml.replace(/Gimel-Europe/ug, "GimelEurope");
xml = xml.replace(/G-N/g, "GN"); xml = xml.replace(/G-N/ug, "GN");
xml = xml.replace(/Imp-Damsel/g, "ImpDamsel"); xml = xml.replace(/Imp-Damsel/ug, "ImpDamsel");
xml = xml.replace(/Damsel-Ashley/g, "DamselAshley"); xml = xml.replace(/Damsel-Ashley/ug, "DamselAshley");
xml = xml.replace(/Antares-Anelace/g, "AntaresAnelace"); xml = xml.replace(/Antares-Anelace/ug, "AntaresAnelace");
xml = xml.replace(/Challenger-Gallant/g, "ChallengerGallant"); xml = xml.replace(/Challenger-Gallant/ug, "ChallengerGallant");
xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1$2"); xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/ug, "Undersider$1$2");
xml = xml.replace(/Norwalk-Fairfield/g, "NorwalkFairfield"); xml = xml.replace(/Norwalk-Fairfield/ug, "NorwalkFairfield");
xml = xml.replace(/East-West/g, "eastwest"); xml = xml.replace(/East-West/ug, "eastwest");
xml = xml.replace(/Creutzfeldt-Jakob/g, "CreutzfeldtJakob"); xml = xml.replace(/Creutzfeldt-Jakob/ug, "CreutzfeldtJakob");
xml = xml.replace(/Astaroth-Nidhug/g, "AstarothNidhug"); xml = xml.replace(/Astaroth-Nidhug/ug, "AstarothNidhug");
xml = xml.replace(/Capulet-Montague/g, "CapuletMontague"); xml = xml.replace(/Capulet-Montague/ug, "CapuletMontague");
xml = xml.replace(/Weaver-Clockblocker/g, "WeaverClockblocker"); xml = xml.replace(/Weaver-Clockblocker/ug, "WeaverClockblocker");
xml = xml.replace(/Alexandria-Pretender/g, "AlexandriaPretender"); xml = xml.replace(/Alexandria-Pretender/ug, "AlexandriaPretender");
xml = xml.replace(/Night Hag-Nyx/g, "Night HagNyx"); xml = xml.replace(/Night Hag-Nyx/ug, "Night HagNyx");
xml = xml.replace(/Crawler-Breed/g, "CrawlerBreed"); xml = xml.replace(/Crawler-Breed/ug, "CrawlerBreed");
xml = xml.replace(/Simurgh-Myrddin-plant/g, "SimurghMyrddinplant"); xml = xml.replace(/Simurgh-Myrddin-plant/ug, "SimurghMyrddinplant");
xml = xml.replace(/Armsmaster-Defiant/g, "ArmsmasterDefiant"); xml = xml.replace(/Armsmaster-Defiant/ug, "ArmsmasterDefiant");
xml = xml.replace(/Matryoshka-Valentin/g, "MatryoshkaValentin"); xml = xml.replace(/Matryoshka-Valentin/ug, "MatryoshkaValentin");
xml = xml.replace(/Gaea-Eden/g, "GaeaEden"); xml = xml.replace(/Gaea-Eden/ug, "GaeaEden");
xml = xml.replace(/([Aa])gent-parahuman/g, "$1gentparahuman"); xml = xml.replace(/([Aa])gent-parahuman/ug, "$1gentparahuman");
xml = xml.replace(/([Pp])arahuman-agent/g, "$1arahumanagent"); xml = xml.replace(/([Pp])arahuman-agent/ug, "$1arahumanagent");
return xml; return xml;
} }
@ -456,15 +461,15 @@ function fixPossessives(xml) {
// Fix possessive of names ending in "s". // Fix possessive of names ending in "s".
xml = xml.replace( xml = xml.replace(
// eslint-disable-next-line max-len // eslint-disable-next-line max-len
/(?<!)(Judas|Brutus|Jess|Aegis|Dauntless|Circus|Sirius|Brooks|Genesis|Atlas|Lucas|Gwerrus|Chris|Eligos|Animos|Mags|Huntress|Hephaestus|Lord of Loss|John Combs|Mama Mathers|Monokeros|Goddess|Boundless|Paris|Tress|Harris|Antares|Nieves|Backwoods|Midas|Mrs. Sims|Ms. Stillons|Chuckles|Amias|Semiramis|Mother of Mothers)(?!s)/g, /(?<!)(Judas|Brutus|Jess|Aegis|Dauntless|Circus|Sirius|Brooks|Genesis|Atlas|Lucas|Gwerrus|Chris|Eligos|Animos|Mags|Huntress|Hephaestus|Lord of Loss|John Combs|Mama Mathers|Monokeros|Goddess|Boundless|Paris|Tress|Harris|Antares|Nieves|Backwoods|Midas|Mrs. Sims|Ms. Stillons|Chuckles|Amias|Semiramis|Mother of Mothers)(?!s)/ug,
"$1s" "$1s"
); );
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s". // Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s".
xml = xml.replace(/Marquiss/g, "Marquis"); xml = xml.replace(/Marquiss/ug, "Marquis");
// This one is not just missing the extra "s"; it's often misplaced. // This one is not just missing the extra "s"; it's often misplaced.
xml = xml.replace(/Wardens/g, "Wardens"); xml = xml.replace(/Wardens/ug, "Wardens");
return xml; return xml;
} }
@ -473,16 +478,25 @@ function cleanSceneBreaks(xml) {
// Normalize scene breaks. <hr> would be more semantically appropriate, but loses the author's intent. This is // Normalize scene breaks. <hr> would be more semantically appropriate, but loses the author's intent. This is
// especially the case in Ward, which uses a variety of different scene breaks. // especially the case in Ward, which uses a variety of different scene breaks.
xml = xml.replace(/<p(?:[^>]*)>■<\/p>/g, `<p style="text-align: center;">■</p>`); xml = xml.replace(/<p(?:[^>]*)>■<\/p>/ug, `<p style="text-align: center;">■</p>`);
xml = xml.replace(/<p style="text-align: center;"><strong>⊙<\/strong><\/p>/g, `<p style="text-align: center;">⊙</p>`); xml = xml.replace(
xml = xml.replace(/<p style="text-align: center;"><em><strong>⊙<\/strong><\/em><\/p>/g, /<p style="text-align: center;"><strong>⊙<\/strong><\/p>/ug,
`<p style="text-align: center;">⊙</p>`); `<p style="text-align: center;">⊙</p>`
xml = xml.replace(/<p style="text-align: center;"><strong>⊙⊙<\/strong><\/p>/g, );
`<p style="text-align: center;">⊙</p>`); xml = xml.replace(
/<p style="text-align: center;"><em><strong>⊙<\/strong><\/em><\/p>/ug,
`<p style="text-align: center;">⊙</p>`
);
xml = xml.replace(
/<p style="text-align: center;"><strong>⊙⊙<\/strong><\/p>/ug,
`<p style="text-align: center;">⊙</p>`
);
xml = xml.replace(/<p style="text-align: center;"><strong>⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/g, xml = xml.replace(
`<p style="text-align: center;">⊙ ⊙ ⊙ ⊙ ⊙</p>`); /<p style="text-align: center;"><strong>⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/ug,
`<p style="text-align: center;">⊙ ⊙ ⊙ ⊙ ⊙</p>`
);
return xml; return xml;
} }
@ -491,57 +505,59 @@ function fixCapitalization(xml, book) {
// This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where // This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where
// it's incorrect to capitalize in the one-off fixes. // it's incorrect to capitalize in the one-off fixes.
// Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals. // Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals.
xml = xml.replace(/([Tt])he clairvoyant(?!s)/g, "$1he Clairvoyant"); xml = xml.replace(/([Tt])he clairvoyant(?!s)/ug, "$1he Clairvoyant");
// ReSound's name is sometimes miscapitalized. The word is never used in a non-name context. // ReSound's name is sometimes miscapitalized. The word is never used in a non-name context.
xml = xml.replace(/Resound/g, "ReSound"); xml = xml.replace(/Resound/ug, "ReSound");
// The Speedrunners team name is missing its capitalization a couple times. // The Speedrunners team name is missing its capitalization a couple times.
xml = xml.replace(/speedrunners/g, "Speedrunners"); xml = xml.replace(/speedrunners/ug, "Speedrunners");
// The Machine Army is missing its capitalization a couple times. // The Machine Army is missing its capitalization a couple times.
xml = xml.replace(/machine army/g, "Machine Army"); xml = xml.replace(/machine army/ug, "Machine Army");
// "patrol block" is capitalized three different ways: "patrol block", "Patrol block", and "Patrol Block". "patrol // "patrol block" is capitalized three different ways: "patrol block", "Patrol block", and "Patrol Block". "patrol
// group" is always lowercased. It seems like "Patrol" is a proper name, and is used as a capitalized modifier in // group" is always lowercased. It seems like "Patrol" is a proper name, and is used as a capitalized modifier in
// other contexts (e.g. Patrol leader). So let's standardize on "Patrol <lowercase>". // other contexts (e.g. Patrol leader). So let's standardize on "Patrol <lowercase>".
xml = xml.replace(/patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/ig, xml = xml.replace(
(_, $1) => `Patrol ${$1.toLowerCase()}`); /patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/uig,
(_, $1) => `Patrol ${$1.toLowerCase()}`
);
// This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in // This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in
// Worm, where it is never needed: // Worm, where it is never needed:
if (book === "ward") { if (book === "ward") {
xml = xml.replace(/the patrol(?!s|ling)/g, "the Patrol"); xml = xml.replace(/the patrol(?!s|ling)/ug, "the Patrol");
} }
// This is sometimes missing its capitalization. // This is sometimes missing its capitalization.
xml = xml.replace(/the birdcage/g, "the Birdcage"); xml = xml.replace(/the birdcage/ug, "the Birdcage");
// There's no reason why these should be capitalized. // There's no reason why these should be capitalized.
xml = xml.replace(/(?<! {2}|“|>)Halberd/g, "halberd"); xml = xml.replace(/(?<! {2}|“|>)Halberd/ug, "halberd");
xml = xml.replace(/(?<! {2}|“|>)Loft/g, "loft"); xml = xml.replace(/(?<! {2}|“|>)Loft/ug, "loft");
// These are treated as common nouns and not traditionally capitalized. "Krav Maga" remains capitalized, // These are treated as common nouns and not traditionally capitalized. "Krav Maga" remains capitalized,
// interestingly (according to dictionaries and Wikipedia). // interestingly (according to dictionaries and Wikipedia).
xml = xml.replace(/(?<! {2}|“|>)Judo/g, "judo"); xml = xml.replace(/(?<! {2}|“|>)Judo/ug, "judo");
xml = xml.replace(/(?<! {2}|“|>)Aikido/g, "aikido"); xml = xml.replace(/(?<! {2}|“|>)Aikido/ug, "aikido");
xml = xml.replace(/(?<! {2}|“|>)Karate/g, "karate"); xml = xml.replace(/(?<! {2}|“|>)Karate/ug, "karate");
xml = xml.replace(/(?<! {2}|“|>)Tae Kwon Do/g, "tae kwon do"); xml = xml.replace(/(?<! {2}|“|>)Tae Kwon Do/ug, "tae kwon do");
// There's no reason why university should be capitalized in most contexts, although sometimes it's used as part of // There's no reason why university should be capitalized in most contexts, although sometimes it's used as part of
// a compound noun or at the beginning of a sentence. // a compound noun or at the beginning of a sentence.
xml = xml.replace(/(?<! {2}|“|>|Cornell |Nilles )University(?! Road)/, "university"); xml = xml.replace(/(?<! {2}|“|>|Cornell |Nilles )University(?! Road)/ug, "university");
// Organ names (e.g. brain, arm) or scientific names are not capitalized, so the "corona pollentia" and friends should // Organ names (e.g. brain, arm) or scientific names are not capitalized, so the "corona pollentia" and friends should
// not be either. The books are inconsistent. // not be either. The books are inconsistent.
xml = xml.replace(/(?<! {2}|“|>|-)Corona/g, "corona"); xml = xml.replace(/(?<! {2}|“|>|-)Corona/ug, "corona");
xml = xml.replace(/Pollentia/g, "pollentia"); xml = xml.replace(/Pollentia/ug, "pollentia");
xml = xml.replace(/Radiata/g, "radiata"); xml = xml.replace(/Radiata/ug, "radiata");
xml = xml.replace(/Gemma/g, "gemma"); xml = xml.replace(/Gemma/ug, "gemma");
// We de-capitalize Valkyrie's "flock", since most uses are de-capitalized (e.g. the many instances in Gleaming // We de-capitalize Valkyrie's "flock", since most uses are de-capitalized (e.g. the many instances in Gleaming
// Interlude 9, or Dying 15.z). This is a bit surprising; it seems like an organization name. But I guess it's // Interlude 9, or Dying 15.z). This is a bit surprising; it seems like an organization name. But I guess it's
// informal. // informal.
xml = xml.replace(/(?<! {2}|“|>)Flock/g, "flock"); xml = xml.replace(/(?<! {2}|“|>)Flock/ug, "flock");
// Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we // Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we
// can be reasonably sure they don't start a sentence, although more specific instances are done in // can be reasonably sure they don't start a sentence, although more specific instances are done in
@ -553,64 +569,64 @@ function fixCapitalization(xml, book) {
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json. // This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json.
xml = xml.replace( xml = xml.replace(
// eslint-disable-next-line max-len // eslint-disable-next-line max-len
/(?<! {2}|“|>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/g, /(?<! {2}|“|>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/ug,
(_, designation) => designation.toLowerCase() (_, designation) => designation.toLowerCase()
); );
xml = xml.replace( xml = xml.replace(
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)-(\d+)/gi, /(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)-(\d+)/ugi,
"$1 $2" "$1 $2"
); );
xml = xml.replace( xml = xml.replace(
// eslint-disable-next-line max-len // eslint-disable-next-line max-len
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)[ -/](mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)/gi, /(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)[ -/](mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)/ugi,
"$1$2" "$1$2"
); );
// Capitalization is inconsistent, but shard names seems to usually be capitalized. // Capitalization is inconsistent, but shard names seems to usually be capitalized.
xml = xml.replace(/Grasping self/g, "Grasping Self"); xml = xml.replace(/Grasping self/ug, "Grasping Self");
xml = xml.replace(/Cloven stranger/g, "Cloven Stranger"); xml = xml.replace(/Cloven stranger/ug, "Cloven Stranger");
xml = xml.replace(/Princess shaper/g, "Princess Shaper"); xml = xml.replace(/Princess shaper/ug, "Princess Shaper");
xml = xml.replace(/Fragile one/g, "Fragile One"); xml = xml.replace(/Fragile one/ug, "Fragile One");
// Place names need to always be capitalized // Place names need to always be capitalized
xml = xml.replace(/North end/g, "North End"); xml = xml.replace(/North end/ug, "North End");
xml = xml.replace(/(Stonemast|Shale) avenue/g, "$1 Avenue"); xml = xml.replace(/(Stonemast|Shale) avenue/ug, "$1 Avenue");
xml = xml.replace(/(Lord|Slater) street/g, "$1 Street"); xml = xml.replace(/(Lord|Slater) street/ug, "$1 Street");
xml = xml.replace(/(Hollow|Cedar) point/g, "$1 Point"); xml = xml.replace(/(Hollow|Cedar) point/ug, "$1 Point");
xml = xml.replace(/(Norwalk|Fenway|Stratford) station/g, "$1 Station"); xml = xml.replace(/(Norwalk|Fenway|Stratford) station/ug, "$1 Station");
xml = xml.replace(/the megalopolis/g, "the Megalopolis"); xml = xml.replace(/the megalopolis/ug, "the Megalopolis");
xml = xml.replace(/earths(?![a-z])/g, "Earths"); xml = xml.replace(/earths(?![a-z])/ug, "Earths");
if (book === "ward") { if (book === "ward") {
xml = xml.replace(/the bunker/g, "the Bunker"); xml = xml.replace(/the bunker/ug, "the Bunker");
xml = xml.replace(/bunker/g, "Bunker"); xml = xml.replace(/bunker/ug, "Bunker");
} }
// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of // "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in // instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
// substitutions.json. // substitutions.json.
xml = xml.replace(/(?<!mom), dad(?![a-z])/g, ", Dad"); xml = xml.replace(/(?<!mom), dad(?![a-z])/ug, ", Dad");
xml = xml.replace(/, mom(?![a-z-])/g, ", Mom"); xml = xml.replace(/, mom(?![a-z-])/ug, ", Mom");
// Similarly, specific aunts and uncles get capitalized when used as a title. These are often missed. // Similarly, specific aunts and uncles get capitalized when used as a title. These are often missed.
xml = xml.replace(/aunt Sarah/g, "Aunt Sarah"); xml = xml.replace(/aunt Sarah/ug, "Aunt Sarah");
xml = xml.replace(/aunt Fleur/g, "Aunt Fleur"); xml = xml.replace(/aunt Fleur/ug, "Aunt Fleur");
xml = xml.replace(/uncle Neil/g, "Uncle Neil"); xml = xml.replace(/uncle Neil/ug, "Uncle Neil");
// The majority of "Wardens headquarters" is lowercased, and always prefixed with "the", indicating it's not a proper // The majority of "Wardens headquarters" is lowercased, and always prefixed with "the", indicating it's not a proper
// place name. So we remove the capitalization in the few places where it does appear. // place name. So we remove the capitalization in the few places where it does appear.
xml = xml.replace(/Wardens Headquarters/g, "Wardens headquarters"); xml = xml.replace(/Wardens Headquarters/ug, "Wardens headquarters");
// Some style guides try to reserve capitalized "Nazi" for historical discussions of members of the Nazi party. This // Some style guides try to reserve capitalized "Nazi" for historical discussions of members of the Nazi party. This
// seems fuzzy when it comes to phrases like "neo-Nazi", and doesn't seem to be what the author is doing; the books // seems fuzzy when it comes to phrases like "neo-Nazi", and doesn't seem to be what the author is doing; the books
// are just plain inconsistent. So, let's standardize on always uppercasing. // are just plain inconsistent. So, let's standardize on always uppercasing.
xml = xml.replace(/(?<![a-z])nazi/g, "Nazi"); xml = xml.replace(/(?<![a-z])nazi/ug, "Nazi");
xml = xml.replace(/ Neo-/g, " neo-"); xml = xml.replace(/ Neo-/ug, " neo-");
// Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their // Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one // adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one
// case of "French toast".) // case of "French toast".)
xml = xml.replace(/english(?! muffin)/g, "English"); xml = xml.replace(/english(?! muffin)/ug, "English");
xml = xml.replace(/(?<! {2})English muffin/g, "english muffin"); xml = xml.replace(/(?<! {2})English muffin/ug, "english muffin");
// I was very torn on what to do with capitalization for "Titan" and "Titans". In general you don't capitalize species // I was very torn on what to do with capitalization for "Titan" and "Titans". In general you don't capitalize species
// names or other classifications, e.g. style guides are quite clear you don't capitalize "gods". The author // names or other classifications, e.g. style guides are quite clear you don't capitalize "gods". The author
@ -622,34 +638,34 @@ function fixCapitalization(xml, book) {
// or "Kronos Titan".) // or "Kronos Titan".)
if (book === "ward") { if (book === "ward") {
// All plural discussions of "Titans" are after Sundown 17.y. // All plural discussions of "Titans" are after Sundown 17.y.
xml = xml.replace(/titans/g, "Titans"); xml = xml.replace(/titans/ug, "Titans");
// Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here, // Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here,
// though. // though.
xml = xml.replace(/dauntless titan/ig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized. xml = xml.replace(/dauntless titan/uig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized.
xml = xml.replace(/Kronos titan/g, "Kronos Titan"); xml = xml.replace(/Kronos titan/ug, "Kronos Titan");
} }
// For the giants, the prevailing usage seems to be to keep the term lowercase, but capitalize when used as a name. // For the giants, the prevailing usage seems to be to keep the term lowercase, but capitalize when used as a name.
xml = xml.replace(/(?<=Mathers |Goddess )giant/g, "Giant"); xml = xml.replace(/(?<=Mathers |Goddess )giant/ug, "Giant");
xml = xml.replace(/mother giant/ig, "Mother Giant"); xml = xml.replace(/mother giant/uig, "Mother Giant");
xml = xml.replace(/(?<! {2}|“|>)Giants/g, "giants"); xml = xml.replace(/(?<! {2}|“|>)Giants/ug, "giants");
return xml; return xml;
} }
function fixMispellings(xml) { function fixMispellings(xml) {
// This is commonly misspelled. // This is commonly misspelled.
xml = xml.replace(/([Ss])houlderblade/g, "$1houlder blade"); xml = xml.replace(/([Ss])houlderblade/ug, "$1houlder blade");
// All dictionaries agree this is capitalized. // All dictionaries agree this is capitalized.
xml = xml.replace(/u-turn/g, "U-turn"); xml = xml.replace(/u-turn/ug, "U-turn");
// https://www.dictionary.com/browse/scot-free // https://www.dictionary.com/browse/scot-free
xml = xml.replace(/scott(?: |-)free/g, "scot-free"); xml = xml.replace(/scott(?: |-)free/ug, "scot-free");
// https://grammarist.com/idiom/change-tack/ // https://ugrammarist.com/idiom/change-tack/
xml = xml.replace(/changed tacks/g, "changed tack"); xml = xml.replace(/changed tacks/ug, "changed tack");
return xml; return xml;
} }
@ -657,110 +673,111 @@ function fixMispellings(xml) {
function fixHyphens(xml) { function fixHyphens(xml) {
// "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit // "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit
// them. // them.
xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/g, "$1-year-old$2"); xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/ug, "$1-year-old$2");
xml = xml.replace(/(\w+) or (\w+)-year-old/g, "$1- or $2-year-old"); xml = xml.replace(/(\w+) or (\w+)-year-old/ug, "$1- or $2-year-old");
// Compound numbers from 11 through 99 must be hyphenated, but others should not be. // Compound numbers from 11 through 99 must be hyphenated, but others should not be.
xml = xml.replace( xml = xml.replace(
/(?<!\w)(twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety) (one|two|three|four|five|six|seven|eight|nine)/ig, /(?<!\w)(twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety) (one|two|three|four|five|six|seven|eight|nine)/uig,
"$1-$2" "$1-$2"
); );
xml = xml.replace(/[- ]hundred-and-/g, " hundred and "); xml = xml.replace(/[- ]hundred-and-/ug, " hundred and ");
xml = xml.replace(/(?<!-)(one|two|three|four|five|six|seven|eight|nine|twelve)-hundred/, "$1 hundred"); xml = xml.replace(/(?<!-)(one|two|three|four|five|six|seven|eight|nine|twelve)-hundred/ug, "$1 hundred");
xml = xml.replace(/(hundred|ninety)-percent(?!-)/g, "$1 percent"); xml = xml.replace(/(hundred|ninety)-percent(?!-)/ug, "$1 percent");
// "red-haired", "long-haired", etc.: they all need hyphens // "red-haired", "long-haired", etc.: they all need hyphens
xml = xml.replace(/ haired/g, "-haired"); xml = xml.replace(/ haired/ug, "-haired");
// These are consistently missing hyphens. // These are consistently missing hyphens.
xml = xml.replace(/([Ll]ife) threatening/g, "life-threatening"); xml = xml.replace(/([Ll]ife) threatening/ug, "life-threatening");
xml = xml.replace(/([Hh]ard) headed/g, "$1-headed"); xml = xml.replace(/([Hh]ard) headed/ug, "$1-headed");
xml = xml.replace(/([Ss]houlder) mounted/g, "$1-mounted"); xml = xml.replace(/([Ss]houlder) mounted/ug, "$1-mounted");
xml = xml.replace(/([Gg]olden) skinned/g, "$1-skinned"); xml = xml.replace(/([Gg]olden) skinned/ug, "$1-skinned");
xml = xml.replace(/([Cc]reepy) crawl/g, "$1-crawl"); xml = xml.replace(/([Cc]reepy) crawl/ug, "$1-crawl");
xml = xml.replace(/([Ww]ell) armed/g, "$1-armed"); xml = xml.replace(/([Ww]ell) armed/ug, "$1-armed");
xml = xml.replace(/([Aa]ble) bodied/g, "$1-bodied"); xml = xml.replace(/([Aa]ble) bodied/ug, "$1-bodied");
xml = xml.replace(/([Ll]evel) headed/g, "$1-headed"); xml = xml.replace(/([Ll]evel) headed/ug, "$1-headed");
xml = xml.replace(/([Cc]lear) cut/g, "$1-cut"); xml = xml.replace(/([Cc]lear) cut/ug, "$1-cut");
xml = xml.replace(/([Vv]at) grown/g, "$1-grown"); xml = xml.replace(/([Vv]at) grown/ug, "$1-grown");
xml = xml.replace(/([Ss]hell) shocked/g, "$1-shocked"); xml = xml.replace(/([Ss]hell) shocked/ug, "$1-shocked");
xml = xml.replace(/([Dd]og) tired/g, "$1-tired"); xml = xml.replace(/([Dd]og) tired/ug, "$1-tired");
xml = xml.replace(/([Nn]ightmare) filled/g, "$1-filled"); xml = xml.replace(/([Nn]ightmare) filled/ug, "$1-filled");
xml = xml.replace(/([Oo]ne) sided/g, "$1-sided"); xml = xml.replace(/([Oo]ne) sided/ug, "$1-sided");
xml = xml.replace(/([Mm]edium) sized/g, "$1-sized"); xml = xml.replace(/([Mm]edium) sized/ug, "$1-sized");
xml = xml.replace(/([Tt]eary) eyed/g, "$1-eyed"); xml = xml.replace(/([Tt]eary) eyed/ug, "$1-eyed");
xml = xml.replace(/([Ww]orst) case scenario/g, "$1-case scenario"); xml = xml.replace(/([Ww]orst) case scenario/ug, "$1-case scenario");
xml = xml.replace(/([Ss]elf) (conscious|esteem|loathing|harm|destruct|preservation)/g, "$1-$2"); xml = xml.replace(/([Ss]elf) (conscious|esteem|loathing|harm|destruct|preservation)/ug, "$1-$2");
xml = xml.replace(/([Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ourth) dimensional/g, "$1-dimensional"); xml = xml.replace(/([Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ourth) dimensional/ug, "$1-dimensional");
xml = xml.replace(/(?<=\b)([Oo]ne) on one(?=\b)/g, "$1-on-one"); xml = xml.replace(/(?<=\b)([Oo]ne) on one(?=\b)/ug, "$1-on-one");
// Preemptive(ly) is often hyphenated (not always). It should not be. // Preemptive(ly) is often hyphenated (not always). It should not be.
xml = xml.replace(/([Pp])re-emptive/g, "$1reemptive"); xml = xml.replace(/([Pp])re-emptive/ug, "$1reemptive");
// These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json. // These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json.
xml = xml.replace(/fist-bump/g, "fist bump"); xml = xml.replace(/fist-bump/ug, "fist bump");
xml = xml.replace(/high-five/g, "high five"); xml = xml.replace(/high-five/ug, "high five");
// This should be hyphenated when used as an adjective (instead of an adverb or noun). I.e. it should be // This should be hyphenated when used as an adjective (instead of an adverb or noun). I.e. it should be
// "hand-to-hand combat", but "passed from hand to hand", and "capable in hand to hand". The following heuristic works // "hand-to-hand combat", but "passed from hand to hand", and "capable in hand to hand". The following heuristic works
// in the books. // in the books.
xml = xml.replace(/hand to hand(?= [a-z])/g, "hand-to-hand"); xml = xml.replace(/hand to hand(?= [a-z])/ug, "hand-to-hand");
// This is usually wrong but sometimes correct. The lookarounds avoid specific cases where it's referring to an actual // This is usually wrong but sometimes correct. The lookarounds avoid specific cases where it's referring to an actual
// second in a series of guesses. // second in a series of guesses.
xml = xml.replace(/(?<!my |that )([Ss]econd) guess(?!es)/g, "$1-guess"); xml = xml.replace(/(?<!my |that )([Ss]econd) guess(?!es)/ug, "$1-guess");
// When used as a phrase "just in case" gets no hyphens. When used as a noun or adjective it does. A couple of the // When used as a phrase "just in case" gets no hyphens. When used as a noun or adjective it does. A couple of the
// noun cases are missing one or both hyphens. // noun cases are missing one or both hyphens.
xml = xml.replace(/([Aa]) just[ -]in case/g, "$1 just-in-case"); xml = xml.replace(/([Aa]) just[ -]in case/ug, "$1 just-in-case");
// When used as an adjective, it's hyphenated. It turns out most cases are as an adverb, so we go with this approach: // When used as an adjective, it's hyphenated. It turns out most cases are as an adverb, so we go with this approach:
xml = xml.replace( xml = xml.replace(
/face to face(?= meeting| hang-out| interaction| contact| conversation| confrontation| fight)/g, /face to face(?= meeting| hang-out| interaction| contact| conversation| confrontation| fight)/ug,
"face-to-face"); "face-to-face"
);
// When used as an adjective, it's hyphenated. This heuristic works in the books. // When used as an adjective, it's hyphenated. This heuristic works in the books.
xml = xml.replace(/fight or flight(?= [a-z])/g, "fight-or-flight"); xml = xml.replace(/fight or flight(?= [a-z])/ug, "fight-or-flight");
// This is usually correct but sometimes wrong. // This is usually correct but sometimes wrong.
xml = xml.replace(/neo /g, "neo-"); xml = xml.replace(/neo /ug, "neo-");
return xml; return xml;
} }
function standardizeSpellings(xml) { function standardizeSpellings(xml) {
// This is usually spelled "TV" but sometimes the other ways. Normalize. // This is usually spelled "TV" but sometimes the other ways. Normalize.
xml = xml.replace(/(\b)tv(\b)/g, "$1TV$2"); xml = xml.replace(/(\b)tv(\b)/ug, "$1TV$2");
xml = xml.replace(/t\.v\./ig, "TV"); xml = xml.replace(/t\.v\./uig, "TV");
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are // "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
// writing notes and thus probably the intention was to be less formal. Also it seems per // writing notes and thus probably the intention was to be less formal. Also it seems per
// https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized. // https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
xml = xml.replace(/Ok([,. ])/g, "Okay$1"); xml = xml.replace(/Ok([,. ])/ug, "Okay$1");
xml = xml.replace(/([^a-zA-Z])ok([^a])/g, "$1okay$2"); xml = xml.replace(/([^a-zA-Z])ok([^a])/ug, "$1okay$2");
xml = xml.replace(/([^a-zA-Z])o\.k\.([^a])/g, "$1okay$2"); xml = xml.replace(/([^a-zA-Z])o\.k\.([^a])/ug, "$1okay$2");
xml = xml.replace(/a-okay/g, "A-okay"); xml = xml.replace(/a-okay/ug, "A-okay");
// Signal(l)ing/signal(l)ed are spelled both ways. Both are acceptable in English. Let's standardize on single-L. // Signal(l)ing/signal(l)ed are spelled both ways. Both are acceptable in English. Let's standardize on single-L.
xml = xml.replace(/(S|s)ignall/g, "$1ignal"); xml = xml.replace(/(S|s)ignall/ug, "$1ignal");
// Clich(e|é) is spelled both ways. Let's standardize on including the accent. // Clich(e|é) is spelled both ways. Let's standardize on including the accent.
xml = xml.replace(/cliche/g, "cliché"); xml = xml.replace(/cliche/ug, "cliché");
// T-shirt is usually spelled lowercase ("t-shirt"). Normalize the remaining instances. // T-shirt is usually spelled lowercase ("t-shirt"). Normalize the remaining instances.
xml = xml.replace(/(?<! {2})T-shirt/g, "t-shirt"); xml = xml.replace(/(?<! {2})T-shirt/ug, "t-shirt");
// "gray" is the majority spelling, except for "greyhound" // "gray" is the majority spelling, except for "greyhound"
xml = xml.replace(/(G|g)rey(?!hound)/g, "$1ray"); xml = xml.replace(/(G|g)rey(?!hound)/ug, "$1ray");
// 12 instances of "Dragon-craft", 12 instances of "Dragon craft", 1 instance of "dragon craft" // 12 instances of "Dragon-craft", 12 instances of "Dragon craft", 1 instance of "dragon craft"
xml = xml.replace(/[Dd]ragon[ -](craft|mech)/g, "Dragon-$1"); xml = xml.replace(/[Dd]ragon[ -](craft|mech)/ug, "Dragon-$1");
// 88 instances of "A.I." to four of "AI" // 88 instances of "A.I." to four of "AI"
xml = xml.replace(/(?<=\b)AI(?=\b)/g, "A.I."); xml = xml.replace(/(?<=\b)AI(?=\b)/ug, "A.I.");
// 2 instances of "G.M." to one of "GM" // 2 instances of "G.M." to one of "GM"
xml = xml.replace(/(?<=\b)GM(?=\b)/g, "G.M."); xml = xml.replace(/(?<=\b)GM(?=\b)/ug, "G.M.");
return xml; return xml;
} }
@ -780,12 +797,14 @@ function fixCaseNumbers(xml) {
// We standardize on "Case Fifty-Three"; although it isn't the most common, it seems best to treat these as proper // We standardize on "Case Fifty-Three"; although it isn't the most common, it seems best to treat these as proper
// nouns. // nouns.
xml = xml.replace(/case[ -](?:fifty[ -]three|53)(?!)/ig, "Case Fifty-Three"); xml = xml.replace(/case[ -](?:fifty[ -]three|53)(?!)/uig, "Case Fifty-Three");
xml = xml.replace(/case[ -](?:thirty[ -]two|53)(?!)/ig, "Case Thirty-Two"); xml = xml.replace(/case[ -](?:thirty[ -]two|53)(?!)/uig, "Case Thirty-Two");
xml = xml.replace(/case[ -](?:sixty[ -]nine|53)(?!)/ig, "Case Sixty-Nine"); xml = xml.replace(/case[ -](?:sixty[ -]nine|53)(?!)/uig, "Case Sixty-Nine");
xml = xml.replace(/(?<!in )case[ -](zero|one|two|three|four|twelve|fifteen|seventy|ninety)(?!-)/ig, xml = xml.replace(
(_, caseNumber) => "Case " + caseNumber[0].toUpperCase() + caseNumber.substring(1)); /(?<!in )case[ -](zero|one|two|three|four|twelve|fifteen|seventy|ninety)(?!-)/uig,
(_, caseNumber) => `Case ${caseNumber[0].toUpperCase()}${caseNumber.substring(1)}`
);
return xml; return xml;
} }
@ -799,7 +818,7 @@ function isEmptyOrGarbage(el) {
} }
function escapeRegExp(str) { function escapeRegExp(str) {
return str.replace(/[-[\]/{}()*+?.\\^$|]/g, "\\$&"); return str.replace(/[[\]/{}()*+?.\\^$|]/ug, "\\$&");
} }
function decodeCloudFlareEmail(hash) { function decodeCloudFlareEmail(hash) {

View file

@ -34,8 +34,8 @@ module.exports = async (cachePath, manifestPath, contentPath, book, concurrentJo
warnings.push(...await pool.exec("convertChapter", [chapter, book, inputPath, outputPath])); warnings.push(...await pool.exec("convertChapter", [chapter, book, inputPath, outputPath]));
const time = String(Math.round((performance.now() - start) / 1000)).padStart(3) + " s"; const seconds = String(Math.round((performance.now() - start) / 1000)).padStart(3);
progress.increment({ time }); progress.increment({ time: `${seconds} s` });
})); }));
pool.terminate(); pool.terminate();

View file

@ -1,7 +1,7 @@
"use strict"; "use strict";
const path = require("path"); const path = require("path");
const fs = require("fs").promises; const fs = require("fs").promises;
const request = require("requisition"); const fetch = require("minipass-fetch");
const { JSDOM } = require("jsdom"); const { JSDOM } = require("jsdom");
const FILENAME_PREFIX = "chapter"; const FILENAME_PREFIX = "chapter";
@ -87,7 +87,7 @@ function getChapterTitle(rawChapterDoc) {
// issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to // issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to
// have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter. // have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter.
// Until then this is reasonable and uniform. // Until then this is reasonable and uniform.
return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ /, " "); return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ /u, " ");
} }
function retry(times, fn) { function retry(times, fn) {
@ -113,7 +113,7 @@ async function downloadChapter(startingURL) {
const refreshMeta = dom.window.document.querySelector("meta[http-equiv=refresh]"); const refreshMeta = dom.window.document.querySelector("meta[http-equiv=refresh]");
if (refreshMeta) { if (refreshMeta) {
[, urlToFollow] = /\d+;url=(.*)/i.exec(refreshMeta.content); [, urlToFollow] = /\d+;url=(.*)/ui.exec(refreshMeta.content);
process.stdout.write(`\n Redirected to ${urlToFollow}... `); process.stdout.write(`\n Redirected to ${urlToFollow}... `);
dom.window.close(); dom.window.close();
} else { } else {
@ -126,7 +126,7 @@ async function downloadChapter(startingURL) {
function downloadWithRetry(url) { function downloadWithRetry(url) {
return retry(3, async () => { return retry(3, async () => {
const response = await request(url).redirects(10); const response = await fetch(url);
if (response.status !== 200) { if (response.status !== 200) {
throw new Error(`Response status for ${url} was ${response.status}`); throw new Error(`Response status for ${url} was ${response.status}`);
} }

View file

@ -101,6 +101,10 @@
"before": "I felt painfully conspicuous", "before": "I felt painfully conspicuous",
"after": "I felt painfully conspicuous." "after": "I felt painfully conspicuous."
}, },
{
"before": "Justin Beiber",
"after": "Justin Bieber"
},
{ {
"before": "Lisa said. “Entire", "before": "Lisa said. “Entire",
"after": "Lisa said, “entire" "after": "Lisa said, “entire"
@ -861,6 +865,10 @@
} }
], ],
"https://parahumans.wordpress.com/2012/03/08/interlude-7%C2%BD-bonus/": [ "https://parahumans.wordpress.com/2012/03/08/interlude-7%C2%BD-bonus/": [
{
"before": "she been blossoming",
"after": "she had been blossoming"
},
{ {
"before": "Get Ready!", "before": "Get Ready!",
"after": "Get ready!" "after": "Get ready!"
@ -1808,6 +1816,10 @@
"before": "Charlotte,” I frowned. “Look", "before": "Charlotte,” I frowned. “Look",
"after": "Charlotte,” I frowned, “look" "after": "Charlotte,” I frowned, “look"
}, },
{
"before": "non-sequitor",
"after": "non-sequitur"
},
{ {
"before": "Did they… was he", "before": "Did they… was he",
"after": "Did they… Was he" "after": "Did they… Was he"
@ -2177,6 +2189,11 @@
{ {
"before": "Grue,” Trickster said. “Get", "before": "Grue,” Trickster said. “Get",
"after": "Grue,” Trickster said, “get" "after": "Grue,” Trickster said, “get"
},
{
"before": "Nothing she cant do outside the bubble",
"after": "Nothing she can do outside the bubble",
"_comment": "This doesn't make sense logically with 'cant'."
} }
], ],
"https://parahumans.wordpress.com/2012/08/18/snare-13-7/": [ "https://parahumans.wordpress.com/2012/08/18/snare-13-7/": [
@ -2308,6 +2325,11 @@
} }
], ],
"https://parahumans.wordpress.com/2012/10/20/colony-15-2/": [ "https://parahumans.wordpress.com/2012/10/20/colony-15-2/": [
{
"before": "turned something",
"after": "taken something",
"_comment": "'turned' is repeated later in the sentence."
},
{ {
"before": "on,” Tattletale said. “Lets", "before": "on,” Tattletale said. “Lets",
"after": "on,” Tattletale said, “lets" "after": "on,” Tattletale said, “lets"
@ -2318,6 +2340,11 @@
} }
], ],
"https://parahumans.wordpress.com/2012/10/23/colony-15-3/": [ "https://parahumans.wordpress.com/2012/10/23/colony-15-3/": [
{
"before": "whether or not I agreed or not",
"after": "whether I agreed or not",
"_comment": "Alternatively, the second 'or not' could be deleted instead of the first."
},
{ {
"before": "Woah,” Regent said. “Relax", "before": "Woah,” Regent said. “Relax",
"after": "Woah,” Regent said, “relax" "after": "Woah,” Regent said, “relax"
@ -2329,6 +2356,11 @@
{ {
"before": "the street</p>", "before": "the street</p>",
"after": "the street.</p>" "after": "the street.</p>"
},
{
"before": "dishonest members",
"after": "dishonest member",
"_comment": "He's talking only about Skitter here."
} }
], ],
"https://parahumans.wordpress.com/2012/10/27/colony-15-4/": [ "https://parahumans.wordpress.com/2012/10/27/colony-15-4/": [
@ -2407,10 +2439,6 @@
{ {
"before": "better pictures of mom", "before": "better pictures of mom",
"after": "better pictures of Mom" "after": "better pictures of Mom"
},
{
"before": "five-hundred percent",
"after": "five hundred percent"
} }
], ],
"https://parahumans.wordpress.com/2012/11/20/interlude-15/": [ "https://parahumans.wordpress.com/2012/11/20/interlude-15/": [
@ -2489,6 +2517,11 @@
{ {
"before": "decisions: Holding back", "before": "decisions: Holding back",
"after": "decisions: holding back" "after": "decisions: holding back"
},
{
"before": "their positioning, they had planned this",
"after": "their positioning… They had planned this",
"_comment": "A comma seems like the wrong choice here."
} }
], ],
"https://parahumans.wordpress.com/2012/12/22/monarch-16-9/": [ "https://parahumans.wordpress.com/2012/12/22/monarch-16-9/": [
@ -2597,6 +2630,11 @@
{ {
"before": "GWER-.</p>", "before": "GWER-.</p>",
"after": "GWER—.</p>" "after": "GWER—.</p>"
},
{
"before": "the Birdcage",
"after": "the birdcage",
"_comment": "A literal birdcage, not the parahuman prison."
} }
], ],
"https://parahumans.wordpress.com/2013/01/12/migration-17-5/": [ "https://parahumans.wordpress.com/2013/01/12/migration-17-5/": [
@ -2762,6 +2800,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/02/05/monarch-18-6/": [ "https://parahumans.wordpress.com/2013/02/05/monarch-18-6/": [
{
"before": "he encouragement",
"after": "the encouragement"
},
{ {
"before": "okay", "before": "okay",
"after": "OK", "after": "OK",
@ -3036,6 +3078,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/03/23/chrysalis-20-2/": [ "https://parahumans.wordpress.com/2013/03/23/chrysalis-20-2/": [
{
"before": "something together something",
"after": "something together"
},
{ {
"before": "of anxiety</em>.", "before": "of anxiety</em>.",
"after": "of anxiety.</em>" "after": "of anxiety.</em>"
@ -3309,6 +3355,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/05/21/interlude-22-donation-bonus-1/": [ "https://parahumans.wordpress.com/2013/05/21/interlude-22-donation-bonus-1/": [
{
"before": "Sumimasen deshita",
"after": "<i lang=\"ja-JP\">Sumimasen deshita</i>"
},
{ {
"before": "“is it reassuring", "before": "“is it reassuring",
"after": "“Is it reassuring" "after": "“Is it reassuring"
@ -3341,6 +3391,10 @@
"before": "of the ship. Its", "before": "of the ship. Its",
"after": "of the ship. Its" "after": "of the ship. Its"
}, },
{
"before": "supervillain-turned hero",
"after": "supervillain-turned-hero"
},
{ {
"before": "Las Vegas Rogue", "before": "Las Vegas Rogue",
"after": "Las Vegas rogue" "after": "Las Vegas rogue"
@ -3363,6 +3417,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/05/30/drone-23-3/": [ "https://parahumans.wordpress.com/2013/05/30/drone-23-3/": [
{
"before": "nerf",
"after": "Nerf"
},
{ {
"before": "Nine,” Glenn said. “I", "before": "Nine,” Glenn said. “I",
"after": "Nine,” Glenn said, “I" "after": "Nine,” Glenn said, “I"
@ -3374,6 +3432,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/06/01/drone-23-4/": [ "https://parahumans.wordpress.com/2013/06/01/drone-23-4/": [
{
"before": "FIrst",
"after": "First"
},
{ {
"before": "said. Someone", "before": "said. Someone",
"after": "said. “Someone" "after": "said. “Someone"
@ -3390,6 +3452,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/06/06/interlude-23/": [ "https://parahumans.wordpress.com/2013/06/06/interlude-23/": [
{
"before": "one—or two-word",
"after": "one- or two-word"
},
{ {
"before": "her mouth: A click", "before": "her mouth: A click",
"after": "her mouth: a click" "after": "her mouth: a click"
@ -3489,6 +3555,15 @@
} }
], ],
"https://parahumans.wordpress.com/2013/07/02/scarab-25-2/": [ "https://parahumans.wordpress.com/2013/07/02/scarab-25-2/": [
{
"before": "Enough or three",
"after": "Enough for three"
},
{
"before": "—3",
"after": "-3",
"_comment": "This is a computer display and thus probably uses a regular hyphen for the minus sign."
},
{ {
"before": "said. Except without", "before": "said. Except without",
"after": "said. “Except without" "after": "said. “Except without"
@ -3564,6 +3639,11 @@
{ {
"before": "<span style=\"text-decoration:underline;\"><strong>Lord</strong></span><span style=\"text-decoration:underline;\"> <strong>Walston</strong></span>", "before": "<span style=\"text-decoration:underline;\"><strong>Lord</strong></span><span style=\"text-decoration:underline;\"> <strong>Walston</strong></span>",
"after": "<span style=\"text-decoration:underline;\"><strong>Lord Walston</strong></span>" "after": "<span style=\"text-decoration:underline;\"><strong>Lord Walston</strong></span>"
},
{
"before": "—16",
"after": "-16",
"_comment": "This is a computer display and thus probably uses a regular hyphen for the minus sign."
} }
], ],
"https://parahumans.wordpress.com/2013/07/18/sting-26-1/": [ "https://parahumans.wordpress.com/2013/07/18/sting-26-1/": [
@ -3581,6 +3661,11 @@
} }
], ],
"https://parahumans.wordpress.com/2013/07/20/sting-26-2/": [ "https://parahumans.wordpress.com/2013/07/20/sting-26-2/": [
{
"before": "It didnt hurt that",
"after": "It didnt help that",
"_comment": "This makes more sense with 'help'"
},
{ {
"before": "And gauntlet</em>, to reply to the last one", "before": "And gauntlet</em>, to reply to the last one",
"after": "And gauntlet, to reply to the last one</em>" "after": "And gauntlet, to reply to the last one</em>"
@ -3792,6 +3877,10 @@
} }
], ],
"https://parahumans.wordpress.com/2013/08/31/cockroaches-28-1/": [ "https://parahumans.wordpress.com/2013/08/31/cockroaches-28-1/": [
{
"before": "“No, I said.",
"after": "“No,” I said."
},
{ {
"before": "could save issue", "before": "could save issue",
"after": "could say issue" "after": "could say issue"
@ -3860,6 +3949,10 @@
"before": "Im <em>not? Fuck</em>. Theres", "before": "Im <em>not? Fuck</em>. Theres",
"after": "Im <em>not</em>? <em>Fuck.</em> Theres" "after": "Im <em>not</em>? <em>Fuck.</em> Theres"
}, },
{
"before": "I <em>dont know</em>,”",
"after": "“I <em>dont know</em>,”"
},
{ {
"before": "I suspect Its a", "before": "I suspect Its a",
"after": "I suspect its a" "after": "I suspect its a"
@ -4386,7 +4479,7 @@
}, },
{ {
"before": "that because I cant isnt an", "before": "that because I cant isnt an",
"after": "that because I cant isnt an" "after": "that because I can isnt an"
} }
], ],
"https://parahumans.wordpress.com/2013/11/19/interlude-end/": [ "https://parahumans.wordpress.com/2013/11/19/interlude-end/": [

View file

@ -14,7 +14,7 @@ const zip = require("./zip.js");
const OUTPUT_DEFAULT = "(Book name).epub"; const OUTPUT_DEFAULT = "(Book name).epub";
const argv = yargs const { argv } = yargs
.usage(`${packageJson.description}\n\n${packageJson.name} [<command1> [<command2> [<command3> ...]]]\n\n` + .usage(`${packageJson.description}\n\n${packageJson.name} [<command1> [<command2> [<command3> ...]]]\n\n` +
"Each command will fail if the previously-listed one has not yet been run (with matching options).\n\n" + "Each command will fail if the previously-listed one has not yet been run (with matching options).\n\n" +
"Running with no commands is equivalent to running download convert scaffold zip.") "Running with no commands is equivalent to running download convert scaffold zip.")
@ -61,8 +61,7 @@ const argv = yargs
}) })
.recommendCommands() .recommendCommands()
.help() .help()
.version() .version();
.argv;
const outputFilename = argv.out === OUTPUT_DEFAULT ? `${books[argv.book].title}.epub` : argv.out; const outputFilename = argv.out === OUTPUT_DEFAULT ? `${books[argv.book].title}.epub` : argv.out;
@ -82,13 +81,13 @@ if (argv._.length === 0) {
} }
if (argv._.includes("download")) { if (argv._.includes("download")) {
const startURL = books[argv.book].startURL; const { startURL } = books[argv.book];
commands.push(() => download(startURL, cachePath, manifestPath)); commands.push(() => download(startURL, cachePath, manifestPath));
} }
if (argv._.includes("convert")) { if (argv._.includes("convert")) {
commands.push(() => { commands.push(() => {
return fs.rmdir(chaptersPath, { recursive: true, maxRetries: 3 }) return fs.rm(chaptersPath, { force: true, recursive: true, maxRetries: 3 })
.then(() => fs.mkdir(chaptersPath, { recursive: true })) .then(() => fs.mkdir(chaptersPath, { recursive: true }))
.then(() => convert(cachePath, manifestPath, chaptersPath, argv.book, argv.jobs)); .then(() => convert(cachePath, manifestPath, chaptersPath, argv.book, argv.jobs));
}); });
@ -97,7 +96,13 @@ if (argv._.includes("convert")) {
if (argv._.includes("scaffold")) { if (argv._.includes("scaffold")) {
const bookInfo = books[argv.book]; const bookInfo = books[argv.book];
commands.push(() => scaffold( commands.push(() => scaffold(
scaffoldingPath, coverPath, stagingPath, contentPath, chaptersPath, manifestPath, bookInfo scaffoldingPath,
coverPath,
stagingPath,
contentPath,
chaptersPath,
manifestPath,
bookInfo
)); ));
} }

3625
npm-shrinkwrap.json generated

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@
"parahuman", "parahuman",
"scraper" "scraper"
], ],
"version": "4.12.1", "version": "5.1.0",
"author": "Domenic Denicola <d@domenic.me> (https://domenic.me/)", "author": "Domenic Denicola <d@domenic.me> (https://domenic.me/)",
"license": "WTFPL", "license": "WTFPL",
"repository": "domenic/worm-scraper", "repository": "domenic/worm-scraper",
@ -23,18 +23,19 @@
"lint": "eslint lib" "lint": "eslint lib"
}, },
"dependencies": { "dependencies": {
"archiver": "^5.1.0", "archiver": "^5.3.1",
"cli-progress": "^3.8.2", "cli-progress": "^3.11.1",
"cpr": "^3.0.1", "cpr": "^3.0.1",
"jsdom": "^16.4.0", "jsdom": "^19.0.0",
"requisition": "^1.5.0", "minipass-fetch": "^2.1.0",
"workerpool": "^6.0.3", "workerpool": "^6.2.1",
"yargs": "^16.2.0" "yargs": "^17.5.1"
}, },
"devDependencies": { "devDependencies": {
"eslint": "^7.16.0" "@domenic/eslint-config": "^2.0.0",
"eslint": "^8.16.0"
}, },
"engines": { "engines": {
"node": ">=12.10.0" "node": ">=16.13.2"
} }
} }