Compare commits

...

152 commits

Author SHA1 Message Date
Domenic Denicola
8f73d57a5a 5.1.0 2022-05-28 18:05:25 -04:00
Santiago Arambillete
b25a0d3a65
Add some fixes for Worm 2022-05-28 18:02:34 -04:00
Domenic Denicola
752c4c3916 Update dependencies 2022-05-28 17:21:28 -04:00
Domenic Denicola
43a80d5dd9 5.0.0 2022-01-22 18:50:33 -05:00
Domenic Denicola
7c9d96578b Use minipass-fetch instead of requisition
Requisition appears to be unmaintained and causes security alerts upon installation.
2022-01-22 18:50:18 -05:00
Domenic Denicola
ac56d1c4c7 Update dependencies and minimum Node 2022-01-22 18:33:02 -05:00
Evan Young
12394da334
Fix failure on convert in modern Node.js
Closes #30.
2022-01-22 13:39:12 -05:00
Domenic Denicola
c860873d78 Update dependencies 2021-02-15 22:05:38 -05:00
Domenic Denicola
360f108c1c Use @domenic/eslint-config 2021-02-15 22:05:38 -05:00
Domenic Denicola
f3893e2f3f 4.12.1 2021-01-17 17:04:56 -05:00
Domenic Denicola
32c76a59a5 Update CI cache key for source updates 2021-01-17 17:04:37 -05:00
Domenic Denicola
ed070daf9f Fix Ward substitutions for source updates
URLs seem to be relative now.
2021-01-17 17:04:37 -05:00
Domenic Denicola
96e3e837e6 Fix broken Ward last chapter downloading
It's not clear when this started happening, but the "Next Chapter" link in Last 20.e6 no longer works.
2021-01-17 17:04:37 -05:00
Domenic Denicola
bfdb9eadde Follow redirects during downloads
Fixes #25.
2021-01-17 17:04:37 -05:00
Domenic Denicola
48400e6f96
Run EPUBCheck on CI
This should help detect any issues such as those seen in #22.
2021-01-01 17:48:41 -05:00
Domenic Denicola
b94c33ea6f 4.12.0 2021-01-01 16:27:03 -05:00
Domenic Denicola
ba387d3555 Improve deletion of empty-ish elements
The previous heuristic of replacing them with a space character caused spaces to be inserted in the middle of words. Also, various cases were missed. This should help.
2021-01-01 16:24:33 -05:00
Domenic Denicola
a405adf6b7 Fix more "changed tack" misspellings 2021-01-01 16:23:05 -05:00
Domenic Denicola
0efaf38170 Fix more "scot-free" misspellings 2021-01-01 16:23:05 -05:00
Elaina Martineau
ab226686ae Spot fixes for all of Worm 2021-01-01 16:23:05 -05:00
Domenic Denicola
66f7856a0f 4.11.0 2020-12-31 14:22:50 -05:00
Domenic Denicola
848b090b0d Fix bad XHTML in Ward Sundown 17.6
Closes #22.
2020-12-31 14:22:10 -05:00
Domenic Denicola
db1b3d9e97 Do not compress the mimetype file
This is the last error (for Worm) noted via epubcheck, per #22.
2020-12-31 14:22:10 -05:00
Domenic Denicola
c6f7460b82 Tweaks to cover-related EPUB stuff
Helps with the validation errors noted in #22.
2020-12-31 14:22:10 -05:00
Domenic Denicola
3865ae0f5b Fix bad XHTML in Worm Cockroaches 28.2
Part of #22.
2020-12-31 14:22:10 -05:00
Domenic Denicola
d4663ed2e3 Replace WordPress <img>s with emoji
Noticed via the validation discussed in #22, which was failing on the draggable attribute.
2020-12-31 14:22:10 -05:00
Domenic Denicola
4a2d33f968 Fix bad XHTML in Worm Interlude 8 (Bonus)
Part of #22.
2020-12-31 14:22:09 -05:00
Domenic Denicola
07f3011423 4.10.0 2020-12-26 18:03:22 -05:00
Domenic Denicola
e1f9320524 Capitalize "Wildbow" in book metadata
Per Q2 on https://www.parahumans.net/f-a-q/ it seems to be capitalized like a name, not just at the beginning of sentences.
2020-12-26 17:58:59 -05:00
Domenic Denicola
18874a335c Add cover credits 2020-12-26 17:58:09 -05:00
Domenic Denicola
d5f3b58f0b Slightly improve program output
Most notably reduce the number of output lines per chapter in the download step.
2020-12-26 17:54:19 -05:00
Domenic Denicola
0be098ff16 Allow running with no commands 2020-12-26 17:43:42 -05:00
Domenic Denicola
54dc72b182 Update dependencies 2020-12-26 17:38:16 -05:00
Domenic Denicola
28d4c6927a Add a new cover for Worm 2020-12-26 17:31:16 -05:00
Domenic Denicola
cc2db87b58 4.9.0 2020-12-26 16:55:45 -05:00
Domenic Denicola
a89414392e Fix some em dashes in Ward Last 20.e5 2020-12-26 16:54:04 -05:00
Domenic Denicola
89789724d1 Hyphenate "worst-case scenario" 2020-12-26 16:54:04 -05:00
Domenic Denicola
5a25df658b Hyphenate "teary-eyed" 2020-12-26 16:54:04 -05:00
Domenic Denicola
f3366e8346 Hyphenate "medium-sized" 2020-12-26 16:54:04 -05:00
Domenic Denicola
0b4af123ab Hyphenate "one-sided" 2020-12-26 16:54:04 -05:00
Domenic Denicola
f5f0ba8e61 Spot fixes for Ward through Last 20.end 2020-12-26 16:54:04 -05:00
Domenic Denicola
08e0d0d9a8 De-italicize some commas 2020-12-26 16:54:04 -05:00
Domenic Denicola
5d9a031c02 De-capitalize "giants" but capitalize "X Giant" 2020-12-26 16:54:04 -05:00
Domenic Denicola
b652e3812b De-capitalize "flock" to match prevailing usage 2020-12-26 16:54:04 -05:00
Domenic Denicola
26a2b9c9b5 Hyphenate "fight or flight" when appropriate 2020-12-26 16:54:04 -05:00
Domenic Denicola
2fb075128a Hyphenate "face-to-face" when appropriate 2020-12-26 16:54:04 -05:00
Domenic Denicola
ed392d8b98 De-capitalize judo, aikido, karate, and tae kwon do 2020-12-26 16:54:04 -05:00
Domenic Denicola
d0c23d86fd Hyphenate "a just-in-case" 2020-12-26 16:54:04 -05:00
Domenic Denicola
369714f3d1 Spot fixes for Ward through Infrared 19.10 2020-12-26 16:54:04 -05:00
Domenic Denicola
06e43dcf16 Hyphenate "dog-tired" 2020-12-26 16:54:04 -05:00
Domenic Denicola
e1b59994f8 Capitalize "Uncle Neil" and "Aunt Fleur" 2020-12-26 16:54:04 -05:00
Domenic Denicola
b67f4032f5 Hyphenate "built-in" when appropriate 2020-12-26 16:54:04 -05:00
Domenic Denicola
651944b4da Hyphenate "second-guess" and derivatives, when appropriate 2020-12-26 16:54:04 -05:00
Domenic Denicola
44f2cc3c7b Hyphenate "shell-shocked" 2020-12-26 16:54:04 -05:00
Domenic Denicola
294cbb2e71 Hyphenate "vat-grown" 2020-12-26 16:54:04 -05:00
Domenic Denicola
6256b332cb Apply hyphenation fixes even to capitalized phrases 2020-12-26 16:54:03 -05:00
Domenic Denicola
bf8a1b325c Hyphenate self-preservation 2020-12-26 16:54:03 -05:00
Domenic Denicola
6f51bc6c9a Spot fixes for Ward through Infrared 19.8 2020-12-26 16:54:03 -05:00
Domenic Denicola
877beda733 4.8.0 2020-12-19 17:21:59 -05:00
Domenic Denicola
6ddde06817 De-capitalize "parahumans" 2020-12-19 17:19:36 -05:00
Domenic Denicola
aa9fc197e9 Hyphenate "hand-to-hand" 2020-12-19 17:19:36 -05:00
Domenic Denicola
a1c7f00b42 Capitalize "Machine Army" 2020-12-19 17:19:36 -05:00
Domenic Denicola
b9d396f6f5 Fix hyphenation for high five and fist bump 2020-12-19 17:19:36 -05:00
Domenic Denicola
1a0780bd7b Capitalize "Aunt Sarah" 2020-12-19 17:19:36 -05:00
Domenic Denicola
5b7ec80750 Spot fixes for Ward through Infrared 19.4 2020-12-19 17:19:36 -05:00
Domenic Denicola
442d245e2d Remove more non-breaking spaces
Also normalizes after-sentence spaces to two (normal) spaces, but that's not visible to readers.
2020-12-19 17:19:36 -05:00
Domenic Denicola
9fc36b813f Hyphenate "clear-cut" 2020-12-19 17:19:36 -05:00
Domenic Denicola
631417a530 Capitalize "Fragile One" 2020-12-19 17:19:36 -05:00
Domenic Denicola
8a9562e10e Hyphenate "level-headed" 2020-12-19 17:19:36 -05:00
Domenic Denicola
5ff6621b31 Hyphenate a variety of words starting with "self-"
Previously we only did self-conscious; this brings along self-esteem, self-loathing, and self-harm.
2020-12-19 17:19:36 -05:00
Domenic Denicola
3e06358fa2 Spot fixes for Ward through Sundown 17.z 2020-12-19 17:19:30 -05:00
Domenic Denicola
3aece3e05e Fix misspellings of "Tattletale" 2020-12-19 17:19:26 -05:00
Domenic Denicola
4908956f0c Consistently capitalize "Titan" 2020-12-19 17:18:38 -05:00
Domenic Denicola
730cc512e3 Capitalize "Titans" 2020-12-19 17:18:38 -05:00
Domenic Denicola
cc151355fd Re-capitalize "the Stranger" 2020-12-19 17:18:38 -05:00
Domenic Denicola
a86e21b846 Re-capitalize "Stranger Titan" 2020-12-19 17:18:38 -05:00
Domenic Denicola
fc641af4f5 4.7.0 2020-11-29 17:26:34 -05:00
Domenic Denicola
6e59181524 Standardize on "Juliette" instead of "Juliet" 2020-11-29 17:25:46 -05:00
Domenic Denicola
ecea0f5660 Hyphenate day-to-day when appropriate 2020-11-29 17:25:46 -05:00
Domenic Denicola
e3ef2e254a Always hyphenate "one-on-one" 2020-11-29 17:25:46 -05:00
Domenic Denicola
54f2f82650 De-capitalize "corona pollentia" and friends 2020-11-29 17:25:46 -05:00
Domenic Denicola
0c22f7df11 Hyphenate N-dimensional 2020-11-29 17:25:46 -05:00
Domenic Denicola
d12c48976b Fix opening quotes inside quotes to apostrophes instead 2020-11-29 17:25:46 -05:00
Domenic Denicola
2e4a3e56dc Hyphenate compound words ending in "haired" 2020-11-29 17:25:45 -05:00
Domenic Denicola
5c80327fef Spot fixes for Ward through From Within 16.z 2020-11-29 17:25:45 -05:00
Domenic Denicola
2412fa0375 Capitalize U-turn 2020-11-29 17:25:45 -05:00
Domenic Denicola
32817eb255 Fix capitalization and apostrophes for ’Cage 2020-11-29 17:25:45 -05:00
Domenic Denicola
c128712bb4 Change treatment of Teacher color team names
Previously I was treating them as "conflict or connection", trying to give them an en dash. But I think they're just compound adjectives, so let's revert to a hyphen-minus. Also fixes missing capitalization for one of them.
2020-11-29 17:25:45 -05:00
Domenic Denicola
d62739f6cd De-capitalize math 2020-11-29 17:25:45 -05:00
Domenic Denicola
f3063854ff De-capitalize season names 2020-11-29 17:25:45 -05:00
Domenic Denicola
1328dfd8e3 Standardize on "A.I." instead of "AI" 2020-11-29 17:25:45 -05:00
Domenic Denicola
5310672cc2 De-capitalize aunt and uncle where appropriate 2020-11-29 17:25:45 -05:00
Domenic Denicola
79efec7080 Fix end-of-line commas that should be periods. 2020-11-29 17:25:45 -05:00
Domenic Denicola
ee76715935 Stop over-italicizing commas 2020-11-29 17:25:45 -05:00
Domenic Denicola
2a09195471 Italicize question marks in single-question-word sentences 2020-11-29 17:25:45 -05:00
Domenic Denicola
4c1c7cd03d Remove extra spaces before closing quote marks 2020-11-29 17:25:45 -05:00
Domenic Denicola
de83a47bfa Spot fixes through Dying 15.z 2020-11-29 17:25:45 -05:00
Domenic Denicola
3ec6e36e34 Un-capitalize "Church" when appropriate 2020-11-29 17:25:45 -05:00
Domenic Denicola
70e1ff0281 Always hyphenate self-conscious and derivatives 2020-11-29 17:25:45 -05:00
Domenic Denicola
6751136bec Fix some instances of the possessive of Marquis 2020-11-29 17:25:45 -05:00
Domenic Denicola
1da99791d3 Replace hyphen-minus with em dash when preceding a question mark 2020-11-29 17:25:45 -05:00
Domenic Denicola
ea19dbb6c5 Standardize on "Jotun" instead of "Jotunn" 2020-11-29 17:25:45 -05:00
Domenic Denicola
424f13f169 Always capitalize "the Bunker" in Ward 2020-11-29 17:25:45 -05:00
Domenic Denicola
b613514994 Remove hyphenation around "hundred" and "percent" 2020-11-29 17:25:45 -05:00
Domenic Denicola
16ef1836da Hyphenate spelled out numbers from 11 through 99 2020-11-29 17:25:45 -05:00
Domenic Denicola
85b5e142da Spot fixes for Ward through Breaking 14.z 2020-11-29 17:25:45 -05:00
Domenic Denicola
2eafeee814 Fix possessive of Semiramis 2020-11-29 17:25:45 -05:00
Domenic Denicola
af03064221 Standardize on "Crock o’ Shit" 2020-11-29 17:25:45 -05:00
Domenic Denicola
22fbff008e Capitalize a few more alternate "Earth"s 2020-11-29 17:25:45 -05:00
Domenic Denicola
c9d51787e2 Fix capitalization and apostrophes for ’Lace 2020-11-29 17:25:45 -05:00
Domenic Denicola
53f7307daa Capitalize "English" but de-capitalize "english muffin"
Also lowercase one instance of "french toast"
2020-11-29 17:25:45 -05:00
Domenic Denicola
20d91d37df Use apostrophes instead of opening quotes for ’kay 2020-11-29 17:25:45 -05:00
Domenic Denicola
60709f54c2 Always capitalize "Nazi"
Also fix hyphenation and capitalization around "neo-Nazi".
2020-11-29 17:25:45 -05:00
Domenic Denicola
06c7b3adf2 Standardize on "Dragon-craft" and "Dragon-mech" 2020-11-29 17:25:45 -05:00
Domenic Denicola
3284c04f8b Always capitalize "Dauntless Titan" 2020-11-29 17:25:45 -05:00
Domenic Denicola
121ab01243 Spot fixes for Ward through Heavens 12.none 2020-11-29 17:25:45 -05:00
Domenic Denicola
abca01b1d6 Fix a few incomplete or misplaced ellipses 2020-11-29 17:25:45 -05:00
Domenic Denicola
b113b240cc 4.6.1 2020-11-17 10:39:12 -05:00
Domenic Denicola
d69484af8e Fix messed up chapters with text conversations
aea63ba055 introduced a bug where these chapters would contain ill-formed XML, and so at least some eBook readers would refuse to display them.
2020-11-17 10:39:03 -05:00
Domenic Denicola
21bded5650 4.6.0 2020-11-14 20:27:22 -05:00
Domenic Denicola
b20182ad0d Un-italicize various commas 2020-11-14 20:26:44 -05:00
Domenic Denicola
b017ab54d1 Fix a variety of wrong closing quotes
E.g. doubles, "I”ll", backwards
2020-11-14 20:26:44 -05:00
Domenic Denicola
54093109bf Spot fixes for Ward through Heavens 12.4 2020-11-14 20:26:44 -05:00
Domenic Denicola
aea63ba055 Fix inconsistently-bolded colons in Ward text conversations 2020-11-14 20:26:44 -05:00
Domenic Denicola
8daeebe507 Remove overcapitalization of "university" 2020-11-14 20:26:44 -05:00
Domenic Denicola
41566a380e Refactor possessive-fixing regular expressions 2020-11-14 20:26:44 -05:00
Domenic Denicola
584a52fc27 Standardize on "Amias", not "Amais" 2020-11-14 20:26:44 -05:00
Domenic Denicola
075db7f446 Standardize on "Mrs." Yamada, not "Ms." 2020-11-14 20:26:44 -05:00
Domenic Denicola
d2402b1bb2 Fix many erroneously-italicized closing quotes 2020-11-14 20:26:44 -05:00
Domenic Denicola
05a9abab91 Fix many erroneously-italicized exclamation points 2020-11-14 20:26:44 -05:00
Domenic Denicola
9da59c9104 Fix a variety of missing periods 2020-11-14 20:26:44 -05:00
Domenic Denicola
631897b011 Capitalize "Heartbroken" when it's a proper noun 2020-11-14 20:26:44 -05:00
Domenic Denicola
1a2b6de78e Standardize on one style for "Case Fifty-Three" and friends 2020-11-14 20:26:44 -05:00
Domenic Denicola
15e34e47a3 Fix double-periods in Ward
It appears all the ones in Worm, at least detectable via this pattern, were already caught.
2020-11-14 20:26:44 -05:00
Domenic Denicola
c5b13e7cc1 Fix capitalization and apostrophes for truncated names 2020-11-14 20:26:44 -05:00
Domenic Denicola
71d5352eb1 Spot fixes for Ward through Blinding 11.5 2020-11-14 20:26:44 -05:00
Domenic Denicola
13b25835ba Always lowercase "the Wardens’ headquarters" 2020-11-14 20:26:44 -05:00
Domenic Denicola
326ec148c9 Fix instances of "Warden’s" which should be "Wardens’" 2020-11-14 20:26:44 -05:00
Domenic Denicola
6c6b360dac Fix a couple instances of "Hardboil" to be "Hard Boil" 2020-11-14 20:26:44 -05:00
Domenic Denicola
6ee5fb9ef6 Fix backward closing single quotation marks 2020-11-14 20:26:44 -05:00
Domenic Denicola
33d88eb523 Refactor convert-worker.js a bit
This introduces a few fixes around italics.
2020-11-14 20:26:44 -05:00
Domenic Denicola
f70ba64629 Measure and output the time conversion takes 2020-11-14 20:26:44 -05:00
Domenic Denicola
58b0856deb Stop failing the convert step due to busy filesystems
This often happens with virus scanners, etc.
2020-11-14 20:26:44 -05:00
Domenic Denicola
7d5167c952 4.5.0 2020-11-07 18:09:19 -05:00
Domenic Denicola
481e9cc3a8 Capitalize "the Pharmacist" from Ward Gleaming 9.10 onward
In Gleaming 9.10, it starts being capitalized consistently, presumably reflecting the shift from thinking of "the pharmacist" as a job title, to thinking of "the Pharmacist" as a cape name. So, for subsequent chapters, let's be sure to retain that.
2020-11-07 18:05:49 -05:00
Domenic Denicola
0053545444 Correctly hyphenate "X-year-old" 2020-11-07 18:05:48 -05:00
Domenic Denicola
56a38609ca Fix a variety of misplaced or extraneous quotes 2020-11-07 18:05:48 -05:00
Domenic Denicola
6cdf486858 Fix missing spaces after commas 2020-11-07 18:05:48 -05:00
Domenic Denicola
e364cb2b22 Normalize a few instances of "T-shirt" to "t-shirt"
The latter is overwhelmingly more common.
2020-11-07 18:05:48 -05:00
Domenic Denicola
b85e8a68b1 Spot fixes for Ward through Polarize 10.x 2020-11-07 18:05:48 -05:00
Domenic Denicola
547b7e9518 Restore original scene breaks, instead of using <hr> 2020-11-07 18:05:48 -05:00
Domenic Denicola
b4454f8432 Fix warning output
This has been broken since the progress bar addition.
2020-11-07 18:05:48 -05:00
18 changed files with 6599 additions and 1977 deletions

View file

@ -1,259 +1,10 @@
{ {
"root": true, "root": true,
"extends": "@domenic",
"env": { "env": {
"node": true, "node": true
"es6": true
},
"parserOptions": {
"ecmaVersion": 2019
}, },
"rules": { "rules": {
// Possible errors "no-console": "off"
"no-await-in-loop": "off",
"comma-dangle": ["error", "never"],
"no-cond-assign": ["error", "except-parens"],
"no-console": "off",
"no-constant-condition": "error",
"no-control-regex": "error",
"no-debugger": "error",
"no-dupe-args": "error",
"no-dupe-keys": "error",
"no-duplicate-case": "error",
"no-empty": "error",
"no-empty-character-class": "error",
"no-ex-assign": "error",
"no-extra-boolean-cast": "error",
"no-extra-parens": ["error", "all", { "conditionalAssign": false, "nestedBinaryExpressions": false }],
"no-extra-semi": "error",
"no-func-assign": "error",
"no-inner-declarations": "off",
"no-invalid-regexp": "error",
"no-irregular-whitespace": "error",
"no-obj-calls": "error",
"no-prototype-builtins": "error",
"no-regex-spaces": "error",
"no-sparse-arrays": "error",
"no-template-curly-in-string": "error",
"no-unexpected-multiline": "error",
"no-unreachable": "error",
"no-unsafe-finally": "off",
"no-unsafe-negation": "error",
"use-isnan": "error",
"valid-jsdoc": "off",
"valid-typeof": "error",
// Best practices
"accessor-pairs": "error",
"array-callback-return": "error",
"block-scoped-var": "off",
"class-methods-use-this": "error",
"complexity": "off",
"consistent-return": "error",
"curly": ["error", "all"],
"default-case": "off",
"dot-location": ["error", "property"],
"dot-notation": "error",
"eqeqeq": "error",
"guard-for-in": "off",
"no-alert": "error",
"no-caller": "error",
"no-case-declarations": "error",
"no-div-regex": "off",
"no-else-return": "error",
"no-empty-function": "error",
"no-empty-pattern": "error",
"no-eq-null": "error",
"no-eval": "error",
"no-extend-native": "error",
"no-extra-bind": "error",
"no-extra-label": "error",
"no-fallthrough": "error",
"no-floating-decimal": "error",
"no-global-assign": "error",
"no-implicit-coercion": "error",
"no-implicit-globals": "error",
"no-implied-eval": "off",
"no-invalid-this": "error",
"no-iterator": "error",
"no-labels": ["error", { "allowLoop": true }],
"no-lone-blocks": "error",
"no-loop-func": "off",
"no-magic-numbers": "off",
"no-multi-spaces": "error",
"no-multi-str": "error",
"no-new": "error",
"no-new-func": "error",
"no-new-wrappers": "error",
"no-octal": "error",
"no-octal-escape": "error",
"no-param-reassign": "off",
"no-process-env": "error",
"no-proto": "error",
"no-redeclare": "error",
"no-restricted-properties": "off",
"no-return-assign": ["error", "except-parens"],
"no-return-await": "error",
"no-script-url": "off",
"no-self-assign": "error",
"no-self-compare": "error",
"no-sequences": "error",
"no-throw-literal": "error",
"no-unmodified-loop-condition": "error",
"no-unused-expressions": "error",
"no-unused-labels": "error",
"no-useless-call": "error",
"no-useless-concat": "error",
"no-useless-escape": "error",
"no-useless-return": "error",
"no-void": "error",
"no-warning-comments": "off",
"no-with": "error",
"radix": ["error", "as-needed"],
"require-await": "error",
"vars-on-top": "off",
"wrap-iife": ["error", "outside"],
"yoda": ["error", "never"],
// Strict Mode
"strict": ["error", "global"],
// Variables
"init-declarations": "off",
"no-catch-shadow": "error",
"no-delete-var": "error",
"no-label-var": "error",
"no-restricted-globals": "off",
"no-shadow": "error",
"no-shadow-restricted-names": "error",
"no-undef": "error",
"no-undef-init": "error",
"no-undefined": "off",
"no-unused-vars": "error",
"no-use-before-define": ["error", "nofunc"],
// Node.js and CommonJS
"callback-return": "off",
"global-require": "error",
"handle-callback-err": "error",
"no-mixed-requires": ["error", true],
"no-new-require": "error",
"no-path-concat": "error",
"no-process-exit": "error",
"no-restricted-imports": "off",
"no-restricted-modules": "off",
"no-sync": "off",
// Stylistic Issues
"array-bracket-spacing": ["error", "never"],
"block-spacing": ["error", "always"],
"brace-style": ["error", "1tbs", { "allowSingleLine": false }],
"camelcase": ["error", { "properties": "always" }],
"capitalized-comments": ["error", "always", { "ignoreConsecutiveComments": true }],
"comma-spacing": ["error", { "before": false, "after": true }],
"comma-style": ["error", "last"],
"computed-property-spacing": ["error", "never"],
"consistent-this": "off",
"eol-last": "error",
"func-call-spacing": ["error", "never"],
"func-name-matching": ["error", "always"],
"func-names": ["error", "never"],
"func-style": ["error", "declaration"],
"id-blacklist": "off",
"id-length": "off",
"id-match": "off",
"indent": ["error", 2, { "SwitchCase": 1 }],
"jsx-quotes": "off",
"key-spacing": ["error", { "beforeColon": false, "afterColon": true, "mode": "strict" }],
"keyword-spacing": ["error", { "before": true, "after": true }],
"line-comment-position": "off",
"linebreak-style": ["error", "unix"],
"lines-around-comment": "off",
"lines-around-directive": "off",
"max-depth": "off",
"max-len": ["error", 120, { "ignoreUrls": true }],
"max-lines": "off",
"max-nested-callbacks": "off",
"max-params": "off",
"max-statements": "off",
"max-statements-per-line": ["error", { "max": 1 }],
"multiline-ternary": "off",
"new-cap": "error",
"new-parens": "error",
"newline-after-var": "off",
"newline-before-return": "off",
"newline-per-chained-call": "off",
"no-array-constructor": "error",
"no-bitwise": "off",
"no-continue": "off",
"no-inline-comments": "off",
"no-lonely-if": "error",
"no-mixed-operators": "error",
"no-mixed-spaces-and-tabs": "error",
"no-multiple-empty-lines": "error",
"no-negated-condition": "off",
"no-nested-ternary": "error",
"no-new-object": "error",
"no-plusplus": "off",
"no-restricted-syntax": "off",
"no-tabs": "error",
"no-ternary": "off",
"no-trailing-spaces": "error",
"no-underscore-dangle": "off",
"no-unneeded-ternary": "error",
"no-whitespace-before-property": "error",
"object-curly-newline": ["error", { "multiline": true }],
"object-curly-spacing": ["error", "always"],
"object-property-newline": "off",
"one-var": ["error", "never"],
"one-var-declaration-per-line": ["error", "initializations"],
"operator-assignment": ["error", "always"],
"operator-linebreak": ["error", "after"],
"padded-blocks": ["error", "never"],
"quote-props": ["error", "as-needed"],
"quotes": ["error", "double", { "avoidEscape": true, "allowTemplateLiterals": true }],
"require-jsdoc": "off",
"semi": ["error", "always"],
"semi-spacing": "error",
"sort-keys": "off",
"sort-vars": "off",
"space-before-blocks": ["error", "always"],
"space-before-function-paren": ["error", { "anonymous": "always", "named": "never" }],
"space-in-parens": ["error", "never"],
"space-infix-ops": "error",
"space-unary-ops": ["error", { "words": true, "nonwords": false }],
"spaced-comment": ["error", "always", { "markers": ["///"] }],
"unicode-bom": ["error", "never"],
"wrap-regex": "off",
// ECMAScript 6
"arrow-body-style": "off", // meh
"arrow-parens": ["error", "as-needed"],
"arrow-spacing": "error",
"constructor-super": "error",
"generator-star-spacing": ["error", "after"],
"no-class-assign": "error",
"no-confusing-arrow": "off",
"no-const-assign": "error",
"no-dupe-class-members": "error",
"no-duplicate-imports": "error",
"no-new-symbol": "error",
"no-this-before-super": "error",
"no-useless-computed-key": "error",
"no-useless-constructor": "error",
"no-useless-rename": "error",
"no-var": "error",
"object-shorthand": "error",
"prefer-arrow-callback": "error",
"prefer-const": "error",
"prefer-numeric-literals": "error",
"prefer-rest-params": "error",
"prefer-spread": "error",
"prefer-template": "off",
"require-yield": "error",
"rest-spread-spacing": ["error", "never"],
"sort-imports": "off",
"symbol-description": "error",
"template-curly-spacing": ["error", "never"],
"yield-star-spacing": ["error", "after"]
} }
} }

50
.github/workflows/test.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: Test
on:
pull_request:
branches:
- master
push:
branches:
- master
jobs:
test:
name: Test
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/setup-node@v2
with:
node-version: 16
- run: npm install
- run: npm run lint
# CI would take too long if we did the download every time too. So, we cache it. This does mean we're vulnerable to
# source changes exposing problems in our code, but those are pretty infrequent. If they occur, we need to bump the
# cache key.
- uses: actions/cache@v2
with:
key: worm-ward-cache-2021-01-17
path: ./cache
- run: node ./lib/worm-scraper.js --book=worm
- run: node ./lib/worm-scraper.js --book=ward
- uses: actions/setup-java@v1
with:
java-version: 15
java-package: jre
- name: Get EPUBCheck
run: |
curl https://github.com/w3c/epubcheck/releases/download/v4.2.4/epubcheck-4.2.4.zip --location --output epubcheck.zip
unzip epubcheck.zip
- name: Check Worm.epub
run: java -jar epubcheck-4.2.4/epubcheck.jar --failonwarnings Worm.epub
- name: Check Ward.epub
run: java -jar epubcheck-4.2.4/epubcheck.jar --failonwarnings Ward.epub

View file

@ -1,6 +0,0 @@
language: node_js
node_js:
- 10
- stable
script:
npm run lint

View file

@ -4,7 +4,7 @@ Scrapes the web serial [_Worm_](https://parahumans.wordpress.com/) and its seque
## How to use ## How to use
First you'll need a modern version of [Node.js](https://nodejs.org/en/). Install whatever is current (not LTS); at least v12.10.0 is necessary. First you'll need a modern version of [Node.js](https://nodejs.org/en/). At least v16.13.2 is necessary.
Then, open a terminal ([Mac documentation](http://blog.teamtreehouse.com/introduction-to-the-mac-os-x-command-line), [Windows documentation](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and install the program by typing Then, open a terminal ([Mac documentation](http://blog.teamtreehouse.com/introduction-to-the-mac-os-x-command-line), [Windows documentation](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and install the program by typing
@ -21,7 +21,7 @@ worm-scraper --help
If this outputs some help documentation, then the installation process went smoothly. You can move on to assemble the eBook by typing If this outputs some help documentation, then the installation process went smoothly. You can move on to assemble the eBook by typing
```bash ```bash
worm-scraper download convert scaffold zip worm-scraper
``` ```
This will take a while, but will eventually produce a `Worm.epub` file! This will take a while, but will eventually produce a `Worm.epub` file!
@ -29,7 +29,7 @@ This will take a while, but will eventually produce a `Worm.epub` file!
If you'd like to get _Ward_ instead of _Worm_, use `--book=ward`, e.g. If you'd like to get _Ward_ instead of _Worm_, use `--book=ward`, e.g.
```bash ```bash
worm-scraper download convert scaffold zip --book=ward worm-scraper --book=ward
``` ```
## EPUB vs. other formats ## EPUB vs. other formats

11
covers/README.md Normal file
View file

@ -0,0 +1,11 @@
# Cover credits
The _Worm_ cover is assembled from:
- [Ari Ibarra's fanart](https://www.instagram.com/p/B1wSi1Ynaze/) on Instagram
- The "Wildbow's Past Works" image for _Worm_ on [parahumans.net](https://www.parahumans.net/)
The _Ward_ cover is assembled from:
- [zearoe's fanart](https://www.reddit.com/r/Parahumans/comments/b8n7o0/fanartrepost_antares/) on Reddit
- The header image on [parahumans.net](https://www.parahumans.net/)

View file

@ -3,7 +3,7 @@
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head> <head>
<title>Cover</title> <title>Cover</title>
<style> <style type="text/css">
body { body {
text-align: center; text-align: center;
margin: 0; margin: 0;
@ -17,6 +17,8 @@
</style> </style>
</head> </head>
<body> <body>
<img src="cover.jpg" alt=""/> <div>
<img src="cover.jpg" alt=""/>
</div>
</body> </body>
</html> </html>

BIN
covers/worm/cover.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 374 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 332 KiB

View file

@ -3,7 +3,7 @@
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head> <head>
<title>Cover</title> <title>Cover</title>
<style> <style type="text/css">
body { body {
text-align: center; text-align: center;
margin: 0; margin: 0;
@ -17,6 +17,8 @@
</style> </style>
</head> </head>
<body> <body>
<img src="cover.png" alt=""/> <div>
<img src="cover.jpg" alt=""/>
</div>
</body> </body>
</html> </html>

View file

@ -10,29 +10,35 @@ function convertChapter(chapter, book, inputPath, outputPath) {
const contents = fs.readFileSync(inputPath, { encoding: "utf-8" }); const contents = fs.readFileSync(inputPath, { encoding: "utf-8" });
const rawChapterJSDOM = new JSDOM(contents); const rawChapterJSDOM = new JSDOM(contents);
const output = getChapterString(chapter, book, rawChapterJSDOM.window.document); const { output, warnings } = getChapterString(chapter, book, rawChapterJSDOM.window.document);
// TODO: this should probably not be necessary... jsdom bug I guess!? // TODO: this should probably not be necessary... jsdom bug I guess!?
rawChapterJSDOM.window.close(); rawChapterJSDOM.window.close();
fs.writeFileSync(outputPath, output); fs.writeFileSync(outputPath, output);
return warnings;
} }
function getChapterString(chapter, book, rawChapterDoc) { function getChapterString(chapter, book, rawChapterDoc) {
const body = getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content")); const { xml, warnings } =
getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content"));
return `<?xml version="1.0" encoding="UTF-8" ?> const output = `<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head> <head>
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" /> <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
<title>${chapter.title}</title> <title>${chapter.title}</title>
</head> </head>
${body} ${xml}
</html>`; </html>`;
return { output, warnings };
} }
function getBodyXML(chapter, book, contentEl) { function getBodyXML(chapter, book, contentEl) {
const warnings = [];
// Remove initial Next Chapter and Previous Chapter <p> // Remove initial Next Chapter and Previous Chapter <p>
contentEl.firstElementChild.remove(); contentEl.firstElementChild.remove();
@ -65,18 +71,27 @@ function getBodyXML(chapter, book, contentEl) {
// Worm uses 30px; Ward mostly uses 40px but sometimes uses 30px/60px. Let's standardize on 30px. // Worm uses 30px; Ward mostly uses 40px but sometimes uses 30px/60px. Let's standardize on 30px.
if (style === "text-align:left;padding-left:30px;" || if (style === "text-align:left;padding-left:30px;" ||
style === "text-align: left;padding-left: 40px;" || style === "text-align: left;padding-left: 40px;" ||
style === "text-align: left; padding-left: 40px;" ||
style === "padding-left: 40px;") { style === "padding-left: 40px;") {
child.setAttribute("style", "padding-left: 30px;"); child.setAttribute("style", "padding-left: 30px;");
} }
} }
// Remove empty <em>s and <i>s // Remove empty inline elements.
// Remove style attributes from them, as they're always messed up. // Remove style attributes from inline elements, as they're always messed up.
for (const em of contentEl.querySelectorAll("em, i")) { for (const el of contentEl.querySelectorAll("em, i, strong, b")) {
if (em.textContent.trim() === "") { const { textContent } = el;
em.replaceWith(contentEl.ownerDocument.createTextNode(" "));
if (textContent === "") {
el.remove();
} else if (textContent.trim() === "") {
if (el.childElementCount === 0) {
el.replaceWith(" ");
} else if (el.childElementCount === 1 && el.children[0].localName === "br") {
el.outerHTML = "<br />\n";
}
} else { } else {
em.removeAttribute("style"); el.removeAttribute("style");
} }
} }
@ -127,185 +142,200 @@ function getBodyXML(chapter, book, contentEl) {
let xml = xmlSerializer.serializeToString(bodyEl); let xml = xmlSerializer.serializeToString(bodyEl);
// Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p> // Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p>
xml = xml.replace(/<br \/>\s*<\/em><\/p>/g, "</em></p>"); xml = xml.replace(/<br \/>\s*<\/em><\/p>/ug, "</em></p>");
// There are way too many nonbreaking spaces where they don't belong. // Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
// If they show up three in a row, then let them live. Otherwise, they die. xml = xml.replace(/<i>([^ ]+)<\/i>/ug, "<em>$1</em>");
// Also remove any run of them after a period. xml = xml.replace(/<i>([^ ]+)( +)<\/i>/ug, "<em>$1</em>$2");
xml = xml.replace(/([^\xA0])\xA0\xA0?([^\xA0])/g, "$1 $2");
xml = xml.replace(/\.\x20*\xA0[\xA0\x20]*/, ". "); // There are way too many nonbreaking spaces where they don't belong. If they show up three in a row, then let them
// live; they're maybe being used for alignment or something. Otherwise, they die.
//
// Also, normalize spaces after a period/quote mark to two (normal) spaces. The second one is invisible when
// rendered, but it helps future heuristics detect end of sentences.
xml = xml.replace(/\xA0{1,2}(?!\x20\xA0)/ug, " ");
xml = xml.replace(/([.”])\x20*\xA0[\xA0\x20]*/ug, "$1 ");
xml = xml.replace(/([.”])\x20{3,}/ug, "$1 ");
function fixEms() { function fixEms() {
// Fix recurring broken-up or erroneous <em>s // Fix recurring broken-up or erroneous <em>s
xml = xml.replace(/<\/em>s/g, "s</em>"); xml = xml.replace(/<\/em>s/ug, "s</em>");
xml = xml.replace(/<em><\/em>/g, ""); xml = xml.replace(/<em><\/em>/ug, "");
xml = xml.replace(/<\/em><em>/g, ""); xml = xml.replace(/<\/em><em>/ug, "");
xml = xml.replace(/<em>(\s?\s?[^A-Za-z]\s?\s?)<\/em>/g, "$1"); xml = xml.replace(/<em>(\s?\s?[^A-Za-z]\s?\s?)<\/em>/ug, "$1");
xml = xml.replace(/<\/em>(\s?\s?[^A-Za-z]\s?\s?)<em>/g, "$1"); xml = xml.replace(/<\/em>(\s?\s?[^A-Za-z]\s?\s?)<em>/ug, "$1");
xml = xml.replace(/“<em>([^>]+)<\/em>(!|\?|\.)”/g, "“<em>$1$2</em>”"); xml = xml.replace(/“<em>([^>]+)<\/em>(!|\?|\.)”/ug, "“<em>$1$2</em>”");
xml = xml.replace(/<p><em>([^>]+)<\/em>(!|\?|\.)<\/p>/g, "<p><em>$1$2</em></p>"); xml = xml.replace(/<p><em>([^>]+)<\/em>(!|\?|\.)<\/p>/ug, "<p><em>$1$2</em></p>");
xml = xml.replace(/(!|\?|\.)\s{2}<\/em><\/p>/g, "$1</em></p>"); xml = xml.replace(/(!|\?|\.)\s{2}<\/em><\/p>/ug, "$1</em></p>");
xml = xml.replace(/<em>([a-z]+)(\?|\.)<\/em>/g, "<em>$1</em>$2"); xml = xml.replace(/<em>([a-z]+)(\?|\.)<\/em>/ug, "<em>$1</em>$2");
xml = xml.replace(/<em>([^>]+?)( +)<\/em>/g, "<em>$1</em>$2"); xml = xml.replace(/<em>([^>]+?)( +)<\/em>/ug, "<em>$1</em>$2");
xml = xml.replace(/<em> ([a-zA-Z]+)<\/em>/g, " <em>$1</em>"); xml = xml.replace(/<em> ([a-zA-Z]+)<\/em>/ug, " <em>$1</em>");
xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>/g, "<em>$1</em>"); xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>/ug, "<em>$1</em>");
xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>\s*/g, "<em>$1</em>"); xml = xml.replace(/<em>\s*([^<]+)\s*<\/em>\s*/ug, "<em>$1</em>");
xml = xml.replace(/\s*<em>\s*([^<]+)\s*<\/em>/g, "<em>$1</em>"); xml = xml.replace(/\s*<em>\s*([^<]+)\s*<\/em>/ug, "<em>$1</em>");
xml = xml.replace(/<em>“\s*([^<]+)\s*”<\/em>/g, "“<em>$1</em>”"); xml = xml.replace(/<em>“\s*([^<]+)\s*”<\/em>/ug, "“<em>$1</em>”");
xml = xml.replace(/<em>“\s*([^<]+)\s*<\/em>\s*”/g, "“<em>$1</em>”"); xml = xml.replace(/<em>“\s*([^<]+)\s*<\/em>\s*”/ug, "“<em>$1</em>”");
xml = xml.replace(/“\s*<em>\s*([^<]+)\s*”<\/em>/g, "“<em>$1</em>”"); xml = xml.replace(/“\s*<em>\s*([^<]+)\s*”<\/em>/ug, "“<em>$1</em>”");
xml = xml.replace(/([^\n>])<em> ?/g, "$1 <em>"); xml = xml.replace(/([^\n>])<em> ?/ug, "$1 <em>");
xml = xml.replace(/ ?<\/em>/g, "</em> "); xml = xml.replace(/ ?<\/em>/ug, "</em> ");
xml = xml.replace(/<p([^>]+)> <em>/g, "<p$1><em>"); xml = xml.replace(/<p([^>]+)> <em>/ug, "<p$1><em>");
xml = xml.replace(/<\/em> <\/p>/g, "</em></p>"); xml = xml.replace(/<\/em> <\/p>/ug, "</em></p>");
xml = xml.replace(/<em>([a-z]+),<\/em>/g, "<em>$1</em>,"); xml = xml.replace(/<em>([a-z]+),<\/em>/ug, "<em>$1</em>,");
} }
function fixQuotesAndApostrophes() { // These quote/apostrophe/em fixes interact with each other. TODO: try to disentangle so we don't repeat all of
// Fix recurring poor quotes and apostrophes // fixEms.
xml = xml.replace(/<p>”/g, "<p>“"); xml = xml.replace(/,” <\/em>/ug, "</em>,” ");
xml = xml.replace(/“\s*<\/p>/g, "”</p>");
xml = xml.replace(/“\s*<\/em><\/p>/g, "</em>”</p>");
xml = xml.replace(/\s*<\/p>/g, "</p>");
xml = xml.replace(/\s*<\/em><\/p>/g, "</em></p>");
xml = xml.replace(/,” <\/em>/g, "</em>,” ");
xml = xml.replace(//g, "");
xml = xml.replace(/″/g, "”");
xml = xml.replace(/([A-Za-z])s(\s?)/g, "$1s$2");
xml = xml.replace(/Im/g, "Im");
xml = xml.replace(/<p>“\s+/g, "<p>“");
xml = xml.replace(/'/g, "");
xml = xml.replace(/([A-Za-z]+)/g, "$1");
xml = xml.replace(/Sup/g, "Sup");
xml = xml.replace(/cuz/g, "cuz");
xml = xml.replace(/([a-z])”<\/p>/g, "$1.”</p>");
}
// These interact with each other, so do them a few times.
xml = xml.replace(/,” <\/em>/g, "</em>,” ");
fixEms(); fixEms();
fixQuotesAndApostrophes(); xml = xml.replace(/<p>”/ug, "<p>“");
fixEms(); xml = xml.replace(/“\s*<\/p>/ug, "”</p>");
fixQuotesAndApostrophes(); xml = xml.replace(/“\s*<\/em><\/p>/ug, "</em>”</p>");
xml = xml.replace(/\s*<\/p>/ug, "</p>");
xml = xml.replace(/\s*<\/em><\/p>/ug, "</em></p>");
xml = xml.replace(/,” <\/em>/ug, "</em>,” ");
xml = xml.replace(//ug, "");
xml = xml.replace(/″/ug, "”");
xml = xml.replace(/([A-Za-z])s(\s?)/ug, "$1s$2");
xml = xml.replace(/Im/ug, "Im");
xml = xml.replace(/<p>“\s+/ug, "<p>“");
xml = xml.replace(/\s+”/ug, "”");
xml = xml.replace(/'/ug, "");
xml = xml.replace(/([A-Za-z]+)/ug, "$1");
xml = xml.replace(/([a-z])”<\/p>/ug, "$1.”</p>");
fixEms(); fixEms();
xml = xml.replace(/<em>([^<]+)<\/em>/ug, "<em>$1</em>");
xml = xml.replace(/<em>([a-z]+)!<\/em>/ug, "<em>$1</em>!");
xml = xml.replace(/(?<! {2})<em>([\w ]+)([!.?])”<\/em>/ug, "<em>$1</em>$2”");
xml = xml.replace(/<em>([\w ]+[!.?])”<\/em>/ug, "<em>$1</em>”");
xml = xml.replace(/I”(m|ll)/ug, "I$1");
xml = xml.replace(/””<\/p>/ug, "”</p>");
xml = xml.replace(/^([^“]+?) ?”(?![ —<])/ugm, "$1 “");
xml = xml.replace(/(?<!“)<em>([A-Za-z]+),<\/em>(?!”| +[A-Za-z]+ thought)/u, "<em>$1</em>,");
xml = xml.replace(/([Kk])ay(?!)/ug, "$1ay");
xml = xml.replace(/<em>(Why|What|Who|How|Where|When)<\/em>\?/ug, "<em>$1?</em>");
xml = xml.replace(/,<\/em>/ug, "</em>,");
xml = xml.replace(/,”<\/p>/ug, ".”</p>");
xml = xml.replace(/<p>(.*),<\/p>/ug, "<p>$1.</p>");
xml = xml.replace(/(\w+)(\w+)/ug, "$1$2");
xml = xml.replace(/<em>([a-z]+), ([a-z]+)<\/em>/ug, "<em>$1</em>, <em>$2</em>");
// Similar problems occur in Ward with <b> and <strong> as do in Worm with <em>s // Similar problems occur in Ward with <b> and <strong> as do in Worm with <em>s
xml = xml.replace(/<b \/>/g, ""); xml = xml.replace(/<b \/>/ug, "");
xml = xml.replace(/<b>(\s*<br \/>\s*)<\/b>/g, "$1"); xml = xml.replace(/<b>(\s*<br \/>\s*)<\/b>/ug, "$1");
xml = xml.replace(/<strong>(\s*<br \/>\s*)<\/strong>/g, "$1"); xml = xml.replace(/<strong>(\s*<br \/>\s*)<\/strong>/ug, "$1");
xml = xml.replace(/<\/strong>(\s*)<strong>/g, "$1"); xml = xml.replace(/<\/strong>(\s*)<strong>/ug, "$1");
xml = xml.replace(/<strong>@<\/strong>/g, "@"); xml = xml.replace(/<strong>@<\/strong>/ug, "@");
xml = xml.replace(/<br \/>(\s*)<\/strong>/g, "</strong><br />$1"); xml = xml.replace(/<br \/>(\s*)<\/strong>/ug, "</strong><br />$1");
xml = xml.replace(/(\s*)<\/strong>/g, "</strong>$1"); xml = xml.replace(/(\s*)<\/strong>/ug, "</strong>$1");
xml = xml.replace(/><strong>(.*)<\/strong>:</ug, "><strong>$1:</strong><");
// No need for line breaks before paragraph ends // No need for line breaks before paragraph ends or after paragraph starts
// These often occur with the <br>s inside <b>/<strong> fixed above. // These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above.
xml = xml.replace(/<br \/>\s*<\/p>/g, "</p>"); xml = xml.replace(/<br \/>\s*<\/p>/ug, "</p>");
xml = xml.replace(/<p><br \/>\s*/ug, "<p>");
// Fix possessive of names ending in "s" // This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh.
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s". xml = xml.replace(/<\/em>\s*“\s*<\/p>/ug, "</em>”</p>");
xml = xml.replace(/([^])Judas([^s])/g, "$1Judass$2");
xml = xml.replace(/([^])Brutus([^s])/g, "$1Brutuss$2");
xml = xml.replace(/([^])Jess([^s])/g, "$1Jesss$2");
xml = xml.replace(/([^])Aegis([^s])/g, "$1Aegiss$2");
xml = xml.replace(/([^])Dauntless([^s])/g, "$1Dauntlesss$2");
xml = xml.replace(/([^])Circus([^s])/g, "$1Circuss$2");
xml = xml.replace(/([^])Sirius([^s])/g, "$1Siriuss$2");
xml = xml.replace(/([^])Brooks([^s])/g, "$1Brookss$2");
xml = xml.replace(/([^])Genesis([^s])/g, "$1Genesiss$2");
xml = xml.replace(/([^])Atlas([^s])/g, "$1Atlass$2");
xml = xml.replace(/([^])Lucas([^s])/g, "$1Lucass$2");
xml = xml.replace(/([^])Gwerrus([^s])/g, "$1Gwerruss$2");
xml = xml.replace(/([^])Chris([^s])/g, "$1Chriss$2");
xml = xml.replace(/([^])Eligos([^s])/g, "$1Eligoss$2");
xml = xml.replace(/([^])Animos([^s])/g, "$1Animoss$2");
xml = xml.replace(/([^])Mags([^s])/g, "$1Magss$2");
xml = xml.replace(/([^])Huntress([^s])/g, "$1Huntresss$2");
xml = xml.replace(/([^])Hephaestus([^s])/g, "$1Hephaestuss$2");
xml = xml.replace(/([^])Lord of Loss([^s])/g, "$1Lord of Losss$2");
xml = xml.replace(/([^])John Combs([^s])/g, "$1John Combss$2");
xml = xml.replace(/([^])Mama Mathers([^s])/g, "$1Mama Matherss$2");
xml = xml.replace(/([^])Monokeros([^s])/g, "$1Monokeross$2");
xml = xml.replace(/([^])Goddess([^s])/g, "$1Goddesss$2");
xml = xml.replace(/([^])Boundless([^s])/g, "$1Boundlesss$2");
xml = xml.replace(/([^])Paris([^s])/g, "$1Pariss$2");
xml = xml.replace(/([^])Tress([^s])/g, "$1Tresss$2");
xml = xml.replace(/([^])Harris([^s])/g, "$1Harriss$2");
xml = xml.replace(/([^])Antares([^s])/g, "$1Antaress$2");
xml = xml.replace(/([^])Nieves([^s])/g, "$1Nievess$2");
xml = xml.replace(/([^])Backwoods([^s])/g, "$1Backwoodss$2");
xml = xml.replace(/([^])Midas([^s])/g, "$1Midass$2");
xml = xml.replace(/([^])Mrs. Sims([^s])/g, "$1Mrs. Simss$2");
xml = xml.replace(/([^])Ms. Stillons([^s])/g, "$1Ms. Stillonss$2");
xml = xml.replace(/([^])Chuckles([^s])/g, "$1Chuckless$2");
// Fixes dashes // Fix missing spaces after commas
xml = xml.replace(/ /g, "—"); xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/ug, "$1, $2");
xml = xml.replace(/“((?:<em>)?)-/g, "“$1—");
xml = xml.replace(/-[,.]?”/g, "—”");
xml = xml.replace(/-(!|\?)”/g, "—$1”");
xml = xml.replace(/-[,.]?<\/em>”/g, "—</em>”");
xml = xml.replace(/-“/g, "—”");
xml = xml.replace(/<p>-/g, "<p>—");
xml = xml.replace(/-<\/p>/g, "—</p>");
xml = xml.replace(/-<\/em><\/p>/g, "—</em></p>");
xml = xml.replace(/\s?\s?\s?\s?/g, "—");
xml = xml.replace(/-\s\s?/g, "—");
xml = xml.replace(/\s?\s-/g, "—");
xml = xml.replace(/\s+—”/g, "—”");
xml = xml.replace(/I-I/g, "I—I");
xml = xml.replace(/I-uh/g, "I—uh");
// Joint names should use em dashes
xml = xml.replace(/Dallon-Pelham/g, "DallonPelham");
xml = xml.replace(/Bet-Gimel/g, "BetGimel");
xml = xml.replace(/Tristan-Capricorn/g, "TristanCapricorn");
xml = xml.replace(/Capricorn-Byron/g, "CapricornByron");
xml = xml.replace(/Tristan-Byron/g, "TristanByron");
xml = xml.replace(/Gimel-Europe/g, "GimelEurope");
xml = xml.replace(/G-N/g, "GN");
xml = xml.replace(/Imp-Damsel/g, "ImpDamsel");
xml = xml.replace(/Damsel-Ashley/g, "DamselAshley");
xml = xml.replace(/Antares-Anelace/g, "AntaresAnelace");
xml = xml.replace(/Challenger-Gallant/g, "ChallengerGallant");
xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1$2");
xml = xml.replace(/Norwalk-Fairfield/g, "NorwalkFairfield");
xml = xml.replace(/East-West/g, "eastwest");
xml = xml.replace(/(Green|Yellow)-Black/g, "$1Black");
xml = xml.replace(/Creutzfeldt-Jakob/g, "CreutzfeldtJakob");
xml = xml.replace(/Astaroth-Nidhug/g, "AstarothNidhug");
xml = xml.replace(/Capulet-Montague/g, "CapuletMontague");
xml = xml.replace(/Weaver-Clockblocker/g, "WeaverClockblocker");
xml = xml.replace(/Alexandria-Pretender/g, "AlexandriaPretender");
xml = xml.replace(/Night Hag-Nyx/g, "Night HagNyx");
xml = xml.replace(/Crawler-Breed/g, "CrawlerBreed");
xml = xml.replace(/Simurgh-Myrddin-plant/g, "SimurghMyrddinplant");
xml = xml.replace(/Armsmaster-Defiant/g, "ArmsmasterDefiant");
// Use <hr> for separators
// https://www.parahumans.net/2019/12/21/interlude-18-z-radiation/ has "super-separators" ("⊙ ⊙ ⊙ ⊙ ⊙") which we
// leave untouched for now.
xml = xml.replace(/<p>■<\/p>/g, "<hr />");
xml = xml.replace(/<p style="text-align:center;">■<\/p>/g, "<hr />");
xml = xml.replace(/<p style="text-align: center;">⊙<\/p>/g, "<hr />");
xml = xml.replace(/<p style="text-align: center;"><strong>⊙<\/strong><\/p>/g, "<hr />");
xml = xml.replace(/<p style="text-align: center;"><em><strong>⊙<\/strong><\/em><\/p>/g, "<hr />");
xml = xml.replace(/<p style="text-align: center;"><strong>⊙⊙<\/strong><\/p>/g, "<hr />");
// Fix recurring miscapitalization with questions
xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked");
xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked");
// Fix bad periods and spacing/markup surrounding them // Fix bad periods and spacing/markup surrounding them
xml = xml.replace(/\.\.<\/p>/g, ".</p>"); xml = xml.replace(/\.\.<\/p>/ug, ".</p>");
xml = xml.replace(/\.\.”<\/p>/g, ".”</p>"); xml = xml.replace(/\.\.”<\/p>/ug, ".”</p>");
xml = xml.replace(/ \. /g, ". "); xml = xml.replace(/ \. /ug, ". ");
xml = xml.replace(/ \.<\/p>/g, ".</p>"); xml = xml.replace(/ \.<\/p>/ug, ".</p>");
xml = xml.replace(/\.<em>\.\./g, "<em>…"); xml = xml.replace(/\.<em>\.\./ug, "<em>…");
xml = xml.replace(/\.\. {2}/ug, ". ");
xml = xml.replace(/\.\./ug, "…");
xml = xml.replace(/(?<!Mr|Ms|Mrs)…\./ug, "…");
xml = xml.replace(/(?<=Mr|Ms|Mrs)…\./ug, ".…");
// Fix extra spaces // Fix extra spaces
xml = xml.replace(/ ? <\/p>/g, "</p>"); xml = xml.replace(/ ? <\/p>/ug, "</p>");
xml = xml.replace(/([a-z]) ,/g, "$1,"); xml = xml.replace(/([a-z]) ,/ug, "$1,");
// Use actual emojis instead of images
xml = xml.replace(
// eslint-disable-next-line max-len
/<img width="16" height="16" class="wp-smiley emoji" draggable="false" alt="O_o" src="https:\/\/s1.wp.com\/wp-content\/mu-plugins\/wpcom-smileys\/o_O.svg" style="height: 1em; max-height: 1em;" \/>/ug,
"🤨"
);
xml = fixTruncatedWords(xml);
xml = fixDialogueTags(xml);
xml = fixForeignNames(xml);
xml = standardizeNames(xml);
xml = fixEmDashes(xml);
xml = enDashJointNames(xml);
xml = fixPossessives(xml);
xml = cleanSceneBreaks(xml);
xml = fixCapitalization(xml, book);
xml = fixMispellings(xml);
xml = fixHyphens(xml);
xml = standardizeSpellings(xml);
xml = fixCaseNumbers(xml);
// One-off fixes
for (const substitution of substitutions[chapter.url] || []) {
if (substitution.before) {
const indexOf = xml.indexOf(substitution.before);
if (indexOf === -1) {
warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
`updated at the source, in which case, you should edit substitutions.json.`);
}
if (indexOf !== xml.lastIndexOf(substitution.before)) {
warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
`Update substitutions.json for a more precise substitution.`);
}
xml = xml.replace(new RegExp(escapeRegExp(substitution.before), "u"), substitution.after);
} else if (substitution.regExp) {
xml = xml.replace(new RegExp(substitution.regExp, "ug"), substitution.replacement);
} else {
warnings.push(`Invalid substitution specified for ${chapter.url}`);
}
}
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>.
// Use this opportunity to insert a comment pointing to the original URL, for reference.
xml = xml.replace(
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/u,
`<body>\n<!-- ${chapter.url} -->\n`
);
return { xml, warnings };
}
function fixTruncatedWords(xml) {
xml = xml.replace(/Sup/ug, "Sup");
xml = xml.replace(/cuz/ug, "cuz");
// Short for "Sidepeace"
xml = xml.replace(/[][Pp]iece(?![a-z])/ug, "Piece");
// Short for "Disjoint"
xml = xml.replace(/[][Jj]oint(?![a-z])/ug, "Joint");
// Short for "Contender"
xml = xml.replace(/[][Tt]end(?![a-z])/ug, "Tend");
// Short for "Anelace"
xml = xml.replace(/[][Ll]ace(?![a-z])/ug, "Lace");
// Short for "Birdcage"
xml = xml.replace(/[][Cc]age(?![a-z])/ug, "Cage");
// We can't do "Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an
// open quote, so we do that in substitutions.json.
return xml;
}
function fixDialogueTags(xml) {
// Fix recurring miscapitalization with questions
xml = xml.replace(/\?”\s\s?She asked/ug, "?” she asked");
xml = xml.replace(/\?”\s\s?He asked/ug, "?” he asked");
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example, // The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
// > “I didnt get much done,” Greg said, “I got distracted by... // > “I didnt get much done,” Greg said, “I got distracted by...
@ -321,146 +351,460 @@ function getBodyXML(chapter, book, contentEl) {
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of // This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the // times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized. // capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2"); xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/ug, ",” $1. “$2");
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad. return xml;
xml = xml.replace(/<i>([A-Za-z]+)<\/i>/g, "<em>$1</em>"); }
function fixForeignNames(xml) {
// This is consistently missing diacritics
xml = xml.replace(/Yangban/ug, "Yàngbǎn");
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
// italicized, so we go in the direction of removing the italics.
xml = xml.replace(/<em>Garama<\/em>/ug, "Garama");
xml = xml.replace(/<em>Thanda<\/em>/ug, "Thanda");
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/ug, "Sifara$1");
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/ug, "Moord Nag$1");
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/ug, "Califa de Perro$1");
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/ug, "Turanta$1");
return xml;
}
function standardizeNames(xml) {
// 197 instances of "Mrs." to 21 of "Ms."
xml = xml.replace(/Ms\. Yamada/ug, "Mrs. Yamada");
// 25 instances of "Amias" to 3 of "Amais"
xml = xml.replace(/Amais/ug, "Amias");
// 185 instances of Juliette to 4 of Juliet
xml = xml.replace(/Juliet(?=\b)/ug, "Juliette");
// Earlier chapters have a space; later ones do not. They're separate words, so side with the earlier chapters.
// One location is missing the "k".
xml = xml.replace(/Crock? o[]Shit/ug, "Crock o Shit");
// 5 instances of "Jotun" to 2 of "Jotunn"
xml = xml.replace(/Jotunn/ug, "Jotun");
// 13 instances of Elman to 1 of Elmann
xml = xml.replace(/Elmann/ug, "Elman");
// Thousands of instances of Tattletale to 4 instances of Tatteltale
xml = xml.replace(/Tatteltale/ug, "Tattletale");
// 73 instances of Über to 2 of Uber
xml = xml.replace(/Uber/ug, "Über");
return xml;
}
function fixEmDashes(xml) {
xml = xml.replace(/ /ug, "—");
xml = xml.replace(/“((?:<em>)?)-/ug, "“$1—");
xml = xml.replace(/-[,.]?”/ug, "—”");
xml = xml.replace(/-(!|\?)”/ug, "—$1”");
xml = xml.replace(/-[,.]?<\/([a-z]+)>”/ug, "—</$1>”");
xml = xml.replace(/-“/ug, "—”");
xml = xml.replace(/<p>-/ug, "<p>—");
xml = xml.replace(/-<\/p>/ug, "—</p>");
xml = xml.replace(/-<br \/>/ug, "—<br />");
xml = xml.replace(/-<\/([a-z]+)><\/p>/ug, "—</$1></p>");
xml = xml.replace(/\s?\s?\s?\s?/ug, "—");
xml = xml.replace(/-\s\s?/ug, "—");
xml = xml.replace(/\s?\s-/ug, "—");
xml = xml.replace(/\s+—”/ug, "—”");
xml = xml.replace(/I-I/ug, "I—I");
xml = xml.replace(/I-uh/ug, "I—uh");
xml = xml.replace(/-\?/ug, "—?");
return xml;
}
function enDashJointNames(xml) {
// Joint names should use en dashes
xml = xml.replace(/Dallon-Pelham/ug, "DallonPelham");
xml = xml.replace(/Bet-Gimel/ug, "BetGimel");
xml = xml.replace(/Cheit-Gimel/ug, "BetGimel");
xml = xml.replace(/Tristan-Capricorn/ug, "TristanCapricorn");
xml = xml.replace(/Capricorn-Byron/ug, "CapricornByron");
xml = xml.replace(/Tristan-Byron/ug, "TristanByron");
xml = xml.replace(/Gimel-Europe/ug, "GimelEurope");
xml = xml.replace(/G-N/ug, "GN");
xml = xml.replace(/Imp-Damsel/ug, "ImpDamsel");
xml = xml.replace(/Damsel-Ashley/ug, "DamselAshley");
xml = xml.replace(/Antares-Anelace/ug, "AntaresAnelace");
xml = xml.replace(/Challenger-Gallant/ug, "ChallengerGallant");
xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/ug, "Undersider$1$2");
xml = xml.replace(/Norwalk-Fairfield/ug, "NorwalkFairfield");
xml = xml.replace(/East-West/ug, "eastwest");
xml = xml.replace(/Creutzfeldt-Jakob/ug, "CreutzfeldtJakob");
xml = xml.replace(/Astaroth-Nidhug/ug, "AstarothNidhug");
xml = xml.replace(/Capulet-Montague/ug, "CapuletMontague");
xml = xml.replace(/Weaver-Clockblocker/ug, "WeaverClockblocker");
xml = xml.replace(/Alexandria-Pretender/ug, "AlexandriaPretender");
xml = xml.replace(/Night Hag-Nyx/ug, "Night HagNyx");
xml = xml.replace(/Crawler-Breed/ug, "CrawlerBreed");
xml = xml.replace(/Simurgh-Myrddin-plant/ug, "SimurghMyrddinplant");
xml = xml.replace(/Armsmaster-Defiant/ug, "ArmsmasterDefiant");
xml = xml.replace(/Matryoshka-Valentin/ug, "MatryoshkaValentin");
xml = xml.replace(/Gaea-Eden/ug, "GaeaEden");
xml = xml.replace(/([Aa])gent-parahuman/ug, "$1gentparahuman");
xml = xml.replace(/([Pp])arahuman-agent/ug, "$1arahumanagent");
return xml;
}
function fixPossessives(xml) {
// Fix possessive of names ending in "s".
xml = xml.replace(
// eslint-disable-next-line max-len
/(?<!)(Judas|Brutus|Jess|Aegis|Dauntless|Circus|Sirius|Brooks|Genesis|Atlas|Lucas|Gwerrus|Chris|Eligos|Animos|Mags|Huntress|Hephaestus|Lord of Loss|John Combs|Mama Mathers|Monokeros|Goddess|Boundless|Paris|Tress|Harris|Antares|Nieves|Backwoods|Midas|Mrs. Sims|Ms. Stillons|Chuckles|Amias|Semiramis|Mother of Mothers)(?!s)/ug,
"$1s"
);
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s".
xml = xml.replace(/Marquiss/ug, "Marquis");
// This one is not just missing the extra "s"; it's often misplaced.
xml = xml.replace(/Wardens/ug, "Wardens");
return xml;
}
function cleanSceneBreaks(xml) {
// Normalize scene breaks. <hr> would be more semantically appropriate, but loses the author's intent. This is
// especially the case in Ward, which uses a variety of different scene breaks.
xml = xml.replace(/<p(?:[^>]*)>■<\/p>/ug, `<p style="text-align: center;">■</p>`);
xml = xml.replace(
/<p style="text-align: center;"><strong>⊙<\/strong><\/p>/ug,
`<p style="text-align: center;">⊙</p>`
);
xml = xml.replace(
/<p style="text-align: center;"><em><strong>⊙<\/strong><\/em><\/p>/ug,
`<p style="text-align: center;">⊙</p>`
);
xml = xml.replace(
/<p style="text-align: center;"><strong>⊙⊙<\/strong><\/p>/ug,
`<p style="text-align: center;">⊙</p>`
);
xml = xml.replace(
/<p style="text-align: center;"><strong>⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/ug,
`<p style="text-align: center;">⊙ ⊙ ⊙ ⊙ ⊙</p>`
);
return xml;
}
function fixCapitalization(xml, book) {
// This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where // This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where
// it's incorrect to capitalize in the one-off fixes. // it's incorrect to capitalize in the one-off fixes.
// Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals. // Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals.
xml = xml.replace(/([Tt])he clairvoyant([^s])/g, "$1he Clairvoyant$2"); xml = xml.replace(/([Tt])he clairvoyant(?!s)/ug, "$1he Clairvoyant");
// ReSound's name is sometimes miscapitalized. The word is never used in a non-name context. // ReSound's name is sometimes miscapitalized. The word is never used in a non-name context.
xml = xml.replace(/Resound/g, "ReSound"); xml = xml.replace(/Resound/ug, "ReSound");
// The Speedrunners team name is missing its capitalization a couple times.
xml = xml.replace(/speedrunners/ug, "Speedrunners");
// The Machine Army is missing its capitalization a couple times.
xml = xml.replace(/machine army/ug, "Machine Army");
// "patrol block" is capitalized three different ways: "patrol block", "Patrol block", and "Patrol Block". "patrol // "patrol block" is capitalized three different ways: "patrol block", "Patrol block", and "Patrol Block". "patrol
// group" is always lowercased. It seems like "Patrol" is a proper name, and is used as a capitalized modifier in // group" is always lowercased. It seems like "Patrol" is a proper name, and is used as a capitalized modifier in
// other contexts (e.g. Patrol leader). So let's standardize on "Patrol <lowercase>". // other contexts (e.g. Patrol leader). So let's standardize on "Patrol <lowercase>".
xml = xml.replace(/patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl)/ig, xml = xml.replace(
(_, $1) => `Patrol ${$1.toLowerCase()}`); /patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/uig,
// This always works in Ward and has a few false positives in Worm, where it is never needed: (_, $1) => `Patrol ${$1.toLowerCase()}`
);
// This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in
// Worm, where it is never needed:
if (book === "ward") { if (book === "ward") {
xml = xml.replace(/the patrol/g, "the Patrol"); xml = xml.replace(/the patrol(?!s|ling)/ug, "the Patrol");
} }
// This is sometimes missing its capitalization. // This is sometimes missing its capitalization.
xml = xml.replace(/the birdcage/g, "the Birdcage"); xml = xml.replace(/the birdcage/ug, "the Birdcage");
// This is usually spelled "TV" but sometimes the other ways. Normalize. // There's no reason why these should be capitalized.
xml = xml.replace(/(\b)tv(\b)/g, "$1TV$2"); xml = xml.replace(/(?<! {2}|“|>)Halberd/ug, "halberd");
xml = xml.replace(/t\.v\./ig, "TV"); xml = xml.replace(/(?<! {2}|“|>)Loft/ug, "loft");
// This is commonly misspelled. // These are treated as common nouns and not traditionally capitalized. "Krav Maga" remains capitalized,
xml = xml.replace(/([Ss])houlderblade/g, "$1houlder blade"); // interestingly (according to dictionaries and Wikipedia).
xml = xml.replace(/(?<! {2}|“|>)Judo/ug, "judo");
xml = xml.replace(/(?<! {2}|“|>)Aikido/ug, "aikido");
xml = xml.replace(/(?<! {2}|“|>)Karate/ug, "karate");
xml = xml.replace(/(?<! {2}|“|>)Tae Kwon Do/ug, "tae kwon do");
// There's no reason why these should be capitalized. (Note that they never appear at the beginning of any sentences.) // There's no reason why university should be capitalized in most contexts, although sometimes it's used as part of
xml = xml.replace(/Halberd/g, "halberd"); // a compound noun or at the beginning of a sentence.
xml = xml.replace(/Loft/g, "loft"); xml = xml.replace(/(?<! {2}|“|>|Cornell |Nilles )University(?! Road)/ug, "university");
// Especially early in the story, PRT designations are capitalized; they should not be. This fixes the cases where we // Organ names (e.g. brain, arm) or scientific names are not capitalized, so the "corona pollentia" and friends should
// not be either. The books are inconsistent.
xml = xml.replace(/(?<! {2}|“|>|-)Corona/ug, "corona");
xml = xml.replace(/Pollentia/ug, "pollentia");
xml = xml.replace(/Radiata/ug, "radiata");
xml = xml.replace(/Gemma/ug, "gemma");
// We de-capitalize Valkyrie's "flock", since most uses are de-capitalized (e.g. the many instances in Gleaming
// Interlude 9, or Dying 15.z). This is a bit surprising; it seems like an organization name. But I guess it's
// informal.
xml = xml.replace(/(?<! {2}|“|>)Flock/ug, "flock");
// Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we
// can be reasonably sure they don't start a sentence, although more specific instances are done in // can be reasonably sure they don't start a sentence, although more specific instances are done in
// substitutions.json, and some need to be back-corrected. // substitutions.json, and some need to be back-corrected.
// //
// Note: "Master" is specifically omitted because it fails poorly on Interlude 4. Other instances need to be // Note: "Master" is specifically omitted because it fails poorly on Worm Interlude 4. Other instances need to be
// corrected via substitutions.json. // corrected via substitutions.json.
//
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json.
xml = xml.replace( xml = xml.replace(
/([a-zA-Z,] |\/)(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)/g, // eslint-disable-next-line max-len
(_, prefix, designation) => prefix + designation.toLowerCase() /(?<! {2}|“|>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/ug,
(_, designation) => designation.toLowerCase()
); );
xml = xml.replace( xml = xml.replace(
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)-(\d+)/gi, /(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)-(\d+)/ugi,
"$1 $2" "$1 $2"
); );
xml = xml.replace( xml = xml.replace(
// eslint-disable-next-line max-len // eslint-disable-next-line max-len
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)[ -/](mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)/gi, /(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)[ -/](mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)/ugi,
"$1$2" "$1$2"
); );
// This is consistently missing accents // Capitalization is inconsistent, but shard names seems to usually be capitalized.
xml = xml.replace(/Yangban/g, "Yàngbǎn"); xml = xml.replace(/Grasping self/ug, "Grasping Self");
xml = xml.replace(/Cloven stranger/ug, "Cloven Stranger");
xml = xml.replace(/Princess shaper/ug, "Princess Shaper");
xml = xml.replace(/Fragile one/ug, "Fragile One");
// Place names need to always be capitalized // Place names need to always be capitalized
xml = xml.replace(/North end/g, "North End"); xml = xml.replace(/North end/ug, "North End");
xml = xml.replace(/(Stonemast|Shale) avenue/g, "$1 Avenue"); xml = xml.replace(/(Stonemast|Shale) avenue/ug, "$1 Avenue");
xml = xml.replace(/(Lord|Slater) street/g, "$1 Street"); xml = xml.replace(/(Lord|Slater) street/ug, "$1 Street");
xml = xml.replace(/(Hollow|Cedar) point/g, "$1 Point"); xml = xml.replace(/(Hollow|Cedar) point/ug, "$1 Point");
xml = xml.replace(/(Norwalk|Fenway|Stratford) station/g, "$1 Station"); xml = xml.replace(/(Norwalk|Fenway|Stratford) station/ug, "$1 Station");
xml = xml.replace(/the megalopolis/g, "the Megalopolis"); xml = xml.replace(/the megalopolis/ug, "the Megalopolis");
xml = xml.replace(/earths(?![a-z])/g, "Earths"); xml = xml.replace(/earths(?![a-z])/ug, "Earths");
if (book === "ward") {
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not xml = xml.replace(/the bunker/ug, "the Bunker");
// italicized, so we go in the direction of removing the italics. xml = xml.replace(/bunker/ug, "Bunker");
xml = xml.replace(/<em>Garama<\/em>/g, "Garama"); }
xml = xml.replace(/<em>Thanda<\/em>/g, "Thanda");
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/g, "Sifara$1");
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/g, "Moord Nag$1");
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1");
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/g, "Turanta$1");
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
// writing notes and thus probably the intention was to be less formal. Also it seems per
// https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
xml = xml.replace(/Ok([,. ])/g, "Okay$1");
xml = xml.replace(/([^a-zA-Z])ok([^a])/g, "$1okay$2");
xml = xml.replace(/([^a-zA-Z])o\.k\.([^a])/g, "$1okay$2");
xml = xml.replace(/a-okay/g, "A-okay");
// Signal(l)ing/signal(l)ed are spelled both ways. Both are acceptable in English. Let's standardize on single-L.
xml = xml.replace(/(S|s)ignall/g, "$1ignal");
// Clich(e|é) is spelled both ways. Let's standardize on including the accent.
xml = xml.replace(/cliche/g, "cliché");
// "gray" is the majority spelling, except for "greyhound"
xml = xml.replace(/(G|g)rey(?!hound)/g, "$1ray");
// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of // "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in // instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
// substitutions.json. // substitutions.json.
xml = xml.replace(/(?<!mom), dad(?![a-z])/g, ", Dad"); xml = xml.replace(/(?<!mom), dad(?![a-z])/ug, ", Dad");
xml = xml.replace(/, mom(?![a-z-])/g, ", Mom"); xml = xml.replace(/, mom(?![a-z-])/ug, ", Mom");
// Similarly, specific aunts and uncles get capitalized when used as a title. These are often missed.
xml = xml.replace(/aunt Sarah/ug, "Aunt Sarah");
xml = xml.replace(/aunt Fleur/ug, "Aunt Fleur");
xml = xml.replace(/uncle Neil/ug, "Uncle Neil");
// The majority of "Wardens headquarters" is lowercased, and always prefixed with "the", indicating it's not a proper
// place name. So we remove the capitalization in the few places where it does appear.
xml = xml.replace(/Wardens Headquarters/ug, "Wardens headquarters");
// Some style guides try to reserve capitalized "Nazi" for historical discussions of members of the Nazi party. This
// seems fuzzy when it comes to phrases like "neo-Nazi", and doesn't seem to be what the author is doing; the books
// are just plain inconsistent. So, let's standardize on always uppercasing.
xml = xml.replace(/(?<![a-z])nazi/ug, "Nazi");
xml = xml.replace(/ Neo-/ug, " neo-");
// Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one
// case of "French toast".)
xml = xml.replace(/english(?! muffin)/ug, "English");
xml = xml.replace(/(?<! {2})English muffin/ug, "english muffin");
// I was very torn on what to do with capitalization for "Titan" and "Titans". In general you don't capitalize species
// names or other classifications, e.g. style guides are quite clear you don't capitalize "gods". The author
// capitalizes them more often than not (e.g., 179 raw "Titans" to 49 "titans"), but is quite inconsistent.
//
// In the end, I decided against de-capitalization, based on the precedent set by "Endbringers" (which are
// conceptually paired with Titans several times in the text). However, we only capitalize the class after they are
// _introduced_ as a class in Sundown 17.y. (Before then we still capitalize individual names like "Dauntless Titan"
// or "Kronos Titan".)
if (book === "ward") {
// All plural discussions of "Titans" are after Sundown 17.y.
xml = xml.replace(/titans/ug, "Titans");
// Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here,
// though.
xml = xml.replace(/dauntless titan/uig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized.
xml = xml.replace(/Kronos titan/ug, "Kronos Titan");
}
// For the giants, the prevailing usage seems to be to keep the term lowercase, but capitalize when used as a name.
xml = xml.replace(/(?<=Mathers |Goddess )giant/ug, "Giant");
xml = xml.replace(/mother giant/uig, "Mother Giant");
xml = xml.replace(/(?<! {2}|“|>)Giants/ug, "giants");
return xml;
}
function fixMispellings(xml) {
// This is commonly misspelled.
xml = xml.replace(/([Ss])houlderblade/ug, "$1houlder blade");
// All dictionaries agree this is capitalized.
xml = xml.replace(/u-turn/ug, "U-turn");
// https://www.dictionary.com/browse/scot-free
xml = xml.replace(/scott(?: |-)free/ug, "scot-free");
// https://ugrammarist.com/idiom/change-tack/
xml = xml.replace(/changed tacks/ug, "changed tack");
return xml;
}
function fixHyphens(xml) {
// "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit
// them.
xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/ug, "$1-year-old$2");
xml = xml.replace(/(\w+) or (\w+)-year-old/ug, "$1- or $2-year-old");
// Compound numbers from 11 through 99 must be hyphenated, but others should not be.
xml = xml.replace(
/(?<!\w)(twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety) (one|two|three|four|five|six|seven|eight|nine)/uig,
"$1-$2"
);
xml = xml.replace(/[- ]hundred-and-/ug, " hundred and ");
xml = xml.replace(/(?<!-)(one|two|three|four|five|six|seven|eight|nine|twelve)-hundred/ug, "$1 hundred");
xml = xml.replace(/(hundred|ninety)-percent(?!-)/ug, "$1 percent");
// "red-haired", "long-haired", etc.: they all need hyphens
xml = xml.replace(/ haired/ug, "-haired");
// These are consistently missing hyphens. // These are consistently missing hyphens.
xml = xml.replace(/self destruct/g, "self-destruct"); xml = xml.replace(/([Ll]ife) threatening/ug, "life-threatening");
xml = xml.replace(/life threatening/g, "life-threatening"); xml = xml.replace(/([Hh]ard) headed/ug, "$1-headed");
xml = xml.replace(/hard headed/g, "hard-headed"); xml = xml.replace(/([Ss]houlder) mounted/ug, "$1-mounted");
xml = xml.replace(/shoulder mounted/g, "shoulder-mounted"); xml = xml.replace(/([Gg]olden) skinned/ug, "$1-skinned");
xml = xml.replace(/golden skinned/g, "golden-skinned"); xml = xml.replace(/([Cc]reepy) crawl/ug, "$1-crawl");
xml = xml.replace(/creepy crawl/g, "creepy-crawl"); xml = xml.replace(/([Ww]ell) armed/ug, "$1-armed");
xml = xml.replace(/well armed/g, "well-armed"); xml = xml.replace(/([Aa]ble) bodied/ug, "$1-bodied");
xml = xml.replace(/able bodied/g, "able-bodied"); xml = xml.replace(/([Ll]evel) headed/ug, "$1-headed");
xml = xml.replace(/([Cc]lear) cut/ug, "$1-cut");
xml = xml.replace(/([Vv]at) grown/ug, "$1-grown");
xml = xml.replace(/([Ss]hell) shocked/ug, "$1-shocked");
xml = xml.replace(/([Dd]og) tired/ug, "$1-tired");
xml = xml.replace(/([Nn]ightmare) filled/ug, "$1-filled");
xml = xml.replace(/([Oo]ne) sided/ug, "$1-sided");
xml = xml.replace(/([Mm]edium) sized/ug, "$1-sized");
xml = xml.replace(/([Tt]eary) eyed/ug, "$1-eyed");
xml = xml.replace(/([Ww]orst) case scenario/ug, "$1-case scenario");
xml = xml.replace(/([Ss]elf) (conscious|esteem|loathing|harm|destruct|preservation)/ug, "$1-$2");
xml = xml.replace(/([Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ourth) dimensional/ug, "$1-dimensional");
xml = xml.replace(/(?<=\b)([Oo]ne) on one(?=\b)/ug, "$1-on-one");
// Preemptive(ly) is often hyphenated (not always). It should not be. // Preemptive(ly) is often hyphenated (not always). It should not be.
xml = xml.replace(/([Pp])re-emptive/g, "$1reemptive"); xml = xml.replace(/([Pp])re-emptive/ug, "$1reemptive");
// One-off fixes // These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json.
(substitutions[chapter.url] || []).forEach(substitution => { xml = xml.replace(/fist-bump/ug, "fist bump");
if (substitution.before) { xml = xml.replace(/high-five/ug, "high five");
const indexOf = xml.indexOf(substitution.before);
if (indexOf === -1) {
console.warn(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
`updated at the source, in which case, you should edit substitutions.json.`);
}
if (indexOf !== xml.lastIndexOf(substitution.before)) {
console.warn(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
`Update substitutions.json for a more precise substitution.`);
}
xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after); // This should be hyphenated when used as an adjective (instead of an adverb or noun). I.e. it should be
} else if (substitution.regExp) { // "hand-to-hand combat", but "passed from hand to hand", and "capable in hand to hand". The following heuristic works
xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement); // in the books.
} else { xml = xml.replace(/hand to hand(?= [a-z])/ug, "hand-to-hand");
console.warn(`Invalid substitution specified for ${chapter.url}`);
}
});
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>. // This is usually wrong but sometimes correct. The lookarounds avoid specific cases where it's referring to an actual
// Use this opportunity to insert a comment pointing to the original URL, for reference. // second in a series of guesses.
xml = xml.replace(/(?<!my |that )([Ss]econd) guess(?!es)/ug, "$1-guess");
// When used as a phrase "just in case" gets no hyphens. When used as a noun or adjective it does. A couple of the
// noun cases are missing one or both hyphens.
xml = xml.replace(/([Aa]) just[ -]in case/ug, "$1 just-in-case");
// When used as an adjective, it's hyphenated. It turns out most cases are as an adverb, so we go with this approach:
xml = xml.replace( xml = xml.replace(
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/, /face to face(?= meeting| hang-out| interaction| contact| conversation| confrontation| fight)/ug,
`<body>\n<!-- ${chapter.url} -->\n`); "face-to-face"
);
// When used as an adjective, it's hyphenated. This heuristic works in the books.
xml = xml.replace(/fight or flight(?= [a-z])/ug, "fight-or-flight");
// This is usually correct but sometimes wrong.
xml = xml.replace(/neo /ug, "neo-");
return xml;
}
function standardizeSpellings(xml) {
// This is usually spelled "TV" but sometimes the other ways. Normalize.
xml = xml.replace(/(\b)tv(\b)/ug, "$1TV$2");
xml = xml.replace(/t\.v\./uig, "TV");
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
// writing notes and thus probably the intention was to be less formal. Also it seems per
// https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
xml = xml.replace(/Ok([,. ])/ug, "Okay$1");
xml = xml.replace(/([^a-zA-Z])ok([^a])/ug, "$1okay$2");
xml = xml.replace(/([^a-zA-Z])o\.k\.([^a])/ug, "$1okay$2");
xml = xml.replace(/a-okay/ug, "A-okay");
// Signal(l)ing/signal(l)ed are spelled both ways. Both are acceptable in English. Let's standardize on single-L.
xml = xml.replace(/(S|s)ignall/ug, "$1ignal");
// Clich(e|é) is spelled both ways. Let's standardize on including the accent.
xml = xml.replace(/cliche/ug, "cliché");
// T-shirt is usually spelled lowercase ("t-shirt"). Normalize the remaining instances.
xml = xml.replace(/(?<! {2})T-shirt/ug, "t-shirt");
// "gray" is the majority spelling, except for "greyhound"
xml = xml.replace(/(G|g)rey(?!hound)/ug, "$1ray");
// 12 instances of "Dragon-craft", 12 instances of "Dragon craft", 1 instance of "dragon craft"
xml = xml.replace(/[Dd]ragon[ -](craft|mech)/ug, "Dragon-$1");
// 88 instances of "A.I." to four of "AI"
xml = xml.replace(/(?<=\b)AI(?=\b)/ug, "A.I.");
// 2 instances of "G.M." to one of "GM"
xml = xml.replace(/(?<=\b)GM(?=\b)/ug, "G.M.");
return xml;
}
function fixCaseNumbers(xml) {
// Case numbers are very inconsistent. For "Case Fifty-Three", the breakdown is:
// * 9 Case-53
// * 6 Case 53
// * 2 case-53
// * 1 Case-Fifty-Three
// * 41 Case Fifty-Three
// * 1 Case Fifty Three
// * 13 Case fifty-three
// * 119 case fifty-three
// * 4 case-fifty-three
// * 1 case fifty three
// We standardize on "Case Fifty-Three"; although it isn't the most common, it seems best to treat these as proper
// nouns.
xml = xml.replace(/case[ -](?:fifty[ -]three|53)(?!)/uig, "Case Fifty-Three");
xml = xml.replace(/case[ -](?:thirty[ -]two|53)(?!)/uig, "Case Thirty-Two");
xml = xml.replace(/case[ -](?:sixty[ -]nine|53)(?!)/uig, "Case Sixty-Nine");
xml = xml.replace(
/(?<!in )case[ -](zero|one|two|three|four|twelve|fifteen|seventy|ninety)(?!-)/uig,
(_, caseNumber) => `Case ${caseNumber[0].toUpperCase()}${caseNumber.substring(1)}`
);
return xml; return xml;
} }
@ -474,7 +818,7 @@ function isEmptyOrGarbage(el) {
} }
function escapeRegExp(str) { function escapeRegExp(str) {
return str.replace(/[-[\]/{}()*+?.\\^$|]/g, "\\$&"); return str.replace(/[[\]/{}()*+?.\\^$|]/ug, "\\$&");
} }
function decodeCloudFlareEmail(hash) { function decodeCloudFlareEmail(hash) {

View file

@ -1,6 +1,7 @@
"use strict"; "use strict";
const path = require("path"); const path = require("path");
const fs = require("fs").promises; const fs = require("fs").promises;
const { performance } = require("perf_hooks");
const workerpool = require("workerpool"); const workerpool = require("workerpool");
const cliProgress = require("cli-progress"); const cliProgress = require("cli-progress");
@ -11,9 +12,12 @@ module.exports = async (cachePath, manifestPath, contentPath, book, concurrentJo
console.log("Converting raw downloaded HTML to EPUB chapters"); console.log("Converting raw downloaded HTML to EPUB chapters");
const progress = new cliProgress.SingleBar({ const progress = new cliProgress.SingleBar({
stopOnComplete: true, stopOnComplete: true,
clearOnComplete: true clearOnComplete: true,
format: " {bar} {percentage}% | {time} | {value}/{total}"
}, cliProgress.Presets.shades_classic); }, cliProgress.Presets.shades_classic);
progress.start(chapters.length, 0);
const start = performance.now();
progress.start(chapters.length, 0, { time: " " });
const poolOptions = {}; const poolOptions = {};
if (concurrentJobs !== undefined) { if (concurrentJobs !== undefined) {
@ -21,18 +25,24 @@ module.exports = async (cachePath, manifestPath, contentPath, book, concurrentJo
} }
const pool = workerpool.pool(path.resolve(__dirname, "convert-worker.js"), poolOptions); const pool = workerpool.pool(path.resolve(__dirname, "convert-worker.js"), poolOptions);
const warnings = [];
await Promise.all(chapters.map(async chapter => { await Promise.all(chapters.map(async chapter => {
const inputPath = path.resolve(cachePath, chapter.filename); const inputPath = path.resolve(cachePath, chapter.filename);
const destFileName = `${path.basename(chapter.filename, ".html")}.xhtml`; const destFileName = `${path.basename(chapter.filename, ".html")}.xhtml`;
const outputPath = path.resolve(contentPath, destFileName); const outputPath = path.resolve(contentPath, destFileName);
await pool.exec("convertChapter", [chapter, book, inputPath, outputPath]); warnings.push(...await pool.exec("convertChapter", [chapter, book, inputPath, outputPath]));
progress.increment(); const seconds = String(Math.round((performance.now() - start) / 1000)).padStart(3);
progress.increment({ time: `${seconds} s` });
})); }));
pool.terminate(); pool.terminate();
console.log("All chapters converted"); for (const warning of warnings) {
console.warn(warning);
}
console.log(`All chapters converted in ${Math.round((performance.now() - start) / 100) / 10} seconds`);
}; };

View file

@ -1,7 +1,7 @@
"use strict"; "use strict";
const path = require("path"); const path = require("path");
const fs = require("fs").promises; const fs = require("fs").promises;
const request = require("requisition"); const fetch = require("minipass-fetch");
const { JSDOM } = require("jsdom"); const { JSDOM } = require("jsdom");
const FILENAME_PREFIX = "chapter"; const FILENAME_PREFIX = "chapter";
@ -39,34 +39,21 @@ async function downloadAllChapters(manifest, startChapterURL, cachePath, manifes
while (currentChapter !== null) { while (currentChapter !== null) {
const filename = `${FILENAME_PREFIX}${chapterIndex.toString().padStart(3, "0")}.html`; const filename = `${FILENAME_PREFIX}${chapterIndex.toString().padStart(3, "0")}.html`;
console.log(`Downloading ${currentChapter}`); process.stdout.write(`Downloading ${currentChapter}... `);
const response = await downloadChapter(currentChapter); const { contents, dom, url } = await downloadChapter(currentChapter);
const contents = await response.text(); const title = getChapterTitle(dom.window.document);
console.log("- Response body received"); currentChapter = getNextChapterURL(dom.window.document);
const rawChapterJSDOM = new JSDOM(contents, { url: currentChapter });
console.log("- Response body parsed into DOM");
const chapterURLToSave = currentChapter; dom.window.close();
const chapterTitle = getChapterTitle(rawChapterJSDOM.window.document);
currentChapter = getNextChapterURL(rawChapterJSDOM.window.document);
// TODO: this should probably not be necessary... jsdom bug I guess!?
rawChapterJSDOM.window.close();
manifest.push({
url: chapterURLToSave,
title: chapterTitle,
filename
});
manifest.push({ url, title, filename });
await fs.writeFile(path.resolve(cachePath, filename), contents); await fs.writeFile(path.resolve(cachePath, filename), contents);
console.log("- Response text saved to cache file");
// Incrementally update the manifest after every successful download, instead of waiting until the end. // Incrementally update the manifest after every successful download, instead of waiting until the end.
const newManifestContents = JSON.stringify(manifest, undefined, 2); const newManifestContents = JSON.stringify(manifest, undefined, 2);
await fs.writeFile(manifestPath, newManifestContents); await fs.writeFile(manifestPath, newManifestContents);
console.log("- Manifest updated"); process.stdout.write("done\n");
++chapterIndex; ++chapterIndex;
} }
@ -78,14 +65,21 @@ function getNextChapterURL(rawChapterDoc) {
// - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/ // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
// So instead search for the first <a> within the main content area starting with "Next", trimmed. // So instead search for the first <a> within the main content area starting with "Next", trimmed.
let result = null;
const aEls = rawChapterDoc.querySelectorAll(".entry-content a"); const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
for (let i = 0; i < aEls.length; ++i) { for (let i = 0; i < aEls.length; ++i) {
if (aEls[i].textContent.trim().startsWith("Next")) { if (aEls[i].textContent.trim().startsWith("Next")) {
return aEls[i].href; result = aEls[i].href;
break;
} }
} }
return null; // Except, this doesn't always work, because the "Next Chapter" link in
// https://www.parahumans.net/2020/04/28/last-20-e6/ is just broken for some reason. We hard-code that.
if (result === "https://www.parahumans.net/?p=3365&preview=true") {
return "https://www.parahumans.net/2020/05/02/last-20-end/";
}
return result;
} }
function getChapterTitle(rawChapterDoc) { function getChapterTitle(rawChapterDoc) {
@ -93,7 +87,7 @@ function getChapterTitle(rawChapterDoc) {
// issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to // issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to
// have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter. // have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter.
// Until then this is reasonable and uniform. // Until then this is reasonable and uniform.
return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ /, " "); return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ /u, " ");
} }
function retry(times, fn) { function retry(times, fn) {
@ -106,9 +100,33 @@ function retry(times, fn) {
}); });
} }
function downloadChapter(url) { async function downloadChapter(startingURL) {
let urlToFollow = startingURL;
let url, contents, dom;
while (urlToFollow !== null) {
const response = await downloadWithRetry(urlToFollow);
url = urlToFollow;
contents = await response.text();
dom = new JSDOM(contents, { url });
const refreshMeta = dom.window.document.querySelector("meta[http-equiv=refresh]");
if (refreshMeta) {
[, urlToFollow] = /\d+;url=(.*)/ui.exec(refreshMeta.content);
process.stdout.write(`\n Redirected to ${urlToFollow}... `);
dom.window.close();
} else {
urlToFollow = null;
}
}
return { url, contents, dom };
}
function downloadWithRetry(url) {
return retry(3, async () => { return retry(3, async () => {
const response = await request(url).redirects(10); const response = await fetch(url);
if (response.status !== 200) { if (response.status !== 200) {
throw new Error(`Response status for ${url} was ${response.status}`); throw new Error(`Response status for ${url} was ${response.status}`);
} }

View file

@ -4,7 +4,7 @@ const path = require("path");
const cpr = require("util").promisify(require("cpr")); const cpr = require("util").promisify(require("cpr"));
const BOOK_PUBLISHER = "Domenic Denicola"; const BOOK_PUBLISHER = "Domenic Denicola";
const BOOK_AUTHOR = "wildbow"; const BOOK_AUTHOR = "Wildbow";
const NCX_FILENAME = "toc.ncx"; const NCX_FILENAME = "toc.ncx";
@ -22,6 +22,8 @@ module.exports = async (scaffoldingPath, coverPath, bookPath, contentPath, chapt
]); ]);
}) })
]); ]);
console.log(`EPUB contents assembled into ${scaffoldingPath}`);
}; };
function noThumbs(filePath) { function noThumbs(filePath) {
@ -54,7 +56,7 @@ function writeOPF(chapters, contentPath, coverFiles, bookInfo) {
<manifest> <manifest>
<item id="ncx" href="${NCX_FILENAME}" media-type="application/x-dtbncx+xml"/> <item id="ncx" href="${NCX_FILENAME}" media-type="application/x-dtbncx+xml"/>
<item id="cover" href="${coverFiles.xhtml}" media-type="application/xhtml+xml"/> <item id="cover" href="${coverFiles.xhtml}" media-type="application/xhtml+xml"/>
<item id="cover-image" properties="cover-image" href="${coverFiles.image}" media-type="${coverFiles.imageMimeType}"/> <item id="cover-image" href="${coverFiles.image}" media-type="${coverFiles.imageMimeType}"/>
${manifestChapters} ${manifestChapters}
</manifest> </manifest>

File diff suppressed because it is too large Load diff

View file

@ -14,9 +14,10 @@ const zip = require("./zip.js");
const OUTPUT_DEFAULT = "(Book name).epub"; const OUTPUT_DEFAULT = "(Book name).epub";
const argv = yargs const { argv } = yargs
.usage(`${packageJson.description}\n\n${packageJson.name} [<command1> [<command2> [<command3> ...]]]\n\n` + .usage(`${packageJson.description}\n\n${packageJson.name} [<command1> [<command2> [<command3> ...]]]\n\n` +
"Each command will fail if the previously-listed one has not yet been run (with matching options).") "Each command will fail if the previously-listed one has not yet been run (with matching options).\n\n" +
"Running with no commands is equivalent to running download convert scaffold zip.")
.command("download", "download all chapters into the cache") .command("download", "download all chapters into the cache")
.command("convert", "convert the raw HTML into cleaned-up ebook chapters") .command("convert", "convert the raw HTML into cleaned-up ebook chapters")
.command("scaffold", "assemble the table of contents, etc.") .command("scaffold", "assemble the table of contents, etc.")
@ -58,11 +59,9 @@ const argv = yargs
requiresArg: true, requiresArg: true,
global: true global: true
}) })
.demandCommand(1) // TODO remove and allow all
.recommendCommands() .recommendCommands()
.help() .help()
.version() .version();
.argv;
const outputFilename = argv.out === OUTPUT_DEFAULT ? `${books[argv.book].title}.epub` : argv.out; const outputFilename = argv.out === OUTPUT_DEFAULT ? `${books[argv.book].title}.epub` : argv.out;
@ -77,14 +76,18 @@ const chaptersPath = path.resolve(contentPath, "chapters");
const commands = []; const commands = [];
if (argv._.length === 0) {
argv._ = ["download", "convert", "scaffold", "zip"];
}
if (argv._.includes("download")) { if (argv._.includes("download")) {
const startURL = books[argv.book].startURL; const { startURL } = books[argv.book];
commands.push(() => download(startURL, cachePath, manifestPath)); commands.push(() => download(startURL, cachePath, manifestPath));
} }
if (argv._.includes("convert")) { if (argv._.includes("convert")) {
commands.push(() => { commands.push(() => {
return fs.rmdir(chaptersPath, { recursive: true }) return fs.rm(chaptersPath, { force: true, recursive: true, maxRetries: 3 })
.then(() => fs.mkdir(chaptersPath, { recursive: true })) .then(() => fs.mkdir(chaptersPath, { recursive: true }))
.then(() => convert(cachePath, manifestPath, chaptersPath, argv.book, argv.jobs)); .then(() => convert(cachePath, manifestPath, chaptersPath, argv.book, argv.jobs));
}); });
@ -93,7 +96,13 @@ if (argv._.includes("convert")) {
if (argv._.includes("scaffold")) { if (argv._.includes("scaffold")) {
const bookInfo = books[argv.book]; const bookInfo = books[argv.book];
commands.push(() => scaffold( commands.push(() => scaffold(
scaffoldingPath, coverPath, stagingPath, contentPath, chaptersPath, manifestPath, bookInfo scaffoldingPath,
coverPath,
stagingPath,
contentPath,
chaptersPath,
manifestPath,
bookInfo
)); ));
} }
@ -106,8 +115,6 @@ if (argv._.includes("zip")) {
for (const command of commands) { for (const command of commands) {
await command(); await command();
} }
console.log("All done!");
} catch (e) { } catch (e) {
console.error(e.stack); console.error(e.stack);
process.exit(1); process.exit(1);

View file

@ -21,7 +21,7 @@ module.exports = (bookPath, contentPath, outPath) => {
archive.pipe(destStream); archive.pipe(destStream);
// Order matters; mimetype must be first for a valid EPUB // Order matters; mimetype must be first for a valid EPUB
archive.file(path.resolve(bookPath, "mimetype"), { name: "mimetype" }); archive.file(path.resolve(bookPath, "mimetype"), { name: "mimetype", store: true });
archive.directory(contentPath, "OEBPS", { name: "OEBPS" }); archive.directory(contentPath, "OEBPS", { name: "OEBPS" });
archive.directory(path.resolve(bookPath, "META-INF"), "META-INF", { name: "META-INF" }); archive.directory(path.resolve(bookPath, "META-INF"), "META-INF", { name: "META-INF" });

3812
npm-shrinkwrap.json generated

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@
"parahuman", "parahuman",
"scraper" "scraper"
], ],
"version": "4.4.0", "version": "5.1.0",
"author": "Domenic Denicola <d@domenic.me> (https://domenic.me/)", "author": "Domenic Denicola <d@domenic.me> (https://domenic.me/)",
"license": "WTFPL", "license": "WTFPL",
"repository": "domenic/worm-scraper", "repository": "domenic/worm-scraper",
@ -23,18 +23,19 @@
"lint": "eslint lib" "lint": "eslint lib"
}, },
"dependencies": { "dependencies": {
"archiver": "^5.0.2", "archiver": "^5.3.1",
"cli-progress": "^3.8.2", "cli-progress": "^3.11.1",
"cpr": "^3.0.1", "cpr": "^3.0.1",
"jsdom": "^16.4.0", "jsdom": "^19.0.0",
"requisition": "^1.5.0", "minipass-fetch": "^2.1.0",
"workerpool": "^6.0.2", "workerpool": "^6.2.1",
"yargs": "^16.1.0" "yargs": "^17.5.1"
}, },
"devDependencies": { "devDependencies": {
"eslint": "^7.11.0" "@domenic/eslint-config": "^2.0.0",
"eslint": "^8.16.0"
}, },
"engines": { "engines": {
"node": ">=12.10.0" "node": ">=16.13.2"
} }
} }