Compare commits
225 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f73d57a5a | ||
|
|
b25a0d3a65 |
||
|
|
752c4c3916 | ||
|
|
43a80d5dd9 | ||
|
|
7c9d96578b | ||
|
|
ac56d1c4c7 | ||
|
|
12394da334 |
||
|
|
c860873d78 | ||
|
|
360f108c1c | ||
|
|
f3893e2f3f | ||
|
|
32c76a59a5 | ||
|
|
ed070daf9f | ||
|
|
96e3e837e6 | ||
|
|
bfdb9eadde | ||
|
|
48400e6f96 |
||
|
|
b94c33ea6f | ||
|
|
ba387d3555 | ||
|
|
a405adf6b7 | ||
|
|
0efaf38170 | ||
|
|
ab226686ae | ||
|
|
66f7856a0f | ||
|
|
848b090b0d | ||
|
|
db1b3d9e97 | ||
|
|
c6f7460b82 | ||
|
|
3865ae0f5b | ||
|
|
d4663ed2e3 | ||
|
|
4a2d33f968 | ||
|
|
07f3011423 | ||
|
|
e1f9320524 | ||
|
|
18874a335c | ||
|
|
d5f3b58f0b | ||
|
|
0be098ff16 | ||
|
|
54dc72b182 | ||
|
|
28d4c6927a | ||
|
|
cc2db87b58 | ||
|
|
a89414392e | ||
|
|
89789724d1 | ||
|
|
5a25df658b | ||
|
|
f3366e8346 | ||
|
|
0b4af123ab | ||
|
|
f5f0ba8e61 | ||
|
|
08e0d0d9a8 | ||
|
|
5d9a031c02 | ||
|
|
b652e3812b | ||
|
|
26a2b9c9b5 | ||
|
|
2fb075128a | ||
|
|
ed392d8b98 | ||
|
|
d0c23d86fd | ||
|
|
369714f3d1 | ||
|
|
06e43dcf16 | ||
|
|
e1b59994f8 | ||
|
|
b67f4032f5 | ||
|
|
651944b4da | ||
|
|
44f2cc3c7b | ||
|
|
294cbb2e71 | ||
|
|
6256b332cb | ||
|
|
bf8a1b325c | ||
|
|
6f51bc6c9a | ||
|
|
877beda733 | ||
|
|
6ddde06817 | ||
|
|
aa9fc197e9 | ||
|
|
a1c7f00b42 | ||
|
|
b9d396f6f5 | ||
|
|
1a0780bd7b | ||
|
|
5b7ec80750 | ||
|
|
442d245e2d | ||
|
|
9fc36b813f | ||
|
|
631417a530 | ||
|
|
8a9562e10e | ||
|
|
5ff6621b31 | ||
|
|
3e06358fa2 | ||
|
|
3aece3e05e | ||
|
|
4908956f0c | ||
|
|
730cc512e3 | ||
|
|
cc151355fd | ||
|
|
a86e21b846 | ||
|
|
fc641af4f5 | ||
|
|
6e59181524 | ||
|
|
ecea0f5660 | ||
|
|
e3ef2e254a | ||
|
|
54f2f82650 | ||
|
|
0c22f7df11 | ||
|
|
d12c48976b | ||
|
|
2e4a3e56dc | ||
|
|
5c80327fef | ||
|
|
2412fa0375 | ||
|
|
32817eb255 | ||
|
|
c128712bb4 | ||
|
|
d62739f6cd | ||
|
|
f3063854ff | ||
|
|
1328dfd8e3 | ||
|
|
5310672cc2 | ||
|
|
79efec7080 | ||
|
|
ee76715935 | ||
|
|
2a09195471 | ||
|
|
4c1c7cd03d | ||
|
|
de83a47bfa | ||
|
|
3ec6e36e34 | ||
|
|
70e1ff0281 | ||
|
|
6751136bec | ||
|
|
1da99791d3 | ||
|
|
ea19dbb6c5 | ||
|
|
424f13f169 | ||
|
|
b613514994 | ||
|
|
16ef1836da | ||
|
|
85b5e142da | ||
|
|
2eafeee814 | ||
|
|
af03064221 | ||
|
|
22fbff008e | ||
|
|
c9d51787e2 | ||
|
|
53f7307daa | ||
|
|
20d91d37df | ||
|
|
60709f54c2 | ||
|
|
06c7b3adf2 | ||
|
|
3284c04f8b | ||
|
|
121ab01243 | ||
|
|
abca01b1d6 | ||
|
|
b113b240cc | ||
|
|
d69484af8e | ||
|
|
21bded5650 | ||
|
|
b20182ad0d | ||
|
|
b017ab54d1 | ||
|
|
54093109bf | ||
|
|
aea63ba055 | ||
|
|
8daeebe507 | ||
|
|
41566a380e | ||
|
|
584a52fc27 | ||
|
|
075db7f446 | ||
|
|
d2402b1bb2 | ||
|
|
05a9abab91 | ||
|
|
9da59c9104 | ||
|
|
631897b011 | ||
|
|
1a2b6de78e | ||
|
|
15e34e47a3 | ||
|
|
c5b13e7cc1 | ||
|
|
71d5352eb1 | ||
|
|
13b25835ba | ||
|
|
326ec148c9 | ||
|
|
6c6b360dac | ||
|
|
6ee5fb9ef6 | ||
|
|
33d88eb523 | ||
|
|
f70ba64629 | ||
|
|
58b0856deb | ||
|
|
7d5167c952 | ||
|
|
481e9cc3a8 | ||
|
|
0053545444 | ||
|
|
56a38609ca | ||
|
|
6cdf486858 | ||
|
|
e364cb2b22 | ||
|
|
b85e8a68b1 | ||
|
|
547b7e9518 | ||
|
|
b4454f8432 | ||
|
|
411defc153 | ||
|
|
24809f607b | ||
|
|
dd2b5b15ce | ||
|
|
e24bbc462b | ||
|
|
ff09d1fd76 | ||
|
|
3810138e15 | ||
|
|
056dc9d514 | ||
|
|
bcca56866c | ||
|
|
699c9be71f | ||
|
|
fb93322823 | ||
|
|
50dce93587 | ||
|
|
0fc22d064d | ||
|
|
4fbaa0e8db | ||
|
|
6b5bedcbc7 | ||
|
|
ab28acfd9a | ||
|
|
6406a0f370 | ||
|
|
367b69221a | ||
|
|
695bb24d3c | ||
|
|
3905965134 | ||
|
|
e8921bbbfe | ||
|
|
304955df72 | ||
|
|
3e2bc4a447 | ||
|
|
5cbbd8a6fb | ||
|
|
b6bce1244d | ||
|
|
e2277751c0 | ||
|
|
ad2c0b0ca5 | ||
|
|
6848456bc9 | ||
|
|
eebd6cb669 | ||
|
|
68eabd0ca0 | ||
|
|
d4c67d73df | ||
|
|
4d9e556431 | ||
|
|
ae5b179d7c | ||
|
|
51520c11be | ||
|
|
e13ca0de74 | ||
|
|
24d2a0a1ab | ||
|
|
7e9da35803 | ||
|
|
5a6f3e8cfb | ||
|
|
f300159db3 | ||
|
|
bb5c652a3d | ||
|
|
697f8a76ea | ||
|
|
97228bf88d | ||
|
|
e71da44ca2 | ||
|
|
4e59937e19 | ||
|
|
84e0d83bd1 | ||
|
|
26b4cb07af | ||
|
|
24b01e4292 | ||
|
|
4091530bb2 | ||
|
|
cac9a924dc | ||
|
|
eec513bb8d | ||
|
|
91f02bd029 | ||
|
|
176d88eea2 | ||
|
|
367b9e6c5c | ||
|
|
36502fe3f6 | ||
|
|
bd4360a793 | ||
|
|
84a9540497 | ||
|
|
6ba3246598 | ||
|
|
851543f44d | ||
|
|
2d856c1de8 | ||
|
|
60c23f0240 | ||
|
|
27eae8860b | ||
|
|
aafa10d874 | ||
|
|
ff991028af | ||
|
|
9e112adb81 | ||
|
|
559681e4ec | ||
|
|
2593540551 | ||
|
|
354fb46819 | ||
|
|
a700e29299 | ||
|
|
fd914b43f5 | ||
|
|
27e14360c9 | ||
|
|
ecdadf8fd9 | ||
|
|
de47bd36e2 | ||
|
|
f8d7e03a40 | ||
|
|
4386d48021 |
22 changed files with 9350 additions and 2375 deletions
255
.eslintrc.json
255
.eslintrc.json
|
|
@ -1,259 +1,10 @@
|
|||
{
|
||||
"root": true,
|
||||
"extends": "@domenic",
|
||||
"env": {
|
||||
"node": true,
|
||||
"es6": true
|
||||
},
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2017
|
||||
"node": true
|
||||
},
|
||||
"rules": {
|
||||
// Possible errors
|
||||
"no-await-in-loop": "off",
|
||||
"comma-dangle": ["error", "never"],
|
||||
"no-cond-assign": ["error", "except-parens"],
|
||||
"no-console": "off",
|
||||
"no-constant-condition": "error",
|
||||
"no-control-regex": "error",
|
||||
"no-debugger": "error",
|
||||
"no-dupe-args": "error",
|
||||
"no-dupe-keys": "error",
|
||||
"no-duplicate-case": "error",
|
||||
"no-empty": "error",
|
||||
"no-empty-character-class": "error",
|
||||
"no-ex-assign": "error",
|
||||
"no-extra-boolean-cast": "error",
|
||||
"no-extra-parens": ["error", "all", { "conditionalAssign": false, "nestedBinaryExpressions": false }],
|
||||
"no-extra-semi": "error",
|
||||
"no-func-assign": "error",
|
||||
"no-inner-declarations": "off",
|
||||
"no-invalid-regexp": "error",
|
||||
"no-irregular-whitespace": "error",
|
||||
"no-obj-calls": "error",
|
||||
"no-prototype-builtins": "error",
|
||||
"no-regex-spaces": "error",
|
||||
"no-sparse-arrays": "error",
|
||||
"no-template-curly-in-string": "error",
|
||||
"no-unexpected-multiline": "error",
|
||||
"no-unreachable": "error",
|
||||
"no-unsafe-finally": "off",
|
||||
"no-unsafe-negation": "error",
|
||||
"use-isnan": "error",
|
||||
"valid-jsdoc": "off",
|
||||
"valid-typeof": "error",
|
||||
|
||||
// Best practices
|
||||
"accessor-pairs": "error",
|
||||
"array-callback-return": "error",
|
||||
"block-scoped-var": "off",
|
||||
"class-methods-use-this": "error",
|
||||
"complexity": "off",
|
||||
"consistent-return": "error",
|
||||
"curly": ["error", "all"],
|
||||
"default-case": "off",
|
||||
"dot-location": ["error", "property"],
|
||||
"dot-notation": "error",
|
||||
"eqeqeq": "error",
|
||||
"guard-for-in": "off",
|
||||
"no-alert": "error",
|
||||
"no-caller": "error",
|
||||
"no-case-declarations": "error",
|
||||
"no-div-regex": "off",
|
||||
"no-else-return": "error",
|
||||
"no-empty-function": "error",
|
||||
"no-empty-pattern": "error",
|
||||
"no-eq-null": "error",
|
||||
"no-eval": "error",
|
||||
"no-extend-native": "error",
|
||||
"no-extra-bind": "error",
|
||||
"no-extra-label": "error",
|
||||
"no-fallthrough": "error",
|
||||
"no-floating-decimal": "error",
|
||||
"no-global-assign": "error",
|
||||
"no-implicit-coercion": "error",
|
||||
"no-implicit-globals": "error",
|
||||
"no-implied-eval": "off",
|
||||
"no-invalid-this": "error",
|
||||
"no-iterator": "error",
|
||||
"no-labels": ["error", { "allowLoop": true }],
|
||||
"no-lone-blocks": "error",
|
||||
"no-loop-func": "off",
|
||||
"no-magic-numbers": "off",
|
||||
"no-multi-spaces": "error",
|
||||
"no-multi-str": "error",
|
||||
"no-new": "error",
|
||||
"no-new-func": "error",
|
||||
"no-new-wrappers": "error",
|
||||
"no-octal": "error",
|
||||
"no-octal-escape": "error",
|
||||
"no-param-reassign": "off",
|
||||
"no-process-env": "error",
|
||||
"no-proto": "error",
|
||||
"no-redeclare": "error",
|
||||
"no-restricted-properties": "off",
|
||||
"no-return-assign": ["error", "except-parens"],
|
||||
"no-return-await": "error",
|
||||
"no-script-url": "off",
|
||||
"no-self-assign": "error",
|
||||
"no-self-compare": "error",
|
||||
"no-sequences": "error",
|
||||
"no-throw-literal": "error",
|
||||
"no-unmodified-loop-condition": "error",
|
||||
"no-unused-expressions": "error",
|
||||
"no-unused-labels": "error",
|
||||
"no-useless-call": "error",
|
||||
"no-useless-concat": "error",
|
||||
"no-useless-escape": "error",
|
||||
"no-useless-return": "error",
|
||||
"no-void": "error",
|
||||
"no-warning-comments": "off",
|
||||
"no-with": "error",
|
||||
"radix": ["error", "as-needed"],
|
||||
"require-await": "error",
|
||||
"vars-on-top": "off",
|
||||
"wrap-iife": ["error", "outside"],
|
||||
"yoda": ["error", "never"],
|
||||
|
||||
// Strict Mode
|
||||
"strict": ["error", "global"],
|
||||
|
||||
// Variables
|
||||
"init-declarations": "off",
|
||||
"no-catch-shadow": "error",
|
||||
"no-delete-var": "error",
|
||||
"no-label-var": "error",
|
||||
"no-restricted-globals": "off",
|
||||
"no-shadow": "error",
|
||||
"no-shadow-restricted-names": "error",
|
||||
"no-undef": "error",
|
||||
"no-undef-init": "error",
|
||||
"no-undefined": "off",
|
||||
"no-unused-vars": "error",
|
||||
"no-use-before-define": ["error", "nofunc"],
|
||||
|
||||
// Node.js and CommonJS
|
||||
"callback-return": "off",
|
||||
"global-require": "error",
|
||||
"handle-callback-err": "error",
|
||||
"no-mixed-requires": ["error", true],
|
||||
"no-new-require": "error",
|
||||
"no-path-concat": "error",
|
||||
"no-process-exit": "error",
|
||||
"no-restricted-imports": "off",
|
||||
"no-restricted-modules": "off",
|
||||
"no-sync": "off",
|
||||
|
||||
// Stylistic Issues
|
||||
"array-bracket-spacing": ["error", "never"],
|
||||
"block-spacing": ["error", "always"],
|
||||
"brace-style": ["error", "1tbs", { "allowSingleLine": false }],
|
||||
"camelcase": ["error", { "properties": "always" }],
|
||||
"capitalized-comments": ["error", "always", { "ignoreConsecutiveComments": true }],
|
||||
"comma-spacing": ["error", { "before": false, "after": true }],
|
||||
"comma-style": ["error", "last"],
|
||||
"computed-property-spacing": ["error", "never"],
|
||||
"consistent-this": "off",
|
||||
"eol-last": "error",
|
||||
"func-call-spacing": ["error", "never"],
|
||||
"func-name-matching": ["error", "always"],
|
||||
"func-names": ["error", "never"],
|
||||
"func-style": ["error", "declaration"],
|
||||
"id-blacklist": "off",
|
||||
"id-length": "off",
|
||||
"id-match": "off",
|
||||
"indent": ["error", 2, { "SwitchCase": 1 }],
|
||||
"jsx-quotes": "off",
|
||||
"key-spacing": ["error", { "beforeColon": false, "afterColon": true, "mode": "strict" }],
|
||||
"keyword-spacing": ["error", { "before": true, "after": true }],
|
||||
"line-comment-position": "off",
|
||||
"linebreak-style": ["error", "unix"],
|
||||
"lines-around-comment": "off",
|
||||
"lines-around-directive": "off",
|
||||
"max-depth": "off",
|
||||
"max-len": ["error", 120, { "ignoreUrls": true }],
|
||||
"max-lines": "off",
|
||||
"max-nested-callbacks": "off",
|
||||
"max-params": "off",
|
||||
"max-statements": "off",
|
||||
"max-statements-per-line": ["error", { "max": 1 }],
|
||||
"multiline-ternary": "off",
|
||||
"new-cap": "error",
|
||||
"new-parens": "error",
|
||||
"newline-after-var": "off",
|
||||
"newline-before-return": "off",
|
||||
"newline-per-chained-call": "off",
|
||||
"no-array-constructor": "error",
|
||||
"no-bitwise": "off",
|
||||
"no-continue": "off",
|
||||
"no-inline-comments": "off",
|
||||
"no-lonely-if": "error",
|
||||
"no-mixed-operators": "error",
|
||||
"no-mixed-spaces-and-tabs": "error",
|
||||
"no-multiple-empty-lines": "error",
|
||||
"no-negated-condition": "off",
|
||||
"no-nested-ternary": "error",
|
||||
"no-new-object": "error",
|
||||
"no-plusplus": "off",
|
||||
"no-restricted-syntax": "off",
|
||||
"no-tabs": "error",
|
||||
"no-ternary": "off",
|
||||
"no-trailing-spaces": "error",
|
||||
"no-underscore-dangle": "off",
|
||||
"no-unneeded-ternary": "error",
|
||||
"no-whitespace-before-property": "error",
|
||||
"object-curly-newline": ["error", { "multiline": true }],
|
||||
"object-curly-spacing": ["error", "always"],
|
||||
"object-property-newline": "off",
|
||||
"one-var": ["error", "never"],
|
||||
"one-var-declaration-per-line": ["error", "initializations"],
|
||||
"operator-assignment": ["error", "always"],
|
||||
"operator-linebreak": ["error", "after"],
|
||||
"padded-blocks": ["error", "never"],
|
||||
"quote-props": ["error", "as-needed"],
|
||||
"quotes": ["error", "double", { "avoidEscape": true, "allowTemplateLiterals": true }],
|
||||
"require-jsdoc": "off",
|
||||
"semi": ["error", "always"],
|
||||
"semi-spacing": "error",
|
||||
"sort-keys": "off",
|
||||
"sort-vars": "off",
|
||||
"space-before-blocks": ["error", "always"],
|
||||
"space-before-function-paren": ["error", { "anonymous": "always", "named": "never" }],
|
||||
"space-in-parens": ["error", "never"],
|
||||
"space-infix-ops": "error",
|
||||
"space-unary-ops": ["error", { "words": true, "nonwords": false }],
|
||||
"spaced-comment": ["error", "always", { "markers": ["///"] }],
|
||||
"unicode-bom": ["error", "never"],
|
||||
"wrap-regex": "off",
|
||||
|
||||
// ECMAScript 6
|
||||
"arrow-body-style": "off", // meh
|
||||
"arrow-parens": ["error", "as-needed"],
|
||||
"arrow-spacing": "error",
|
||||
"constructor-super": "error",
|
||||
"generator-star-spacing": ["error", "after"],
|
||||
"no-class-assign": "error",
|
||||
"no-confusing-arrow": "off",
|
||||
"no-const-assign": "error",
|
||||
"no-dupe-class-members": "error",
|
||||
"no-duplicate-imports": "error",
|
||||
"no-new-symbol": "error",
|
||||
"no-this-before-super": "error",
|
||||
"no-useless-computed-key": "error",
|
||||
"no-useless-constructor": "error",
|
||||
"no-useless-rename": "error",
|
||||
"no-var": "error",
|
||||
"object-shorthand": "error",
|
||||
"prefer-arrow-callback": "error",
|
||||
"prefer-const": "error",
|
||||
"prefer-numeric-literals": "error",
|
||||
"prefer-rest-params": "error",
|
||||
"prefer-spread": "error",
|
||||
"prefer-template": "off",
|
||||
"require-yield": "error",
|
||||
"rest-spread-spacing": ["error", "never"],
|
||||
"sort-imports": "off",
|
||||
"symbol-description": "error",
|
||||
"template-curly-spacing": ["error", "never"],
|
||||
"yield-star-spacing": ["error", "after"]
|
||||
"no-console": "off"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
50
.github/workflows/test.yml
vendored
Normal file
50
.github/workflows/test.yml
vendored
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
name: Test
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
jobs:
|
||||
test:
|
||||
name: Test
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- uses: actions/setup-node@v2
|
||||
with:
|
||||
node-version: 16
|
||||
|
||||
- run: npm install
|
||||
|
||||
- run: npm run lint
|
||||
|
||||
# CI would take too long if we did the download every time too. So, we cache it. This does mean we're vulnerable to
|
||||
# source changes exposing problems in our code, but those are pretty infrequent. If they occur, we need to bump the
|
||||
# cache key.
|
||||
- uses: actions/cache@v2
|
||||
with:
|
||||
key: worm-ward-cache-2021-01-17
|
||||
path: ./cache
|
||||
|
||||
- run: node ./lib/worm-scraper.js --book=worm
|
||||
|
||||
- run: node ./lib/worm-scraper.js --book=ward
|
||||
|
||||
- uses: actions/setup-java@v1
|
||||
with:
|
||||
java-version: 15
|
||||
java-package: jre
|
||||
|
||||
- name: Get EPUBCheck
|
||||
run: |
|
||||
curl https://github.com/w3c/epubcheck/releases/download/v4.2.4/epubcheck-4.2.4.zip --location --output epubcheck.zip
|
||||
unzip epubcheck.zip
|
||||
|
||||
- name: Check Worm.epub
|
||||
run: java -jar epubcheck-4.2.4/epubcheck.jar --failonwarnings Worm.epub
|
||||
|
||||
- name: Check Ward.epub
|
||||
run: java -jar epubcheck-4.2.4/epubcheck.jar --failonwarnings Ward.epub
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -2,5 +2,5 @@
|
|||
/npm-debug.log
|
||||
|
||||
cache/
|
||||
book/
|
||||
Worm.epub
|
||||
staging/
|
||||
*.epub
|
||||
|
|
|
|||
|
|
@ -1,5 +0,0 @@
|
|||
language: node_js
|
||||
node_js:
|
||||
- stable
|
||||
script:
|
||||
npm run lint
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
Copyright © 2015 Domenic Denicola <d@domenic.me>
|
||||
Copyright © Domenic Denicola <d@domenic.me>
|
||||
|
||||
This work is free. You can redistribute it and/or modify it under the
|
||||
terms of the Do What The Fuck You Want To Public License, Version 2,
|
||||
|
|
|
|||
24
README.md
24
README.md
|
|
@ -1,33 +1,45 @@
|
|||
# _Worm_ Scraper
|
||||
|
||||
Scrapes the web serial [_Worm_](https://parahumans.wordpress.com/) into an eBook format.
|
||||
Scrapes the web serial [_Worm_](https://parahumans.wordpress.com/) and its sequel [_Ward_](https://www.parahumans.net/) into an eBook format.
|
||||
|
||||
## How to use
|
||||
|
||||
First you'll need a modern version of [Node.js](https://nodejs.org/en/). Install whatever is current (not LTS); at least v8.x is necessary.
|
||||
First you'll need a modern version of [Node.js](https://nodejs.org/en/). At least v16.13.2 is necessary.
|
||||
|
||||
Then, open a terminal ([Mac documentation](http://blog.teamtreehouse.com/introduction-to-the-mac-os-x-command-line), [Windows documentation](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and install the program by typing
|
||||
|
||||
```
|
||||
```bash
|
||||
npm install -g worm-scraper
|
||||
```
|
||||
|
||||
This will take a while as it downloads this program and its dependencies from the internet. Once it's done, try to run it, by typing:
|
||||
|
||||
```
|
||||
```bash
|
||||
worm-scraper --help
|
||||
```
|
||||
|
||||
If this outputs some help documentation, then the installation process went smoothly. You can move on to assemble the eBook by typing
|
||||
|
||||
```
|
||||
worm-scraper download convert scaffold zip
|
||||
```bash
|
||||
worm-scraper
|
||||
```
|
||||
|
||||
This will take a while, but will eventually produce a `Worm.epub` file!
|
||||
|
||||
If you'd like to get _Ward_ instead of _Worm_, use `--book=ward`, e.g.
|
||||
|
||||
```bash
|
||||
worm-scraper --book=ward
|
||||
```
|
||||
|
||||
## EPUB vs. other formats
|
||||
|
||||
EPUB is one of the primary eBook formats, but it is not recognized by all readers, including most Amazon Kindle devices. You can use an online converter or other tool to convert EPUB to Kindle MOBI, or any other format.
|
||||
|
||||
Alternately, if you are a developer, a pull request adding support for MOBI output would be appreciated; please open an issue to discuss how you plan to proceed.
|
||||
|
||||
## Text fixups
|
||||
|
||||
This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code; there's [`convert-worker.js`](https://github.com/domenic/worm-scraper/blob/master/lib/convert-worker.js), where some things are handled generally, and [`substitutions.json`](https://github.com/domenic/worm-scraper/blob/master/lib/substitutions.json), for one-off fixes.
|
||||
|
||||
This process is designed to be extensible, so if you notice any problems with the original text that you think should be fixed, file an issue to let me know, and we can update the fixup code so that the resulting eBook is improved. (Or better yet, send a pull request!)
|
||||
|
|
|
|||
11
covers/README.md
Normal file
11
covers/README.md
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
# Cover credits
|
||||
|
||||
The _Worm_ cover is assembled from:
|
||||
|
||||
- [Ari Ibarra's fanart](https://www.instagram.com/p/B1wSi1Ynaze/) on Instagram
|
||||
- The "Wildbow's Past Works" image for _Worm_ on [parahumans.net](https://www.parahumans.net/)
|
||||
|
||||
The _Ward_ cover is assembled from:
|
||||
|
||||
- [zearoe's fanart](https://www.reddit.com/r/Parahumans/comments/b8n7o0/fanartrepost_antares/) on Reddit
|
||||
- The header image on [parahumans.net](https://www.parahumans.net/)
|
||||
BIN
covers/ward/cover.jpg
Normal file
BIN
covers/ward/cover.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 768 KiB |
|
|
@ -3,9 +3,11 @@
|
|||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Cover</title>
|
||||
<style>
|
||||
<style type="text/css">
|
||||
body {
|
||||
text-align: center;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
img {
|
||||
max-width: 100%;
|
||||
|
|
@ -15,6 +17,8 @@
|
|||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<img src="cover.png" alt="Worm"/>
|
||||
<div>
|
||||
<img src="cover.jpg" alt=""/>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
BIN
covers/worm/cover.jpg
Normal file
BIN
covers/worm/cover.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 374 KiB |
24
covers/worm/cover.xhtml
Normal file
24
covers/worm/cover.xhtml
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Cover</title>
|
||||
<style type="text/css">
|
||||
body {
|
||||
text-align: center;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
img {
|
||||
max-width: 100%;
|
||||
height: 100%;
|
||||
margin: 0 auto;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="cover.jpg" alt=""/>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
24
lib/books.js
Normal file
24
lib/books.js
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
"use strict";
|
||||
|
||||
exports.worm = {
|
||||
startURL: "https://parahumans.wordpress.com/2011/06/11/1-1/",
|
||||
title: "Worm",
|
||||
id: "e7f3532d-8db6-4888-be80-1976166b7059",
|
||||
|
||||
// First paragraph of https://parahumans.wordpress.com/about/
|
||||
// eslint-disable-next-line max-len
|
||||
description: `An introverted teenage girl with an unconventional superpower, Taylor goes out in costume to find escape from a deeply unhappy and frustrated civilian life. Her first attempt at taking down a supervillain sees her mistaken for one, thrusting her into the midst of the local ‘cape’ scene’s politics, unwritten rules, and ambiguous morals. As she risks life and limb, Taylor faces the dilemma of having to do the wrong things for the right reasons.`
|
||||
};
|
||||
|
||||
exports.ward = {
|
||||
startURL: "https://www.parahumans.net/2017/09/11/daybreak-1-1/",
|
||||
title: "Ward",
|
||||
id: "a6b6b156-2f17-43c0-8bb1-bfa91f3ef62a",
|
||||
|
||||
// Synposis from https://www.parahumans.net/
|
||||
/* eslint-disable max-len */
|
||||
description: `The unwritten rules that govern the fights and outright wars between ‘capes’ have been amended: everyone gets their second chance. It’s an uneasy thing to come to terms with when notorious supervillains and even monsters are playing at being hero. The world ended two years ago, and as humanity straddles the old world and the new, there aren’t records, witnesses, or facilities to answer the villains’ past actions in the present. One of many compromises, uneasy truces and deceptions that are starting to splinter as humanity rebuilds.
|
||||
|
||||
None feel the injustice of this new status quo or the lack of established footing more than the past residents of the parahuman asylums. The facilities hosted parahumans and their victims, but the facilities are ruined or gone; one of many fragile ex-patients is left to find a place in a fractured world. She’s perhaps the person least suited to have anything to do with this tenuous peace or to stand alongside these false heroes. She’s put in a position to make the decision: will she compromise to help forge what they call, with dark sentiment, a second golden age? Or will she stand tall as a gilded dark age dawns?`
|
||||
/* eslint-enable max-len */
|
||||
};
|
||||
833
lib/convert-worker.js
Normal file
833
lib/convert-worker.js
Normal file
|
|
@ -0,0 +1,833 @@
|
|||
"use strict";
|
||||
const workerpool = require("workerpool");
|
||||
const fs = require("fs");
|
||||
const { JSDOM } = require("jsdom");
|
||||
const substitutions = require("./substitutions.json");
|
||||
|
||||
workerpool.worker({ convertChapter });
|
||||
|
||||
function convertChapter(chapter, book, inputPath, outputPath) {
|
||||
const contents = fs.readFileSync(inputPath, { encoding: "utf-8" });
|
||||
|
||||
const rawChapterJSDOM = new JSDOM(contents);
|
||||
const { output, warnings } = getChapterString(chapter, book, rawChapterJSDOM.window.document);
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterJSDOM.window.close();
|
||||
|
||||
fs.writeFileSync(outputPath, output);
|
||||
return warnings;
|
||||
}
|
||||
|
||||
function getChapterString(chapter, book, rawChapterDoc) {
|
||||
const { xml, warnings } =
|
||||
getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content"));
|
||||
|
||||
const output = `<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
|
||||
<title>${chapter.title}</title>
|
||||
</head>
|
||||
${xml}
|
||||
</html>`;
|
||||
|
||||
return { output, warnings };
|
||||
}
|
||||
|
||||
function getBodyXML(chapter, book, contentEl) {
|
||||
const warnings = [];
|
||||
|
||||
// Remove initial Next Chapter and Previous Chapter <p>
|
||||
contentEl.firstElementChild.remove();
|
||||
|
||||
// Remove everything after the last <p> (e.g. analytics <div>s)
|
||||
const lastP = contentEl.querySelector("p:last-of-type");
|
||||
while (contentEl.lastElementChild !== lastP) {
|
||||
contentEl.lastElementChild.remove();
|
||||
}
|
||||
|
||||
// Remove empty <p>s or Last Chapter/Next Chapter <p>s
|
||||
while (isEmptyOrGarbage(contentEl.lastElementChild)) {
|
||||
contentEl.lastElementChild.remove();
|
||||
}
|
||||
|
||||
// Remove redundant attributes and style
|
||||
for (const child of contentEl.children) {
|
||||
if (child.getAttribute("dir") === "ltr") {
|
||||
child.removeAttribute("dir");
|
||||
}
|
||||
|
||||
// Only ever appears with align="LEFT" (useless) or align="CENTER" overridden by style="text-align: left;" (also
|
||||
// useless)
|
||||
child.removeAttribute("align");
|
||||
|
||||
const style = child.getAttribute("style");
|
||||
if (style === "text-align:left;" || style === "text-align: left;") {
|
||||
child.removeAttribute("style");
|
||||
}
|
||||
|
||||
// Worm uses 30px; Ward mostly uses 40px but sometimes uses 30px/60px. Let's standardize on 30px.
|
||||
if (style === "text-align:left;padding-left:30px;" ||
|
||||
style === "text-align: left;padding-left: 40px;" ||
|
||||
style === "text-align: left; padding-left: 40px;" ||
|
||||
style === "padding-left: 40px;") {
|
||||
child.setAttribute("style", "padding-left: 30px;");
|
||||
}
|
||||
}
|
||||
|
||||
// Remove empty inline elements.
|
||||
// Remove style attributes from inline elements, as they're always messed up.
|
||||
for (const el of contentEl.querySelectorAll("em, i, strong, b")) {
|
||||
const { textContent } = el;
|
||||
|
||||
if (textContent === "") {
|
||||
el.remove();
|
||||
} else if (textContent.trim() === "") {
|
||||
if (el.childElementCount === 0) {
|
||||
el.replaceWith(" ");
|
||||
} else if (el.childElementCount === 1 && el.children[0].localName === "br") {
|
||||
el.outerHTML = "<br />\n";
|
||||
}
|
||||
} else {
|
||||
el.removeAttribute("style");
|
||||
}
|
||||
}
|
||||
|
||||
// In https://parahumans.wordpress.com/2013/01/05/monarch-16-13/ there are some <address>s that should be <p>s O_o
|
||||
for (const address of contentEl.querySelectorAll("address")) {
|
||||
const p = contentEl.ownerDocument.createElement("p");
|
||||
p.innerHTML = address.innerHTML;
|
||||
address.replaceWith(p);
|
||||
}
|
||||
|
||||
// Every <span> except underline ones is pointless at best and frequently messed up. (Weird font size, line spacing,
|
||||
// etc.)
|
||||
for (const span of contentEl.querySelectorAll("span")) {
|
||||
if (span.getAttribute("style") === "text-decoration:underline;") {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (span.textContent.trim() === "") {
|
||||
span.remove();
|
||||
} else {
|
||||
const docFrag = contentEl.ownerDocument.createDocumentFragment();
|
||||
while (span.firstChild) {
|
||||
docFrag.appendChild(span.firstChild);
|
||||
}
|
||||
span.replaceWith(docFrag);
|
||||
}
|
||||
}
|
||||
|
||||
// In Ward, CloudFlare email protection obfuscates the email addresses:
|
||||
// https://usamaejaz.com/cloudflare-email-decoding/
|
||||
for (const emailEl of contentEl.querySelectorAll("[data-cfemail]")) {
|
||||
const decoded = decodeCloudFlareEmail(emailEl.dataset.cfemail);
|
||||
emailEl.replaceWith(contentEl.ownerDocument.createTextNode(decoded));
|
||||
}
|
||||
|
||||
// Synthesize a <body> tag to serialize
|
||||
const bodyEl = contentEl.ownerDocument.createElement("body");
|
||||
|
||||
const h1El = contentEl.ownerDocument.createElement("h1");
|
||||
h1El.textContent = chapter.title;
|
||||
bodyEl.appendChild(h1El);
|
||||
|
||||
while (contentEl.firstChild) {
|
||||
bodyEl.appendChild(contentEl.firstChild);
|
||||
}
|
||||
|
||||
const xmlSerializer = new contentEl.ownerDocument.defaultView.XMLSerializer();
|
||||
let xml = xmlSerializer.serializeToString(bodyEl);
|
||||
|
||||
// Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p>
|
||||
xml = xml.replace(/<br \/>\s*<\/em><\/p>/ug, "</em></p>");
|
||||
|
||||
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
|
||||
xml = xml.replace(/<i>([^ ]+)<\/i>/ug, "<em>$1</em>");
|
||||
xml = xml.replace(/<i>([^ ]+)( +)<\/i>/ug, "<em>$1</em>$2");
|
||||
|
||||
// There are way too many nonbreaking spaces where they don't belong. If they show up three in a row, then let them
|
||||
// live; they're maybe being used for alignment or something. Otherwise, they die.
|
||||
//
|
||||
// Also, normalize spaces after a period/quote mark to two (normal) spaces. The second one is invisible when
|
||||
// rendered, but it helps future heuristics detect end of sentences.
|
||||
xml = xml.replace(/\xA0{1,2}(?!\x20\xA0)/ug, " ");
|
||||
xml = xml.replace(/([.”])\x20*\xA0[\xA0\x20]*/ug, "$1 ");
|
||||
xml = xml.replace(/([.”])\x20{3,}/ug, "$1 ");
|
||||
|
||||
function fixEms() {
|
||||
// Fix recurring broken-up or erroneous <em>s
|
||||
xml = xml.replace(/<\/em>‘s/ug, "’s</em>");
|
||||
xml = xml.replace(/<em><\/em>/ug, "");
|
||||
xml = xml.replace(/<\/em><em>/ug, "");
|
||||
xml = xml.replace(/<em>(\s?\s?[^A-Za-z]\s?\s?)<\/em>/ug, "$1");
|
||||
xml = xml.replace(/<\/em>(\s?\s?[^A-Za-z]\s?\s?)<em>/ug, "$1");
|
||||
xml = xml.replace(/“<em>([^>]+)<\/em>(!|\?|\.)”/ug, "“<em>$1$2</em>”");
|
||||
xml = xml.replace(/<p><em>([^>]+)<\/em>(!|\?|\.)<\/p>/ug, "<p><em>$1$2</em></p>");
|
||||
xml = xml.replace(/(!|\?|\.)\s{2}<\/em><\/p>/ug, "$1</em></p>");
|
||||
xml = xml.replace(/<em>([a-z]+)(\?|\.)<\/em>/ug, "<em>$1</em>$2");
|
||||
xml = xml.replace(/<em>([^>]+?)( +)<\/em>/ug, "<em>$1</em>$2");
|
||||
xml = xml.replace(/<em> ([a-zA-Z]+)<\/em>/ug, " <em>$1</em>");
|
||||
xml = xml.replace(/<em>‘\s*([^<]+)\s*’<\/em>/ug, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/<em>‘\s*([^<]+)\s*<\/em>\s*’/ug, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/‘\s*<em>\s*([^<]+)\s*’<\/em>/ug, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/<em>“\s*([^<”]+)\s*”<\/em>/ug, "“<em>$1</em>”");
|
||||
xml = xml.replace(/<em>“\s*([^<”]+)\s*<\/em>\s*”/ug, "“<em>$1</em>”");
|
||||
xml = xml.replace(/“\s*<em>\s*([^<”]+)\s*”<\/em>/ug, "“<em>$1</em>”");
|
||||
xml = xml.replace(/([^\n>])<em> ?/ug, "$1 <em>");
|
||||
xml = xml.replace(/ ?<\/em>/ug, "</em> ");
|
||||
xml = xml.replace(/<p([^>]+)> <em>/ug, "<p$1><em>");
|
||||
xml = xml.replace(/<\/em> <\/p>/ug, "</em></p>");
|
||||
xml = xml.replace(/<em>([a-z]+),<\/em>/ug, "<em>$1</em>,");
|
||||
}
|
||||
|
||||
// These quote/apostrophe/em fixes interact with each other. TODO: try to disentangle so we don't repeat all of
|
||||
// fixEms.
|
||||
xml = xml.replace(/,” <\/em>/ug, "</em>,” ");
|
||||
fixEms();
|
||||
xml = xml.replace(/<p>”/ug, "<p>“");
|
||||
xml = xml.replace(/“\s*<\/p>/ug, "”</p>");
|
||||
xml = xml.replace(/“\s*<\/em><\/p>/ug, "</em>”</p>");
|
||||
xml = xml.replace(/‘\s*<\/p>/ug, "’</p>");
|
||||
xml = xml.replace(/‘\s*<\/em><\/p>/ug, "’</em></p>");
|
||||
xml = xml.replace(/,” <\/em>/ug, "</em>,” ");
|
||||
xml = xml.replace(/′/ug, "’");
|
||||
xml = xml.replace(/″/ug, "”");
|
||||
xml = xml.replace(/([A-Za-z])‘s(\s?)/ug, "$1’s$2");
|
||||
xml = xml.replace(/I‘m/ug, "I’m");
|
||||
xml = xml.replace(/<p>“\s+/ug, "<p>“");
|
||||
xml = xml.replace(/\s+”/ug, "”");
|
||||
xml = xml.replace(/'/ug, "’");
|
||||
xml = xml.replace(/’([A-Za-z]+)’/ug, "‘$1’");
|
||||
xml = xml.replace(/([a-z])”<\/p>/ug, "$1.”</p>");
|
||||
fixEms();
|
||||
xml = xml.replace(/‘<em>([^<]+)<\/em>‘/ug, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/<em>([a-z]+)!<\/em>/ug, "<em>$1</em>!");
|
||||
xml = xml.replace(/(?<! {2})<em>([\w ’]+)([!.?])”<\/em>/ug, "<em>$1</em>$2”");
|
||||
xml = xml.replace(/<em>([\w ’]+[!.?])”<\/em>/ug, "<em>$1</em>”");
|
||||
xml = xml.replace(/I”(m|ll)/ug, "I’$1");
|
||||
xml = xml.replace(/””<\/p>/ug, "”</p>");
|
||||
xml = xml.replace(/^([^“]+?) ?”(?![ —<])/ugm, "$1 “");
|
||||
xml = xml.replace(/(?<!“)<em>([A-Za-z]+),<\/em>(?!”| +[A-Za-z]+ thought)/u, "<em>$1</em>,");
|
||||
xml = xml.replace(/‘([Kk])ay(?!’)/ug, "’$1ay");
|
||||
xml = xml.replace(/<em>(Why|What|Who|How|Where|When)<\/em>\?/ug, "<em>$1?</em>");
|
||||
xml = xml.replace(/,<\/em>/ug, "</em>,");
|
||||
xml = xml.replace(/,”<\/p>/ug, ".”</p>");
|
||||
xml = xml.replace(/<p>(.*),<\/p>/ug, "<p>$1.</p>");
|
||||
xml = xml.replace(/‘(\w+)‘(\w+)’/ug, "‘$1’$2’");
|
||||
xml = xml.replace(/<em>([a-z]+), ([a-z]+)<\/em>/ug, "<em>$1</em>, <em>$2</em>");
|
||||
|
||||
// Similar problems occur in Ward with <b> and <strong> as do in Worm with <em>s
|
||||
xml = xml.replace(/<b \/>/ug, "");
|
||||
xml = xml.replace(/<b>(\s*<br \/>\s*)<\/b>/ug, "$1");
|
||||
xml = xml.replace(/<strong>(\s*<br \/>\s*)<\/strong>/ug, "$1");
|
||||
xml = xml.replace(/<\/strong>(\s*)<strong>/ug, "$1");
|
||||
xml = xml.replace(/<strong>@<\/strong>/ug, "@");
|
||||
xml = xml.replace(/<br \/>(\s*)<\/strong>/ug, "</strong><br />$1");
|
||||
xml = xml.replace(/(\s*)<\/strong>/ug, "</strong>$1");
|
||||
xml = xml.replace(/><strong>(.*)<\/strong>:</ug, "><strong>$1:</strong><");
|
||||
|
||||
// No need for line breaks before paragraph ends or after paragraph starts
|
||||
// These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above.
|
||||
xml = xml.replace(/<br \/>\s*<\/p>/ug, "</p>");
|
||||
xml = xml.replace(/<p><br \/>\s*/ug, "<p>");
|
||||
|
||||
// This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh.
|
||||
xml = xml.replace(/<\/em>\s*“\s*<\/p>/ug, "</em>”</p>");
|
||||
|
||||
// Fix missing spaces after commas
|
||||
xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/ug, "$1, $2");
|
||||
|
||||
// Fix bad periods and spacing/markup surrounding them
|
||||
xml = xml.replace(/\.\.<\/p>/ug, ".</p>");
|
||||
xml = xml.replace(/\.\.”<\/p>/ug, ".”</p>");
|
||||
xml = xml.replace(/ \. /ug, ". ");
|
||||
xml = xml.replace(/ \.<\/p>/ug, ".</p>");
|
||||
xml = xml.replace(/\.<em>\.\./ug, "<em>…");
|
||||
xml = xml.replace(/\.\. {2}/ug, ". ");
|
||||
xml = xml.replace(/\.\./ug, "…");
|
||||
xml = xml.replace(/(?<!Mr|Ms|Mrs)…\./ug, "…");
|
||||
xml = xml.replace(/(?<=Mr|Ms|Mrs)…\./ug, ".…");
|
||||
|
||||
// Fix extra spaces
|
||||
xml = xml.replace(/ ? <\/p>/ug, "</p>");
|
||||
xml = xml.replace(/([a-z]) ,/ug, "$1,");
|
||||
|
||||
// Use actual emojis instead of images
|
||||
xml = xml.replace(
|
||||
// eslint-disable-next-line max-len
|
||||
/<img width="16" height="16" class="wp-smiley emoji" draggable="false" alt="O_o" src="https:\/\/s1.wp.com\/wp-content\/mu-plugins\/wpcom-smileys\/o_O.svg" style="height: 1em; max-height: 1em;" \/>/ug,
|
||||
"🤨"
|
||||
);
|
||||
|
||||
xml = fixTruncatedWords(xml);
|
||||
xml = fixDialogueTags(xml);
|
||||
xml = fixForeignNames(xml);
|
||||
xml = standardizeNames(xml);
|
||||
xml = fixEmDashes(xml);
|
||||
xml = enDashJointNames(xml);
|
||||
xml = fixPossessives(xml);
|
||||
xml = cleanSceneBreaks(xml);
|
||||
xml = fixCapitalization(xml, book);
|
||||
xml = fixMispellings(xml);
|
||||
xml = fixHyphens(xml);
|
||||
xml = standardizeSpellings(xml);
|
||||
xml = fixCaseNumbers(xml);
|
||||
|
||||
// One-off fixes
|
||||
for (const substitution of substitutions[chapter.url] || []) {
|
||||
if (substitution.before) {
|
||||
const indexOf = xml.indexOf(substitution.before);
|
||||
if (indexOf === -1) {
|
||||
warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
|
||||
`updated at the source, in which case, you should edit substitutions.json.`);
|
||||
}
|
||||
if (indexOf !== xml.lastIndexOf(substitution.before)) {
|
||||
warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
|
||||
`Update substitutions.json for a more precise substitution.`);
|
||||
}
|
||||
|
||||
xml = xml.replace(new RegExp(escapeRegExp(substitution.before), "u"), substitution.after);
|
||||
} else if (substitution.regExp) {
|
||||
xml = xml.replace(new RegExp(substitution.regExp, "ug"), substitution.replacement);
|
||||
} else {
|
||||
warnings.push(`Invalid substitution specified for ${chapter.url}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>.
|
||||
// Use this opportunity to insert a comment pointing to the original URL, for reference.
|
||||
xml = xml.replace(
|
||||
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/u,
|
||||
`<body>\n<!-- ${chapter.url} -->\n`
|
||||
);
|
||||
|
||||
return { xml, warnings };
|
||||
}
|
||||
|
||||
function fixTruncatedWords(xml) {
|
||||
xml = xml.replace(/‘Sup/ug, "’Sup");
|
||||
xml = xml.replace(/‘cuz/ug, "’cuz");
|
||||
|
||||
// Short for "Sidepeace"
|
||||
xml = xml.replace(/[‘’][Pp]iece(?![a-z])/ug, "’Piece");
|
||||
|
||||
// Short for "Disjoint"
|
||||
xml = xml.replace(/[‘’][Jj]oint(?![a-z])/ug, "’Joint");
|
||||
|
||||
// Short for "Contender"
|
||||
xml = xml.replace(/[‘’][Tt]end(?![a-z])/ug, "’Tend");
|
||||
|
||||
// Short for "Anelace"
|
||||
xml = xml.replace(/[‘’][Ll]ace(?![a-z])/ug, "’Lace");
|
||||
|
||||
// Short for "Birdcage"
|
||||
xml = xml.replace(/[‘’][Cc]age(?![a-z])/ug, "’Cage");
|
||||
|
||||
// We can't do "’Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an
|
||||
// open quote, so we do that in substitutions.json.
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixDialogueTags(xml) {
|
||||
// Fix recurring miscapitalization with questions
|
||||
xml = xml.replace(/\?”\s\s?She asked/ug, "?” she asked");
|
||||
xml = xml.replace(/\?”\s\s?He asked/ug, "?” he asked");
|
||||
|
||||
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
|
||||
// > “I didn’t get much done,” Greg said, “I got distracted by...
|
||||
// This should instead be
|
||||
// > “I didn’t get much done,” Greg said. “I got distracted by...
|
||||
//
|
||||
// Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
|
||||
//
|
||||
// This sometimes overcorrects, as in the following example:
|
||||
// > “Basically,” Alec said, “For your powers to manifest, ...
|
||||
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
|
||||
//
|
||||
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
|
||||
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
|
||||
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
|
||||
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/ug, ",” $1. “$2");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixForeignNames(xml) {
|
||||
// This is consistently missing diacritics
|
||||
xml = xml.replace(/Yangban/ug, "Yàngbǎn");
|
||||
|
||||
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
|
||||
// italicized, so we go in the direction of removing the italics.
|
||||
xml = xml.replace(/<em>Garama<\/em>/ug, "Garama");
|
||||
xml = xml.replace(/<em>Thanda<\/em>/ug, "Thanda");
|
||||
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/ug, "Sifara$1");
|
||||
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/ug, "Moord Nag$1");
|
||||
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/ug, "Califa de Perro$1");
|
||||
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/ug, "Turanta$1");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function standardizeNames(xml) {
|
||||
// 197 instances of "Mrs." to 21 of "Ms."
|
||||
xml = xml.replace(/Ms\. Yamada/ug, "Mrs. Yamada");
|
||||
|
||||
// 25 instances of "Amias" to 3 of "Amais"
|
||||
xml = xml.replace(/Amais/ug, "Amias");
|
||||
|
||||
// 185 instances of Juliette to 4 of Juliet
|
||||
xml = xml.replace(/Juliet(?=\b)/ug, "Juliette");
|
||||
|
||||
// Earlier chapters have a space; later ones do not. They're separate words, so side with the earlier chapters.
|
||||
// One location is missing the "k".
|
||||
xml = xml.replace(/Crock? o[‘’]Shit/ug, "Crock o’ Shit");
|
||||
|
||||
// 5 instances of "Jotun" to 2 of "Jotunn"
|
||||
xml = xml.replace(/Jotunn/ug, "Jotun");
|
||||
|
||||
// 13 instances of Elman to 1 of Elmann
|
||||
xml = xml.replace(/Elmann/ug, "Elman");
|
||||
|
||||
// Thousands of instances of Tattletale to 4 instances of Tatteltale
|
||||
xml = xml.replace(/Tatteltale/ug, "Tattletale");
|
||||
|
||||
// 73 instances of Über to 2 of Uber
|
||||
xml = xml.replace(/Uber/ug, "Über");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixEmDashes(xml) {
|
||||
xml = xml.replace(/ – /ug, "—");
|
||||
xml = xml.replace(/“((?:<em>)?)-/ug, "“$1—");
|
||||
xml = xml.replace(/-[,.]?”/ug, "—”");
|
||||
xml = xml.replace(/-(!|\?)”/ug, "—$1”");
|
||||
xml = xml.replace(/-[,.]?<\/([a-z]+)>”/ug, "—</$1>”");
|
||||
xml = xml.replace(/-“/ug, "—”");
|
||||
xml = xml.replace(/<p>-/ug, "<p>—");
|
||||
xml = xml.replace(/-<\/p>/ug, "—</p>");
|
||||
xml = xml.replace(/-<br \/>/ug, "—<br />");
|
||||
xml = xml.replace(/-<\/([a-z]+)><\/p>/ug, "—</$1></p>");
|
||||
xml = xml.replace(/\s?\s?–\s?\s?/ug, "—");
|
||||
xml = xml.replace(/-\s\s?/ug, "—");
|
||||
xml = xml.replace(/\s?\s-/ug, "—");
|
||||
xml = xml.replace(/\s+—”/ug, "—”");
|
||||
xml = xml.replace(/I-I/ug, "I—I");
|
||||
xml = xml.replace(/I-uh/ug, "I—uh");
|
||||
xml = xml.replace(/-\?/ug, "—?");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function enDashJointNames(xml) {
|
||||
// Joint names should use en dashes
|
||||
xml = xml.replace(/Dallon-Pelham/ug, "Dallon–Pelham");
|
||||
xml = xml.replace(/Bet-Gimel/ug, "Bet–Gimel");
|
||||
xml = xml.replace(/Cheit-Gimel/ug, "Bet–Gimel");
|
||||
xml = xml.replace(/Tristan-Capricorn/ug, "Tristan–Capricorn");
|
||||
xml = xml.replace(/Capricorn-Byron/ug, "Capricorn–Byron");
|
||||
xml = xml.replace(/Tristan-Byron/ug, "Tristan–Byron");
|
||||
xml = xml.replace(/Gimel-Europe/ug, "Gimel–Europe");
|
||||
xml = xml.replace(/G-N/ug, "G–N");
|
||||
xml = xml.replace(/Imp-Damsel/ug, "Imp–Damsel");
|
||||
xml = xml.replace(/Damsel-Ashley/ug, "Damsel–Ashley");
|
||||
xml = xml.replace(/Antares-Anelace/ug, "Antares–Anelace");
|
||||
xml = xml.replace(/Challenger-Gallant/ug, "Challenger–Gallant");
|
||||
xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/ug, "Undersider$1–$2");
|
||||
xml = xml.replace(/Norwalk-Fairfield/ug, "Norwalk–Fairfield");
|
||||
xml = xml.replace(/East-West/ug, "east–west");
|
||||
xml = xml.replace(/Creutzfeldt-Jakob/ug, "Creutzfeldt–Jakob");
|
||||
xml = xml.replace(/Astaroth-Nidhug/ug, "Astaroth–Nidhug");
|
||||
xml = xml.replace(/Capulet-Montague/ug, "Capulet–Montague");
|
||||
xml = xml.replace(/Weaver-Clockblocker/ug, "Weaver–Clockblocker");
|
||||
xml = xml.replace(/Alexandria-Pretender/ug, "Alexandria–Pretender");
|
||||
xml = xml.replace(/Night Hag-Nyx/ug, "Night Hag–Nyx");
|
||||
xml = xml.replace(/Crawler-Breed/ug, "Crawler–Breed");
|
||||
xml = xml.replace(/Simurgh-Myrddin-plant/ug, "Simurgh–Myrddin–plant");
|
||||
xml = xml.replace(/Armsmaster-Defiant/ug, "Armsmaster–Defiant");
|
||||
xml = xml.replace(/Matryoshka-Valentin/ug, "Matryoshka–Valentin");
|
||||
xml = xml.replace(/Gaea-Eden/ug, "Gaea–Eden");
|
||||
xml = xml.replace(/([Aa])gent-parahuman/ug, "$1gent–parahuman");
|
||||
xml = xml.replace(/([Pp])arahuman-agent/ug, "$1arahuman–agent");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixPossessives(xml) {
|
||||
// Fix possessive of names ending in "s".
|
||||
xml = xml.replace(
|
||||
// eslint-disable-next-line max-len
|
||||
/(?<!‘)(Judas|Brutus|Jess|Aegis|Dauntless|Circus|Sirius|Brooks|Genesis|Atlas|Lucas|Gwerrus|Chris|Eligos|Animos|Mags|Huntress|Hephaestus|Lord of Loss|John Combs|Mama Mathers|Monokeros|Goddess|Boundless|Paris|Tress|Harris|Antares|Nieves|Backwoods|Midas|Mrs. Sims|Ms. Stillons|Chuckles|Amias|Semiramis|Mother of Mothers)’(?!s)/ug,
|
||||
"$1’s"
|
||||
);
|
||||
|
||||
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s".
|
||||
xml = xml.replace(/Marquis’s/ug, "Marquis’");
|
||||
|
||||
// This one is not just missing the extra "s"; it's often misplaced.
|
||||
xml = xml.replace(/Warden’s/ug, "Wardens’");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function cleanSceneBreaks(xml) {
|
||||
// Normalize scene breaks. <hr> would be more semantically appropriate, but loses the author's intent. This is
|
||||
// especially the case in Ward, which uses a variety of different scene breaks.
|
||||
|
||||
xml = xml.replace(/<p(?:[^>]*)>■<\/p>/ug, `<p style="text-align: center;">■</p>`);
|
||||
|
||||
xml = xml.replace(
|
||||
/<p style="text-align: center;"><strong>⊙<\/strong><\/p>/ug,
|
||||
`<p style="text-align: center;">⊙</p>`
|
||||
);
|
||||
xml = xml.replace(
|
||||
/<p style="text-align: center;"><em><strong>⊙<\/strong><\/em><\/p>/ug,
|
||||
`<p style="text-align: center;">⊙</p>`
|
||||
);
|
||||
xml = xml.replace(
|
||||
/<p style="text-align: center;"><strong>⊙⊙<\/strong><\/p>/ug,
|
||||
`<p style="text-align: center;">⊙</p>`
|
||||
);
|
||||
|
||||
xml = xml.replace(
|
||||
/<p style="text-align: center;"><strong>⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/ug,
|
||||
`<p style="text-align: center;">⊙ ⊙ ⊙ ⊙ ⊙</p>`
|
||||
);
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixCapitalization(xml, book) {
|
||||
// This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where
|
||||
// it's incorrect to capitalize in the one-off fixes.
|
||||
// Note that Ward contains much talk of "the clairvoyants", so we don't want to capitalize plurals.
|
||||
xml = xml.replace(/([Tt])he clairvoyant(?!s)/ug, "$1he Clairvoyant");
|
||||
|
||||
// ReSound's name is sometimes miscapitalized. The word is never used in a non-name context.
|
||||
xml = xml.replace(/Resound/ug, "ReSound");
|
||||
|
||||
// The Speedrunners team name is missing its capitalization a couple times.
|
||||
xml = xml.replace(/speedrunners/ug, "Speedrunners");
|
||||
|
||||
// The Machine Army is missing its capitalization a couple times.
|
||||
xml = xml.replace(/machine army/ug, "Machine Army");
|
||||
|
||||
// "patrol block" is capitalized three different ways: "patrol block", "Patrol block", and "Patrol Block". "patrol
|
||||
// group" is always lowercased. It seems like "Patrol" is a proper name, and is used as a capitalized modifier in
|
||||
// other contexts (e.g. Patrol leader). So let's standardize on "Patrol <lowercase>".
|
||||
xml = xml.replace(
|
||||
/patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/uig,
|
||||
(_, $1) => `Patrol ${$1.toLowerCase()}`
|
||||
);
|
||||
// This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in
|
||||
// Worm, where it is never needed:
|
||||
if (book === "ward") {
|
||||
xml = xml.replace(/the patrol(?!s|ling)/ug, "the Patrol");
|
||||
}
|
||||
|
||||
// This is sometimes missing its capitalization.
|
||||
xml = xml.replace(/the birdcage/ug, "the Birdcage");
|
||||
|
||||
// There's no reason why these should be capitalized.
|
||||
xml = xml.replace(/(?<! {2}|“|>)Halberd/ug, "halberd");
|
||||
xml = xml.replace(/(?<! {2}|“|>)Loft/ug, "loft");
|
||||
|
||||
// These are treated as common nouns and not traditionally capitalized. "Krav Maga" remains capitalized,
|
||||
// interestingly (according to dictionaries and Wikipedia).
|
||||
xml = xml.replace(/(?<! {2}|“|>)Judo/ug, "judo");
|
||||
xml = xml.replace(/(?<! {2}|“|>)Aikido/ug, "aikido");
|
||||
xml = xml.replace(/(?<! {2}|“|>)Karate/ug, "karate");
|
||||
xml = xml.replace(/(?<! {2}|“|>)Tae Kwon Do/ug, "tae kwon do");
|
||||
|
||||
// There's no reason why university should be capitalized in most contexts, although sometimes it's used as part of
|
||||
// a compound noun or at the beginning of a sentence.
|
||||
xml = xml.replace(/(?<! {2}|“|>|Cornell |Nilles )University(?! Road)/ug, "university");
|
||||
|
||||
// Organ names (e.g. brain, arm) or scientific names are not capitalized, so the "corona pollentia" and friends should
|
||||
// not be either. The books are inconsistent.
|
||||
xml = xml.replace(/(?<! {2}|“|>|-)Corona/ug, "corona");
|
||||
xml = xml.replace(/Pollentia/ug, "pollentia");
|
||||
xml = xml.replace(/Radiata/ug, "radiata");
|
||||
xml = xml.replace(/Gemma/ug, "gemma");
|
||||
|
||||
// We de-capitalize Valkyrie's "flock", since most uses are de-capitalized (e.g. the many instances in Gleaming
|
||||
// Interlude 9, or Dying 15.z). This is a bit surprising; it seems like an organization name. But I guess it's
|
||||
// informal.
|
||||
xml = xml.replace(/(?<! {2}|“|>)Flock/ug, "flock");
|
||||
|
||||
// Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we
|
||||
// can be reasonably sure they don't start a sentence, although more specific instances are done in
|
||||
// substitutions.json, and some need to be back-corrected.
|
||||
//
|
||||
// Note: "Master" is specifically omitted because it fails poorly on Worm Interlude 4. Other instances need to be
|
||||
// corrected via substitutions.json.
|
||||
//
|
||||
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json.
|
||||
xml = xml.replace(
|
||||
// eslint-disable-next-line max-len
|
||||
/(?<! {2}|“|>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/ug,
|
||||
(_, designation) => designation.toLowerCase()
|
||||
);
|
||||
xml = xml.replace(
|
||||
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)-(\d+)/ugi,
|
||||
"$1 $2"
|
||||
);
|
||||
xml = xml.replace(
|
||||
// eslint-disable-next-line max-len
|
||||
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)[ -/](mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)/ugi,
|
||||
"$1–$2"
|
||||
);
|
||||
|
||||
// Capitalization is inconsistent, but shard names seems to usually be capitalized.
|
||||
xml = xml.replace(/Grasping self/ug, "Grasping Self");
|
||||
xml = xml.replace(/Cloven stranger/ug, "Cloven Stranger");
|
||||
xml = xml.replace(/Princess shaper/ug, "Princess Shaper");
|
||||
xml = xml.replace(/Fragile one/ug, "Fragile One");
|
||||
|
||||
// Place names need to always be capitalized
|
||||
xml = xml.replace(/North end/ug, "North End");
|
||||
xml = xml.replace(/(Stonemast|Shale) avenue/ug, "$1 Avenue");
|
||||
xml = xml.replace(/(Lord|Slater) street/ug, "$1 Street");
|
||||
xml = xml.replace(/(Hollow|Cedar) point/ug, "$1 Point");
|
||||
xml = xml.replace(/(Norwalk|Fenway|Stratford) station/ug, "$1 Station");
|
||||
xml = xml.replace(/the megalopolis/ug, "the Megalopolis");
|
||||
xml = xml.replace(/earths(?![a-z])/ug, "Earths");
|
||||
if (book === "ward") {
|
||||
xml = xml.replace(/the bunker/ug, "the Bunker");
|
||||
xml = xml.replace(/‘bunker’/ug, "‘Bunker’");
|
||||
}
|
||||
|
||||
// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
|
||||
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
|
||||
// substitutions.json.
|
||||
xml = xml.replace(/(?<!mom), dad(?![a-z])/ug, ", Dad");
|
||||
xml = xml.replace(/, mom(?![a-z-])/ug, ", Mom");
|
||||
|
||||
// Similarly, specific aunts and uncles get capitalized when used as a title. These are often missed.
|
||||
xml = xml.replace(/aunt Sarah/ug, "Aunt Sarah");
|
||||
xml = xml.replace(/aunt Fleur/ug, "Aunt Fleur");
|
||||
xml = xml.replace(/uncle Neil/ug, "Uncle Neil");
|
||||
|
||||
// The majority of "Wardens’ headquarters" is lowercased, and always prefixed with "the", indicating it's not a proper
|
||||
// place name. So we remove the capitalization in the few places where it does appear.
|
||||
xml = xml.replace(/Wardens’ Headquarters/ug, "Wardens’ headquarters");
|
||||
|
||||
// Some style guides try to reserve capitalized "Nazi" for historical discussions of members of the Nazi party. This
|
||||
// seems fuzzy when it comes to phrases like "neo-Nazi", and doesn't seem to be what the author is doing; the books
|
||||
// are just plain inconsistent. So, let's standardize on always uppercasing.
|
||||
xml = xml.replace(/(?<![a-z])nazi/ug, "Nazi");
|
||||
xml = xml.replace(/ Neo-/ug, " neo-");
|
||||
|
||||
// Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their
|
||||
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one
|
||||
// case of "French toast".)
|
||||
xml = xml.replace(/english(?! muffin)/ug, "English");
|
||||
xml = xml.replace(/(?<! {2})English muffin/ug, "english muffin");
|
||||
|
||||
// I was very torn on what to do with capitalization for "Titan" and "Titans". In general you don't capitalize species
|
||||
// names or other classifications, e.g. style guides are quite clear you don't capitalize "gods". The author
|
||||
// capitalizes them more often than not (e.g., 179 raw "Titans" to 49 "titans"), but is quite inconsistent.
|
||||
//
|
||||
// In the end, I decided against de-capitalization, based on the precedent set by "Endbringers" (which are
|
||||
// conceptually paired with Titans several times in the text). However, we only capitalize the class after they are
|
||||
// _introduced_ as a class in Sundown 17.y. (Before then we still capitalize individual names like "Dauntless Titan"
|
||||
// or "Kronos Titan".)
|
||||
if (book === "ward") {
|
||||
// All plural discussions of "Titans" are after Sundown 17.y.
|
||||
xml = xml.replace(/titans/ug, "Titans");
|
||||
|
||||
// Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here,
|
||||
// though.
|
||||
xml = xml.replace(/dauntless titan/uig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized.
|
||||
xml = xml.replace(/Kronos titan/ug, "Kronos Titan");
|
||||
}
|
||||
|
||||
// For the giants, the prevailing usage seems to be to keep the term lowercase, but capitalize when used as a name.
|
||||
xml = xml.replace(/(?<=Mathers |Goddess )giant/ug, "Giant");
|
||||
xml = xml.replace(/mother giant/uig, "Mother Giant");
|
||||
xml = xml.replace(/(?<! {2}|“|>)Giants/ug, "giants");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixMispellings(xml) {
|
||||
// This is commonly misspelled.
|
||||
xml = xml.replace(/([Ss])houlderblade/ug, "$1houlder blade");
|
||||
|
||||
// All dictionaries agree this is capitalized.
|
||||
xml = xml.replace(/u-turn/ug, "U-turn");
|
||||
|
||||
// https://www.dictionary.com/browse/scot-free
|
||||
xml = xml.replace(/scott(?: |-)free/ug, "scot-free");
|
||||
|
||||
// https://ugrammarist.com/idiom/change-tack/
|
||||
xml = xml.replace(/changed tacks/ug, "changed tack");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixHyphens(xml) {
|
||||
// "X-year-old" should use hyphens; all grammar guides agree. The books are very inconsistent but most often omit
|
||||
// them.
|
||||
xml = xml.replace(/(\w+)[ -]year[ -]old(s?)(?!\w)/ug, "$1-year-old$2");
|
||||
xml = xml.replace(/(\w+) or (\w+)-year-old/ug, "$1- or $2-year-old");
|
||||
|
||||
// Compound numbers from 11 through 99 must be hyphenated, but others should not be.
|
||||
xml = xml.replace(
|
||||
/(?<!\w)(twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety) (one|two|three|four|five|six|seven|eight|nine)/uig,
|
||||
"$1-$2"
|
||||
);
|
||||
xml = xml.replace(/[- ]hundred-and-/ug, " hundred and ");
|
||||
xml = xml.replace(/(?<!-)(one|two|three|four|five|six|seven|eight|nine|twelve)-hundred/ug, "$1 hundred");
|
||||
xml = xml.replace(/(hundred|ninety)-percent(?!-)/ug, "$1 percent");
|
||||
|
||||
// "red-haired", "long-haired", etc.: they all need hyphens
|
||||
xml = xml.replace(/ haired/ug, "-haired");
|
||||
|
||||
// These are consistently missing hyphens.
|
||||
xml = xml.replace(/([Ll]ife) threatening/ug, "life-threatening");
|
||||
xml = xml.replace(/([Hh]ard) headed/ug, "$1-headed");
|
||||
xml = xml.replace(/([Ss]houlder) mounted/ug, "$1-mounted");
|
||||
xml = xml.replace(/([Gg]olden) skinned/ug, "$1-skinned");
|
||||
xml = xml.replace(/([Cc]reepy) crawl/ug, "$1-crawl");
|
||||
xml = xml.replace(/([Ww]ell) armed/ug, "$1-armed");
|
||||
xml = xml.replace(/([Aa]ble) bodied/ug, "$1-bodied");
|
||||
xml = xml.replace(/([Ll]evel) headed/ug, "$1-headed");
|
||||
xml = xml.replace(/([Cc]lear) cut/ug, "$1-cut");
|
||||
xml = xml.replace(/([Vv]at) grown/ug, "$1-grown");
|
||||
xml = xml.replace(/([Ss]hell) shocked/ug, "$1-shocked");
|
||||
xml = xml.replace(/([Dd]og) tired/ug, "$1-tired");
|
||||
xml = xml.replace(/([Nn]ightmare) filled/ug, "$1-filled");
|
||||
xml = xml.replace(/([Oo]ne) sided/ug, "$1-sided");
|
||||
xml = xml.replace(/([Mm]edium) sized/ug, "$1-sized");
|
||||
xml = xml.replace(/([Tt]eary) eyed/ug, "$1-eyed");
|
||||
xml = xml.replace(/([Ww]orst) case scenario/ug, "$1-case scenario");
|
||||
xml = xml.replace(/([Ss]elf) (conscious|esteem|loathing|harm|destruct|preservation)/ug, "$1-$2");
|
||||
xml = xml.replace(/([Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ourth) dimensional/ug, "$1-dimensional");
|
||||
xml = xml.replace(/(?<=\b)([Oo]ne) on one(?=\b)/ug, "$1-on-one");
|
||||
|
||||
// Preemptive(ly) is often hyphenated (not always). It should not be.
|
||||
xml = xml.replace(/([Pp])re-emptive/ug, "$1reemptive");
|
||||
|
||||
// These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json.
|
||||
xml = xml.replace(/fist-bump/ug, "fist bump");
|
||||
xml = xml.replace(/high-five/ug, "high five");
|
||||
|
||||
// This should be hyphenated when used as an adjective (instead of an adverb or noun). I.e. it should be
|
||||
// "hand-to-hand combat", but "passed from hand to hand", and "capable in hand to hand". The following heuristic works
|
||||
// in the books.
|
||||
xml = xml.replace(/hand to hand(?= [a-z])/ug, "hand-to-hand");
|
||||
|
||||
// This is usually wrong but sometimes correct. The lookarounds avoid specific cases where it's referring to an actual
|
||||
// second in a series of guesses.
|
||||
xml = xml.replace(/(?<!my |that )([Ss]econd) guess(?!es)/ug, "$1-guess");
|
||||
|
||||
// When used as a phrase "just in case" gets no hyphens. When used as a noun or adjective it does. A couple of the
|
||||
// noun cases are missing one or both hyphens.
|
||||
xml = xml.replace(/([Aa]) just[ -]in case/ug, "$1 just-in-case");
|
||||
|
||||
// When used as an adjective, it's hyphenated. It turns out most cases are as an adverb, so we go with this approach:
|
||||
xml = xml.replace(
|
||||
/face to face(?= meeting| hang-out| interaction| contact| conversation| confrontation| fight)/ug,
|
||||
"face-to-face"
|
||||
);
|
||||
|
||||
// When used as an adjective, it's hyphenated. This heuristic works in the books.
|
||||
xml = xml.replace(/fight or flight(?= [a-z])/ug, "fight-or-flight");
|
||||
|
||||
// This is usually correct but sometimes wrong.
|
||||
xml = xml.replace(/neo /ug, "neo-");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function standardizeSpellings(xml) {
|
||||
// This is usually spelled "TV" but sometimes the other ways. Normalize.
|
||||
xml = xml.replace(/(\b)tv(\b)/ug, "$1TV$2");
|
||||
xml = xml.replace(/t\.v\./uig, "TV");
|
||||
|
||||
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
|
||||
// writing notes and thus probably the intention was to be less formal. Also it seems per
|
||||
// https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
|
||||
xml = xml.replace(/Ok([,. ])/ug, "Okay$1");
|
||||
xml = xml.replace(/([^a-zA-Z])ok([^a])/ug, "$1okay$2");
|
||||
xml = xml.replace(/([^a-zA-Z])o\.k\.([^a])/ug, "$1okay$2");
|
||||
xml = xml.replace(/a-okay/ug, "A-okay");
|
||||
|
||||
// Signal(l)ing/signal(l)ed are spelled both ways. Both are acceptable in English. Let's standardize on single-L.
|
||||
xml = xml.replace(/(S|s)ignall/ug, "$1ignal");
|
||||
|
||||
// Clich(e|é) is spelled both ways. Let's standardize on including the accent.
|
||||
xml = xml.replace(/cliche/ug, "cliché");
|
||||
|
||||
// T-shirt is usually spelled lowercase ("t-shirt"). Normalize the remaining instances.
|
||||
xml = xml.replace(/(?<! {2})T-shirt/ug, "t-shirt");
|
||||
|
||||
// "gray" is the majority spelling, except for "greyhound"
|
||||
xml = xml.replace(/(G|g)rey(?!hound)/ug, "$1ray");
|
||||
|
||||
// 12 instances of "Dragon-craft", 12 instances of "Dragon craft", 1 instance of "dragon craft"
|
||||
xml = xml.replace(/[Dd]ragon[ -](craft|mech)/ug, "Dragon-$1");
|
||||
|
||||
// 88 instances of "A.I." to four of "AI"
|
||||
xml = xml.replace(/(?<=\b)AI(?=\b)/ug, "A.I.");
|
||||
|
||||
// 2 instances of "G.M." to one of "GM"
|
||||
xml = xml.replace(/(?<=\b)GM(?=\b)/ug, "G.M.");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function fixCaseNumbers(xml) {
|
||||
// Case numbers are very inconsistent. For "Case Fifty-Three", the breakdown is:
|
||||
// * 9 Case-53
|
||||
// * 6 Case 53
|
||||
// * 2 case-53
|
||||
// * 1 Case-Fifty-Three
|
||||
// * 41 Case Fifty-Three
|
||||
// * 1 Case Fifty Three
|
||||
// * 13 Case fifty-three
|
||||
// * 119 case fifty-three
|
||||
// * 4 case-fifty-three
|
||||
// * 1 case fifty three
|
||||
// We standardize on "Case Fifty-Three"; although it isn't the most common, it seems best to treat these as proper
|
||||
// nouns.
|
||||
|
||||
xml = xml.replace(/case[ -](?:fifty[ -]three|53)(?!’)/uig, "Case Fifty-Three");
|
||||
xml = xml.replace(/case[ -](?:thirty[ -]two|53)(?!’)/uig, "Case Thirty-Two");
|
||||
xml = xml.replace(/case[ -](?:sixty[ -]nine|53)(?!’)/uig, "Case Sixty-Nine");
|
||||
|
||||
xml = xml.replace(
|
||||
/(?<!in )case[ -](zero|one|two|three|four|twelve|fifteen|seventy|ninety)(?!-)/uig,
|
||||
(_, caseNumber) => `Case ${caseNumber[0].toUpperCase()}${caseNumber.substring(1)}`
|
||||
);
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function isEmptyOrGarbage(el) {
|
||||
const text = el.textContent.trim();
|
||||
return text === "" ||
|
||||
text.startsWith("Last Chapter") ||
|
||||
text.startsWith("Previous Chapter") ||
|
||||
text.startsWith("Next Chapter");
|
||||
}
|
||||
|
||||
function escapeRegExp(str) {
|
||||
return str.replace(/[[\]/{}()*+?.\\^$|]/ug, "\\$&");
|
||||
}
|
||||
|
||||
function decodeCloudFlareEmail(hash) {
|
||||
let email = "";
|
||||
const xorWithThis = parseInt(hash.substring(0, 2), 16);
|
||||
for (let i = 2; i < hash.length; i += 2) {
|
||||
const charCode = parseInt(hash.substring(i, i + 2), 16) ^ xorWithThis;
|
||||
email += String.fromCharCode(charCode);
|
||||
}
|
||||
|
||||
return email;
|
||||
}
|
||||
418
lib/convert.js
418
lib/convert.js
|
|
@ -1,386 +1,48 @@
|
|||
"use strict";
|
||||
const path = require("path");
|
||||
const fs = require("mz/fs");
|
||||
const throat = require("throat");
|
||||
const serializeToXML = require("xmlserializer").serializeToString;
|
||||
const { JSDOM } = require("jsdom");
|
||||
const substitutions = require("./substitutions.json");
|
||||
const fs = require("fs").promises;
|
||||
const { performance } = require("perf_hooks");
|
||||
const workerpool = require("workerpool");
|
||||
const cliProgress = require("cli-progress");
|
||||
|
||||
module.exports = async (cachePath, manifestPath, contentPath) => {
|
||||
module.exports = async (cachePath, manifestPath, contentPath, book, concurrentJobs) => {
|
||||
const manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
|
||||
const chapters = JSON.parse(manifestContents);
|
||||
console.log("All chapters downloaded; beginning conversion to EPUB chapters");
|
||||
|
||||
const mapper = throat(10, chapter => convertChapter(chapter, cachePath, contentPath));
|
||||
await Promise.all(chapters.map(mapper));
|
||||
console.log("Converting raw downloaded HTML to EPUB chapters");
|
||||
const progress = new cliProgress.SingleBar({
|
||||
stopOnComplete: true,
|
||||
clearOnComplete: true,
|
||||
format: " {bar} {percentage}% | {time} | {value}/{total}"
|
||||
}, cliProgress.Presets.shades_classic);
|
||||
|
||||
console.log("All chapters converted");
|
||||
const start = performance.now();
|
||||
progress.start(chapters.length, 0, { time: " " });
|
||||
|
||||
const poolOptions = {};
|
||||
if (concurrentJobs !== undefined) {
|
||||
poolOptions.maxWorkers = concurrentJobs;
|
||||
}
|
||||
const pool = workerpool.pool(path.resolve(__dirname, "convert-worker.js"), poolOptions);
|
||||
|
||||
const warnings = [];
|
||||
await Promise.all(chapters.map(async chapter => {
|
||||
const inputPath = path.resolve(cachePath, chapter.filename);
|
||||
|
||||
const destFileName = `${path.basename(chapter.filename, ".html")}.xhtml`;
|
||||
const outputPath = path.resolve(contentPath, destFileName);
|
||||
|
||||
warnings.push(...await pool.exec("convertChapter", [chapter, book, inputPath, outputPath]));
|
||||
|
||||
const seconds = String(Math.round((performance.now() - start) / 1000)).padStart(3);
|
||||
progress.increment({ time: `${seconds} s` });
|
||||
}));
|
||||
|
||||
pool.terminate();
|
||||
|
||||
for (const warning of warnings) {
|
||||
console.warn(warning);
|
||||
}
|
||||
|
||||
console.log(`All chapters converted in ${Math.round((performance.now() - start) / 100) / 10} seconds`);
|
||||
};
|
||||
|
||||
async function convertChapter(chapter, cachePath, contentPath) {
|
||||
const filename = chapter.filename;
|
||||
const filePath = path.resolve(cachePath, filename);
|
||||
|
||||
const contents = await fs.readFile(filePath, { encoding: "utf-8" });
|
||||
|
||||
const rawChapterJSDOM = new JSDOM(contents);
|
||||
const output = getChapterString(chapter, rawChapterJSDOM.window.document);
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterJSDOM.window.close();
|
||||
|
||||
const destFileName = `${path.basename(filename, ".html")}.xhtml`;
|
||||
const destFilePath = path.resolve(contentPath, destFileName);
|
||||
|
||||
await fs.writeFile(destFilePath, output);
|
||||
console.log(`- Finished converting ${filename}`);
|
||||
}
|
||||
|
||||
function getChapterString(chapter, rawChapterDoc) {
|
||||
const body = getBodyXML(chapter, rawChapterDoc.querySelector(".entry-content"));
|
||||
|
||||
return `<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
|
||||
<title>${chapter.title}</title>
|
||||
</head>
|
||||
${body}
|
||||
</html>`;
|
||||
}
|
||||
|
||||
function getBodyXML(chapter, contentEl) {
|
||||
// Remove initial Next Chapter and Previous Chapter <p>
|
||||
contentEl.removeChild(contentEl.firstElementChild);
|
||||
|
||||
// Remove everything after the last <p> (e.g. analytics <div>s)
|
||||
const lastP = contentEl.querySelector("p:last-of-type");
|
||||
while (contentEl.lastElementChild !== lastP) {
|
||||
contentEl.removeChild(contentEl.lastElementChild);
|
||||
}
|
||||
|
||||
// Remove empty <p>s or Last Chapter/Next Chapter <p>s
|
||||
while (isEmptyOrGarbage(contentEl.lastElementChild)) {
|
||||
contentEl.removeChild(contentEl.lastElementChild);
|
||||
}
|
||||
|
||||
// Remove redundant attributes and style
|
||||
Array.prototype.forEach.call(contentEl.children, child => {
|
||||
if (child.getAttribute("dir") === "ltr") {
|
||||
child.removeAttribute("dir");
|
||||
}
|
||||
|
||||
// Only ever appears with align="LEFT" (useless) or align="CENTER" overridden by style="text-align: left;" (also
|
||||
// useless)
|
||||
child.removeAttribute("align");
|
||||
|
||||
if (child.getAttribute("style") === "text-align:left;") {
|
||||
child.removeAttribute("style");
|
||||
}
|
||||
if (child.getAttribute("style") === "text-align:left;padding-left:30px;") {
|
||||
child.setAttribute("style", "padding-left:30px;");
|
||||
}
|
||||
});
|
||||
|
||||
// Remove empty <em>s and <i>s
|
||||
// Remove style attributes from them, as they're always messed up.
|
||||
const ems = contentEl.querySelectorAll("em, i");
|
||||
Array.prototype.forEach.call(ems, em => {
|
||||
if (em.textContent.trim() === "") {
|
||||
const replacement = contentEl.ownerDocument.createTextNode(" ");
|
||||
em.parentNode.replaceChild(replacement, em);
|
||||
} else {
|
||||
em.removeAttribute("style");
|
||||
}
|
||||
});
|
||||
|
||||
// In https://parahumans.wordpress.com/2013/01/05/monarch-16-13/ there are some <address>s that should be <p>s O_o
|
||||
const addresses = contentEl.querySelectorAll("address");
|
||||
Array.prototype.forEach.call(addresses, address => {
|
||||
const p = contentEl.ownerDocument.createElement("p");
|
||||
p.innerHTML = address.innerHTML;
|
||||
address.parentNode.replaceChild(p, address);
|
||||
});
|
||||
|
||||
// Every <span> except underline ones is pointless at best and frequently messed up. (Weird font size, line spacing,
|
||||
// etc.)
|
||||
const spans = contentEl.querySelectorAll("span");
|
||||
Array.prototype.forEach.call(spans, span => {
|
||||
if (span.getAttribute("style") === "text-decoration:underline;") {
|
||||
return;
|
||||
}
|
||||
|
||||
if (span.textContent.trim() === "") {
|
||||
span.parentNode.removeChild(span);
|
||||
} else {
|
||||
const docFrag = contentEl.ownerDocument.createDocumentFragment();
|
||||
while (span.firstChild) {
|
||||
docFrag.appendChild(span.firstChild);
|
||||
}
|
||||
span.parentNode.replaceChild(docFrag, span);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
// Synthesize a <body> tag to serialize
|
||||
const bodyEl = contentEl.ownerDocument.createElement("body");
|
||||
const h1El = contentEl.ownerDocument.createElement("h1");
|
||||
h1El.textContent = chapter.title;
|
||||
|
||||
bodyEl.appendChild(h1El);
|
||||
while (contentEl.firstChild) {
|
||||
bodyEl.appendChild(contentEl.firstChild);
|
||||
}
|
||||
|
||||
let xml = serializeToXML(bodyEl);
|
||||
|
||||
// Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p>
|
||||
xml = xml.replace(/<br\/>\s*<\/em><\/p>/g, "</em></p>");
|
||||
|
||||
// There are way too many nonbreaking spaces where they don't belong.
|
||||
// If they show up three in a row, then let them live. Otherwise, they die.
|
||||
xml = xml.replace(/([^\xA0])\xA0\xA0?([^\xA0])/g, "$1 $2");
|
||||
|
||||
function fixEms() {
|
||||
// Fix recurring broken-up or erroneous <em>s
|
||||
xml = xml.replace(/<\/em>‘s/g, "’s</em>");
|
||||
xml = xml.replace(/<em><\/em>/g, "");
|
||||
xml = xml.replace(/<\/em><em>/g, "");
|
||||
xml = xml.replace(/<em>(\s?\s?[^A-Za-z]\s?\s?)<\/em>/g, "$1");
|
||||
xml = xml.replace(/<\/em>(\s?\s?[^A-Za-z]\s?\s?)<em>/g, "$1");
|
||||
xml = xml.replace(/“<em>([^>]+)<\/em>(!|\?|\.)”/g, "“<em>$1$2</em>”");
|
||||
xml = xml.replace(/<p><em>([^>]+)<\/em>(!|\?|\.)<\/p>/g, "<p><em>$1$2</em></p>");
|
||||
xml = xml.replace(/(!|\?|\.)\s{2}<\/em><\/p>/g, "$1</em></p>");
|
||||
xml = xml.replace(/<em>([a-z]+)\?<\/em>/g, "<em>$1</em>?");
|
||||
xml = xml.replace(/<em>([^>]+?)( +)<\/em>/g, "<em>$1</em>$2");
|
||||
xml = xml.replace(/<em> ([a-zA-Z]+)<\/em>/g, " <em>$1</em>");
|
||||
xml = xml.replace(/<em>‘\s*([^<]+)\s*’<\/em>/g, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/<em>‘\s*([^<]+)\s*<\/em>\s*’/g, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/‘\s*<em>\s*([^<]+)\s*’<\/em>/g, "‘<em>$1</em>’");
|
||||
xml = xml.replace(/<em>“\s*([^<]+)\s*”<\/em>/g, "“<em>$1</em>”");
|
||||
xml = xml.replace(/<em>“\s*([^<]+)\s*<\/em>\s*”/g, "“<em>$1</em>”");
|
||||
xml = xml.replace(/“\s*<em>\s*([^<]+)\s*”<\/em>/g, "“<em>$1</em>”");
|
||||
xml = xml.replace(/([^\n>])<em> ?/g, "$1 <em>");
|
||||
xml = xml.replace(/ ?<\/em>/g, "</em> ");
|
||||
xml = xml.replace(/<p([^>]+)> <em>/g, "<p$1><em>");
|
||||
xml = xml.replace(/<\/em> <\/p>/g, "</em></p>");
|
||||
xml = xml.replace(/<em>([a-z]+),<\/em>/g, "<em>$1</em>,");
|
||||
}
|
||||
|
||||
function fixQuotesAndApostrophes() {
|
||||
// Fix recurring poor quotes and apostrophes
|
||||
xml = xml.replace(/<p>”/g, "<p>“");
|
||||
xml = xml.replace(/“\s*<\/p>/g, "”</p>");
|
||||
xml = xml.replace(/“\s*<\/em><\/p>/g, "</em>”</p>");
|
||||
xml = xml.replace(/‘\s*<\/p>/g, "’</p>");
|
||||
xml = xml.replace(/‘\s*<\/em><\/p>/g, "’</em></p>");
|
||||
xml = xml.replace(/,” <\/em>/g, "</em>,” ");
|
||||
xml = xml.replace(/′/g, "’");
|
||||
xml = xml.replace(/″/g, "”");
|
||||
xml = xml.replace(/([A-Za-z])‘s(\s?)/g, "$1’s$2");
|
||||
xml = xml.replace(/I‘m/g, "I’m");
|
||||
xml = xml.replace(/<p>“\s+/g, "<p>“");
|
||||
xml = xml.replace(/'/g, "’");
|
||||
xml = xml.replace(/’([A-Za-z]+)’/g, "‘$1’");
|
||||
xml = xml.replace(/‘Sup/g, "’Sup");
|
||||
}
|
||||
|
||||
// These interact with each other, so do them a few times.
|
||||
xml = xml.replace(/,” <\/em>/g, "</em>,” ");
|
||||
fixEms();
|
||||
fixQuotesAndApostrophes();
|
||||
fixEms();
|
||||
fixQuotesAndApostrophes();
|
||||
fixEms();
|
||||
|
||||
// Fix possessive of names ending in "s"
|
||||
// Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s".
|
||||
xml = xml.replace(/([^‘])Judas’([^s])/g, "$1Judas’s$2");
|
||||
xml = xml.replace(/([^‘])Brutus’([^s])/g, "$1Brutus’s$2");
|
||||
xml = xml.replace(/([^‘])Jess’([^s])/g, "$1Jess’s$2");
|
||||
xml = xml.replace(/([^‘])Aegis’([^s])/g, "$1Aegis’s$2");
|
||||
xml = xml.replace(/([^‘])Dauntless’([^s])/g, "$1Dauntless’s$2");
|
||||
xml = xml.replace(/([^‘])Circus’([^s])/g, "$1Circus’s$2");
|
||||
xml = xml.replace(/([^‘])Sirius’([^s])/g, "$1Sirius’s$2");
|
||||
xml = xml.replace(/([^‘])Brooks’([^s])/g, "$1Brooks’s$2");
|
||||
xml = xml.replace(/([^‘])Genesis’([^s])/g, "$1Genesis’s$2");
|
||||
xml = xml.replace(/([^‘])Atlas’([^s])/g, "$1Atlas’s$2");
|
||||
xml = xml.replace(/([^‘])Lucas’([^s])/g, "$1Lucas’s$2");
|
||||
xml = xml.replace(/([^‘])Gwerrus’([^s])/g, "$1Gwerrus’s$2");
|
||||
xml = xml.replace(/([^‘])Chris’([^s])/g, "$1Chris’s$2");
|
||||
xml = xml.replace(/([^‘])Eligos’([^s])/g, "$1Eligos’s$2");
|
||||
xml = xml.replace(/([^‘])Animos’([^s])/g, "$1Animos’s$2");
|
||||
xml = xml.replace(/([^‘])Mags’([^s])/g, "$1Mags’s$2");
|
||||
xml = xml.replace(/([^‘])Huntress’([^s])/g, "$1Huntress’s$2");
|
||||
xml = xml.replace(/([^‘])Hephaestus’([^s])/g, "$1Hephaestus’s$2");
|
||||
|
||||
// Fixes dashes
|
||||
xml = xml.replace(/ – /g, "—");
|
||||
xml = xml.replace(/“-/g, "“—");
|
||||
xml = xml.replace(/-[,.]?”/g, "—”");
|
||||
xml = xml.replace(/-(!|\?)”/g, "—$1”");
|
||||
xml = xml.replace(/-[,.]?<\/em>”/g, "—</em>”");
|
||||
xml = xml.replace(/-“/g, "—”");
|
||||
xml = xml.replace(/<p>-/g, "<p>—");
|
||||
xml = xml.replace(/-<\/p>/g, "—</p>");
|
||||
xml = xml.replace(/-<\/em><\/p>/g, "—</em></p>");
|
||||
xml = xml.replace(/\s?\s?–\s?\s?/g, "—");
|
||||
xml = xml.replace(/-\s\s?/g, "—");
|
||||
xml = xml.replace(/\s?\s-/g, "—");
|
||||
xml = xml.replace(/\s+—”/g, "—”");
|
||||
xml = xml.replace(/I-I/g, "I—I");
|
||||
xml = xml.replace(/I-uh/g, "I—uh");
|
||||
|
||||
// Use <hr> for separators
|
||||
xml = xml.replace(/<p>■<\/p>/g, "<hr/>");
|
||||
xml = xml.replace(/<p style="text-align:center;">■<\/p>/g, "<hr/>");
|
||||
|
||||
// Fix recurring miscapitalization with questions
|
||||
xml = xml.replace(/\?”\s\s?She asked/g, "?” she asked");
|
||||
xml = xml.replace(/\?”\s\s?He asked/g, "?” he asked");
|
||||
|
||||
// Fix bad periods and spacing/markup surrounding them
|
||||
xml = xml.replace(/\.\.<\/p>/g, ".</p>");
|
||||
xml = xml.replace(/\.\.”<\/p>/g, ".”</p>");
|
||||
xml = xml.replace(/ \. /g, ". ");
|
||||
xml = xml.replace(/ \.<\/p>/g, ".</p>");
|
||||
xml = xml.replace(/\.<em>\.\./g, "<em>…");
|
||||
|
||||
// Fix extra spaces
|
||||
xml = xml.replace(/ ? <\/p>/g, "</p>");
|
||||
xml = xml.replace(/([a-z]) ,/g, "$1,");
|
||||
|
||||
// The author often fails to terminate a sentence, instead using a comma after a dialogue tag. For example,
|
||||
// > “I didn’t get much done,” Greg said, “I got distracted by...
|
||||
// This should instead be
|
||||
// > “I didn’t get much done,” Greg said. “I got distracted by...
|
||||
//
|
||||
// Our heuristic is to try to automatically fix this if the dialogue tag is two words (X said/admitted/sighed/etc.).
|
||||
//
|
||||
// This sometimes overcorrects, as in the following example:
|
||||
// > “Basically,” Alec said, “For your powers to manifest, ...
|
||||
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
|
||||
//
|
||||
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
|
||||
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
|
||||
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
|
||||
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/g, ",” $1. “$2");
|
||||
|
||||
// Replace single-word <i>s with <em>s. Other <i>s are probably erroneous too, but these are known-bad.
|
||||
xml = xml.replace(/<i>([A-Za-z]+)<\/i>/g, "<em>$1</em>");
|
||||
|
||||
// This occurs enough times it's better to do here than in one-off fixes. We correct the single instance where
|
||||
// it's incorrect to capitalize in the one-off fixes.
|
||||
xml = xml.replace(/the clairvoyant/g, "the Clairvoyant");
|
||||
|
||||
// This is sometimes missing its capitalization.
|
||||
xml = xml.replace(/the birdcage/g, "the Birdcage");
|
||||
|
||||
// This is usually spelled "TV" but sometimes the other ways. Normalize.
|
||||
xml = xml.replace(/tv/g, "TV");
|
||||
xml = xml.replace(/T\.V\./g, "TV");
|
||||
|
||||
// There's no reason why these should be capitalized. (Note that they never appear at the beginning of any sentences.)
|
||||
xml = xml.replace(/Halberd/g, "halberd");
|
||||
xml = xml.replace(/Loft/g, "loft");
|
||||
|
||||
// Especially early in the story, PRT designations are capitalized; they should not be. This fixes the cases where we
|
||||
// can be reasonably sure they don't start a sentence, although more specific instances are done in
|
||||
// substitutions.json, and some need to be back-corrected.
|
||||
//
|
||||
// Note: "Master" is specifically omitted because it fails poorly on Interlude 4. Other instances need to be
|
||||
// corrected via substitutions.json.
|
||||
xml = xml.replace(
|
||||
/([a-zA-Z,] |\/)(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)/g,
|
||||
(_, prefix, designation) => prefix + designation.toLowerCase()
|
||||
);
|
||||
xml = xml.replace(
|
||||
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)-(\d+)/gi,
|
||||
"$1 $2"
|
||||
);
|
||||
xml = xml.replace(
|
||||
// eslint-disable-next-line max-len
|
||||
/(mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)[ -/](mover|shaker|brute|breaker|tinker|blaster|thinker|master|striker|changer|trump|stranger|shifter|shaper)/gi,
|
||||
"$1–$2"
|
||||
);
|
||||
|
||||
// This is consistently missing accents
|
||||
xml = xml.replace(/Yangban/g, "Yàngbǎn");
|
||||
|
||||
// Place names need to always be capitalized
|
||||
xml = xml.replace(/North end/g, "North End");
|
||||
xml = xml.replace(/Stonemast avenue/g, "Stonemast Avenue");
|
||||
xml = xml.replace(/Shale avenue/g, "Shale Avenue");
|
||||
xml = xml.replace(/Lord street/g, "Lord Street");
|
||||
xml = xml.replace(/Slater street/g, "Slater Street");
|
||||
|
||||
// These are usually not italicized, but sometimes are. Other foreign-language names (like Yàngbǎn) are not
|
||||
// italicized, so we go in the direction of removing the italics.
|
||||
xml = xml.replace(/<em>Garama<\/em>/g, "Garama");
|
||||
xml = xml.replace(/<em>Thanda<\/em>/g, "Thanda");
|
||||
xml = xml.replace(/<em>Sifara([^<]*)<\/em>/g, "Sifara$1");
|
||||
xml = xml.replace(/<em>Moord Nag([^<]*)<\/em>/g, "Moord Nag$1");
|
||||
xml = xml.replace(/<em>Califa de Perro([^<]*)<\/em>/g, "Califa de Perro$1");
|
||||
xml = xml.replace(/<em>Turanta([^<]*)<\/em>/g, "Turanta$1");
|
||||
|
||||
// "okay" is preferred to "ok". This sometimes gets changed back via substitutions.json when people are writing notes
|
||||
// and thus probably the intention was to be less formal. Also it seems per https://en.wikipedia.org/wiki/A-ok the
|
||||
// "A" in "A-okay" should be capitalized.
|
||||
xml = xml.replace(/Ok([,. ])/g, "Okay$1");
|
||||
xml = xml.replace(/([^a-zA-Z])ok([^a])/g, "$1okay$2");
|
||||
xml = xml.replace(/a-okay/g, "A-okay");
|
||||
|
||||
// Signal(l)ing/signal(l)ed are spelled both ways. Both are acceptable in English. Let's standardize on single-L.
|
||||
xml = xml.replace(/(S|s)ignall/g, "$1ignal");
|
||||
|
||||
// Clich(e|é) is spelled both ways. Let's standardize on including the accent.
|
||||
xml = xml.replace(/cliche/g, "cliché");
|
||||
|
||||
// "gray" is the majority spelling, except for "greyhound"
|
||||
xml = xml.replace(/(G|g)rey(?!hound)/g, "$1ray");
|
||||
|
||||
// These are consistently missing hyphens.
|
||||
xml = xml.replace(/self destruct/g, "self-destruct");
|
||||
xml = xml.replace(/life threatening/g, "life-threatening");
|
||||
xml = xml.replace(/hard headed/g, "hard-headed");
|
||||
xml = xml.replace(/shoulder mounted/g, "shoulder-mounted");
|
||||
xml = xml.replace(/golden skinned/g, "golden-skinned");
|
||||
xml = xml.replace(/creepy crawl/g, "creepy-crawl");
|
||||
xml = xml.replace(/well armed/g, "well-armed");
|
||||
|
||||
// One-off fixes
|
||||
(substitutions[chapter.url] || []).forEach(substitution => {
|
||||
if (substitution.before) {
|
||||
const indexOf = xml.indexOf(substitution.before);
|
||||
if (indexOf === -1) {
|
||||
console.warn(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
|
||||
`updated at the source, in which case, you should edit substitutions.json.`);
|
||||
}
|
||||
if (indexOf !== xml.lastIndexOf(substitution.before)) {
|
||||
console.warn(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
|
||||
`Update substitutions.json for a more precise substitution.`);
|
||||
}
|
||||
|
||||
xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after);
|
||||
} else if (substitution.regExp) {
|
||||
xml = xml.replace(new RegExp(substitution.regExp, "g"), substitution.replacement);
|
||||
} else {
|
||||
console.warn(`Invalid substitution specified for ${chapter.url}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>
|
||||
xml = xml.replace(/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/, "<body>");
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
function isEmptyOrGarbage(el) {
|
||||
const text = el.textContent.trim();
|
||||
return text === "" || text.startsWith("Last Chapter") || text.startsWith("Next Chapter");
|
||||
}
|
||||
|
||||
function escapeRegExp(str) {
|
||||
return str.replace(/[-[\]/{}()*+?.\\^$|]/g, "\\$&");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
"use strict";
|
||||
const path = require("path");
|
||||
const fs = require("mz/fs");
|
||||
const mkdirp = require("mkdirp-then");
|
||||
const request = require("requisition");
|
||||
const fs = require("fs").promises;
|
||||
const fetch = require("minipass-fetch");
|
||||
const { JSDOM } = require("jsdom");
|
||||
|
||||
const FILENAME_PREFIX = "chapter";
|
||||
|
|
@ -35,39 +34,26 @@ async function downloadAllChapters(manifest, startChapterURL, cachePath, manifes
|
|||
manifest = [];
|
||||
}
|
||||
|
||||
await mkdirp(cachePath);
|
||||
await fs.mkdir(cachePath, { recursive: true });
|
||||
|
||||
while (currentChapter !== null) {
|
||||
const filename = `${FILENAME_PREFIX}${chapterIndex.toString().padStart(3, "0")}.html`;
|
||||
|
||||
console.log(`Downloading ${currentChapter}`);
|
||||
process.stdout.write(`Downloading ${currentChapter}... `);
|
||||
|
||||
const response = await downloadChapter(currentChapter);
|
||||
const contents = await response.text();
|
||||
console.log("- Response body received");
|
||||
const rawChapterJSDOM = new JSDOM(contents, { url: currentChapter });
|
||||
console.log("- Response body parsed into DOM");
|
||||
const { contents, dom, url } = await downloadChapter(currentChapter);
|
||||
const title = getChapterTitle(dom.window.document);
|
||||
currentChapter = getNextChapterURL(dom.window.document);
|
||||
|
||||
const chapterURLToSave = currentChapter;
|
||||
const chapterTitle = getChapterTitle(rawChapterJSDOM.window.document);
|
||||
currentChapter = getNextChapterURL(rawChapterJSDOM.window.document);
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterJSDOM.window.close();
|
||||
|
||||
manifest.push({
|
||||
url: chapterURLToSave,
|
||||
title: chapterTitle,
|
||||
filename
|
||||
});
|
||||
dom.window.close();
|
||||
|
||||
manifest.push({ url, title, filename });
|
||||
await fs.writeFile(path.resolve(cachePath, filename), contents);
|
||||
console.log("- Response text saved to cache file");
|
||||
|
||||
// Incrementally update the manifest after every successful download, instead of waiting until the end.
|
||||
const newManifestContents = JSON.stringify(manifest, undefined, 2);
|
||||
await fs.writeFile(manifestPath, newManifestContents);
|
||||
console.log("- Manifest updated");
|
||||
process.stdout.write("done\n");
|
||||
|
||||
++chapterIndex;
|
||||
}
|
||||
|
|
@ -79,18 +65,29 @@ function getNextChapterURL(rawChapterDoc) {
|
|||
// - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
|
||||
// So instead search for the first <a> within the main content area starting with "Next", trimmed.
|
||||
|
||||
let result = null;
|
||||
const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
|
||||
for (let i = 0; i < aEls.length; ++i) {
|
||||
if (aEls[i].textContent.trim().startsWith("Next")) {
|
||||
return aEls[i].href;
|
||||
result = aEls[i].href;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
// Except, this doesn't always work, because the "Next Chapter" link in
|
||||
// https://www.parahumans.net/2020/04/28/last-20-e6/ is just broken for some reason. We hard-code that.
|
||||
if (result === "https://www.parahumans.net/?p=3365&preview=true") {
|
||||
return "https://www.parahumans.net/2020/05/02/last-20-end/";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function getChapterTitle(rawChapterDoc) {
|
||||
return rawChapterDoc.querySelector("h1.entry-title").textContent;
|
||||
// Remove " – " because it's present in Ward but not in Worm, which is inconsistent. (And leaving it in causes slight
|
||||
// issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to
|
||||
// have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter.
|
||||
// Until then this is reasonable and uniform.
|
||||
return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ – /u, " ");
|
||||
}
|
||||
|
||||
function retry(times, fn) {
|
||||
|
|
@ -103,9 +100,33 @@ function retry(times, fn) {
|
|||
});
|
||||
}
|
||||
|
||||
function downloadChapter(url) {
|
||||
async function downloadChapter(startingURL) {
|
||||
let urlToFollow = startingURL;
|
||||
|
||||
let url, contents, dom;
|
||||
while (urlToFollow !== null) {
|
||||
const response = await downloadWithRetry(urlToFollow);
|
||||
|
||||
url = urlToFollow;
|
||||
contents = await response.text();
|
||||
dom = new JSDOM(contents, { url });
|
||||
|
||||
const refreshMeta = dom.window.document.querySelector("meta[http-equiv=refresh]");
|
||||
if (refreshMeta) {
|
||||
[, urlToFollow] = /\d+;url=(.*)/ui.exec(refreshMeta.content);
|
||||
process.stdout.write(`\n Redirected to ${urlToFollow}... `);
|
||||
dom.window.close();
|
||||
} else {
|
||||
urlToFollow = null;
|
||||
}
|
||||
}
|
||||
|
||||
return { url, contents, dom };
|
||||
}
|
||||
|
||||
function downloadWithRetry(url) {
|
||||
return retry(3, async () => {
|
||||
const response = await request(url).redirects(10);
|
||||
const response = await fetch(url);
|
||||
if (response.status !== 200) {
|
||||
throw new Error(`Response status for ${url} was ${response.status}`);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,36 +1,29 @@
|
|||
"use strict";
|
||||
const fs = require("mz/fs");
|
||||
const fs = require("fs").promises;
|
||||
const path = require("path");
|
||||
const cpr = require("thenify")(require("cpr"));
|
||||
const cpr = require("util").promisify(require("cpr"));
|
||||
|
||||
const BOOK_TITLE = "Worm";
|
||||
const BOOK_AUTHOR = "wildbow";
|
||||
const BOOK_PUBLISHER = "Domenic Denicola";
|
||||
const BOOK_ID = "urn:uuid:e7f3532d-8db6-4888-be80-1976166b7059";
|
||||
|
||||
// First paragraph of https://parahumans.wordpress.com/about/
|
||||
const BOOK_DESCRIPTION = `
|
||||
An introverted teenage girl with an unconventional superpower, Taylor goes out in costume to find escape from a deeply
|
||||
unhappy and frustrated civilian life. Her first attempt at taking down a supervillain sees her mistaken for one,
|
||||
thrusting her into the midst of the local ‘cape’ scene’s politics, unwritten rules, and ambiguous morals. As she risks
|
||||
life and limb, Taylor faces the dilemma of having to do the wrong things for the right reasons.`;
|
||||
const BOOK_AUTHOR = "Wildbow";
|
||||
|
||||
const NCX_FILENAME = "toc.ncx";
|
||||
|
||||
const COVER_IMG_FILENAME = "cover.png";
|
||||
const COVER_XHTML_FILENAME = "cover.xhtml";
|
||||
const COVER_MIMETYPE = "image/png";
|
||||
|
||||
module.exports = async (scaffoldingPath, bookPath, contentPath, chaptersPath, manifestPath) => {
|
||||
module.exports = async (scaffoldingPath, coverPath, bookPath, contentPath, chaptersPath, manifestPath, bookInfo) => {
|
||||
await Promise.all([
|
||||
cpr(scaffoldingPath, bookPath, { overwrite: true, confirm: true, filter: noThumbs }),
|
||||
getChapters(contentPath, chaptersPath, manifestPath).then(chapters => {
|
||||
cpr(coverPath, path.resolve(bookPath, "OEBPS"), { overwrite: true, confirm: true, filter: noThumbs }),
|
||||
Promise.all([
|
||||
getChapters(contentPath, chaptersPath, manifestPath),
|
||||
getCoverFiles(coverPath)
|
||||
]).then(([chapters, coverFiles]) => {
|
||||
return Promise.all([
|
||||
writeOPF(chapters, contentPath),
|
||||
writeNcx(chapters, contentPath)
|
||||
writeOPF(chapters, contentPath, coverFiles, bookInfo),
|
||||
writeNcx(chapters, contentPath, bookInfo)
|
||||
]);
|
||||
})
|
||||
]);
|
||||
|
||||
console.log(`EPUB contents assembled into ${scaffoldingPath}`);
|
||||
};
|
||||
|
||||
function noThumbs(filePath) {
|
||||
|
|
@ -38,7 +31,7 @@ function noThumbs(filePath) {
|
|||
return path.basename(filePath) !== "Thumbs.db";
|
||||
}
|
||||
|
||||
function writeOPF(chapters, contentPath) {
|
||||
function writeOPF(chapters, contentPath, coverFiles, bookInfo) {
|
||||
const manifestChapters = chapters.map(c => {
|
||||
return `<item id="${c.id}" href="${c.href}" media-type="application/xhtml+xml"/>`;
|
||||
}).join("\n");
|
||||
|
|
@ -51,19 +44,19 @@ function writeOPF(chapters, contentPath) {
|
|||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
|
||||
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
|
||||
<dc:title>${BOOK_TITLE}</dc:title>
|
||||
<dc:title>${bookInfo.title}</dc:title>
|
||||
<dc:language>en</dc:language>
|
||||
<dc:identifier id="BookId" opf:scheme="UUID">${BOOK_ID}</dc:identifier>
|
||||
<dc:identifier id="BookId" opf:scheme="UUID">urn:uuid:${bookInfo.id}</dc:identifier>
|
||||
<dc:creator opf:file-as="${BOOK_AUTHOR}" opf:role="aut">${BOOK_AUTHOR}</dc:creator>
|
||||
<dc:publisher>${BOOK_PUBLISHER}</dc:publisher>
|
||||
<dc:description>${BOOK_DESCRIPTION}</dc:description>
|
||||
<dc:description>${bookInfo.description}</dc:description>
|
||||
<meta name="cover" content="cover-image"/>
|
||||
</metadata>
|
||||
|
||||
<manifest>
|
||||
<item id="ncx" href="${NCX_FILENAME}" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="cover" href="${COVER_XHTML_FILENAME}" media-type="application/xhtml+xml"/>
|
||||
<item id="cover-image" href="${COVER_IMG_FILENAME}" media-type="${COVER_MIMETYPE}"/>
|
||||
<item id="cover" href="${coverFiles.xhtml}" media-type="application/xhtml+xml"/>
|
||||
<item id="cover-image" href="${coverFiles.image}" media-type="${coverFiles.imageMimeType}"/>
|
||||
${manifestChapters}
|
||||
</manifest>
|
||||
|
||||
|
|
@ -73,14 +66,14 @@ ${spineChapters}
|
|||
</spine>
|
||||
|
||||
<guide>
|
||||
<reference type="cover" title="Cover" href="${COVER_XHTML_FILENAME}"/>
|
||||
<reference type="cover" title="Cover" href="${coverFiles.xhtml}"/>
|
||||
</guide>
|
||||
</package>`;
|
||||
|
||||
return fs.writeFile(path.resolve(contentPath, "content.opf"), contents);
|
||||
}
|
||||
|
||||
function writeNcx(chapters, contentPath) {
|
||||
function writeNcx(chapters, contentPath, bookInfo) {
|
||||
const navPoints = chapters.map((c, i) => {
|
||||
return `<navPoint class="chapter" id="${c.id}" playOrder="${i + 1}">
|
||||
<navLabel><text>${c.title}</text></navLabel>
|
||||
|
|
@ -92,14 +85,14 @@ function writeNcx(chapters, contentPath) {
|
|||
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
|
||||
<ncx version="2005-1" xml:lang="en" xmlns="http://www.daisy.org/z3986/2005/ncx/">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="${BOOK_ID}"/>
|
||||
<meta name="dtb:uid" content="urn:uuid:${bookInfo.id}"/>
|
||||
<meta name="dtb:depth" content="1"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
</head>
|
||||
|
||||
<docTitle>
|
||||
<text>${BOOK_TITLE}</text>
|
||||
<text>${bookInfo.title}</text>
|
||||
</docTitle>
|
||||
|
||||
<docAuthor>
|
||||
|
|
@ -133,3 +126,15 @@ async function getChapters(contentPath, chaptersPath, manifestPath) {
|
|||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getCoverFiles(coverPath) {
|
||||
const filenames = await fs.readdir(coverPath);
|
||||
|
||||
const images = filenames.filter(f => [".png", ".jpg"].includes(path.extname(f)));
|
||||
if (images.length !== 1) {
|
||||
throw new Error(`Expected one cover image in ${coverPath}; found ${images.length}`);
|
||||
}
|
||||
const imageMimeType = path.extname(images[0]) === ".png" ? "image/png" : "image/jpeg";
|
||||
|
||||
return { xhtml: "cover.xhtml", imageMimeType, image: images[0] };
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -2,86 +2,112 @@
|
|||
"use strict";
|
||||
/* eslint-disable no-process-exit */
|
||||
const path = require("path");
|
||||
const mkdirp = require("mkdirp-then");
|
||||
const rimraf = require("rimraf-then");
|
||||
const fs = require("fs").promises;
|
||||
const yargs = require("yargs");
|
||||
|
||||
const packageJson = require("../package.json");
|
||||
const books = require("./books.js");
|
||||
const download = require("./download.js");
|
||||
const convert = require("./convert.js");
|
||||
const scaffold = require("./scaffold.js");
|
||||
const zip = require("./zip.js");
|
||||
|
||||
const argv = yargs
|
||||
const OUTPUT_DEFAULT = "(Book name).epub";
|
||||
|
||||
const { argv } = yargs
|
||||
.usage(`${packageJson.description}\n\n${packageJson.name} [<command1> [<command2> [<command3> ...]]]\n\n` +
|
||||
"Each command will fail if the previously-listed one has not yet been run (with matching options).")
|
||||
.command("download", "download all chapters by crawling parahumans.wordpress.com")
|
||||
.command("convert", "convert the raw chapter HTML files into cleaned-up ebook chapters")
|
||||
.command("scaffold", "assemble the table of contents, etc. to complete the EPUB")
|
||||
.command("zip", "zip up the EPUB files into a .epub output")
|
||||
.option("s", {
|
||||
alias: "start-url",
|
||||
default: "https://parahumans.wordpress.com/2011/06/11/1-1/",
|
||||
describe: "the URL from which to start crawling, for the download command",
|
||||
"Each command will fail if the previously-listed one has not yet been run (with matching options).\n\n" +
|
||||
"Running with no commands is equivalent to running download convert scaffold zip.")
|
||||
.command("download", "download all chapters into the cache")
|
||||
.command("convert", "convert the raw HTML into cleaned-up ebook chapters")
|
||||
.command("scaffold", "assemble the table of contents, etc.")
|
||||
.command("zip", "zip up the created files into a .epub output")
|
||||
.option("b", {
|
||||
alias: "book",
|
||||
default: Object.keys(books)[0],
|
||||
describe: "the book to operate on",
|
||||
choices: Object.keys(books),
|
||||
requiresArg: true,
|
||||
global: true
|
||||
})
|
||||
.option("c", {
|
||||
alias: "cache-directory",
|
||||
alias: "cache",
|
||||
default: "cache",
|
||||
describe: "cache directory, for the download and convert commands",
|
||||
describe: "cache directory for downloaded raw chapters",
|
||||
requiresArg: true,
|
||||
global: true
|
||||
})
|
||||
.option("b", {
|
||||
alias: "book-directory",
|
||||
default: "book",
|
||||
describe: "directory in which to assemble the EPUB files before zipping, for the convert, scaffold, and zip " +
|
||||
"commands",
|
||||
.option("s", {
|
||||
alias: "staging",
|
||||
default: "staging",
|
||||
describe: "directory in which to assemble the EPUB files",
|
||||
requiresArg: true,
|
||||
global: true
|
||||
})
|
||||
.option("o", {
|
||||
alias: "out",
|
||||
default: "Worm.epub",
|
||||
describe: "output file destination, for the zip command",
|
||||
default: OUTPUT_DEFAULT,
|
||||
describe: "output file destination",
|
||||
requiresArg: true,
|
||||
global: true
|
||||
})
|
||||
.option("j", {
|
||||
alias: "jobs",
|
||||
default: undefined,
|
||||
defaultDescription: "# of CPU cores - 1",
|
||||
describe: "number of concurrent read/write conversion jobs",
|
||||
requiresArg: true,
|
||||
global: true
|
||||
})
|
||||
.demandCommand(1) // TODO remove and allow all
|
||||
.recommendCommands()
|
||||
.help()
|
||||
.version()
|
||||
.argv;
|
||||
.version();
|
||||
|
||||
const cachePath = path.resolve(argv.cacheDirectory);
|
||||
const outputFilename = argv.out === OUTPUT_DEFAULT ? `${books[argv.book].title}.epub` : argv.out;
|
||||
|
||||
const cachePath = path.resolve(argv.cache, argv.book);
|
||||
const manifestPath = path.resolve(cachePath, "manifest.json");
|
||||
|
||||
const scaffoldingPath = path.resolve(__dirname, "../scaffolding");
|
||||
const bookPath = path.resolve(argv.bookDirectory);
|
||||
const contentPath = path.resolve(bookPath, "OEBPS");
|
||||
const coverPath = path.resolve(__dirname, "../covers", argv.book);
|
||||
const stagingPath = path.resolve(argv.staging, argv.book);
|
||||
const contentPath = path.resolve(stagingPath, "OEBPS");
|
||||
const chaptersPath = path.resolve(contentPath, "chapters");
|
||||
|
||||
const commands = [];
|
||||
|
||||
if (argv._.length === 0) {
|
||||
argv._ = ["download", "convert", "scaffold", "zip"];
|
||||
}
|
||||
|
||||
if (argv._.includes("download")) {
|
||||
commands.push(() => download(argv.startUrl, cachePath, manifestPath));
|
||||
const { startURL } = books[argv.book];
|
||||
commands.push(() => download(startURL, cachePath, manifestPath));
|
||||
}
|
||||
|
||||
if (argv._.includes("convert")) {
|
||||
commands.push(() => {
|
||||
return rimraf(chaptersPath)
|
||||
.then(() => mkdirp(chaptersPath))
|
||||
.then(() => convert(cachePath, manifestPath, chaptersPath));
|
||||
return fs.rm(chaptersPath, { force: true, recursive: true, maxRetries: 3 })
|
||||
.then(() => fs.mkdir(chaptersPath, { recursive: true }))
|
||||
.then(() => convert(cachePath, manifestPath, chaptersPath, argv.book, argv.jobs));
|
||||
});
|
||||
}
|
||||
|
||||
if (argv._.includes("scaffold")) {
|
||||
commands.push(() => scaffold(scaffoldingPath, bookPath, contentPath, chaptersPath, manifestPath));
|
||||
const bookInfo = books[argv.book];
|
||||
commands.push(() => scaffold(
|
||||
scaffoldingPath,
|
||||
coverPath,
|
||||
stagingPath,
|
||||
contentPath,
|
||||
chaptersPath,
|
||||
manifestPath,
|
||||
bookInfo
|
||||
));
|
||||
}
|
||||
|
||||
if (argv._.includes("zip")) {
|
||||
commands.push(() => zip(bookPath, contentPath, path.resolve(argv.out)));
|
||||
commands.push(() => zip(stagingPath, contentPath, path.resolve(outputFilename)));
|
||||
}
|
||||
|
||||
(async () => {
|
||||
|
|
@ -89,8 +115,6 @@ if (argv._.includes("zip")) {
|
|||
for (const command of commands) {
|
||||
await command();
|
||||
}
|
||||
|
||||
console.log("All done!");
|
||||
} catch (e) {
|
||||
console.error(e.stack);
|
||||
process.exit(1);
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ module.exports = (bookPath, contentPath, outPath) => {
|
|||
archive.pipe(destStream);
|
||||
|
||||
// Order matters; mimetype must be first for a valid EPUB
|
||||
archive.file(path.resolve(bookPath, "mimetype"), { name: "mimetype" });
|
||||
archive.file(path.resolve(bookPath, "mimetype"), { name: "mimetype", store: true });
|
||||
archive.directory(contentPath, "OEBPS", { name: "OEBPS" });
|
||||
archive.directory(path.resolve(bookPath, "META-INF"), "META-INF", { name: "META-INF" });
|
||||
|
||||
|
|
|
|||
4159
npm-shrinkwrap.json
generated
4159
npm-shrinkwrap.json
generated
File diff suppressed because it is too large
Load diff
27
package.json
27
package.json
|
|
@ -1,18 +1,20 @@
|
|||
{
|
||||
"name": "worm-scraper",
|
||||
"description": "Scrapes the web serial Worm into an eBook format",
|
||||
"description": "Scrapes the web serial Worm and its sequel Ward into an eBook format",
|
||||
"keywords": [
|
||||
"ebook",
|
||||
"worm",
|
||||
"ward",
|
||||
"parahuman",
|
||||
"scraper"
|
||||
],
|
||||
"version": "2.3.0",
|
||||
"version": "5.1.0",
|
||||
"author": "Domenic Denicola <d@domenic.me> (https://domenic.me/)",
|
||||
"license": "WTFPL",
|
||||
"repository": "domenic/worm-scraper",
|
||||
"bin": "lib/worm-scraper.js",
|
||||
"files": [
|
||||
"covers/",
|
||||
"lib/",
|
||||
"scaffolding/",
|
||||
"npm-shrinkwrap.json"
|
||||
|
|
@ -21,18 +23,19 @@
|
|||
"lint": "eslint lib"
|
||||
},
|
||||
"dependencies": {
|
||||
"archiver": "^3.1.1",
|
||||
"archiver": "^5.3.1",
|
||||
"cli-progress": "^3.11.1",
|
||||
"cpr": "^3.0.1",
|
||||
"jsdom": "^15.1.1",
|
||||
"mkdirp-then": "^1.0.1",
|
||||
"requisition": "^1.5.0",
|
||||
"rimraf-then": "^1.0.0",
|
||||
"thenify": "^3.3.0",
|
||||
"throat": "^5.0.0",
|
||||
"xmlserializer": "^0.6.1",
|
||||
"yargs": "^13.3.0"
|
||||
"jsdom": "^19.0.0",
|
||||
"minipass-fetch": "^2.1.0",
|
||||
"workerpool": "^6.2.1",
|
||||
"yargs": "^17.5.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"eslint": "6.1.0"
|
||||
"@domenic/eslint-config": "^2.0.0",
|
||||
"eslint": "^8.16.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.13.2"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Binary file not shown.
|
Before Width: | Height: | Size: 332 KiB |
Loading…
Add table
Add a link
Reference in a new issue