"use strict"; const workerpool = require("workerpool"); const fs = require("fs"); const { JSDOM } = require("jsdom"); const substitutions = require("./substitutions.json"); workerpool.worker({ convertChapter }); function convertChapter(chapter, book, inputPath, outputPath) { const contents = fs.readFileSync(inputPath, { encoding: "utf-8" }); const rawChapterJSDOM = new JSDOM(contents); const { output, warnings } = getChapterString(chapter, book, rawChapterJSDOM.window.document); // TODO: this should probably not be necessary... jsdom bug I guess!? rawChapterJSDOM.window.close(); fs.writeFileSync(outputPath, output); return warnings; } function getChapterString(chapter, book, rawChapterDoc) { const { xml, warnings } = getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content")); const output = `
contentEl.firstElementChild.remove(); // Remove everything after the last
(e.g. analytics
s or Last Chapter/Next Chapter
s
while (isEmptyOrGarbage(contentEl.lastElementChild)) {
contentEl.lastElementChild.remove();
}
// Remove redundant attributes and style
for (const child of contentEl.children) {
if (child.getAttribute("dir") === "ltr") {
child.removeAttribute("dir");
}
// Only ever appears with align="LEFT" (useless) or align="CENTER" overridden by style="text-align: left;" (also
// useless)
child.removeAttribute("align");
const style = child.getAttribute("style");
if (style === "text-align:left;" || style === "text-align: left;") {
child.removeAttribute("style");
}
// Worm uses 30px; Ward mostly uses 40px but sometimes uses 30px/60px. Let's standardize on 30px.
if (style === "text-align:left;padding-left:30px;" ||
style === "text-align: left;padding-left: 40px;" ||
style === "padding-left: 40px;") {
child.setAttribute("style", "padding-left: 30px;");
}
}
// Remove empty s and s
// Remove style attributes from them, as they're always messed up.
for (const em of contentEl.querySelectorAll("em, i")) {
if (em.textContent.trim() === "") {
em.replaceWith(contentEl.ownerDocument.createTextNode(" "));
} else {
em.removeAttribute("style");
}
}
// In https://parahumans.wordpress.com/2013/01/05/monarch-16-13/ there are some s that should be s O_o
for (const address of contentEl.querySelectorAll("address")) {
const p = contentEl.ownerDocument.createElement("p");
p.innerHTML = address.innerHTML;
address.replaceWith(p);
}
// Every except underline ones is pointless at best and frequently messed up. (Weird font size, line spacing,
// etc.)
for (const span of contentEl.querySelectorAll("span")) {
if (span.getAttribute("style") === "text-decoration:underline;") {
continue;
}
if (span.textContent.trim() === "") {
span.remove();
} else {
const docFrag = contentEl.ownerDocument.createDocumentFragment();
while (span.firstChild) {
docFrag.appendChild(span.firstChild);
}
span.replaceWith(docFrag);
}
}
// In Ward, CloudFlare email protection obfuscates the email addresses:
// https://usamaejaz.com/cloudflare-email-decoding/
for (const emailEl of contentEl.querySelectorAll("[data-cfemail]")) {
const decoded = decodeCloudFlareEmail(emailEl.dataset.cfemail);
emailEl.replaceWith(contentEl.ownerDocument.createTextNode(decoded));
}
// Synthesize a tag to serialize
const bodyEl = contentEl.ownerDocument.createElement("body");
const h1El = contentEl.ownerDocument.createElement("h1");
h1El.textContent = chapter.title;
bodyEl.appendChild(h1El);
while (contentEl.firstChild) {
bodyEl.appendChild(contentEl.firstChild);
}
const xmlSerializer = new contentEl.ownerDocument.defaultView.XMLSerializer();
let xml = xmlSerializer.serializeToString(bodyEl);
// Fix recurring strange pattern of extra ......
in
\n
\s*<\/em><\/p>/g, "
([^>]+)<\/em>(!|\?|\.)<\/p>/g, " $1$2
]+)> /g, " ");
xml = xml.replace(/<\/em> <\/p>/g, " ”/g, " “");
xml = xml.replace(/“\s*<\/p>/g, "”
“\s+/g, "
“"); xml = xml.replace(/'/g, "’"); xml = xml.replace(/’([A-Za-z]+)’/g, "‘$1’"); xml = xml.replace(/([a-z])”<\/p>/g, "$1.”
"); fixEms(); xml = xml.replace(/‘([^<]+)<\/em>‘/g, "‘$1’"); xml = xml.replace(/([a-z]+)!<\/em>/g, "$1!"); xml = xml.replace(/(?([\w ’]+)([!.?])”<\/em>/g, "$1$2”"); xml = xml.replace(/([\w ’]+[!.?])”<\/em>/g, "$1”"); xml = xml.replace(/I”(m|ll)/g, "I’$1"); xml = xml.replace(/””<\/p>/g, "”"); xml = xml.replace(/^([^“]+?) ?”(?![ —<])/gm, "$1 “"); // Similar problems occur in Ward with and as do in Worm with s xml = xml.replace(//g, ""); xml = xml.replace(/(\s*-/g, "
—"); xml = xml.replace(/-<\/p>/g, "—
"); xml = xml.replace(/-<\/em><\/p>/g, "—"); xml = xml.replace(/\s?\s?–\s?\s?/g, "—"); xml = xml.replace(/-\s\s?/g, "—"); xml = xml.replace(/\s?\s-/g, "—"); xml = xml.replace(/\s+—”/g, "—”"); xml = xml.replace(/I-I/g, "I—I"); xml = xml.replace(/I-uh/g, "I—uh"); return xml; } function enDashJointNames(xml) { // Joint names should use en dashes xml = xml.replace(/Dallon-Pelham/g, "Dallon–Pelham"); xml = xml.replace(/Bet-Gimel/g, "Bet–Gimel"); xml = xml.replace(/Tristan-Capricorn/g, "Tristan–Capricorn"); xml = xml.replace(/Capricorn-Byron/g, "Capricorn–Byron"); xml = xml.replace(/Tristan-Byron/g, "Tristan–Byron"); xml = xml.replace(/Gimel-Europe/g, "Gimel–Europe"); xml = xml.replace(/G-N/g, "G–N"); xml = xml.replace(/Imp-Damsel/g, "Imp–Damsel"); xml = xml.replace(/Damsel-Ashley/g, "Damsel–Ashley"); xml = xml.replace(/Antares-Anelace/g, "Antares–Anelace"); xml = xml.replace(/Challenger-Gallant/g, "Challenger–Gallant"); xml = xml.replace(/Undersider(s?)-(Breakthrough|Ambassador)/g, "Undersider$1–$2"); xml = xml.replace(/Norwalk-Fairfield/g, "Norwalk–Fairfield"); xml = xml.replace(/East-West/g, "east–west"); xml = xml.replace(/(Green|Yellow)-Black/g, "$1–Black"); xml = xml.replace(/Creutzfeldt-Jakob/g, "Creutzfeldt–Jakob"); xml = xml.replace(/Astaroth-Nidhug/g, "Astaroth–Nidhug"); xml = xml.replace(/Capulet-Montague/g, "Capulet–Montague"); xml = xml.replace(/Weaver-Clockblocker/g, "Weaver–Clockblocker"); xml = xml.replace(/Alexandria-Pretender/g, "Alexandria–Pretender"); xml = xml.replace(/Night Hag-Nyx/g, "Night Hag–Nyx"); xml = xml.replace(/Crawler-Breed/g, "Crawler–Breed"); xml = xml.replace(/Simurgh-Myrddin-plant/g, "Simurgh–Myrddin–plant"); xml = xml.replace(/Armsmaster-Defiant/g, "Armsmaster–Defiant"); return xml; } function fixPossessives(xml) { // Fix possessive of names ending in "s" // Note: if the "s" is unvoiced, as in Marquis, then it doesn't get the second "s". xml = xml.replace( // eslint-disable-next-line max-len /(? would be more semantically appropriate, but loses the author's intent. This is // especially the case in Ward, which uses a variety of different scene breaks. xml = xml.replace(/]*)>■<\/p>/g, `
■
`); xml = xml.replace(/⊙<\/strong><\/p>/g, ` ⊙ ⊙<\/strong><\/em><\/p>/g,
` ⊙ ⊙⊙<\/strong><\/p>/g,
` ⊙ ⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/g,
` ⊙ ⊙ ⊙ ⊙ ⊙