"use strict"; const workerpool = require("workerpool"); const fs = require("fs"); const { JSDOM } = require("jsdom"); const substitutions = require("./substitutions.json"); workerpool.worker({ convertChapter }); function convertChapter(chapter, book, inputPath, outputPath) { const contents = fs.readFileSync(inputPath, { encoding: "utf-8" }); const rawChapterJSDOM = new JSDOM(contents); const { output, warnings } = getChapterString(chapter, book, rawChapterJSDOM.window.document); // TODO: this should probably not be necessary... jsdom bug I guess!? rawChapterJSDOM.window.close(); fs.writeFileSync(outputPath, output); return warnings; } function getChapterString(chapter, book, rawChapterDoc) { const { xml, warnings } = getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content")); const output = `
contentEl.firstElementChild.remove(); // Remove everything after the last
(e.g. analytics
s or Last Chapter/Next Chapter
s
while (isEmptyOrGarbage(contentEl.lastElementChild)) {
contentEl.lastElementChild.remove();
}
// Remove redundant attributes and style
for (const child of contentEl.children) {
if (child.getAttribute("dir") === "ltr") {
child.removeAttribute("dir");
}
// Only ever appears with align="LEFT" (useless) or align="CENTER" overridden by style="text-align: left;" (also
// useless)
child.removeAttribute("align");
const style = child.getAttribute("style");
if (style === "text-align:left;" || style === "text-align: left;") {
child.removeAttribute("style");
}
// Worm uses 30px; Ward mostly uses 40px but sometimes uses 30px/60px. Let's standardize on 30px.
if (style === "text-align:left;padding-left:30px;" ||
style === "text-align: left;padding-left: 40px;" ||
style === "text-align: left; padding-left: 40px;" ||
style === "padding-left: 40px;") {
child.setAttribute("style", "padding-left: 30px;");
}
}
// Remove empty inline elements.
// Remove style attributes from inline elements, as they're always messed up.
for (const el of contentEl.querySelectorAll("em, i, strong, b")) {
const { textContent } = el;
if (textContent === "") {
el.remove();
} else if (textContent.trim() === "") {
if (el.childElementCount === 0) {
el.replaceWith(" ");
} else if (el.childElementCount === 1 && el.children[0].localName === "br") {
el.outerHTML = "
\n";
}
} else {
el.removeAttribute("style");
}
}
// In https://parahumans.wordpress.com/2013/01/05/monarch-16-13/ there are some
s O_o
for (const address of contentEl.querySelectorAll("address")) {
const p = contentEl.ownerDocument.createElement("p");
p.innerHTML = address.innerHTML;
address.replaceWith(p);
}
// Every except underline ones is pointless at best and frequently messed up. (Weird font size, line spacing,
// etc.)
for (const span of contentEl.querySelectorAll("span")) {
if (span.getAttribute("style") === "text-decoration:underline;") {
continue;
}
if (span.textContent.trim() === "") {
span.remove();
} else {
const docFrag = contentEl.ownerDocument.createDocumentFragment();
while (span.firstChild) {
docFrag.appendChild(span.firstChild);
}
span.replaceWith(docFrag);
}
}
// In Ward, CloudFlare email protection obfuscates the email addresses:
// https://usamaejaz.com/cloudflare-email-decoding/
for (const emailEl of contentEl.querySelectorAll("[data-cfemail]")) {
const decoded = decodeCloudFlareEmail(emailEl.dataset.cfemail);
emailEl.replaceWith(contentEl.ownerDocument.createTextNode(decoded));
}
// Synthesize a tag to serialize
const bodyEl = contentEl.ownerDocument.createElement("body");
const h1El = contentEl.ownerDocument.createElement("h1");
h1El.textContent = chapter.title;
bodyEl.appendChild(h1El);
while (contentEl.firstChild) {
bodyEl.appendChild(contentEl.firstChild);
}
const xmlSerializer = new contentEl.ownerDocument.defaultView.XMLSerializer();
let xml = xmlSerializer.serializeToString(bodyEl);
// Fix recurring strange pattern of extra ......
in
\n
\s*<\/em><\/p>/ug, "
([^>]+)<\/em>(!|\?|\.)<\/p>/ug, " $1$2
]+)> /ug, " ");
xml = xml.replace(/<\/em> <\/p>/ug, " ”/ug, " “");
xml = xml.replace(/“\s*<\/p>/ug, "”
“\s+/ug, "
“"); xml = xml.replace(/\s+”/ug, "”"); xml = xml.replace(/'/ug, "’"); xml = xml.replace(/’([A-Za-z]+)’/ug, "‘$1’"); xml = xml.replace(/([a-z])”<\/p>/ug, "$1.”
"); fixEms(); xml = xml.replace(/‘([^<]+)<\/em>‘/ug, "‘$1’"); xml = xml.replace(/([a-z]+)!<\/em>/ug, "$1!"); xml = xml.replace(/(?([\w ’]+)([!.?])”<\/em>/ug, "$1$2”"); xml = xml.replace(/([\w ’]+[!.?])”<\/em>/ug, "$1”"); xml = xml.replace(/I”(m|ll)/ug, "I’$1"); xml = xml.replace(/””<\/p>/ug, "”"); xml = xml.replace(/^([^“]+?) ?”(?![ —<])/ugm, "$1 “"); xml = xml.replace(/(?([A-Za-z]+),<\/em>(?!”| +[A-Za-z]+ thought)/u, "$1,"); xml = xml.replace(/‘([Kk])ay(?!’)/ug, "’$1ay"); xml = xml.replace(/(Why|What|Who|How|Where|When)<\/em>\?/ug, "$1?"); xml = xml.replace(/,<\/em>/ug, ","); xml = xml.replace(/,”<\/p>/ug, ".”"); xml = xml.replace(/(.*),<\/p>/ug, "
$1.
"); xml = xml.replace(/‘(\w+)‘(\w+)’/ug, "‘$1’$2’"); xml = xml.replace(/([a-z]+), ([a-z]+)<\/em>/ug, "$1, $2"); // Similar problems occur in Ward with and as do in Worm with s xml = xml.replace(//ug, ""); xml = xml.replace(/(\s*
\s*/ug, "
"); // This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh. xml = xml.replace(/<\/em>\s*“\s*<\/p>/ug, "
”"); // Fix missing spaces after commas xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/ug, "$1, $2"); // Fix bad periods and spacing/markup surrounding them xml = xml.replace(/\.\.<\/p>/ug, "."); xml = xml.replace(/\.\.”<\/p>/ug, ".”"); xml = xml.replace(/ \. /ug, ". "); xml = xml.replace(/ \.<\/p>/ug, "."); xml = xml.replace(/\.\.\./ug, "…"); xml = xml.replace(/\.\. {2}/ug, ". "); xml = xml.replace(/\.\./ug, "…"); xml = xml.replace(/(?/ug, ""); xml = xml.replace(/([a-z]) ,/ug, "$1,"); // Use actual emojis instead of images xml = xml.replace( // eslint-disable-next-line max-len /-/ug, "
—"); xml = xml.replace(/-<\/p>/ug, "—
"); xml = xml.replace(/-]*)>■<\/p>/ug, `
■
`); xml = xml.replace( /⊙<\/strong><\/p>/ug,
` ⊙ ⊙<\/strong><\/em><\/p>/ug,
` ⊙ ⊙⊙<\/strong><\/p>/ug,
` ⊙ ⊙ *⊙ *⊙ *⊙ *⊙<\/strong><\/p>/ug,
` ⊙ ⊙ ⊙ ⊙ ⊙