worm-scraper/lib/download.js
2021-01-17 17:04:37 -05:00

128 lines
4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
const path = require("path");
const fs = require("fs").promises;
const request = require("requisition");
const { JSDOM } = require("jsdom");
const FILENAME_PREFIX = "chapter";
module.exports = async (startChapterURL, cachePath, manifestPath) => {
let manifestContents;
try {
manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
} catch (e) {
if (e.code === "ENOENT") {
return downloadAllChapters(null, startChapterURL, cachePath, manifestPath);
}
throw e;
}
const manifest = JSON.parse(manifestContents);
return downloadAllChapters(manifest, startChapterURL, cachePath, manifestPath);
};
async function downloadAllChapters(manifest, startChapterURL, cachePath, manifestPath) {
let currentChapter = startChapterURL;
let chapterIndex = 0;
if (manifest !== null) {
currentChapter = manifest[manifest.length - 1].url;
chapterIndex = manifest.length - 1;
// We're going to re-add it to the manifest later, possibly with an updated title.
manifest.pop();
} else {
manifest = [];
}
await fs.mkdir(cachePath, { recursive: true });
while (currentChapter !== null) {
const filename = `${FILENAME_PREFIX}${chapterIndex.toString().padStart(3, "0")}.html`;
process.stdout.write(`Downloading ${currentChapter}... `);
const { contents, dom, url } = await downloadChapter(currentChapter);
const title = getChapterTitle(dom.window.document);
currentChapter = getNextChapterURL(dom.window.document);
dom.window.close();
manifest.push({ url, title, filename });
await fs.writeFile(path.resolve(cachePath, filename), contents);
// Incrementally update the manifest after every successful download, instead of waiting until the end.
const newManifestContents = JSON.stringify(manifest, undefined, 2);
await fs.writeFile(manifestPath, newManifestContents);
process.stdout.write("done\n");
++chapterIndex;
}
}
function getNextChapterURL(rawChapterDoc) {
// `a[title="Next Chapter"]` doesn"t always work. Two different pathologies:
// - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
// - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
// So instead search for the first <a> within the main content area starting with "Next", trimmed.
const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
for (let i = 0; i < aEls.length; ++i) {
if (aEls[i].textContent.trim().startsWith("Next")) {
return aEls[i].href;
}
}
return null;
}
function getChapterTitle(rawChapterDoc) {
// Remove " " because it's present in Ward but not in Worm, which is inconsistent. (And leaving it in causes slight
// issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to
// have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter.
// Until then this is reasonable and uniform.
return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ /, " ");
}
function retry(times, fn) {
if (times === 0) {
return fn();
}
return fn().catch(() => {
return retry(times - 1, fn);
});
}
async function downloadChapter(startingURL) {
let urlToFollow = startingURL;
let url, contents, dom;
while (urlToFollow !== null) {
const response = await downloadWithRetry(urlToFollow);
url = urlToFollow;
contents = await response.text();
dom = new JSDOM(contents, { url });
const refreshMeta = dom.window.document.querySelector("meta[http-equiv=refresh]");
if (refreshMeta) {
[, urlToFollow] = /\d+;url=(.*)/i.exec(refreshMeta.content);
process.stdout.write(`\n Redirected to ${urlToFollow}... `);
dom.window.close();
} else {
urlToFollow = null;
}
}
return { url, contents, dom };
}
function downloadWithRetry(url) {
return retry(3, async () => {
const response = await request(url).redirects(10);
if (response.status !== 200) {
throw new Error(`Response status for ${url} was ${response.status}`);
}
return response;
});
}