worm-scraper/lib/download.js
2020-07-01 17:23:05 -04:00

117 lines
3.9 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
const path = require("path");
const fs = require("fs").promises;
const request = require("requisition");
const { JSDOM } = require("jsdom");
const FILENAME_PREFIX = "chapter";
module.exports = async (startChapterURL, cachePath, manifestPath) => {
let manifestContents;
try {
manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
} catch (e) {
if (e.code === "ENOENT") {
return downloadAllChapters(null, startChapterURL, cachePath, manifestPath);
}
throw e;
}
const manifest = JSON.parse(manifestContents);
return downloadAllChapters(manifest, startChapterURL, cachePath, manifestPath);
};
async function downloadAllChapters(manifest, startChapterURL, cachePath, manifestPath) {
let currentChapter = startChapterURL;
let chapterIndex = 0;
if (manifest !== null) {
currentChapter = manifest[manifest.length - 1].url;
chapterIndex = manifest.length - 1;
// We're going to re-add it to the manifest later, possibly with an updated title.
manifest.pop();
} else {
manifest = [];
}
await fs.mkdir(cachePath, { recursive: true });
while (currentChapter !== null) {
const filename = `${FILENAME_PREFIX}${chapterIndex.toString().padStart(3, "0")}.html`;
console.log(`Downloading ${currentChapter}`);
const response = await downloadChapter(currentChapter);
const contents = await response.text();
console.log("- Response body received");
const rawChapterJSDOM = new JSDOM(contents, { url: currentChapter });
console.log("- Response body parsed into DOM");
const chapterURLToSave = currentChapter;
const chapterTitle = getChapterTitle(rawChapterJSDOM.window.document);
currentChapter = getNextChapterURL(rawChapterJSDOM.window.document);
// TODO: this should probably not be necessary... jsdom bug I guess!?
rawChapterJSDOM.window.close();
manifest.push({
url: chapterURLToSave,
title: chapterTitle,
filename
});
await fs.writeFile(path.resolve(cachePath, filename), contents);
console.log("- Response text saved to cache file");
// Incrementally update the manifest after every successful download, instead of waiting until the end.
const newManifestContents = JSON.stringify(manifest, undefined, 2);
await fs.writeFile(manifestPath, newManifestContents);
console.log("- Manifest updated");
++chapterIndex;
}
}
function getNextChapterURL(rawChapterDoc) {
// `a[title="Next Chapter"]` doesn"t always work. Two different pathologies:
// - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
// - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
// So instead search for the first <a> within the main content area starting with "Next", trimmed.
const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
for (let i = 0; i < aEls.length; ++i) {
if (aEls[i].textContent.trim().startsWith("Next")) {
return aEls[i].href;
}
}
return null;
}
function getChapterTitle(rawChapterDoc) {
// Remove " " because it's present in Ward but not in Worm, which is inconsistent. (And leaving it in causes slight
// issues down the line where we remove spaces around em dashes during conversion.) In the future it might be nice to
// have proper chapter titles, e.g. sections per arc with title pages and then just "1" or similar for the chapter.
// Until then this is reasonable and uniform.
return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ /, " ");
}
function retry(times, fn) {
if (times === 0) {
return fn();
}
return fn().catch(() => {
return retry(times - 1, fn);
});
}
function downloadChapter(url) {
return retry(3, async () => {
const response = await request(url).redirects(10);
if (response.status !== 200) {
throw new Error(`Response status for ${url} was ${response.status}`);
}
return response;
});
}