diff --git a/lib/download.js b/lib/download.js
new file mode 100644
index 0000000..259d630
--- /dev/null
+++ b/lib/download.js
@@ -0,0 +1,121 @@
+"use strict";
+const path = require("path");
+const fs = require("mz/fs");
+const mkdirp = require("mkdirp-then");
+const request = require("requisition");
+const jsdom = require("jsdom");
+
+const FILENAME_PREFIX = "chapter";
+const URL_SUFFIX = "-url.txt";
+
+module.exports = function (startChapterUrl, cachePath) {
+ return fs.readdir(cachePath).then(
+ function (filenames) {
+ const lastChapter = filenames.filter(function (f) {
+ return f.endsWith(URL_SUFFIX);
+ })
+ .map(function (f) {
+ return [getChapterNumber(f), f];
+ })
+ .sort(function (a, b) {
+ return b[0] - a[0];
+ })
+ [0];
+
+ const lastChapterNumber = lastChapter[0];
+ const lastChapterUrlFile = lastChapter[1];
+
+ return fs.readFile(path.resolve(cachePath, lastChapterUrlFile), { encoding: "utf-8" }).then(function (url) {
+ return downloadAllChapters(url, lastChapterNumber, cachePath);
+ });
+ },
+ function (e) {
+ if (e.code === "ENOENT") {
+ return downloadAllChapters(startChapterUrl, 1, cachePath);
+ } else {
+ throw e;
+ }
+ }
+ );
+};
+
+function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) {
+ let currentChapter = startChapterUrl;
+ let chapterCounter = startChapterNumber;
+
+ return mkdirp(cachePath).then(loop);
+
+ function loop() {
+ const filename = `${FILENAME_PREFIX}${chapterCounter}.html`;
+
+ console.log(`Downloading ${currentChapter}`);
+
+ // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
+ const escapedUrl = encodeURI(currentChapter);
+
+ return retry(3, function () { return request(escapedUrl).redirects(10); }).then(function (response) {
+ console.log("- Response received");
+ return response.text();
+ })
+ .then(function (contents) {
+ console.log("- Response body received");
+ const rawChapterDoc = jsdom.jsdom(contents, { url: currentChapter });
+ console.log("- Response body parsed into DOM");
+
+ const chapterUrlToSave = currentChapter;
+ currentChapter = getNextChapterUrl(rawChapterDoc);
+ console.log("- Got next chapter URL");
+
+ // TODO: this should probably not be necessary... jsdom bug I guess!?
+ rawChapterDoc.defaultView.close();
+
+ return [
+ fs.writeFile(path.resolve(cachePath, filename), contents),
+ fs.writeFile(path.resolve(cachePath, urlFilename(filename)), chapterUrlToSave)
+ ];
+ })
+ .then(function () {
+ console.log("- Response text saved to cache file");
+ if (currentChapter === null) {
+ return;
+ }
+
+ ++chapterCounter;
+ return loop();
+ });
+ }
+}
+
+function getNextChapterUrl(rawChapterDoc) {
+ // a[title="Next Chapter"] doesn"t always work. Two different pathologies:
+ // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
+ // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
+ // So instead search for the first within the main content area starting with "Next", trimmed.
+
+ const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
+ for (let i = 0; i < aEls.length; ++i) {
+ if (aEls[i].textContent.trim().startsWith("Next")) {
+ return aEls[i].href;
+ }
+ }
+
+ return null;
+}
+
+function retry(times, fn) {
+ if (times === 0) {
+ return fn();
+ }
+
+ return fn().catch(function () {
+ return retry(times - 1, fn);
+ });
+}
+
+function urlFilename(filename) {
+ return `${filename}${URL_SUFFIX}`;
+}
+
+function getChapterNumber(filename) {
+ return Number(filename.substring(FILENAME_PREFIX.length, filename.indexOf(".html")));
+}
diff --git a/lib/worm-scraper.js b/lib/worm-scraper.js
index dba01a3..2f285e6 100644
--- a/lib/worm-scraper.js
+++ b/lib/worm-scraper.js
@@ -3,24 +3,26 @@ const path = require("path");
const fs = require("mz/fs");
const mkdirp = require("mkdirp-then");
const rimraf = require("rimraf-then");
-const request = require("requisition");
const jsdom = require("jsdom");
+const download = require("./download.js");
+
require("./track-rejections.js");
-const START_CHAPTER = "https://parahumans.wordpress.com/2011/06/11/1-1/";
+const START_CHAPTER_URL = "https://parahumans.wordpress.com/2011/06/11/1-1/";
const cachePath = path.resolve("cache");
const outPath = path.resolve("out");
const contentPath = path.resolve(outPath, "OEBPS");
-const rawChapterDocsCache = new Map();
-
rimraf(outPath)
.then(function () {
return mkdirp(contentPath);
})
- .then(getAllRawChapterDocs)
+ .then(function () {
+ return download(START_CHAPTER_URL, cachePath);
+ })
+ .then(readAllRawChapterDocs)
.then(function (rawChapterDocs) {
console.log("Extracting content into EPUB chapter files");
return Promise.all(rawChapterDocs.map(function (rawChapterDoc, i) {
@@ -33,72 +35,18 @@ rimraf(outPath)
console.log("All done!");
});
-function getAllRawChapterDocs() {
- return fs.readdir(cachePath).then(
- function (filenames) {
- return filenames.map(getRawChapterDoc);
- },
- function (e) {
- if (e.code === "ENOENT") {
- return downloadAllChapters();
- } else {
- throw e;
- }
- }
- );
-}
-
-function downloadAllChapters() {
- let currentChapter = START_CHAPTER;
- let chapterCounter = 1;
-
- return mkdirp(cachePath).then(loop).then(function () {
- return toArray(rawChapterDocsCache.values());
- });
-
- function loop() {
- const filename = `chapter${chapterCounter}.html`;
-
- console.log(`Downloading ${currentChapter}`);
-
- // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
- const escapedUrl = encodeURI(currentChapter);
-
- return request(escapedUrl).redirects(10).then(function (response) {
- return response.text();
- })
- .then(function (contents) {
- const rawChapterDoc = setRawChapterDoc(filename, contents);
- currentChapter = getNextChapterUrl(rawChapterDoc);
-
- return fs.writeFile(path.resolve(cachePath, filename), jsdom.serializeDocument(rawChapterDoc));
- })
- .then(function () {
- if (currentChapter === null) {
- return;
- }
-
- ++chapterCounter;
- return loop();
- });
- }
-}
-
-function getRawChapterDoc(filename) {
- const doc = rawChapterDocsCache.get(filename);
- if (doc !== undefined) {
- return Promise.resolve(doc);
- }
-
- return fs.readFile(path.resolve(cachePath, filename), { encoding: "utf-8" }).then(function (contents) {
- return setRawChapterDoc(filename, contents);
+function readAllRawChapterDocs() {
+ return fs.readdir(cachePath).then(function (filenames) {
+ const htmlFiles = filenames.filter(function (f) { return f.endsWith(".html"); });
+ return Promise.all(htmlFiles.map(readRawChapterDoc));
});
}
-function setRawChapterDoc(filename, contents) {
- const doc = jsdom.jsdom(contents);
- rawChapterDocsCache.set(filename, doc);
- return doc;
+function readRawChapterDoc(filename) {
+ const filePath = path.resolve(cachePath, filename);
+ return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) {
+ return jsdom.jsdom(contents);
+ });
}
function getChapterString(rawChapterDoc) {
@@ -134,27 +82,3 @@ function cleanContentEl(el) {
return el;
}
-
-function getNextChapterUrl(rawChapterDoc) {
- // a[title="Next Chapter"] doesn't always work. Two different pathologies:
- // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
- // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
- // So instead search for the first within the main content area starting with "Next", trimmed.
-
- const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
- for (let i = 0; i < aEls.length; ++i) {
- if (aEls[i].textContent.trim().startsWith("Next")) {
- return aEls[i].href;
- }
- }
-
- return null;
-}
-
-function toArray(iterable) {
- const array = [];
- for (const x of iterable) {
- array.push(x);
- }
- return array;
-}