From 127b6612c90bb6227ce97d48f68dba577ee930cf Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Wed, 6 May 2015 22:00:44 +0200 Subject: [PATCH] Re-architect Separate out download code from the rest. Re-do the download code to store URLs alongside HTML, and add support for resuming from the last-seen URL. --- lib/download.js | 121 ++++++++++++++++++++++++++++++++++++++++++++ lib/worm-scraper.js | 108 ++++++--------------------------------- 2 files changed, 137 insertions(+), 92 deletions(-) create mode 100644 lib/download.js diff --git a/lib/download.js b/lib/download.js new file mode 100644 index 0000000..259d630 --- /dev/null +++ b/lib/download.js @@ -0,0 +1,121 @@ +"use strict"; +const path = require("path"); +const fs = require("mz/fs"); +const mkdirp = require("mkdirp-then"); +const request = require("requisition"); +const jsdom = require("jsdom"); + +const FILENAME_PREFIX = "chapter"; +const URL_SUFFIX = "-url.txt"; + +module.exports = function (startChapterUrl, cachePath) { + return fs.readdir(cachePath).then( + function (filenames) { + const lastChapter = filenames.filter(function (f) { + return f.endsWith(URL_SUFFIX); + }) + .map(function (f) { + return [getChapterNumber(f), f]; + }) + .sort(function (a, b) { + return b[0] - a[0]; + }) + [0]; + + const lastChapterNumber = lastChapter[0]; + const lastChapterUrlFile = lastChapter[1]; + + return fs.readFile(path.resolve(cachePath, lastChapterUrlFile), { encoding: "utf-8" }).then(function (url) { + return downloadAllChapters(url, lastChapterNumber, cachePath); + }); + }, + function (e) { + if (e.code === "ENOENT") { + return downloadAllChapters(startChapterUrl, 1, cachePath); + } else { + throw e; + } + } + ); +}; + +function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) { + let currentChapter = startChapterUrl; + let chapterCounter = startChapterNumber; + + return mkdirp(cachePath).then(loop); + + function loop() { + const filename = `${FILENAME_PREFIX}${chapterCounter}.html`; + + console.log(`Downloading ${currentChapter}`); + + // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/ + const escapedUrl = encodeURI(currentChapter); + + return retry(3, function () { return request(escapedUrl).redirects(10); }).then(function (response) { + console.log("- Response received"); + return response.text(); + }) + .then(function (contents) { + console.log("- Response body received"); + const rawChapterDoc = jsdom.jsdom(contents, { url: currentChapter }); + console.log("- Response body parsed into DOM"); + + const chapterUrlToSave = currentChapter; + currentChapter = getNextChapterUrl(rawChapterDoc); + console.log("- Got next chapter URL"); + + // TODO: this should probably not be necessary... jsdom bug I guess!? + rawChapterDoc.defaultView.close(); + + return [ + fs.writeFile(path.resolve(cachePath, filename), contents), + fs.writeFile(path.resolve(cachePath, urlFilename(filename)), chapterUrlToSave) + ]; + }) + .then(function () { + console.log("- Response text saved to cache file"); + if (currentChapter === null) { + return; + } + + ++chapterCounter; + return loop(); + }); + } +} + +function getNextChapterUrl(rawChapterDoc) { + // a[title="Next Chapter"] doesn"t always work. Two different pathologies: + // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/ + // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/ + // So instead search for the first within the main content area starting with "Next", trimmed. + + const aEls = rawChapterDoc.querySelectorAll(".entry-content a"); + for (let i = 0; i < aEls.length; ++i) { + if (aEls[i].textContent.trim().startsWith("Next")) { + return aEls[i].href; + } + } + + return null; +} + +function retry(times, fn) { + if (times === 0) { + return fn(); + } + + return fn().catch(function () { + return retry(times - 1, fn); + }); +} + +function urlFilename(filename) { + return `${filename}${URL_SUFFIX}`; +} + +function getChapterNumber(filename) { + return Number(filename.substring(FILENAME_PREFIX.length, filename.indexOf(".html"))); +} diff --git a/lib/worm-scraper.js b/lib/worm-scraper.js index dba01a3..2f285e6 100644 --- a/lib/worm-scraper.js +++ b/lib/worm-scraper.js @@ -3,24 +3,26 @@ const path = require("path"); const fs = require("mz/fs"); const mkdirp = require("mkdirp-then"); const rimraf = require("rimraf-then"); -const request = require("requisition"); const jsdom = require("jsdom"); +const download = require("./download.js"); + require("./track-rejections.js"); -const START_CHAPTER = "https://parahumans.wordpress.com/2011/06/11/1-1/"; +const START_CHAPTER_URL = "https://parahumans.wordpress.com/2011/06/11/1-1/"; const cachePath = path.resolve("cache"); const outPath = path.resolve("out"); const contentPath = path.resolve(outPath, "OEBPS"); -const rawChapterDocsCache = new Map(); - rimraf(outPath) .then(function () { return mkdirp(contentPath); }) - .then(getAllRawChapterDocs) + .then(function () { + return download(START_CHAPTER_URL, cachePath); + }) + .then(readAllRawChapterDocs) .then(function (rawChapterDocs) { console.log("Extracting content into EPUB chapter files"); return Promise.all(rawChapterDocs.map(function (rawChapterDoc, i) { @@ -33,72 +35,18 @@ rimraf(outPath) console.log("All done!"); }); -function getAllRawChapterDocs() { - return fs.readdir(cachePath).then( - function (filenames) { - return filenames.map(getRawChapterDoc); - }, - function (e) { - if (e.code === "ENOENT") { - return downloadAllChapters(); - } else { - throw e; - } - } - ); -} - -function downloadAllChapters() { - let currentChapter = START_CHAPTER; - let chapterCounter = 1; - - return mkdirp(cachePath).then(loop).then(function () { - return toArray(rawChapterDocsCache.values()); - }); - - function loop() { - const filename = `chapter${chapterCounter}.html`; - - console.log(`Downloading ${currentChapter}`); - - // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/ - const escapedUrl = encodeURI(currentChapter); - - return request(escapedUrl).redirects(10).then(function (response) { - return response.text(); - }) - .then(function (contents) { - const rawChapterDoc = setRawChapterDoc(filename, contents); - currentChapter = getNextChapterUrl(rawChapterDoc); - - return fs.writeFile(path.resolve(cachePath, filename), jsdom.serializeDocument(rawChapterDoc)); - }) - .then(function () { - if (currentChapter === null) { - return; - } - - ++chapterCounter; - return loop(); - }); - } -} - -function getRawChapterDoc(filename) { - const doc = rawChapterDocsCache.get(filename); - if (doc !== undefined) { - return Promise.resolve(doc); - } - - return fs.readFile(path.resolve(cachePath, filename), { encoding: "utf-8" }).then(function (contents) { - return setRawChapterDoc(filename, contents); +function readAllRawChapterDocs() { + return fs.readdir(cachePath).then(function (filenames) { + const htmlFiles = filenames.filter(function (f) { return f.endsWith(".html"); }); + return Promise.all(htmlFiles.map(readRawChapterDoc)); }); } -function setRawChapterDoc(filename, contents) { - const doc = jsdom.jsdom(contents); - rawChapterDocsCache.set(filename, doc); - return doc; +function readRawChapterDoc(filename) { + const filePath = path.resolve(cachePath, filename); + return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) { + return jsdom.jsdom(contents); + }); } function getChapterString(rawChapterDoc) { @@ -134,27 +82,3 @@ function cleanContentEl(el) { return el; } - -function getNextChapterUrl(rawChapterDoc) { - // a[title="Next Chapter"] doesn't always work. Two different pathologies: - // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/ - // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/ - // So instead search for the first within the main content area starting with "Next", trimmed. - - const aEls = rawChapterDoc.querySelectorAll(".entry-content a"); - for (let i = 0; i < aEls.length; ++i) { - if (aEls[i].textContent.trim().startsWith("Next")) { - return aEls[i].href; - } - } - - return null; -} - -function toArray(iterable) { - const array = []; - for (const x of iterable) { - array.push(x); - } - return array; -}