diff --git a/lib/common.js b/lib/common.js new file mode 100644 index 0000000..e69a4e1 --- /dev/null +++ b/lib/common.js @@ -0,0 +1,5 @@ +"use strict"; + +exports.getChapterTitle = function (rawChapterDoc) { + return rawChapterDoc.querySelector("h1.entry-title").textContent; +}; diff --git a/lib/convert.js b/lib/convert.js index b021e13..7a47b53 100644 --- a/lib/convert.js +++ b/lib/convert.js @@ -2,8 +2,9 @@ const path = require("path"); const fs = require("mz/fs"); const throat = require("throat"); -const jsdom = require("./jsdom.js"); const serializeToXml = require("xmlserializer").serializeToString; +const jsdom = require("./jsdom.js"); +const getChapterTitle = require("./common.js").getChapterTitle; module.exports = function (cachePath, contentPath) { return getChapterFilePaths(cachePath) @@ -50,9 +51,8 @@ function convertChapter(filePath, contentPath) { } function getChapterString(rawChapterDoc) { - const headingEl = rawChapterDoc.querySelector("h1.entry-title"); - const title = headingEl.textContent; - const body = getBodyXml(headingEl, rawChapterDoc.querySelector(".entry-content")); + const title = getChapterTitle(rawChapterDoc); + const body = getBodyXml(title, rawChapterDoc.querySelector(".entry-content")); return ` @@ -65,7 +65,7 @@ ${body} `; } -function getBodyXml(headingEl, contentEl) { +function getBodyXml(title, contentEl) { // Remove initial Next Chapter and Previous Chapter

contentEl.removeChild(contentEl.firstElementChild); @@ -108,7 +108,7 @@ function getBodyXml(headingEl, contentEl) { // Synthesize a tag to serialize const bodyEl = contentEl.ownerDocument.createElement("body"); const h1El = contentEl.ownerDocument.createElement("h1"); - h1El.textContent = headingEl.textContent; + h1El.textContent = title; bodyEl.appendChild(h1El); while (contentEl.firstChild) { diff --git a/lib/download.js b/lib/download.js index 3664092..d0319e4 100644 --- a/lib/download.js +++ b/lib/download.js @@ -5,24 +5,20 @@ const mkdirp = require("mkdirp-then"); const request = require("requisition"); const zfill = require("zfill"); const jsdom = require("./jsdom.js"); +const getChapterTitle = require("./common.js").getChapterTitle; const FILENAME_PREFIX = "chapter"; -const URL_SUFFIX = "-url.txt"; -module.exports = function (startChapterUrl, cachePath) { - return fs.readdir(cachePath).then( - function (filenames) { - const sorted = filenames.filter(function (f) { return f.endsWith(URL_SUFFIX); }).sort(); - const lastChapterUrlFile = sorted[sorted.length - 1]; - const lastChapterNumber = getChapterNumber(lastChapterUrlFile); +module.exports = function (startChapterUrl, cachePath, manifestPath) { + return fs.readFile(manifestPath, { encoding: "utf-8" }).then( + function (manifestContents) { + const manifest = JSON.parse(manifestContents); - return fs.readFile(path.resolve(cachePath, lastChapterUrlFile), { encoding: "utf-8" }).then(function (url) { - return downloadAllChapters(url, lastChapterNumber, cachePath); - }); + return downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath); }, function (e) { if (e.code === "ENOENT") { - return downloadAllChapters(startChapterUrl, 1, cachePath); + return downloadAllChapters(null, startChapterUrl, cachePath, manifestPath); } else { throw e; } @@ -30,21 +26,27 @@ module.exports = function (startChapterUrl, cachePath) { ); }; -function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) { +function downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath) { let currentChapter = startChapterUrl; - let chapterCounter = startChapterNumber; + let chapterIndex = 0; + if (manifest !== null) { + currentChapter = manifest[manifest.length - 1].url; + chapterIndex = manifest.length - 1; + + // We're going to re-add it to the manifest later, possibly with an updated title. + manifest.pop(); + } else { + manifest = []; + } return mkdirp(cachePath).then(loop); function loop() { - const filename = `${FILENAME_PREFIX}${zfill(chapterCounter, 3)}.html`; + const filename = `${FILENAME_PREFIX}${zfill(chapterIndex, 3)}.html`; console.log(`Downloading ${currentChapter}`); - // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/ - const escapedUrl = encodeURI(currentChapter); - - return retry(3, function () { return request(escapedUrl).redirects(10); }).then(function (response) { + return downloadChapter(currentChapter).then(function (response) { console.log("- Response received"); return response.text(); }) @@ -54,27 +56,42 @@ function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) { console.log("- Response body parsed into DOM"); const chapterUrlToSave = currentChapter; + const chapterTitle = getChapterTitle(rawChapterDoc); currentChapter = getNextChapterUrl(rawChapterDoc); - console.log("- Got next chapter URL"); // TODO: this should probably not be necessary... jsdom bug I guess!? rawChapterDoc.defaultView.close(); - return [ - fs.writeFile(path.resolve(cachePath, filename), contents), - fs.writeFile(path.resolve(cachePath, urlFilename(filename)), chapterUrlToSave) - ]; + manifest.push({ + url: chapterUrlToSave, + title: chapterTitle, + filename: filename + }); + + fs.writeFile(path.resolve(cachePath, filename), contents); }) .then(function () { console.log("- Response text saved to cache file"); + // Incrementally update the manifest after every successful download, instead of waiting until the end. + return writeManifest(); + }) + .then(function () { + console.log("- Manifest updated"); + if (currentChapter === null) { return; } - ++chapterCounter; + ++chapterIndex; + return loop(); }); } + + function writeManifest() { + const contents = JSON.stringify(manifest, undefined, 2); + return fs.writeFile(manifestPath, contents); + } } function getNextChapterUrl(rawChapterDoc) { @@ -103,10 +120,11 @@ function retry(times, fn) { }); } -function urlFilename(filename) { - return `${filename}${URL_SUFFIX}`; -} +function downloadChapter(url) { + // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/ + const escapedUrl = encodeURI(url); -function getChapterNumber(filename) { - return Number(/0?0?(\d+)\./.exec(filename)[1]); + return retry(3, function () { + return request(escapedUrl).redirects(10); + }); } diff --git a/lib/worm-scraper.js b/lib/worm-scraper.js index ebed3b3..ae39f31 100644 --- a/lib/worm-scraper.js +++ b/lib/worm-scraper.js @@ -15,12 +15,13 @@ const cachePath = path.resolve("cache"); const outPath = path.resolve("out"); const contentPath = path.resolve(outPath, "OEBPS"); const chaptersPath = path.resolve(contentPath, "chapters"); +const manifestPath = path.resolve(cachePath, "manifest.json"); Promise.resolve() .then(function () { -// return download(START_CHAPTER_URL, cachePath); + return download(START_CHAPTER_URL, cachePath, manifestPath); }) - .then(function () { +/* .then(function () { return rimraf(chaptersPath); }) .then(function () { @@ -31,7 +32,7 @@ Promise.resolve() }) .then(function () { return extras(contentPath, chaptersPath); - }) + })*/ .then(function () { console.log("All done!"); });