While downloading, save a cache manifest alongside

This allows us to keep track of the chapter title after the fact.
This commit is contained in:
Domenic Denicola 2015-05-11 20:49:09 -04:00
commit b68b88e17e
4 changed files with 62 additions and 38 deletions

5
lib/common.js Normal file
View file

@ -0,0 +1,5 @@
"use strict";
exports.getChapterTitle = function (rawChapterDoc) {
return rawChapterDoc.querySelector("h1.entry-title").textContent;
};

View file

@ -2,8 +2,9 @@
const path = require("path");
const fs = require("mz/fs");
const throat = require("throat");
const jsdom = require("./jsdom.js");
const serializeToXml = require("xmlserializer").serializeToString;
const jsdom = require("./jsdom.js");
const getChapterTitle = require("./common.js").getChapterTitle;
module.exports = function (cachePath, contentPath) {
return getChapterFilePaths(cachePath)
@ -50,9 +51,8 @@ function convertChapter(filePath, contentPath) {
}
function getChapterString(rawChapterDoc) {
const headingEl = rawChapterDoc.querySelector("h1.entry-title");
const title = headingEl.textContent;
const body = getBodyXml(headingEl, rawChapterDoc.querySelector(".entry-content"));
const title = getChapterTitle(rawChapterDoc);
const body = getBodyXml(title, rawChapterDoc.querySelector(".entry-content"));
return `<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
@ -65,7 +65,7 @@ ${body}
</html>`;
}
function getBodyXml(headingEl, contentEl) {
function getBodyXml(title, contentEl) {
// Remove initial Next Chapter and Previous Chapter <p>
contentEl.removeChild(contentEl.firstElementChild);
@ -108,7 +108,7 @@ function getBodyXml(headingEl, contentEl) {
// Synthesize a <body> tag to serialize
const bodyEl = contentEl.ownerDocument.createElement("body");
const h1El = contentEl.ownerDocument.createElement("h1");
h1El.textContent = headingEl.textContent;
h1El.textContent = title;
bodyEl.appendChild(h1El);
while (contentEl.firstChild) {

View file

@ -5,24 +5,20 @@ const mkdirp = require("mkdirp-then");
const request = require("requisition");
const zfill = require("zfill");
const jsdom = require("./jsdom.js");
const getChapterTitle = require("./common.js").getChapterTitle;
const FILENAME_PREFIX = "chapter";
const URL_SUFFIX = "-url.txt";
module.exports = function (startChapterUrl, cachePath) {
return fs.readdir(cachePath).then(
function (filenames) {
const sorted = filenames.filter(function (f) { return f.endsWith(URL_SUFFIX); }).sort();
const lastChapterUrlFile = sorted[sorted.length - 1];
const lastChapterNumber = getChapterNumber(lastChapterUrlFile);
module.exports = function (startChapterUrl, cachePath, manifestPath) {
return fs.readFile(manifestPath, { encoding: "utf-8" }).then(
function (manifestContents) {
const manifest = JSON.parse(manifestContents);
return fs.readFile(path.resolve(cachePath, lastChapterUrlFile), { encoding: "utf-8" }).then(function (url) {
return downloadAllChapters(url, lastChapterNumber, cachePath);
});
return downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath);
},
function (e) {
if (e.code === "ENOENT") {
return downloadAllChapters(startChapterUrl, 1, cachePath);
return downloadAllChapters(null, startChapterUrl, cachePath, manifestPath);
} else {
throw e;
}
@ -30,21 +26,27 @@ module.exports = function (startChapterUrl, cachePath) {
);
};
function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) {
function downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath) {
let currentChapter = startChapterUrl;
let chapterCounter = startChapterNumber;
let chapterIndex = 0;
if (manifest !== null) {
currentChapter = manifest[manifest.length - 1].url;
chapterIndex = manifest.length - 1;
// We're going to re-add it to the manifest later, possibly with an updated title.
manifest.pop();
} else {
manifest = [];
}
return mkdirp(cachePath).then(loop);
function loop() {
const filename = `${FILENAME_PREFIX}${zfill(chapterCounter, 3)}.html`;
const filename = `${FILENAME_PREFIX}${zfill(chapterIndex, 3)}.html`;
console.log(`Downloading ${currentChapter}`);
// Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
const escapedUrl = encodeURI(currentChapter);
return retry(3, function () { return request(escapedUrl).redirects(10); }).then(function (response) {
return downloadChapter(currentChapter).then(function (response) {
console.log("- Response received");
return response.text();
})
@ -54,27 +56,42 @@ function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) {
console.log("- Response body parsed into DOM");
const chapterUrlToSave = currentChapter;
const chapterTitle = getChapterTitle(rawChapterDoc);
currentChapter = getNextChapterUrl(rawChapterDoc);
console.log("- Got next chapter URL");
// TODO: this should probably not be necessary... jsdom bug I guess!?
rawChapterDoc.defaultView.close();
return [
fs.writeFile(path.resolve(cachePath, filename), contents),
fs.writeFile(path.resolve(cachePath, urlFilename(filename)), chapterUrlToSave)
];
manifest.push({
url: chapterUrlToSave,
title: chapterTitle,
filename: filename
});
fs.writeFile(path.resolve(cachePath, filename), contents);
})
.then(function () {
console.log("- Response text saved to cache file");
// Incrementally update the manifest after every successful download, instead of waiting until the end.
return writeManifest();
})
.then(function () {
console.log("- Manifest updated");
if (currentChapter === null) {
return;
}
++chapterCounter;
++chapterIndex;
return loop();
});
}
function writeManifest() {
const contents = JSON.stringify(manifest, undefined, 2);
return fs.writeFile(manifestPath, contents);
}
}
function getNextChapterUrl(rawChapterDoc) {
@ -103,10 +120,11 @@ function retry(times, fn) {
});
}
function urlFilename(filename) {
return `${filename}${URL_SUFFIX}`;
}
function downloadChapter(url) {
// Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
const escapedUrl = encodeURI(url);
function getChapterNumber(filename) {
return Number(/0?0?(\d+)\./.exec(filename)[1]);
return retry(3, function () {
return request(escapedUrl).redirects(10);
});
}

View file

@ -15,12 +15,13 @@ const cachePath = path.resolve("cache");
const outPath = path.resolve("out");
const contentPath = path.resolve(outPath, "OEBPS");
const chaptersPath = path.resolve(contentPath, "chapters");
const manifestPath = path.resolve(cachePath, "manifest.json");
Promise.resolve()
.then(function () {
// return download(START_CHAPTER_URL, cachePath);
return download(START_CHAPTER_URL, cachePath, manifestPath);
})
.then(function () {
/* .then(function () {
return rimraf(chaptersPath);
})
.then(function () {
@ -31,7 +32,7 @@ Promise.resolve()
})
.then(function () {
return extras(contentPath, chaptersPath);
})
})*/
.then(function () {
console.log("All done!");
});