While downloading, save a cache manifest alongside
This allows us to keep track of the chapter title after the fact.
This commit is contained in:
parent
ba1e7b956f
commit
b68b88e17e
4 changed files with 62 additions and 38 deletions
5
lib/common.js
Normal file
5
lib/common.js
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
"use strict";
|
||||
|
||||
exports.getChapterTitle = function (rawChapterDoc) {
|
||||
return rawChapterDoc.querySelector("h1.entry-title").textContent;
|
||||
};
|
||||
|
|
@ -2,8 +2,9 @@
|
|||
const path = require("path");
|
||||
const fs = require("mz/fs");
|
||||
const throat = require("throat");
|
||||
const jsdom = require("./jsdom.js");
|
||||
const serializeToXml = require("xmlserializer").serializeToString;
|
||||
const jsdom = require("./jsdom.js");
|
||||
const getChapterTitle = require("./common.js").getChapterTitle;
|
||||
|
||||
module.exports = function (cachePath, contentPath) {
|
||||
return getChapterFilePaths(cachePath)
|
||||
|
|
@ -50,9 +51,8 @@ function convertChapter(filePath, contentPath) {
|
|||
}
|
||||
|
||||
function getChapterString(rawChapterDoc) {
|
||||
const headingEl = rawChapterDoc.querySelector("h1.entry-title");
|
||||
const title = headingEl.textContent;
|
||||
const body = getBodyXml(headingEl, rawChapterDoc.querySelector(".entry-content"));
|
||||
const title = getChapterTitle(rawChapterDoc);
|
||||
const body = getBodyXml(title, rawChapterDoc.querySelector(".entry-content"));
|
||||
|
||||
return `<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
|
|
@ -65,7 +65,7 @@ ${body}
|
|||
</html>`;
|
||||
}
|
||||
|
||||
function getBodyXml(headingEl, contentEl) {
|
||||
function getBodyXml(title, contentEl) {
|
||||
// Remove initial Next Chapter and Previous Chapter <p>
|
||||
contentEl.removeChild(contentEl.firstElementChild);
|
||||
|
||||
|
|
@ -108,7 +108,7 @@ function getBodyXml(headingEl, contentEl) {
|
|||
// Synthesize a <body> tag to serialize
|
||||
const bodyEl = contentEl.ownerDocument.createElement("body");
|
||||
const h1El = contentEl.ownerDocument.createElement("h1");
|
||||
h1El.textContent = headingEl.textContent;
|
||||
h1El.textContent = title;
|
||||
|
||||
bodyEl.appendChild(h1El);
|
||||
while (contentEl.firstChild) {
|
||||
|
|
|
|||
|
|
@ -5,24 +5,20 @@ const mkdirp = require("mkdirp-then");
|
|||
const request = require("requisition");
|
||||
const zfill = require("zfill");
|
||||
const jsdom = require("./jsdom.js");
|
||||
const getChapterTitle = require("./common.js").getChapterTitle;
|
||||
|
||||
const FILENAME_PREFIX = "chapter";
|
||||
const URL_SUFFIX = "-url.txt";
|
||||
|
||||
module.exports = function (startChapterUrl, cachePath) {
|
||||
return fs.readdir(cachePath).then(
|
||||
function (filenames) {
|
||||
const sorted = filenames.filter(function (f) { return f.endsWith(URL_SUFFIX); }).sort();
|
||||
const lastChapterUrlFile = sorted[sorted.length - 1];
|
||||
const lastChapterNumber = getChapterNumber(lastChapterUrlFile);
|
||||
module.exports = function (startChapterUrl, cachePath, manifestPath) {
|
||||
return fs.readFile(manifestPath, { encoding: "utf-8" }).then(
|
||||
function (manifestContents) {
|
||||
const manifest = JSON.parse(manifestContents);
|
||||
|
||||
return fs.readFile(path.resolve(cachePath, lastChapterUrlFile), { encoding: "utf-8" }).then(function (url) {
|
||||
return downloadAllChapters(url, lastChapterNumber, cachePath);
|
||||
});
|
||||
return downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath);
|
||||
},
|
||||
function (e) {
|
||||
if (e.code === "ENOENT") {
|
||||
return downloadAllChapters(startChapterUrl, 1, cachePath);
|
||||
return downloadAllChapters(null, startChapterUrl, cachePath, manifestPath);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
|
|
@ -30,21 +26,27 @@ module.exports = function (startChapterUrl, cachePath) {
|
|||
);
|
||||
};
|
||||
|
||||
function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) {
|
||||
function downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath) {
|
||||
let currentChapter = startChapterUrl;
|
||||
let chapterCounter = startChapterNumber;
|
||||
let chapterIndex = 0;
|
||||
if (manifest !== null) {
|
||||
currentChapter = manifest[manifest.length - 1].url;
|
||||
chapterIndex = manifest.length - 1;
|
||||
|
||||
// We're going to re-add it to the manifest later, possibly with an updated title.
|
||||
manifest.pop();
|
||||
} else {
|
||||
manifest = [];
|
||||
}
|
||||
|
||||
return mkdirp(cachePath).then(loop);
|
||||
|
||||
function loop() {
|
||||
const filename = `${FILENAME_PREFIX}${zfill(chapterCounter, 3)}.html`;
|
||||
const filename = `${FILENAME_PREFIX}${zfill(chapterIndex, 3)}.html`;
|
||||
|
||||
console.log(`Downloading ${currentChapter}`);
|
||||
|
||||
// Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
|
||||
const escapedUrl = encodeURI(currentChapter);
|
||||
|
||||
return retry(3, function () { return request(escapedUrl).redirects(10); }).then(function (response) {
|
||||
return downloadChapter(currentChapter).then(function (response) {
|
||||
console.log("- Response received");
|
||||
return response.text();
|
||||
})
|
||||
|
|
@ -54,27 +56,42 @@ function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) {
|
|||
console.log("- Response body parsed into DOM");
|
||||
|
||||
const chapterUrlToSave = currentChapter;
|
||||
const chapterTitle = getChapterTitle(rawChapterDoc);
|
||||
currentChapter = getNextChapterUrl(rawChapterDoc);
|
||||
console.log("- Got next chapter URL");
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterDoc.defaultView.close();
|
||||
|
||||
return [
|
||||
fs.writeFile(path.resolve(cachePath, filename), contents),
|
||||
fs.writeFile(path.resolve(cachePath, urlFilename(filename)), chapterUrlToSave)
|
||||
];
|
||||
manifest.push({
|
||||
url: chapterUrlToSave,
|
||||
title: chapterTitle,
|
||||
filename: filename
|
||||
});
|
||||
|
||||
fs.writeFile(path.resolve(cachePath, filename), contents);
|
||||
})
|
||||
.then(function () {
|
||||
console.log("- Response text saved to cache file");
|
||||
// Incrementally update the manifest after every successful download, instead of waiting until the end.
|
||||
return writeManifest();
|
||||
})
|
||||
.then(function () {
|
||||
console.log("- Manifest updated");
|
||||
|
||||
if (currentChapter === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
++chapterCounter;
|
||||
++chapterIndex;
|
||||
|
||||
return loop();
|
||||
});
|
||||
}
|
||||
|
||||
function writeManifest() {
|
||||
const contents = JSON.stringify(manifest, undefined, 2);
|
||||
return fs.writeFile(manifestPath, contents);
|
||||
}
|
||||
}
|
||||
|
||||
function getNextChapterUrl(rawChapterDoc) {
|
||||
|
|
@ -103,10 +120,11 @@ function retry(times, fn) {
|
|||
});
|
||||
}
|
||||
|
||||
function urlFilename(filename) {
|
||||
return `${filename}${URL_SUFFIX}`;
|
||||
}
|
||||
function downloadChapter(url) {
|
||||
// Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
|
||||
const escapedUrl = encodeURI(url);
|
||||
|
||||
function getChapterNumber(filename) {
|
||||
return Number(/0?0?(\d+)\./.exec(filename)[1]);
|
||||
return retry(3, function () {
|
||||
return request(escapedUrl).redirects(10);
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,12 +15,13 @@ const cachePath = path.resolve("cache");
|
|||
const outPath = path.resolve("out");
|
||||
const contentPath = path.resolve(outPath, "OEBPS");
|
||||
const chaptersPath = path.resolve(contentPath, "chapters");
|
||||
const manifestPath = path.resolve(cachePath, "manifest.json");
|
||||
|
||||
Promise.resolve()
|
||||
.then(function () {
|
||||
// return download(START_CHAPTER_URL, cachePath);
|
||||
return download(START_CHAPTER_URL, cachePath, manifestPath);
|
||||
})
|
||||
.then(function () {
|
||||
/* .then(function () {
|
||||
return rimraf(chaptersPath);
|
||||
})
|
||||
.then(function () {
|
||||
|
|
@ -31,7 +32,7 @@ Promise.resolve()
|
|||
})
|
||||
.then(function () {
|
||||
return extras(contentPath, chaptersPath);
|
||||
})
|
||||
})*/
|
||||
.then(function () {
|
||||
console.log("All done!");
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue