Update dependencies and code style
This commit is contained in:
parent
b6cb0b5b6e
commit
0435c45b2e
7 changed files with 108 additions and 140 deletions
|
|
@ -4,7 +4,7 @@ Scrapes the web serial [_Worm_](https://parahumans.wordpress.com/) into an eBook
|
|||
|
||||
## How to use
|
||||
|
||||
First you'll need a modern version of [Node.js](https://nodejs.org/en/). Install whatever is current (not LTS).
|
||||
First you'll need a modern version of [Node.js](https://nodejs.org/en/). Install whatever is current (not LTS); at least v8.x is necessary.
|
||||
|
||||
Then, open a terminal ([Mac documentation](http://blog.teamtreehouse.com/introduction-to-the-mac-os-x-command-line), [Windows documentation](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and install the program by typing
|
||||
|
||||
|
|
|
|||
|
|
@ -3,40 +3,39 @@ const path = require("path");
|
|||
const fs = require("mz/fs");
|
||||
const throat = require("throat");
|
||||
const serializeToXML = require("xmlserializer").serializeToString;
|
||||
const jsdom = require("./jsdom.js");
|
||||
const { JSDOM } = require("jsdom");
|
||||
const substitutions = require("./substitutions.json");
|
||||
|
||||
module.exports = (cachePath, manifestPath, contentPath) => {
|
||||
return fs.readFile(manifestPath, { encoding: "utf-8" }).then(manifestContents => {
|
||||
const chapters = JSON.parse(manifestContents);
|
||||
console.log("All chapters downloaded; beginning conversion to EPUB chapters");
|
||||
module.exports = async (cachePath, manifestPath, contentPath) => {
|
||||
const manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
|
||||
const chapters = JSON.parse(manifestContents);
|
||||
console.log("All chapters downloaded; beginning conversion to EPUB chapters");
|
||||
|
||||
const mapper = throat(10, chapter => {
|
||||
return convertChapter(chapter, cachePath, contentPath);
|
||||
});
|
||||
return Promise.all(chapters.map(mapper));
|
||||
})
|
||||
.then(() => console.log("All chapters converted"));
|
||||
const mapper = throat(10, chapter => convertChapter(chapter, cachePath, contentPath));
|
||||
await Promise.all(chapters.map(mapper));
|
||||
|
||||
console.log("All chapters converted");
|
||||
};
|
||||
|
||||
function convertChapter(chapter, cachePath, contentPath) {
|
||||
async function convertChapter(chapter, cachePath, contentPath) {
|
||||
const filename = chapter.filename;
|
||||
const filePath = path.resolve(cachePath, filename);
|
||||
|
||||
console.log(`- Reading ${filename}`);
|
||||
return fs.readFile(filePath, { encoding: "utf-8" }).then(contents => {
|
||||
console.log(`- Read ${filename}`);
|
||||
const rawChapterDoc = jsdom(contents);
|
||||
const output = getChapterString(chapter, rawChapterDoc);
|
||||
const contents = await fs.readFile(filePath, { encoding: "utf-8" });
|
||||
console.log(`- Read ${filename}`);
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterDoc.defaultView.close();
|
||||
const rawChapterJSDOM = new JSDOM(contents);
|
||||
const output = getChapterString(chapter, rawChapterJSDOM.window.document);
|
||||
|
||||
const destFileName = `${path.basename(filename, ".html")}.xhtml`;
|
||||
const destFilePath = path.resolve(contentPath, destFileName);
|
||||
return fs.writeFile(destFilePath, output);
|
||||
})
|
||||
.then(() => console.log(`- Finished converting ${filename}`));
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterJSDOM.window.close();
|
||||
|
||||
const destFileName = `${path.basename(filename, ".html")}.xhtml`;
|
||||
const destFilePath = path.resolve(contentPath, destFileName);
|
||||
|
||||
await fs.writeFile(destFilePath, output);
|
||||
console.log(`- Finished converting ${filename}`);
|
||||
}
|
||||
|
||||
function getChapterString(chapter, rawChapterDoc) {
|
||||
|
|
|
|||
113
lib/download.js
113
lib/download.js
|
|
@ -4,28 +4,27 @@ const fs = require("mz/fs");
|
|||
const mkdirp = require("mkdirp-then");
|
||||
const request = require("requisition");
|
||||
const zfill = require("zfill");
|
||||
const jsdom = require("./jsdom.js");
|
||||
const { JSDOM } = require("jsdom");
|
||||
|
||||
const FILENAME_PREFIX = "chapter";
|
||||
|
||||
module.exports = (startChapterUrl, cachePath, manifestPath) => {
|
||||
return fs.readFile(manifestPath, { encoding: "utf-8" }).then(
|
||||
manifestContents => {
|
||||
const manifest = JSON.parse(manifestContents);
|
||||
|
||||
return downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath);
|
||||
},
|
||||
e => {
|
||||
if (e.code === "ENOENT") {
|
||||
return downloadAllChapters(null, startChapterUrl, cachePath, manifestPath);
|
||||
}
|
||||
throw e;
|
||||
module.exports = async (startChapterURL, cachePath, manifestPath) => {
|
||||
let manifestContents;
|
||||
try {
|
||||
manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
|
||||
} catch (e) {
|
||||
if (e.code === "ENOENT") {
|
||||
return downloadAllChapters(null, startChapterURL, cachePath, manifestPath);
|
||||
}
|
||||
);
|
||||
throw e;
|
||||
}
|
||||
|
||||
const manifest = JSON.parse(manifestContents);
|
||||
return downloadAllChapters(manifest, startChapterURL, cachePath, manifestPath);
|
||||
};
|
||||
|
||||
function downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath) {
|
||||
let currentChapter = startChapterUrl;
|
||||
async function downloadAllChapters(manifest, startChapterURL, cachePath, manifestPath) {
|
||||
let currentChapter = startChapterURL;
|
||||
let chapterIndex = 0;
|
||||
if (manifest !== null) {
|
||||
currentChapter = manifest[manifest.length - 1].url;
|
||||
|
|
@ -37,62 +36,45 @@ function downloadAllChapters(manifest, startChapterUrl, cachePath, manifestPath)
|
|||
manifest = [];
|
||||
}
|
||||
|
||||
return mkdirp(cachePath).then(loop);
|
||||
await mkdirp(cachePath);
|
||||
|
||||
function loop() {
|
||||
while (currentChapter !== null) {
|
||||
const filename = `${FILENAME_PREFIX}${zfill(chapterIndex, 3)}.html`;
|
||||
|
||||
console.log(`Downloading ${currentChapter}`);
|
||||
|
||||
return downloadChapter(currentChapter).then(response => {
|
||||
console.log("- Response received");
|
||||
return response.text();
|
||||
})
|
||||
.then(contents => {
|
||||
console.log("- Response body received");
|
||||
const rawChapterDoc = jsdom(contents, { url: currentChapter });
|
||||
console.log("- Response body parsed into DOM");
|
||||
const response = await downloadChapter(currentChapter);
|
||||
const contents = await response.text();
|
||||
console.log("- Response body received");
|
||||
const rawChapterJSDOM = new JSDOM(contents, { url: currentChapter });
|
||||
console.log("- Response body parsed into DOM");
|
||||
|
||||
const chapterUrlToSave = currentChapter;
|
||||
const chapterTitle = getChapterTitle(rawChapterDoc);
|
||||
currentChapter = getNextChapterUrl(rawChapterDoc);
|
||||
const chapterURLToSave = currentChapter;
|
||||
const chapterTitle = getChapterTitle(rawChapterJSDOM.window.document);
|
||||
currentChapter = getNextChapterURL(rawChapterJSDOM.window.document);
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterDoc.defaultView.close();
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterJSDOM.window.close();
|
||||
|
||||
manifest.push({
|
||||
url: chapterUrlToSave,
|
||||
title: chapterTitle,
|
||||
filename
|
||||
});
|
||||
|
||||
fs.writeFile(path.resolve(cachePath, filename), contents);
|
||||
})
|
||||
.then(() => {
|
||||
console.log("- Response text saved to cache file");
|
||||
// Incrementally update the manifest after every successful download, instead of waiting until the end.
|
||||
return writeManifest();
|
||||
})
|
||||
.then(() => {
|
||||
console.log("- Manifest updated");
|
||||
|
||||
if (currentChapter === null) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
++chapterIndex;
|
||||
|
||||
return loop();
|
||||
manifest.push({
|
||||
url: chapterURLToSave,
|
||||
title: chapterTitle,
|
||||
filename
|
||||
});
|
||||
}
|
||||
|
||||
function writeManifest() {
|
||||
const contents = JSON.stringify(manifest, undefined, 2);
|
||||
return fs.writeFile(manifestPath, contents);
|
||||
await fs.writeFile(path.resolve(cachePath, filename), contents);
|
||||
console.log("- Response text saved to cache file");
|
||||
|
||||
// Incrementally update the manifest after every successful download, instead of waiting until the end.
|
||||
const newManifestContents = JSON.stringify(manifest, undefined, 2);
|
||||
await fs.writeFile(manifestPath, newManifestContents);
|
||||
console.log("- Manifest updated");
|
||||
|
||||
++chapterIndex;
|
||||
}
|
||||
}
|
||||
|
||||
function getNextChapterUrl(rawChapterDoc) {
|
||||
function getNextChapterURL(rawChapterDoc) {
|
||||
// `a[title="Next Chapter"]` doesn"t always work. Two different pathologies:
|
||||
// - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
|
||||
// - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
|
||||
|
|
@ -123,12 +105,11 @@ function retry(times, fn) {
|
|||
}
|
||||
|
||||
function downloadChapter(url) {
|
||||
return retry(3, () => {
|
||||
return request(url).redirects(10).then(response => {
|
||||
if (response.status !== 200) {
|
||||
throw new Error(`Response status for ${url} was ${response.status}`);
|
||||
}
|
||||
return response;
|
||||
});
|
||||
return retry(3, async () => {
|
||||
const response = await request(url).redirects(10);
|
||||
if (response.status !== 200) {
|
||||
throw new Error(`Response status for ${url} was ${response.status}`);
|
||||
}
|
||||
return response;
|
||||
});
|
||||
}
|
||||
|
|
|
|||
14
lib/jsdom.js
14
lib/jsdom.js
|
|
@ -1,14 +0,0 @@
|
|||
"use strict";
|
||||
const jsdom = require("jsdom");
|
||||
|
||||
// No need to fetch or execute JavaScript
|
||||
module.exports = (contents, options) => {
|
||||
options = Object.assign({}, options, {
|
||||
features: {
|
||||
FetchExternalResources: false,
|
||||
ProcessExternalResources: false
|
||||
}
|
||||
});
|
||||
|
||||
return jsdom.jsdom(contents, options);
|
||||
};
|
||||
|
|
@ -21,17 +21,16 @@ const COVER_IMG_FILENAME = "cover.png";
|
|||
const COVER_XHTML_FILENAME = "cover.xhtml";
|
||||
const COVER_MIMETYPE = "image/png";
|
||||
|
||||
module.exports = (scaffoldingPath, bookPath, contentPath, chaptersPath, manifestPath) => {
|
||||
return Promise.all([
|
||||
module.exports = async (scaffoldingPath, bookPath, contentPath, chaptersPath, manifestPath) => {
|
||||
await Promise.all([
|
||||
cpr(scaffoldingPath, bookPath, { overwrite: true, confirm: true, filter: noThumbs }),
|
||||
getChapters(contentPath, chaptersPath, manifestPath).then(chapters => {
|
||||
return Promise.all([
|
||||
writeOpf(chapters, contentPath),
|
||||
writeOPF(chapters, contentPath),
|
||||
writeNcx(chapters, contentPath)
|
||||
]);
|
||||
})
|
||||
])
|
||||
.then(() => undefined);
|
||||
]);
|
||||
};
|
||||
|
||||
function noThumbs(filePath) {
|
||||
|
|
@ -39,7 +38,7 @@ function noThumbs(filePath) {
|
|||
return path.basename(filePath) !== "Thumbs.db";
|
||||
}
|
||||
|
||||
function writeOpf(chapters, contentPath) {
|
||||
function writeOPF(chapters, contentPath) {
|
||||
const manifestChapters = chapters.map(c => {
|
||||
return `<item id="${c.id}" href="${c.href}" media-type="application/xhtml+xml"/>`;
|
||||
}).join("\n");
|
||||
|
|
@ -115,23 +114,22 @@ ${navPoints}
|
|||
return fs.writeFile(path.resolve(contentPath, NCX_FILENAME), contents);
|
||||
}
|
||||
|
||||
function getChapters(contentPath, chaptersPath, manifestPath) {
|
||||
async function getChapters(contentPath, chaptersPath, manifestPath) {
|
||||
const hrefPrefix = `${path.relative(contentPath, chaptersPath)}/`;
|
||||
|
||||
return fs.readFile(manifestPath, { encoding: "utf-8" }).then(manifestContents => {
|
||||
const manifestChapters = JSON.parse(manifestContents);
|
||||
const manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
|
||||
const manifestChapters = JSON.parse(manifestContents);
|
||||
|
||||
return fs.readdir(chaptersPath).then(filenames => {
|
||||
return filenames
|
||||
.filter(f => path.extname(f) === ".xhtml")
|
||||
.sort()
|
||||
.map((f, i) => {
|
||||
return {
|
||||
id: path.basename(f),
|
||||
title: manifestChapters[i].title,
|
||||
href: `${hrefPrefix}${f}`
|
||||
};
|
||||
});
|
||||
const filenames = await fs.readdir(chaptersPath);
|
||||
|
||||
return filenames
|
||||
.filter(f => path.extname(f) === ".xhtml")
|
||||
.sort()
|
||||
.map((f, i) => {
|
||||
return {
|
||||
id: path.basename(f),
|
||||
title: manifestChapters[i].title,
|
||||
href: `${hrefPrefix}${f}`
|
||||
};
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -84,11 +84,15 @@ if (argv._.includes("zip")) {
|
|||
commands.push(() => zip(bookPath, contentPath, path.resolve(argv.out)));
|
||||
}
|
||||
|
||||
commands.reduce((previous, command) => {
|
||||
return previous.then(command);
|
||||
}, Promise.resolve())
|
||||
.then(() => console.log("All done!"))
|
||||
.catch(e => {
|
||||
console.error(e.stack);
|
||||
process.exit(1);
|
||||
});
|
||||
(async () => {
|
||||
try {
|
||||
for (const command of commands) {
|
||||
await command();
|
||||
}
|
||||
|
||||
console.log("All done!");
|
||||
} catch (e) {
|
||||
console.error(e.stack);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
|
|
|
|||
14
package.json
14
package.json
|
|
@ -20,19 +20,19 @@
|
|||
"lint": "eslint lib"
|
||||
},
|
||||
"dependencies": {
|
||||
"archiver": "^1.3.0",
|
||||
"cpr": "^2.0.2",
|
||||
"jsdom": "^9.9.1",
|
||||
"archiver": "^2.0.0",
|
||||
"cpr": "^2.2.0",
|
||||
"jsdom": "^11.1.0",
|
||||
"mkdirp-then": "^1.0.1",
|
||||
"requisition": "^1.5.0",
|
||||
"rimraf-then": "^1.0.0",
|
||||
"thenify": "^3.1.0",
|
||||
"throat": "^3.0.0",
|
||||
"thenify": "^3.3.0",
|
||||
"throat": "^4.1.0",
|
||||
"xmlserializer": "^0.6.0",
|
||||
"yargs": "^6.6.0",
|
||||
"yargs": "^8.0.2",
|
||||
"zfill": "0.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"eslint": "3.12.2"
|
||||
"eslint": "4.4.1"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue