diff --git a/.gitignore b/.gitignore index 0c0aa04..ba721ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /node_modules/ /npm-debug.log + +cache/ +out/ diff --git a/lib/track-rejections.js b/lib/track-rejections.js new file mode 100644 index 0000000..6b0d0ab --- /dev/null +++ b/lib/track-rejections.js @@ -0,0 +1,17 @@ +"use strict"; + +let currentId = 0; +const tracker = new WeakMap(); + +process.on("unhandledRejection", function (reason, promise) { + tracker.set(promise, ++currentId); + console.error(`Unhandled rejection (${currentId}): `); + console.error(reason.stack); +}); + +process.on("rejectionHandled", function (promise) { + const id = tracker.get(promise); + tracker.delete(promise); + + console.error(`Rejection handled (${id})`); +}); diff --git a/lib/worm-scraper.js b/lib/worm-scraper.js index d0b341a..b910099 100644 --- a/lib/worm-scraper.js +++ b/lib/worm-scraper.js @@ -1,5 +1,124 @@ "use strict"; +const path = require("path"); +const fs = require("mz/fs"); +const mkdirp = require("mkdirp-then"); +const request = require("requisition"); +const jsdom = require("jsdom"); -module.exports = function () { +require("./track-rejections.js"); -}; +const START_CHAPTER = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"; + +const cachePath = path.resolve("cache"); +const outPath = path.resolve("out"); +const contentPath = path.resolve(outPath, "OEBPS"); + +const rawChapterDocs = new Map(); + +let chapters; +getChapters() + .then(function (theChapters) { + chapters = theChapters; + return mkdirp(contentPath); + }) + .then(function () { + return Promise.all(chapters.map(getRawChapterDoc)); + }) + .then(function (rawChapterDocs) { + console.log("Extracting content into EPUB chapter files"); + return Promise.all(rawChapterDocs.map(function (rawChapterDoc, i) { + const output = getChapterString(rawChapterDoc); + const destFilename = path.resolve(contentPath, `chapter${i + 1}.xhtml`); + return fs.writeFile(destFilename, output); + })); + }) + .then(function () { + console.log("All done!"); + }); + +function getChapters() { + return fs.readdir(cachePath).catch(function (e) { + if (e.code === "ENOENT") { + let currentChapter = START_CHAPTER; + let chapterCounter = 1; + const filenames = []; + + return loop().then(function () { + return filenames; + }); + + function loop() { + const filename = `chapter${chapterCounter}.html`; + + console.log(`Downloading ${currentChapter}`); + + return request(currentChapter).redirects(10).then(function (response) { + return response.saveTo(path.resolve(cachePath, filename)); + }) + .then(function () { + filenames.push(filename); + + // This is inefficient in a number of ways; whatever. + return getRawChapterDoc(filename); + }) + .then(function (rawChapterDoc) { + currentChapter = getNextChapterUrl(rawChapterDoc); + + if (currentChapter === null) { + return; + } + + ++chapterCounter; + return loop(); + }); + } + } else { + throw e; + } + }); +} + +function getRawChapterDoc(filename) { + return fs.readFile(path.resolve(cachePath, filename), { encoding: "utf-8" }).then(function (contents) { + return jsdom.jsdom(contents); + }); +} + +function getChapterString(rawChapterDoc) { + const title = rawChapterDoc.querySelector("h1.entry-title").textContent; + const body = cleanContentEl(rawChapterDoc.querySelector(".entry-content")).innerHTML; + + return ` + + + + + ${title} + + +

${title}

+ + ${body} + +`; +} + +function cleanContentEl(el) { + // Remove Next Chapter and Previous Chapter

s + el.removeChild(el.firstElementChild); + el.removeChild(el.lastElementChild); + + // Remove redundant dir="ltr" + Array.prototype.forEach.call(el.children, function (child) { + if (child.getAttribute("dir") === "ltr") { + child.removeAttribute("dir"); + } + }); + + return el; +} + +function getNextChapterUrl(rawChapterDoc) { + const nextEl = rawChapterDoc.querySelector("a[title=\"Next Chapter\"]"); + return nextEl && nextEl.href; +} diff --git a/package.json b/package.json index 2ac76be..c485aff 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,9 @@ "lint": "eslint lib && jscs lib" }, "dependencies": { - "jsdom": "^5.3.0" + "jsdom": "^5.3.0", + "mkdirp-then": "^1.0.1", + "requisition": "^1.5.0" }, "devDependencies": { "eslint": "0.20.0",