diff --git a/lib/convert.js b/lib/convert.js new file mode 100644 index 0000000..dee31e4 --- /dev/null +++ b/lib/convert.js @@ -0,0 +1,83 @@ +"use strict"; +const path = require("path"); +const fs = require("mz/fs"); +const throat = require("throat"); +const jsdom = require("./jsdom.js"); + +module.exports = function (cachePath, contentPath) { + return getChapterFilePaths(cachePath) + .then(function (chapterFilePaths) { + console.log("All chapters downloaded; beginning conversion to EPUB chapters"); + + const mapper = throat(10, function (filePath) { + return convertChapter(filePath, contentPath); + }); + return Promise.all(chapterFilePaths.map(mapper)); + }) + .then(function () { + console.log("All chapters converted"); + }); +}; + + +function getChapterFilePaths(cachePath) { + return fs.readdir(cachePath).then(function (filenames) { + return filenames.filter(function (f) { return f.endsWith(".html"); }) + .map(function (f) { return path.resolve(cachePath, f); }); + }); +} + +function convertChapter(filePath, contentPath) { + const filename = path.basename(filePath); + + console.log(`- Reading ${filename}`); + return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) { + console.log(`- Read ${filename}`); + const rawChapterDoc = jsdom(contents); + const output = getChapterString(rawChapterDoc); + + // TODO: this should probably not be necessary... jsdom bug I guess!? + rawChapterDoc.defaultView.close(); + + const destFileName = `${path.basename(filename, ".html")}.xhtml`; + const destFilePath = path.resolve(contentPath, destFileName); + return fs.writeFile(destFilePath, output); + }) + .then(function () { + console.log(`- Finished converting ${filename}`); + }); +} + +function getChapterString(rawChapterDoc) { + const title = rawChapterDoc.querySelector("h1.entry-title").textContent; + const body = cleanContentEl(rawChapterDoc.querySelector(".entry-content")).innerHTML; + + return ` + + + + + ${title} + + +

${title}

+ + ${body} + +`; +} + +function cleanContentEl(el) { + // Remove Next Chapter and Previous Chapter

s + el.removeChild(el.firstElementChild); + el.removeChild(el.lastElementChild); + + // Remove redundant dir="ltr" + Array.prototype.forEach.call(el.children, function (child) { + if (child.getAttribute("dir") === "ltr") { + child.removeAttribute("dir"); + } + }); + + return el; +} diff --git a/lib/worm-scraper.js b/lib/worm-scraper.js index 44b904a..276d66d 100644 --- a/lib/worm-scraper.js +++ b/lib/worm-scraper.js @@ -1,11 +1,10 @@ "use strict"; const path = require("path"); -const fs = require("mz/fs"); const mkdirp = require("mkdirp-then"); const rimraf = require("rimraf-then"); -const jsdom = require("./jsdom.js"); const download = require("./download.js"); +const convert = require("./convert.js"); require("./track-rejections.js"); @@ -22,73 +21,9 @@ rimraf(outPath) .then(function () { return download(START_CHAPTER_URL, cachePath); }) - .then(getChapterFilePaths) - .then(function (chapterFilePaths) { - console.log("All chapters downloaded; beginning conversion to EPUB chapters"); - return Promise.all(chapterFilePaths.map(convertChapter)); + .then(function () { + return convert(cachePath, contentPath); }) .then(function () { console.log("All done!"); }); - -function getChapterFilePaths() { - return fs.readdir(cachePath).then(function (filenames) { - return filenames.filter(function (f) { return f.endsWith(".html"); }) - .map(function (f) { return path.resolve(cachePath, f); }); - }); -} - -function convertChapter(filePath) { - const filename = path.basename(filePath); - - console.log(`- Reading ${filename}`); - return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) { - console.log(`- Read ${filename}`); - const rawChapterDoc = jsdom(contents); - const output = getChapterString(rawChapterDoc); - - // TODO: this should probably not be necessary... jsdom bug I guess!? - rawChapterDoc.defaultView.close(); - - const destFileName = `${path.basename(filename, ".html")}.xhtml`; - const destFilePath = path.resolve(contentPath, destFileName); - return fs.writeFile(destFilePath, output); - }) - .then(function () { - console.log(`- Finished converting ${filename}`); - }); -} - -function getChapterString(rawChapterDoc) { - const title = rawChapterDoc.querySelector("h1.entry-title").textContent; - const body = cleanContentEl(rawChapterDoc.querySelector(".entry-content")).innerHTML; - - return ` - - - - - ${title} - - -

${title}

- - ${body} - -`; -} - -function cleanContentEl(el) { - // Remove Next Chapter and Previous Chapter

s - el.removeChild(el.firstElementChild); - el.removeChild(el.lastElementChild); - - // Remove redundant dir="ltr" - Array.prototype.forEach.call(el.children, function (child) { - if (child.getAttribute("dir") === "ltr") { - child.removeAttribute("dir"); - } - }); - - return el; -} diff --git a/package.json b/package.json index 903f4e3..e4004b6 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "mkdirp-then": "^1.0.1", "requisition": "^1.5.0", "rimraf-then": "^1.0.0", + "throat": "^2.0.2", "xtend": "^4.0.0", "zfill": "0.0.2" },