Re-architect

Separate out download code from the rest. Re-do the download code to store URLs alongside HTML, and add support for resuming from the last-seen URL.
2015-05-06 22:00:44 +02:00 · 2015-05-06 22:00:44 +02:00 · 127b6612c9
commit 127b6612c9
parent 972fcaa294
2 changed files with 137 additions and 92 deletions
--- a/lib/worm-scraper.js
+++ b/lib/worm-scraper.js
@ -3,24 +3,26 @@ const path = require("path");
 const fs = require("mz/fs");
 const mkdirp = require("mkdirp-then");
 const rimraf = require("rimraf-then");
-const request = require("requisition");
 const jsdom = require("jsdom");

+const download = require("./download.js");
+
 require("./track-rejections.js");

-const START_CHAPTER = "https://parahumans.wordpress.com/2011/06/11/1-1/";
+const START_CHAPTER_URL = "https://parahumans.wordpress.com/2011/06/11/1-1/";

 const cachePath = path.resolve("cache");
 const outPath = path.resolve("out");
 const contentPath = path.resolve(outPath, "OEBPS");

-const rawChapterDocsCache = new Map();
-
 rimraf(outPath)
  .then(function () {
    return mkdirp(contentPath);
  })
-  .then(getAllRawChapterDocs)
+  .then(function () {
+    return download(START_CHAPTER_URL, cachePath);
+  })
+  .then(readAllRawChapterDocs)
  .then(function (rawChapterDocs) {
    console.log("Extracting content into EPUB chapter files");
    return Promise.all(rawChapterDocs.map(function (rawChapterDoc, i) {
@ -33,72 +35,18 @@ rimraf(outPath)
    console.log("All done!");
  });

-function getAllRawChapterDocs() {
-  return fs.readdir(cachePath).then(
-    function (filenames) {
-      return filenames.map(getRawChapterDoc);
-    },
-    function (e) {
-      if (e.code === "ENOENT") {
-        return downloadAllChapters();
-      } else {
-        throw e;
-      }
-    }
-  );
-}
-
-function downloadAllChapters() {
-  let currentChapter = START_CHAPTER;
-  let chapterCounter = 1;
-
-  return mkdirp(cachePath).then(loop).then(function () {
-    return toArray(rawChapterDocsCache.values());
-  });
-
-  function loop() {
-    const filename = `chapter${chapterCounter}.html`;
-
-    console.log(`Downloading ${currentChapter}`);
-
-    // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
-    const escapedUrl = encodeURI(currentChapter);
-
-    return request(escapedUrl).redirects(10).then(function (response) {
-      return response.text();
-    })
-    .then(function (contents) {
-      const rawChapterDoc = setRawChapterDoc(filename, contents);
-      currentChapter = getNextChapterUrl(rawChapterDoc);
-
-      return fs.writeFile(path.resolve(cachePath, filename), jsdom.serializeDocument(rawChapterDoc));
-    })
-    .then(function () {
-      if (currentChapter === null) {
-        return;
-      }
-
-      ++chapterCounter;
-      return loop();
-    });
-  }
-}
-
-function getRawChapterDoc(filename) {
-  const doc = rawChapterDocsCache.get(filename);
-  if (doc !== undefined) {
-    return Promise.resolve(doc);
-  }
-
-  return fs.readFile(path.resolve(cachePath, filename), { encoding: "utf-8" }).then(function (contents) {
-    return setRawChapterDoc(filename, contents);
+function readAllRawChapterDocs() {
+  return fs.readdir(cachePath).then(function (filenames) {
+    const htmlFiles = filenames.filter(function (f) { return f.endsWith(".html"); });
+    return Promise.all(htmlFiles.map(readRawChapterDoc));
  });
 }

-function setRawChapterDoc(filename, contents) {
-  const doc = jsdom.jsdom(contents);
-  rawChapterDocsCache.set(filename, doc);
-  return doc;
+function readRawChapterDoc(filename) {
+  const filePath = path.resolve(cachePath, filename);
+  return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) {
+    return jsdom.jsdom(contents);
+  });
 }

 function getChapterString(rawChapterDoc) {
@ -134,27 +82,3 @@ function cleanContentEl(el) {

  return el;
 }
-
-function getNextChapterUrl(rawChapterDoc) {
-  // a[title="Next Chapter"] doesn't always work. Two different pathologies:
-  // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
-  // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
-  // So instead search for the first <a> within the main content area starting with "Next", trimmed.
-
-  const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
-  for (let i = 0; i < aEls.length; ++i) {
-    if (aEls[i].textContent.trim().startsWith("Next")) {
-      return aEls[i].href;
-    }
-  }
-
-  return null;
-}
-
-function toArray(iterable) {
-  const array = [];
-  for (const x of iterable) {
-    array.push(x);
-  }
-  return array;
-}