Re-architect

Separate out download code from the rest. Re-do the download code to store URLs alongside HTML, and add support for resuming from the last-seen URL.
2015-05-06 22:00:44 +02:00 · 2015-05-06 22:00:44 +02:00 · 127b6612c9
commit 127b6612c9
parent 972fcaa294
2 changed files with 137 additions and 92 deletions
--- a/lib/download.js
+++ b/lib/download.js
@ -0,0 +1,121 @@
+"use strict";
+const path = require("path");
+const fs = require("mz/fs");
+const mkdirp = require("mkdirp-then");
+const request = require("requisition");
+const jsdom = require("jsdom");
+
+const FILENAME_PREFIX = "chapter";
+const URL_SUFFIX = "-url.txt";
+
+module.exports = function (startChapterUrl, cachePath) {
+  return fs.readdir(cachePath).then(
+    function (filenames) {
+      const lastChapter = filenames.filter(function (f) {
+        return f.endsWith(URL_SUFFIX);
+      })
+      .map(function (f) {
+        return [getChapterNumber(f), f];
+      })
+      .sort(function (a, b) {
+        return b[0] - a[0];
+      })
+      [0];
+
+      const lastChapterNumber = lastChapter[0];
+      const lastChapterUrlFile = lastChapter[1];
+
+      return fs.readFile(path.resolve(cachePath, lastChapterUrlFile), { encoding: "utf-8" }).then(function (url) {
+        return downloadAllChapters(url, lastChapterNumber, cachePath);
+      });
+    },
+    function (e) {
+      if (e.code === "ENOENT") {
+        return downloadAllChapters(startChapterUrl, 1, cachePath);
+      } else {
+        throw e;
+      }
+    }
+  );
+};
+
+function downloadAllChapters(startChapterUrl, startChapterNumber, cachePath) {
+  let currentChapter = startChapterUrl;
+  let chapterCounter = startChapterNumber;
+
+  return mkdirp(cachePath).then(loop);
+
+  function loop() {
+    const filename = `${FILENAME_PREFIX}${chapterCounter}.html`;
+
+    console.log(`Downloading ${currentChapter}`);
+
+    // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
+    const escapedUrl = encodeURI(currentChapter);
+
+    return retry(3, function () { return request(escapedUrl).redirects(10); }).then(function (response) {
+      console.log("- Response received");
+      return response.text();
+    })
+    .then(function (contents) {
+      console.log("- Response body received");
+      const rawChapterDoc = jsdom.jsdom(contents, { url: currentChapter });
+      console.log("- Response body parsed into DOM");
+
+      const chapterUrlToSave = currentChapter;
+      currentChapter = getNextChapterUrl(rawChapterDoc);
+      console.log("- Got next chapter URL");
+
+      // TODO: this should probably not be necessary... jsdom bug I guess!?
+      rawChapterDoc.defaultView.close();
+
+      return [
+        fs.writeFile(path.resolve(cachePath, filename), contents),
+        fs.writeFile(path.resolve(cachePath, urlFilename(filename)), chapterUrlToSave)
+      ];
+    })
+    .then(function () {
+      console.log("- Response text saved to cache file");
+      if (currentChapter === null) {
+        return;
+      }
+
+      ++chapterCounter;
+      return loop();
+    });
+  }
+}
+
+function getNextChapterUrl(rawChapterDoc) {
+  // a[title="Next Chapter"] doesn"t always work. Two different pathologies:
+  // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
+  // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
+  // So instead search for the first <a> within the main content area starting with "Next", trimmed.
+
+  const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
+  for (let i = 0; i < aEls.length; ++i) {
+    if (aEls[i].textContent.trim().startsWith("Next")) {
+      return aEls[i].href;
+    }
+  }
+
+  return null;
+}
+
+function retry(times, fn) {
+  if (times === 0) {
+    return fn();
+  }
+
+  return fn().catch(function () {
+    return retry(times - 1, fn);
+  });
+}
+
+function urlFilename(filename) {
+  return `${filename}${URL_SUFFIX}`;
+}
+
+function getChapterNumber(filename) {
+  return Number(filename.substring(FILENAME_PREFIX.length, filename.indexOf(".html")));
+}
--- a/lib/worm-scraper.js
+++ b/lib/worm-scraper.js
@ -3,24 +3,26 @@ const path = require("path");
 const fs = require("mz/fs");
 const mkdirp = require("mkdirp-then");
 const rimraf = require("rimraf-then");
-const request = require("requisition");
 const jsdom = require("jsdom");

+const download = require("./download.js");
+
 require("./track-rejections.js");

-const START_CHAPTER = "https://parahumans.wordpress.com/2011/06/11/1-1/";
+const START_CHAPTER_URL = "https://parahumans.wordpress.com/2011/06/11/1-1/";

 const cachePath = path.resolve("cache");
 const outPath = path.resolve("out");
 const contentPath = path.resolve(outPath, "OEBPS");

-const rawChapterDocsCache = new Map();
-
 rimraf(outPath)
  .then(function () {
    return mkdirp(contentPath);
  })
-  .then(getAllRawChapterDocs)
+  .then(function () {
+    return download(START_CHAPTER_URL, cachePath);
+  })
+  .then(readAllRawChapterDocs)
  .then(function (rawChapterDocs) {
    console.log("Extracting content into EPUB chapter files");
    return Promise.all(rawChapterDocs.map(function (rawChapterDoc, i) {
@ -33,72 +35,18 @@ rimraf(outPath)
    console.log("All done!");
  });

-function getAllRawChapterDocs() {
-  return fs.readdir(cachePath).then(
-    function (filenames) {
-      return filenames.map(getRawChapterDoc);
-    },
-    function (e) {
-      if (e.code === "ENOENT") {
-        return downloadAllChapters();
-      } else {
-        throw e;
-      }
-    }
-  );
-}
-
-function downloadAllChapters() {
-  let currentChapter = START_CHAPTER;
-  let chapterCounter = 1;
-
-  return mkdirp(cachePath).then(loop).then(function () {
-    return toArray(rawChapterDocsCache.values());
-  });
-
-  function loop() {
-    const filename = `chapter${chapterCounter}.html`;
-
-    console.log(`Downloading ${currentChapter}`);
-
-    // Necessary for https://parahumans.wordpress.com/2011/10/11/interlude-3½-bonus/
-    const escapedUrl = encodeURI(currentChapter);
-
-    return request(escapedUrl).redirects(10).then(function (response) {
-      return response.text();
-    })
-    .then(function (contents) {
-      const rawChapterDoc = setRawChapterDoc(filename, contents);
-      currentChapter = getNextChapterUrl(rawChapterDoc);
-
-      return fs.writeFile(path.resolve(cachePath, filename), jsdom.serializeDocument(rawChapterDoc));
-    })
-    .then(function () {
-      if (currentChapter === null) {
-        return;
-      }
-
-      ++chapterCounter;
-      return loop();
-    });
-  }
-}
-
-function getRawChapterDoc(filename) {
-  const doc = rawChapterDocsCache.get(filename);
-  if (doc !== undefined) {
-    return Promise.resolve(doc);
-  }
-
-  return fs.readFile(path.resolve(cachePath, filename), { encoding: "utf-8" }).then(function (contents) {
-    return setRawChapterDoc(filename, contents);
+function readAllRawChapterDocs() {
+  return fs.readdir(cachePath).then(function (filenames) {
+    const htmlFiles = filenames.filter(function (f) { return f.endsWith(".html"); });
+    return Promise.all(htmlFiles.map(readRawChapterDoc));
  });
 }

-function setRawChapterDoc(filename, contents) {
-  const doc = jsdom.jsdom(contents);
-  rawChapterDocsCache.set(filename, doc);
-  return doc;
+function readRawChapterDoc(filename) {
+  const filePath = path.resolve(cachePath, filename);
+  return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) {
+    return jsdom.jsdom(contents);
+  });
 }

 function getChapterString(rawChapterDoc) {
@ -134,27 +82,3 @@ function cleanContentEl(el) {

  return el;
 }
-
-function getNextChapterUrl(rawChapterDoc) {
-  // a[title="Next Chapter"] doesn't always work. Two different pathologies:
-  // - https://parahumans.wordpress.com/2011/09/27/shell-4-2/
-  // - https://parahumans.wordpress.com/2012/04/21/sentinel-9-6/
-  // So instead search for the first <a> within the main content area starting with "Next", trimmed.
-
-  const aEls = rawChapterDoc.querySelectorAll(".entry-content a");
-  for (let i = 0; i < aEls.length; ++i) {
-    if (aEls[i].textContent.trim().startsWith("Next")) {
-      return aEls[i].href;
-    }
-  }
-
-  return null;
-}
-
-function toArray(iterable) {
-  const array = [];
-  for (const x of iterable) {
-    array.push(x);
-  }
-  return array;
-}