Better cleanup in the convert step
This commit is contained in:
parent
ace6a55be8
commit
247f713e13
3 changed files with 106 additions and 10 deletions
|
|
@ -4,6 +4,7 @@ const fs = require("mz/fs");
|
|||
const throat = require("throat");
|
||||
const serializeToXml = require("xmlserializer").serializeToString;
|
||||
const jsdom = require("./jsdom.js");
|
||||
const substitutions = require("./substitutions.json");
|
||||
|
||||
module.exports = function (cachePath, manifestPath, contentPath) {
|
||||
return fs.readFile(manifestPath, { encoding: "utf-8" }).then(function (manifestContents) {
|
||||
|
|
@ -28,7 +29,7 @@ function convertChapter(chapter, cachePath, contentPath) {
|
|||
return fs.readFile(filePath, { encoding: "utf-8" }).then(function (contents) {
|
||||
console.log(`- Read ${filename}`);
|
||||
const rawChapterDoc = jsdom(contents);
|
||||
const output = getChapterString(chapter.title, rawChapterDoc);
|
||||
const output = getChapterString(chapter, rawChapterDoc);
|
||||
|
||||
// TODO: this should probably not be necessary... jsdom bug I guess!?
|
||||
rawChapterDoc.defaultView.close();
|
||||
|
|
@ -42,21 +43,21 @@ function convertChapter(chapter, cachePath, contentPath) {
|
|||
});
|
||||
}
|
||||
|
||||
function getChapterString(title, rawChapterDoc) {
|
||||
const body = getBodyXml(title, rawChapterDoc.querySelector(".entry-content"));
|
||||
function getChapterString(chapter, rawChapterDoc) {
|
||||
const body = getBodyXml(chapter, rawChapterDoc.querySelector(".entry-content"));
|
||||
|
||||
return `<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
|
||||
<title>${title}</title>
|
||||
<title>${chapter.title}</title>
|
||||
</head>
|
||||
${body}
|
||||
</html>`;
|
||||
}
|
||||
|
||||
function getBodyXml(title, contentEl) {
|
||||
function getBodyXml(chapter, contentEl) {
|
||||
// Remove initial Next Chapter and Previous Chapter <p>
|
||||
contentEl.removeChild(contentEl.firstElementChild);
|
||||
|
||||
|
|
@ -99,7 +100,7 @@ function getBodyXml(title, contentEl) {
|
|||
// Synthesize a <body> tag to serialize
|
||||
const bodyEl = contentEl.ownerDocument.createElement("body");
|
||||
const h1El = contentEl.ownerDocument.createElement("h1");
|
||||
h1El.textContent = title;
|
||||
h1El.textContent = chapter.title;
|
||||
|
||||
bodyEl.appendChild(h1El);
|
||||
while (contentEl.firstChild) {
|
||||
|
|
@ -111,8 +112,31 @@ function getBodyXml(title, contentEl) {
|
|||
// Fix recurring strange pattern of extra <br> in <p>...<em>...<br>\n</em></p>
|
||||
xml = xml.replace(/<br\/>\s*<\/em><\/p>/g, '</em></p>');
|
||||
|
||||
// Fix recurring poor closing quotes
|
||||
xml = xml.replace(/“<\/p>/g, "”</p>");
|
||||
|
||||
// Some fixes for dashes; not comprehensive
|
||||
xml = xml.replace(/“-/g, "“—");
|
||||
xml = xml.replace(/-”/g, "—”");
|
||||
|
||||
// There are way too many nonbreaking spaces where they don't belong.
|
||||
// If they show up three in a row, then let them live. Otherwise, they die.
|
||||
xml = xml.replace(/([^\xA0])\xA0\xA0?([^\xA0])/g, "$1 $2");
|
||||
|
||||
// One-off fixes
|
||||
xml = xml.replace(/truck reached<br\/>\nthe other Nine/, 'truck reached the other Nine');
|
||||
(substitutions[chapter.url] || []).forEach(function (substitution) {
|
||||
const indexOf = xml.indexOf(substitution.before);
|
||||
if (indexOf === -1) {
|
||||
throw new Error(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
|
||||
`updated at the source, in which case, you should edit substitutions.json.`);
|
||||
}
|
||||
if (indexOf !== xml.lastIndexOf(substitution.before)) {
|
||||
throw new Error(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
|
||||
`Update substitutions.json for a more precise substitution.`);
|
||||
}
|
||||
|
||||
xml = xml.replace(new RegExp(escapeRegExp(substitution.before)), substitution.after);
|
||||
});
|
||||
|
||||
// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>
|
||||
xml = xml.replace(/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/, '<body>');
|
||||
|
|
@ -124,3 +148,7 @@ function isEmptyOrGarbage(el) {
|
|||
const text = el.textContent.trim();
|
||||
return text === "" || text.startsWith("Last Chapter") || text.startsWith("Next Chapter");
|
||||
}
|
||||
|
||||
function escapeRegExp(str) {
|
||||
return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
|
||||
}
|
||||
|
|
|
|||
68
lib/substitutions.json
Normal file
68
lib/substitutions.json
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
{
|
||||
"https://parahumans.wordpress.com/2012/09/11/prey-14-3/": [
|
||||
{
|
||||
"before": "truck reached<br/>\nthe other Nine",
|
||||
"after": "truck reached the other Nine"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/10/18/interlude-15-donation-bonus/": [
|
||||
{
|
||||
"before": "volunteered, <em>asked<br/>\n</em> to",
|
||||
"after": "volunteered, <em>asked</em> to"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/10/27/colony-15-4/": [
|
||||
{
|
||||
"before": "Victfor",
|
||||
"after": "Victor"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/10/30/colony-15-5/": [
|
||||
{
|
||||
"before": "wasn’t unfamiliar",
|
||||
"after": "was unfamiliar"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/11/06/colony-15-7/": [
|
||||
{
|
||||
"before": "this sort of resistance.",
|
||||
"after": "this sort of resistance?"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/11/27/monarch-16-2/": [
|
||||
{
|
||||
"before": "waseven <i>less</i>",
|
||||
"after": "was even <em>less</em>"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/12/04/monarch-16-4/": [
|
||||
{
|
||||
"before": "well, with Bitch’s civilian",
|
||||
"after": "well, while Bitch’s civilian"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/12/08/monarch-16-5/": [
|
||||
{
|
||||
"before": "and It was powerful enough",
|
||||
"after": "and it was powerful enough"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/12/11/monarch-16-6/": [
|
||||
{
|
||||
"before": "Lost in thought”",
|
||||
"after": "Lost in thought.”"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2012/12/29/monarch-16-11/": [
|
||||
{
|
||||
"before": "attemtps",
|
||||
"after": "attempts"
|
||||
}
|
||||
],
|
||||
"https://parahumans.wordpress.com/2013/01/01/monarch-16-12/": [
|
||||
{
|
||||
"before": "<em>Did they make her </em><em>smell like me?</em><span style=\"line-height:15px;\"> They had to have, to keep the dogs from barking distress. But how? Had Calvert had his men raid my stuff? Had he used my dirty laundry?</span>",
|
||||
"after": "<em>Did they make her smell like me?</em> They had to have, to keep the dogs from barking distress. But how? Had Calvert had his men raid my stuff? Had he used my dirty laundry?"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -27,9 +27,9 @@ Promise.resolve()
|
|||
// .then(function () {
|
||||
// return mkdirp(chaptersPath);
|
||||
// })
|
||||
// .then(function () {
|
||||
// return convert(cachePath, manifestPath, chaptersPath);
|
||||
// })
|
||||
.then(function () {
|
||||
return convert(cachePath, manifestPath, chaptersPath);
|
||||
})
|
||||
.then(function () {
|
||||
return extras(contentPath, chaptersPath, manifestPath);
|
||||
})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue