From ba387d3555c7a9ec3ace14bacf7e4667462b1268 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Fri, 1 Jan 2021 16:24:33 -0500 Subject: [PATCH] Improve deletion of empty-ish elements The previous heuristic of replacing them with a space character caused spaces to be inserted in the middle of words. Also, various cases were missed. This should help. --- lib/convert-worker.js | 26 +++++++++++++++++++------- lib/substitutions.json | 41 +++++++++-------------------------------- 2 files changed, 28 insertions(+), 39 deletions(-) diff --git a/lib/convert-worker.js b/lib/convert-worker.js index f46312a..19a0cd4 100644 --- a/lib/convert-worker.js +++ b/lib/convert-worker.js @@ -77,13 +77,21 @@ function getBodyXML(chapter, book, contentEl) { } } - // Remove empty s and s - // Remove style attributes from them, as they're always messed up. - for (const em of contentEl.querySelectorAll("em, i")) { - if (em.textContent.trim() === "") { - em.replaceWith(contentEl.ownerDocument.createTextNode(" ")); + // Remove empty inline elements. + // Remove style attributes from inline elements, as they're always messed up. + for (const el of contentEl.querySelectorAll("em, i, strong, b")) { + const { textContent } = el; + + if (textContent === "") { + el.remove(); + } else if (textContent.trim() === "") { + if (el.childElementCount === 0) { + el.replaceWith(" "); + } else if (el.childElementCount === 1 && el.children[0].localName === "br") { + el.outerHTML = "
\n"; + } } else { - em.removeAttribute("style"); + el.removeAttribute("style"); } } @@ -221,9 +229,13 @@ function getBodyXML(chapter, book, contentEl) { xml = xml.replace(/(\s*)<\/strong>/g, "$1"); xml = xml.replace(/>(.*)<\/strong>:$1:<"); - // No need for line breaks before paragraph ends + // No need for line breaks before paragraph ends or after paragraph starts // These often occur with the
s inside /// fixed above. xml = xml.replace(/
\s*<\/p>/g, "

"); + xml = xml.replace(/


\s*/g, "

"); + + // This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh. + xml = xml.replace(/<\/em>\s*“\s*<\/p>/g, "

"); // Fix missing spaces after commas xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2"); diff --git a/lib/substitutions.json b/lib/substitutions.json index 38aa1c0..30f027e 100644 --- a/lib/substitutions.json +++ b/lib/substitutions.json @@ -1713,12 +1713,6 @@ "after": "Dinah being kidnapped, and leaving" } ], - "https://parahumans.wordpress.com/2012/05/22/infestation-11-2/": [ - { - "before": "attentio n", - "after": "attention" - } - ], "https://parahumans.wordpress.com/2012/05/26/infestation-11-3/": [ { "before": "intimidating: A sea", @@ -1943,7 +1937,7 @@ "after": "

Crazed, kooky, cracked, crazy,
\nMental, dotty, whacked, loopy…

" }, { - "before": "

Crazed, kooky, cracked, crazy,
\nNutty, screwy, mentally diseased…
\n
She ", + "before": "

Crazed, kooky, cracked, crazy,
\nNutty, screwy, mentally diseased…
\n
\n
She ", "after": "

Crazed, kooky, cracked, crazy,
\nNutty, screwy, mentally diseased…

\n

She " }, { @@ -2765,11 +2759,6 @@ { "before": "stranger class", "after": "stranger-class" - }, - { - "before": "real ly", - "after": "really", - "comment": "There's an empty em element in the middle of this word that gets converted to a space" } ], "https://parahumans.wordpress.com/2013/02/05/monarch-18-6/": [ @@ -3284,10 +3273,6 @@ { "before": "in,” I said. “Could", "after": "in,” I said, “could" - }, - { - "before": " “ We’re", - "after": "“We’re" } ], "https://parahumans.wordpress.com/2013/05/16/cell-22-6/": [ @@ -3317,10 +3302,6 @@ "before": "it,” he said. “She", "after": "it,” he said, “she" }, - { - "before": "“ I don’t", - "after": "“I don’t" - }, { "before": "confirmed okay", "after": "confirmed ok", @@ -3579,6 +3560,10 @@ { "before": "avoided-", "after": "avoided" + }, + { + "before": "Lord Walston", + "after": "Lord Walston" } ], "https://parahumans.wordpress.com/2013/07/18/sting-26-1/": [ @@ -3649,8 +3634,8 @@ ], "https://parahumans.wordpress.com/2013/07/30/sting-26-5/": [ { - "before": " Wait…", - "after": " Wait…" + "before": " Wait…“

", + "after": " Wait…”

" }, { "before": "This,” Imp said. “Is", @@ -3758,10 +3743,6 @@ { "before": "shift position

", "after": "shift position.

" - }, - { - "before": "“ Convenient.", - "after": "“Convenient." } ], "https://parahumans.wordpress.com/2013/08/20/extinction-27-3/": [ @@ -3770,8 +3751,8 @@ "after": "guess,” Sophia said, “you" }, { - "before": "said “ But", - "after": "said. “But" + "before": "“Maybe,” I said “But", + "after": "“Maybe,” I said. “But" } ], "https://parahumans.wordpress.com/2013/08/24/extinction-27-5/": [ @@ -3954,10 +3935,6 @@ { "before": "Once the bead was in place, every bullet hit.", "after": "Once the bead was in place, every bullet hit." - }, - { - "before": "I ‘m", - "after": "I’m" } ], "https://parahumans.wordpress.com/2013/09/24/venom-29-3/": [