Improve deletion of empty-ish elements

The previous heuristic of replacing them with a space character caused spaces to be inserted in the middle of words. Also, various cases were missed. This should help.
This commit is contained in:
Domenic Denicola 2021-01-01 16:24:33 -05:00
commit ba387d3555
2 changed files with 28 additions and 39 deletions

View file

@ -77,13 +77,21 @@ function getBodyXML(chapter, book, contentEl) {
}
}
// Remove empty <em>s and <i>s
// Remove style attributes from them, as they're always messed up.
for (const em of contentEl.querySelectorAll("em, i")) {
if (em.textContent.trim() === "") {
em.replaceWith(contentEl.ownerDocument.createTextNode(" "));
// Remove empty inline elements.
// Remove style attributes from inline elements, as they're always messed up.
for (const el of contentEl.querySelectorAll("em, i, strong, b")) {
const { textContent } = el;
if (textContent === "") {
el.remove();
} else if (textContent.trim() === "") {
if (el.childElementCount === 0) {
el.replaceWith(" ");
} else if (el.childElementCount === 1 && el.children[0].localName === "br") {
el.outerHTML = "<br />\n";
}
} else {
em.removeAttribute("style");
el.removeAttribute("style");
}
}
@ -221,9 +229,13 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/(\s*)<\/strong>/g, "</strong>$1");
xml = xml.replace(/><strong>(.*)<\/strong>:</g, "><strong>$1:</strong><");
// No need for line breaks before paragraph ends
// No need for line breaks before paragraph ends or after paragraph starts
// These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above.
xml = xml.replace(/<br \/>\s*<\/p>/g, "</p>");
xml = xml.replace(/<p><br \/>\s*/g, "<p>");
// This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh.
xml = xml.replace(/<\/em>\s*“\s*<\/p>/g, "</em>”</p>");
// Fix missing spaces after commas
xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2");

View file

@ -1713,12 +1713,6 @@
"after": "Dinah being kidnapped, and leaving"
}
],
"https://parahumans.wordpress.com/2012/05/22/infestation-11-2/": [
{
"before": "attentio n",
"after": "attention"
}
],
"https://parahumans.wordpress.com/2012/05/26/infestation-11-3/": [
{
"before": "intimidating: A sea",
@ -1943,7 +1937,7 @@
"after": "<p><i>Crazed, kooky, cracked, crazy,<br />\nMental, dotty, whacked, loopy…</i></p>"
},
{
"before": "<p><em>Crazed, kooky, cracked, crazy,<br />\n<em>Nutty, screwy, mentally diseased…</em><br />\n</em> She ",
"before": "<p><em>Crazed, kooky, cracked, crazy,<br />\n<em>Nutty, screwy, mentally diseased…</em><br />\n<br />\n</em>She ",
"after": "<p><i>Crazed, kooky, cracked, crazy,<br />\nNutty, screwy, mentally diseased…</i></p>\n<p>She "
},
{
@ -2765,11 +2759,6 @@
{
"before": "stranger class",
"after": "stranger-class"
},
{
"before": "real ly",
"after": "really",
"comment": "There's an empty em element in the middle of this word that gets converted to a space"
}
],
"https://parahumans.wordpress.com/2013/02/05/monarch-18-6/": [
@ -3284,10 +3273,6 @@
{
"before": "in,” I said. “Could",
"after": "in,” I said, “could"
},
{
"before": " “ <em>Were",
"after": "“<em>Were"
}
],
"https://parahumans.wordpress.com/2013/05/16/cell-22-6/": [
@ -3317,10 +3302,6 @@
"before": "it,” he said. “She",
"after": "it,” he said, “she"
},
{
"before": "“ I dont",
"after": "“I dont"
},
{
"before": "confirmed okay",
"after": "confirmed ok",
@ -3579,6 +3560,10 @@
{
"before": "avoided-",
"after": "avoided"
},
{
"before": "<span style=\"text-decoration:underline;\"><strong>Lord</strong></span><span style=\"text-decoration:underline;\"> <strong>Walston</strong></span>",
"after": "<span style=\"text-decoration:underline;\"><strong>Lord Walston</strong></span>"
}
],
"https://parahumans.wordpress.com/2013/07/18/sting-26-1/": [
@ -3649,8 +3634,8 @@
],
"https://parahumans.wordpress.com/2013/07/30/sting-26-5/": [
{
"before": "</em> Wait…",
"after": " Wait…</em>"
"before": "</em> Wait…“</p>",
"after": " Wait…</em>”</p>"
},
{
"before": "This,” Imp said. “Is",
@ -3758,10 +3743,6 @@
{
"before": "shift position</p>",
"after": "shift position.</p>"
},
{
"before": "“ Convenient.",
"after": "“Convenient."
}
],
"https://parahumans.wordpress.com/2013/08/20/extinction-27-3/": [
@ -3770,8 +3751,8 @@
"after": "guess,” Sophia said, “you"
},
{
"before": "said “ But",
"after": "said. “But"
"before": "“Maybe,” I said “But",
"after": "“Maybe,” I said. “But"
}
],
"https://parahumans.wordpress.com/2013/08/24/extinction-27-5/": [
@ -3954,10 +3935,6 @@
{
"before": "<em>Once the bead was in place, every</em> bullet hit.",
"after": "Once the bead was in place, every bullet hit."
},
{
"before": "I m",
"after": "Im"
}
],
"https://parahumans.wordpress.com/2013/09/24/venom-29-3/": [