Improve deletion of empty-ish elements

The previous heuristic of replacing them with a space character caused spaces to be inserted in the middle of words. Also, various cases were missed. This should help.
This commit is contained in:
Domenic Denicola 2021-01-01 16:24:33 -05:00
commit ba387d3555
2 changed files with 28 additions and 39 deletions

View file

@ -77,13 +77,21 @@ function getBodyXML(chapter, book, contentEl) {
}
}
// Remove empty <em>s and <i>s
// Remove style attributes from them, as they're always messed up.
for (const em of contentEl.querySelectorAll("em, i")) {
if (em.textContent.trim() === "") {
em.replaceWith(contentEl.ownerDocument.createTextNode(" "));
// Remove empty inline elements.
// Remove style attributes from inline elements, as they're always messed up.
for (const el of contentEl.querySelectorAll("em, i, strong, b")) {
const { textContent } = el;
if (textContent === "") {
el.remove();
} else if (textContent.trim() === "") {
if (el.childElementCount === 0) {
el.replaceWith(" ");
} else if (el.childElementCount === 1 && el.children[0].localName === "br") {
el.outerHTML = "<br />\n";
}
} else {
em.removeAttribute("style");
el.removeAttribute("style");
}
}
@ -221,9 +229,13 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/(\s*)<\/strong>/g, "</strong>$1");
xml = xml.replace(/><strong>(.*)<\/strong>:</g, "><strong>$1:</strong><");
// No need for line breaks before paragraph ends
// No need for line breaks before paragraph ends or after paragraph starts
// These often occur with the <br>s inside <b>/<strong>/<em>/<i> fixed above.
xml = xml.replace(/<br \/>\s*<\/p>/g, "</p>");
xml = xml.replace(/<p><br \/>\s*/g, "<p>");
// This is another quote fix but it needs to happen after the line break deletion... so entangled, ugh.
xml = xml.replace(/<\/em>\s*“\s*<\/p>/g, "</em>”</p>");
// Fix missing spaces after commas
xml = xml.replace(/([a-zA-Z]+),([a-zA-Z]+)/g, "$1, $2");