Remove more non-breaking spaces

Also normalizes after-sentence spaces to two (normal) spaces, but that's not visible to readers.
This commit is contained in:
Domenic Denicola 2020-12-19 15:55:38 -05:00
commit 442d245e2d
2 changed files with 19 additions and 7 deletions

View file

@ -140,11 +140,14 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/<i>([^ ]+)<\/i>/g, "<em>$1</em>");
xml = xml.replace(/<i>([^ ]+)( +)<\/i>/g, "<em>$1</em>$2");
// There are way too many nonbreaking spaces where they don't belong.
// If they show up three in a row, then let them live. Otherwise, they die.
// Also remove any run of them after a period.
xml = xml.replace(/([^\xA0])\xA0\xA0?([^\xA0])/g, "$1 $2");
xml = xml.replace(/\.\x20*\xA0[\xA0\x20]*/, ". ");
// There are way too many nonbreaking spaces where they don't belong. If they show up three in a row, then let them
// live; they're maybe being used for alignment or something. Otherwise, they die.
//
// Also, normalize spaces after a period/quote mark to two (normal) spaces. The second one is invisible when
// rendered, but it helps future heuristics detect end of sentences.
xml = xml.replace(/\xA0{1,2}(?!\x20\xA0)/g, " ");
xml = xml.replace(/([.”])\x20*\xA0[\xA0\x20]*/g, "$1 ");
xml = xml.replace(/([.”])\x20{3,}/g, "$1 ");
function fixEms() {
// Fix recurring broken-up or erroneous <em>s