Remove more non-breaking spaces

Also normalizes after-sentence spaces to two (normal) spaces, but that's not visible to readers.
This commit is contained in:
Domenic Denicola 2020-12-19 15:55:38 -05:00
commit 442d245e2d
2 changed files with 19 additions and 7 deletions

View file

@ -140,11 +140,14 @@ function getBodyXML(chapter, book, contentEl) {
xml = xml.replace(/<i>([^ ]+)<\/i>/g, "<em>$1</em>");
xml = xml.replace(/<i>([^ ]+)( +)<\/i>/g, "<em>$1</em>$2");
// There are way too many nonbreaking spaces where they don't belong.
// If they show up three in a row, then let them live. Otherwise, they die.
// Also remove any run of them after a period.
xml = xml.replace(/([^\xA0])\xA0\xA0?([^\xA0])/g, "$1 $2");
xml = xml.replace(/\.\x20*\xA0[\xA0\x20]*/, ". ");
// There are way too many nonbreaking spaces where they don't belong. If they show up three in a row, then let them
// live; they're maybe being used for alignment or something. Otherwise, they die.
//
// Also, normalize spaces after a period/quote mark to two (normal) spaces. The second one is invisible when
// rendered, but it helps future heuristics detect end of sentences.
xml = xml.replace(/\xA0{1,2}(?!\x20\xA0)/g, " ");
xml = xml.replace(/([.”])\x20*\xA0[\xA0\x20]*/g, "$1 ");
xml = xml.replace(/([.”])\x20{3,}/g, "$1 ");
function fixEms() {
// Fix recurring broken-up or erroneous <em>s

View file

@ -1200,6 +1200,11 @@
{
"before": "of her head, “And my",
"after": "of her head. “And my"
},
{
"before": "KOOROW BULLIT<br />\nMILK         STUMPY<br />\nBROOTUS JOODUS<br />\nAXIL GINGIR",
"after": "KOOROW\u00A0\u00A0\u00A0BULLIT<br />\nMILK\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0\u00A0STUMPY<br />\nBROOTUS\u00A0\u00A0JOODUS<br />\nAXIL\u00A0\u00A0\u00A0\u00A0\u00A0GINGIR",
"_comment": "This section plays poorly with our space-normalizing heuristic."
}
],
"https://parahumans.wordpress.com/2012/03/31/interlude-8/": [
@ -2925,6 +2930,10 @@
{
"before": "and be brought it",
"after": "and he brought it"
},
{
"before": "propellers  One caught her",
"after": "propellers. One caught her"
}
],
"https://parahumans.wordpress.com/2013/03/02/scourge-19-4/": [
@ -6569,7 +6578,7 @@
"after": "morning breath—more than morning breath—but there"
},
{
"before": "<p><strong><a href=\"https://www.parahumans.net/2019/03/26/heavens-12-none/\">Previous Chapter</a>                                              <a href=\"https://www.parahumans.net/2019/04/02/black-13-1/\">Next Chapter</a></strong></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>",
"before": "<p><strong><a href=\"https://www.parahumans.net/2019/03/26/heavens-12-none/\">Previous Chapter</a>                                                                               <a href=\"https://www.parahumans.net/2019/04/02/black-13-1/\">Next Chapter</a></strong></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>\n<p></p>",
"after": "<div style=\"page-break-after: always;\">&#160;</div>\n<div style=\"page-break-after: always;\">&#160;</div>",
"_comment": "This is the best way I can think of to emulate the end of chapter 'fake out' in an ebook format"
},
@ -6596,7 +6605,7 @@
],
"https://www.parahumans.net/2019/04/02/black-13-1/": [
{
"before": "<p style=\"text-align: center;\">⊙</p>\n<p><strong><a href=\"https://www.parahumans.net/2019/03/29/heavens-12-x/\">Previous Chapter</a>                                              <a href=\"https://www.parahumans.net/2019/04/05/black-13-2/\">Next Chapter</a></strong></p>\n",
"before": "<p style=\"text-align: center;\">⊙</p>\n<p><strong><a href=\"https://www.parahumans.net/2019/03/29/heavens-12-x/\">Previous Chapter</a>                                                                               <a href=\"https://www.parahumans.net/2019/04/05/black-13-2/\">Next Chapter</a></strong></p>\n",
"after": "",
"_comment": "Our usual heuristics of removing the first paragraph to remove the previous/next chapter links are broken here because of the 'go back and look at the fake out' comment at the top"
},