diff --git a/glymur/jp2box.py b/glymur/jp2box.py index 971818f..578ef36 100644 --- a/glymur/jp2box.py +++ b/glymur/jp2box.py @@ -2656,10 +2656,20 @@ class XMLBox(Jp2kBox): warnings.warn(msg, UserWarning) # Strip out any trailing nulls, as they can foul up XML parsing. + # Remove any byte order markers. text = text.rstrip(chr(0)) + if u'\ufeff' in text: + msg = 'An illegal BOM (byte order marker) was detected and ' + msg += 'removed from the XML contents in the box starting at byte ' + msg += 'offset {0}'.format(offset) + warnings.warn(msg) + text = text.replace(u'\ufeff', '') + # Remove any encoding declaration. + if text.startswith(''): + text = text[38:] try: - elt = ET.fromstring(text.encode('utf-8')) + elt = ET.fromstring(text) xml = ET.ElementTree(elt) except ET.ParseError as err: msg = 'A problem was encountered while parsing an XML box:' diff --git a/glymur/test/test_jp2box_xml.py b/glymur/test/test_jp2box_xml.py index 8650895..ff589f9 100644 --- a/glymur/test/test_jp2box_xml.py +++ b/glymur/test/test_jp2box_xml.py @@ -94,24 +94,6 @@ class TestXML(unittest.TestCase): def tearDown(self): os.unlink(self.xmlfile) - @unittest.skipIf(OPJ_DATA_ROOT is None, - "OPJ_DATA_ROOT environment variable not set") - def test_invalid_utf8(self): - """Bad byte sequence that cannot be parsed.""" - filename = opj_data_file(os.path.join('input', - 'nonregression', - '26ccf3651020967f7778238ef5af08af.SIGFPE.d25.527.jp2')) - if sys.hexversion < 0x03000000: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - jp2 = Jp2k(filename) - else: - with self.assertWarns(UserWarning): - jp2 = Jp2k(filename) - - self.assertIsNone(jp2.box[3].box[1].box[1].xml) - - def test_negative_file_and_xml(self): """The XML should come from only one source.""" xml_object = ET.parse(self.xmlfile) @@ -305,3 +287,38 @@ class TestBadButRecoverableXmlFile(unittest.TestCase): b'this is a test') +class TestXML_OpjDataRoot(unittest.TestCase): + """Test suite for XML boxes, requires OPJ_DATA_ROOT.""" + + def test_bom(self): + """Byte order markers are illegal in UTF-8. Issue 185""" + filename = opj_data_file(os.path.join('input', + 'nonregression', + 'issue171.jp2')) + if sys.hexversion < 0x03000000: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + jp2 = Jp2k(filename) + else: + with self.assertWarns(UserWarning): + jp2 = Jp2k(filename) + self.assertIsNotNone(jp2.box[3].xml) + + + def test_invalid_utf8(self): + """Bad byte sequence that cannot be parsed.""" + filename = opj_data_file(os.path.join('input', + 'nonregression', + '26ccf3651020967f7778238ef5af08af.SIGFPE.d25.527.jp2')) + if sys.hexversion < 0x03000000: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + jp2 = Jp2k(filename) + else: + with self.assertWarns(UserWarning): + jp2 = Jp2k(filename) + + self.assertIsNone(jp2.box[3].box[1].box[1].xml) + + +