From eea893682228299b9d90c69f52eb9075612d8708 Mon Sep 17 00:00:00 2001 From: Denis Kasak <dkasak@termina.org.uk> Date: Sat, 17 Mar 2018 17:52:17 +0100 Subject: [PATCH 1/3] Unescape all HTML entities prior to parsing. --- matrix/colors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/matrix/colors.py b/matrix/colors.py index 0c080ff..247c8f0 100644 --- a/matrix/colors.py +++ b/matrix/colors.py @@ -32,6 +32,8 @@ try: except ImportError: from html.parser import HTMLParser +from html.entities import name2codepoint + FormattedString = namedtuple('FormattedString', ['text', 'attributes']) quote_wrapper = textwrap.TextWrapper( @@ -314,6 +316,10 @@ class MatrixHtmlParser(HTMLParser): self.substrings = [] # type: List[FormattedString] self.attributes = DEFAULT_ATRIBUTES.copy() + def feed(self, text): + text = self.unescape(text) + return HTMLParser.feed(self, text) + def _toggle_attribute(self, attribute): if self.text: self.substrings.append( From 0a868b80bbf5cb5dc74adccf68f17b305ada6c0b Mon Sep 17 00:00:00 2001 From: Denis Kasak <dkasak@termina.org.uk> Date: Mon, 19 Mar 2018 09:37:52 +0100 Subject: [PATCH 2/3] Add HTML character reference parsing test. --- tests/http_parser_test.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 tests/http_parser_test.py diff --git a/tests/http_parser_test.py b/tests/http_parser_test.py new file mode 100644 index 0000000..e58903a --- /dev/null +++ b/tests/http_parser_test.py @@ -0,0 +1,33 @@ +import html.entities + +from hypothesis import given +from hypothesis.strategies import sampled_from + +from matrix.colors import MatrixHtmlParser + +try: + # python 3 + html_entities = [(name, char, ord(char)) + for name, char in html.entities.html5.items() + if not name.endswith(';')] +except AttributeError: + # python 2 + html_entities = [(name, unichr(codepoint), codepoint) + for name, codepoint + in html.entities.name2codepoint.items()] + + +@given(sampled_from(html_entities)) +def test_html_named_entity_parsing(entitydef): + name = entitydef[0] + character = entitydef[1] + parser = MatrixHtmlParser() + assert parser.unescape('&{};'.format(name)) == character + + +@given(sampled_from(html_entities)) +def test_html_numeric_reference_parsing(entitydef): + character = entitydef[1] + num = entitydef[2] + parser = MatrixHtmlParser() + assert parser.unescape('&#{};'.format(num)) == character From 4be54d032ef93d2be4612fca5026ca6251d3040f Mon Sep 17 00:00:00 2001 From: Denis Kasak <dkasak@termina.org.uk> Date: Mon, 19 Mar 2018 11:00:05 +0100 Subject: [PATCH 3/3] Implement MatrixHtmlParser.unescape shim. This is for Python 2/3 compatibility, since Python 3 deprecates the instance method and Python 2 doesn't have html.unescape. --- matrix/colors.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/matrix/colors.py b/matrix/colors.py index 247c8f0..fc6d3bb 100644 --- a/matrix/colors.py +++ b/matrix/colors.py @@ -32,6 +32,7 @@ try: except ImportError: from html.parser import HTMLParser +import html from html.entities import name2codepoint FormattedString = namedtuple('FormattedString', ['text', 'attributes']) @@ -316,6 +317,17 @@ class MatrixHtmlParser(HTMLParser): self.substrings = [] # type: List[FormattedString] self.attributes = DEFAULT_ATRIBUTES.copy() + def unescape(self, text): + """Shim to unescape HTML in both Python 2 and 3. + + The instance method was deprecated in Python 3 and html.unescape + doesn't exist in Python 2 so this is needed. + """ + try: + return html.unescape(text) + except AttributeError: + return HTMLParser.unescape(self, text) + def feed(self, text): text = self.unescape(text) return HTMLParser.feed(self, text)