diff --git a/matrix/colors.py b/matrix/colors.py index 0c080ff..fc6d3bb 100644 --- a/matrix/colors.py +++ b/matrix/colors.py @@ -32,6 +32,9 @@ try: except ImportError: from html.parser import HTMLParser +import html +from html.entities import name2codepoint + FormattedString = namedtuple('FormattedString', ['text', 'attributes']) quote_wrapper = textwrap.TextWrapper( @@ -314,6 +317,21 @@ class MatrixHtmlParser(HTMLParser): self.substrings = [] # type: List[FormattedString] self.attributes = DEFAULT_ATRIBUTES.copy() + def unescape(self, text): + """Shim to unescape HTML in both Python 2 and 3. + + The instance method was deprecated in Python 3 and html.unescape + doesn't exist in Python 2 so this is needed. + """ + try: + return html.unescape(text) + except AttributeError: + return HTMLParser.unescape(self, text) + + def feed(self, text): + text = self.unescape(text) + return HTMLParser.feed(self, text) + def _toggle_attribute(self, attribute): if self.text: self.substrings.append( diff --git a/tests/http_parser_test.py b/tests/http_parser_test.py new file mode 100644 index 0000000..e58903a --- /dev/null +++ b/tests/http_parser_test.py @@ -0,0 +1,33 @@ +import html.entities + +from hypothesis import given +from hypothesis.strategies import sampled_from + +from matrix.colors import MatrixHtmlParser + +try: + # python 3 + html_entities = [(name, char, ord(char)) + for name, char in html.entities.html5.items() + if not name.endswith(';')] +except AttributeError: + # python 2 + html_entities = [(name, unichr(codepoint), codepoint) + for name, codepoint + in html.entities.name2codepoint.items()] + + +@given(sampled_from(html_entities)) +def test_html_named_entity_parsing(entitydef): + name = entitydef[0] + character = entitydef[1] + parser = MatrixHtmlParser() + assert parser.unescape('&{};'.format(name)) == character + + +@given(sampled_from(html_entities)) +def test_html_numeric_reference_parsing(entitydef): + character = entitydef[1] + num = entitydef[2] + parser = MatrixHtmlParser() + assert parser.unescape('&#{};'.format(num)) == character