Merge pull request #7 from dkasak/unescape-html-entities

Unescape HTML entities prior to parsing HTML.
2018-03-19 15:21:10 +01:00 · 2018-03-19 15:21:10 +01:00 · 808f652d5f
commit 808f652d5f
parent c967731c0f 4be54d032e
2 changed files with 51 additions and 0 deletions
--- a/matrix/colors.py
+++ b/matrix/colors.py
@ -32,6 +32,9 @@ try:
 except ImportError:
    from html.parser import HTMLParser

+import html
+from html.entities import name2codepoint
+
 FormattedString = namedtuple('FormattedString', ['text', 'attributes'])

 quote_wrapper = textwrap.TextWrapper(
@ -314,6 +317,21 @@ class MatrixHtmlParser(HTMLParser):
        self.substrings = []  # type: List[FormattedString]
        self.attributes = DEFAULT_ATRIBUTES.copy()

+    def unescape(self, text):
+        """Shim to unescape HTML in both Python 2 and 3.
+
+        The instance method was deprecated in Python 3 and html.unescape
+        doesn't exist in Python 2 so this is needed.
+        """
+        try:
+            return html.unescape(text)
+        except AttributeError:
+            return HTMLParser.unescape(self, text)
+
+    def feed(self, text):
+        text = self.unescape(text)
+        return HTMLParser.feed(self, text)
+
    def _toggle_attribute(self, attribute):
        if self.text:
            self.substrings.append(
--- a/tests/http_parser_test.py
+++ b/tests/http_parser_test.py
@ -0,0 +1,33 @@
+import html.entities
+
+from hypothesis import given
+from hypothesis.strategies import sampled_from
+
+from matrix.colors import MatrixHtmlParser
+
+try:
+    # python 3
+    html_entities = [(name, char, ord(char))
+                     for name, char in html.entities.html5.items()
+                     if not name.endswith(';')]
+except AttributeError:
+    # python 2
+    html_entities = [(name, unichr(codepoint), codepoint)
+                     for name, codepoint
+                     in html.entities.name2codepoint.items()]
+
+
+@given(sampled_from(html_entities))
+def test_html_named_entity_parsing(entitydef):
+    name = entitydef[0]
+    character = entitydef[1]
+    parser = MatrixHtmlParser()
+    assert parser.unescape('&{};'.format(name)) == character
+
+
+@given(sampled_from(html_entities))
+def test_html_numeric_reference_parsing(entitydef):
+    character = entitydef[1]
+    num = entitydef[2]
+    parser = MatrixHtmlParser()
+    assert parser.unescape('&#{};'.format(num)) == character