Merge pull request #7 from dkasak/unescape-html-entities

Unescape HTML entities prior to parsing HTML.
2018-03-19 15:21:10 +01:00 · 2018-03-19 15:21:10 +01:00 · 808f652d5f
commit 808f652d5f
parent c967731c0f 4be54d032e
2 changed files with 51 additions and 0 deletions
--- a/matrix/colors.py
+++ b/matrix/colors.py
@ -32,6 +32,9 @@ try:
 except ImportError:
    from html.parser import HTMLParser
 import html
 from html.entities import name2codepoint
 FormattedString = namedtuple('FormattedString', ['text', 'attributes'])
 quote_wrapper = textwrap.TextWrapper(
@ -314,6 +317,21 @@ class MatrixHtmlParser(HTMLParser):
        self.substrings = []  # type: List[FormattedString]
        self.attributes = DEFAULT_ATRIBUTES.copy()
    def unescape(self, text):
        """Shim to unescape HTML in both Python 2 and 3.
        The instance method was deprecated in Python 3 and html.unescape
        doesn't exist in Python 2 so this is needed.
        """
        try:
            return html.unescape(text)
        except AttributeError:
            return HTMLParser.unescape(self, text)
    def feed(self, text):
        text = self.unescape(text)
        return HTMLParser.feed(self, text)
    def _toggle_attribute(self, attribute):
        if self.text:
            self.substrings.append(
--- a/tests/http_parser_test.py
+++ b/tests/http_parser_test.py
@ -0,0 +1,33 @@
 import html.entities
 from hypothesis import given
 from hypothesis.strategies import sampled_from
 from matrix.colors import MatrixHtmlParser
 try:
    # python 3
    html_entities = [(name, char, ord(char))
                     for name, char in html.entities.html5.items()
                     if not name.endswith(';')]
 except AttributeError:
    # python 2
    html_entities = [(name, unichr(codepoint), codepoint)
                     for name, codepoint
                     in html.entities.name2codepoint.items()]
@given(sampled_from(html_entities))
 def test_html_named_entity_parsing(entitydef):
    name = entitydef[0]
    character = entitydef[1]
    parser = MatrixHtmlParser()
    assert parser.unescape('&{};'.format(name)) == character
@given(sampled_from(html_entities))
 def test_html_numeric_reference_parsing(entitydef):
    character = entitydef[1]
    num = entitydef[2]
    parser = MatrixHtmlParser()
    assert parser.unescape('&#{};'.format(num)) == character