Merge pull request #7 from dkasak/unescape-html-entities

Unescape HTML entities prior to parsing HTML.
This commit is contained in:
poljar 2018-03-19 15:21:10 +01:00 committed by GitHub
commit 808f652d5f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 0 deletions

View file

@ -32,6 +32,9 @@ try:
except ImportError:
from html.parser import HTMLParser
import html
from html.entities import name2codepoint
FormattedString = namedtuple('FormattedString', ['text', 'attributes'])
quote_wrapper = textwrap.TextWrapper(
@ -314,6 +317,21 @@ class MatrixHtmlParser(HTMLParser):
self.substrings = [] # type: List[FormattedString]
self.attributes = DEFAULT_ATRIBUTES.copy()
def unescape(self, text):
"""Shim to unescape HTML in both Python 2 and 3.
The instance method was deprecated in Python 3 and html.unescape
doesn't exist in Python 2 so this is needed.
"""
try:
return html.unescape(text)
except AttributeError:
return HTMLParser.unescape(self, text)
def feed(self, text):
text = self.unescape(text)
return HTMLParser.feed(self, text)
def _toggle_attribute(self, attribute):
if self.text:
self.substrings.append(

33
tests/http_parser_test.py Normal file
View file

@ -0,0 +1,33 @@
import html.entities
from hypothesis import given
from hypothesis.strategies import sampled_from
from matrix.colors import MatrixHtmlParser
try:
# python 3
html_entities = [(name, char, ord(char))
for name, char in html.entities.html5.items()
if not name.endswith(';')]
except AttributeError:
# python 2
html_entities = [(name, unichr(codepoint), codepoint)
for name, codepoint
in html.entities.name2codepoint.items()]
@given(sampled_from(html_entities))
def test_html_named_entity_parsing(entitydef):
name = entitydef[0]
character = entitydef[1]
parser = MatrixHtmlParser()
assert parser.unescape('&{};'.format(name)) == character
@given(sampled_from(html_entities))
def test_html_numeric_reference_parsing(entitydef):
character = entitydef[1]
num = entitydef[2]
parser = MatrixHtmlParser()
assert parser.unescape('&#{};'.format(num)) == character