Merge pull request #7 from dkasak/unescape-html-entities
Unescape HTML entities prior to parsing HTML.
This commit is contained in:
commit
808f652d5f
2 changed files with 51 additions and 0 deletions
|
@ -32,6 +32,9 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
import html
|
||||||
|
from html.entities import name2codepoint
|
||||||
|
|
||||||
FormattedString = namedtuple('FormattedString', ['text', 'attributes'])
|
FormattedString = namedtuple('FormattedString', ['text', 'attributes'])
|
||||||
|
|
||||||
quote_wrapper = textwrap.TextWrapper(
|
quote_wrapper = textwrap.TextWrapper(
|
||||||
|
@ -314,6 +317,21 @@ class MatrixHtmlParser(HTMLParser):
|
||||||
self.substrings = [] # type: List[FormattedString]
|
self.substrings = [] # type: List[FormattedString]
|
||||||
self.attributes = DEFAULT_ATRIBUTES.copy()
|
self.attributes = DEFAULT_ATRIBUTES.copy()
|
||||||
|
|
||||||
|
def unescape(self, text):
|
||||||
|
"""Shim to unescape HTML in both Python 2 and 3.
|
||||||
|
|
||||||
|
The instance method was deprecated in Python 3 and html.unescape
|
||||||
|
doesn't exist in Python 2 so this is needed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return html.unescape(text)
|
||||||
|
except AttributeError:
|
||||||
|
return HTMLParser.unescape(self, text)
|
||||||
|
|
||||||
|
def feed(self, text):
|
||||||
|
text = self.unescape(text)
|
||||||
|
return HTMLParser.feed(self, text)
|
||||||
|
|
||||||
def _toggle_attribute(self, attribute):
|
def _toggle_attribute(self, attribute):
|
||||||
if self.text:
|
if self.text:
|
||||||
self.substrings.append(
|
self.substrings.append(
|
||||||
|
|
33
tests/http_parser_test.py
Normal file
33
tests/http_parser_test.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import html.entities
|
||||||
|
|
||||||
|
from hypothesis import given
|
||||||
|
from hypothesis.strategies import sampled_from
|
||||||
|
|
||||||
|
from matrix.colors import MatrixHtmlParser
|
||||||
|
|
||||||
|
try:
|
||||||
|
# python 3
|
||||||
|
html_entities = [(name, char, ord(char))
|
||||||
|
for name, char in html.entities.html5.items()
|
||||||
|
if not name.endswith(';')]
|
||||||
|
except AttributeError:
|
||||||
|
# python 2
|
||||||
|
html_entities = [(name, unichr(codepoint), codepoint)
|
||||||
|
for name, codepoint
|
||||||
|
in html.entities.name2codepoint.items()]
|
||||||
|
|
||||||
|
|
||||||
|
@given(sampled_from(html_entities))
|
||||||
|
def test_html_named_entity_parsing(entitydef):
|
||||||
|
name = entitydef[0]
|
||||||
|
character = entitydef[1]
|
||||||
|
parser = MatrixHtmlParser()
|
||||||
|
assert parser.unescape('&{};'.format(name)) == character
|
||||||
|
|
||||||
|
|
||||||
|
@given(sampled_from(html_entities))
|
||||||
|
def test_html_numeric_reference_parsing(entitydef):
|
||||||
|
character = entitydef[1]
|
||||||
|
num = entitydef[2]
|
||||||
|
parser = MatrixHtmlParser()
|
||||||
|
assert parser.unescape('&#{};'.format(num)) == character
|
Loading…
Add table
Reference in a new issue