Merge pull request #7 from dkasak/unescape-html-entities
Unescape HTML entities prior to parsing HTML.
This commit is contained in:
commit
808f652d5f
2 changed files with 51 additions and 0 deletions
|
|
@ -32,6 +32,9 @@ try:
|
|||
except ImportError:
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import html
|
||||
from html.entities import name2codepoint
|
||||
|
||||
FormattedString = namedtuple('FormattedString', ['text', 'attributes'])
|
||||
|
||||
quote_wrapper = textwrap.TextWrapper(
|
||||
|
|
@ -314,6 +317,21 @@ class MatrixHtmlParser(HTMLParser):
|
|||
self.substrings = [] # type: List[FormattedString]
|
||||
self.attributes = DEFAULT_ATRIBUTES.copy()
|
||||
|
||||
def unescape(self, text):
|
||||
"""Shim to unescape HTML in both Python 2 and 3.
|
||||
|
||||
The instance method was deprecated in Python 3 and html.unescape
|
||||
doesn't exist in Python 2 so this is needed.
|
||||
"""
|
||||
try:
|
||||
return html.unescape(text)
|
||||
except AttributeError:
|
||||
return HTMLParser.unescape(self, text)
|
||||
|
||||
def feed(self, text):
|
||||
text = self.unescape(text)
|
||||
return HTMLParser.feed(self, text)
|
||||
|
||||
def _toggle_attribute(self, attribute):
|
||||
if self.text:
|
||||
self.substrings.append(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue