Better half-markdown (#202)

Better half-markdown.

This fixes some issues with the current semi-markdown-parser to make life easier until a full markdown parser is implemented.

Changes:
- A * that would normally start italics but isn't matched by a closing *, is now left alone.
- A ` that would normally start a code block but isn't matched by a closing `, is now left alone.
- Backslashes should work as expected.
- Support for **bold** and the alternative _italic_ style.
This commit is contained in:
Tom Smeding 2020-05-31 14:09:29 +02:00 committed by GitHub
parent 0ce5b65835
commit 170c5811a3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 132 additions and 79 deletions

View file

@ -88,66 +88,63 @@ class Formatted(object):
substrings = [] # type: List[FormattedString] substrings = [] # type: List[FormattedString]
attributes = DEFAULT_ATTRIBUTES.copy() attributes = DEFAULT_ATTRIBUTES.copy()
def last_match_index(regex, subject, offset_in_match):
matches = list(re.finditer(regex, subject))
return matches[-1].span()[0] + offset_in_match if matches else -1
wrappers = {
"**": {
"key": "bold",
"last_index": last_match_index(r"\S\*\*", line, 1),
},
"*": {
"key": "italic",
"last_index": last_match_index(r"\S\*($|[^*])", line, 1),
},
"_": {
"key": "italic",
"last_index": last_match_index(r"\S_", line, 1),
},
}
wrapper_init_chars = set(k[0] for k in wrappers.keys())
wrapper_max_len = max(len(k) for k in wrappers.keys())
irc_toggles = {
"\x02": "bold",
"\x1D": "italic",
"\x1F": "underline",
}
last_backtick = line.rfind("`")
i = 0 i = 0
while i < len(line): while i < len(line):
# Bold # Markdown escape
if line[i] == "\x02" and not attributes["code"]: # NOTE: IRC-native formatting characters are not escaped
if i + 1 < len(line) and line[i] == "\\" \
and line[i + 1] not in "\x02\x03\x0F\x1D\x1F":
text += line[i + 1]
i = i + 2
# IRC bold/italic/underline
elif line[i] in irc_toggles and not attributes["code"]:
if text: if text:
substrings.append(FormattedString(text, attributes.copy())) substrings.append(FormattedString(text, attributes.copy()))
text = "" text = ""
attributes["bold"] = not attributes["bold"] key = irc_toggles[line[i]]
attributes[key] = not attributes[key]
i = i + 1 i = i + 1
# Markdown inline code # IRC reset
elif line[i] == "`": elif line[i] == "\x0F" and not attributes["code"]:
if text:
# strip leading and trailing spaces and compress consecutive
# spaces in inline code blocks
if attributes["code"]:
text = text.strip()
text = re.sub(r"\s+", " ", text)
substrings.append(
FormattedString(text, attributes.copy())
)
text = ""
attributes["code"] = not attributes["code"]
i = i + 1
# Markdown emphasis
elif line[i] == "*" and not attributes["code"]:
if attributes["italic"] and not line[i - 1].isspace():
if text:
substrings.append(
FormattedString(text, attributes.copy())
)
text = ""
attributes["italic"] = not attributes["italic"]
i = i + 1
continue
elif attributes["italic"] and line[i - 1].isspace():
text = text + line[i]
i = i + 1
continue
elif i + 1 < len(line) and line[i + 1].isspace():
text = text + line[i]
i = i + 1
continue
elif i == len(line) - 1:
text = text + line[i]
i = i + 1
continue
if text: if text:
substrings.append(FormattedString(text, attributes.copy())) substrings.append(FormattedString(text, attributes.copy()))
text = "" text = ""
attributes["italic"] = not attributes["italic"] # Reset all the attributes
attributes = DEFAULT_ATTRIBUTES.copy()
i = i + 1 i = i + 1
# Color # IRC color
elif line[i] == "\x03" and not attributes["code"]: elif line[i] == "\x03" and not attributes["code"]:
if text: if text:
substrings.append(FormattedString(text, attributes.copy())) substrings.append(FormattedString(text, attributes.copy()))
@ -185,37 +182,73 @@ class Formatted(object):
attributes["bgcolor"] = color_line_to_weechat(color_string) attributes["bgcolor"] = color_line_to_weechat(color_string)
else: else:
attributes["bgcolor"] = None attributes["bgcolor"] = None
# Reset
elif line[i] == "\x0F" and not attributes["code"]: # Markdown inline code
elif line[i] == "`" and (attributes["code"] or last_backtick > i):
if text: if text:
substrings.append(FormattedString(text, attributes.copy())) # strip leading and trailing spaces and compress consecutive
# spaces in inline code blocks
if attributes["code"]:
text = text.strip()
text = re.sub(r"\s+", " ", text)
substrings.append(
FormattedString(text, attributes.copy())
)
text = "" text = ""
# Reset all the attributes attributes["code"] = not attributes["code"]
attributes = DEFAULT_ATTRIBUTES.copy()
i = i + 1 i = i + 1
# Italic # Markdown wrapper (emphasis/bold)
elif line[i] == "\x1D" and not attributes["code"]: elif line[i] in wrapper_init_chars and not attributes["code"]:
if text: for l in range(wrapper_max_len, 0, -1):
substrings.append(FormattedString(text, attributes.copy())) if i + l <= len(line) and line[i : i + l] in wrappers:
text = "" descriptor = wrappers[line[i : i + l]]
attributes["italic"] = not attributes["italic"]
i = i + 1
# Underline if attributes[descriptor["key"]]:
elif line[i] == "\x1F" and not attributes["code"]: # Can only turn off if preceded by non-whitespace
if text: if not line[i - 1].isspace():
substrings.append(FormattedString(text, attributes.copy())) if text:
text = "" substrings.append(
attributes["underline"] = not attributes["underline"] FormattedString(text, attributes.copy()))
i = i + 1 text = ""
attributes[descriptor["key"]] = False
i = i + l
else:
text = text + line[i : i + l]
i = i + l
# Must have a chance of closing this, and be followed
# by non-whitespace
elif descriptor["last_index"] >= i + l and \
not line[i + l].isspace():
if text:
substrings.append(
FormattedString(text, attributes.copy()))
text = ""
attributes[descriptor["key"]] = True
i = i + l
else:
text = text + line[i : i + l]
i = i + l
break
else:
# No wrapper matched here (NOTE: cannot happen if "*" and
# "_" are both in wrappers, but for completeness' sake)
text = text + line[i]
i = i + 1
# Normal text # Normal text
else: else:
text = text + line[i] text = text + line[i]
i = i + 1 i = i + 1
substrings.append(FormattedString(text, attributes)) if text:
substrings.append(FormattedString(text, attributes))
return cls(substrings) return cls(substrings)
@classmethod @classmethod

View file

@ -5,7 +5,7 @@ from __future__ import unicode_literals
import webcolors import webcolors
from collections import OrderedDict from collections import OrderedDict
from hypothesis import given from hypothesis import given
from hypothesis.strategies import sampled_from, text from hypothesis.strategies import sampled_from, text, characters
from matrix.colors import (G, Formatted, FormattedString, from matrix.colors import (G, Formatted, FormattedString,
color_html_to_weechat, color_weechat_to_html) color_html_to_weechat, color_weechat_to_html)
@ -58,15 +58,16 @@ def test_normalize_spaces_in_inline_code():
assert formatted.to_weechat() == valid_result assert formatted.to_weechat() == valid_result
# FIXME: this case doesn't and can't work yet (until a proper Markdown parser @given(
# is integrated) text(alphabet=characters(min_codepoint=32,
# @given(text().map(lambda s: '*' + s) blacklist_characters="*_"))
# def test_unpaired_prefix_asterisk_without_space_is_literal(text): .map(lambda s: '*' + s))
# """An unpaired asterisk at the beginning of the line, without a space def test_unpaired_prefix_asterisk_without_space_is_literal(text):
# after it, is considered literal. """An unpaired asterisk at the beginning of the line, without a space
# """ after it, is considered literal.
# formatted = Formatted.from_input_line(text) """
# assert text == formatted.to_weechat() formatted = Formatted.from_input_line(text)
assert text.strip() == formatted.to_weechat()
def test_input_line_color(): def test_input_line_color():
@ -79,7 +80,7 @@ def test_input_line_bold():
assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat() assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat()
assert "<strong>Hello</strong>" == formatted.to_html() assert "<strong>Hello</strong>" == formatted.to_html()
def test_input_line_bold(): def test_input_line_underline():
formatted = Formatted.from_input_line("\x1FHello") formatted = Formatted.from_input_line("\x1FHello")
assert "\x1b[04mHello\x1b[024m" == formatted.to_weechat() assert "\x1b[04mHello\x1b[024m" == formatted.to_weechat()
assert "<u>Hello</u>" == formatted.to_html() assert "<u>Hello</u>" == formatted.to_html()
@ -89,6 +90,25 @@ def test_input_line_markdown_emph():
assert "\x1b[03mHello\x1b[023m" == formatted.to_weechat() assert "\x1b[03mHello\x1b[023m" == formatted.to_weechat()
assert "<em>Hello</em>" == formatted.to_html() assert "<em>Hello</em>" == formatted.to_html()
def test_input_line_markdown_bold():
formatted = Formatted.from_input_line("**Hello**")
assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat()
assert "<strong>Hello</strong>" == formatted.to_html()
def test_input_line_markdown_various():
inp = "**bold* bold *bital etc* bold **bold** * *italic*"
formatted = Formatted.from_input_line(inp)
assert "<strong>bold* bold </strong>" \
"<em><strong>bital etc</strong></em><strong> bold **bold</strong>" \
" * <em>italic</em>" \
== formatted.to_html()
def test_input_line_markdown_various2():
inp = "norm** `code **code *code` norm `norm"
formatted = Formatted.from_input_line(inp)
assert "norm** <code>code **code *code</code> norm `norm" \
== formatted.to_html()
def test_conversion(): def test_conversion():
formatted = Formatted.from_input_line("*Hello*") formatted = Formatted.from_input_line("*Hello*")
formatted2 = Formatted.from_html(formatted.to_html()) formatted2 = Formatted.from_html(formatted.to_html())