Better half-markdown (#202)

Better half-markdown.

This fixes some issues with the current semi-markdown-parser to make life easier until a full markdown parser is implemented.

Changes:
- A * that would normally start italics but isn't matched by a closing *, is now left alone.
- A ` that would normally start a code block but isn't matched by a closing `, is now left alone.
- Backslashes should work as expected.
- Support for **bold** and the alternative _italic_ style.
This commit is contained in:
Tom Smeding 2020-05-31 14:09:29 +02:00 committed by GitHub
parent 0ce5b65835
commit 170c5811a3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 132 additions and 79 deletions

View file

@ -88,66 +88,63 @@ class Formatted(object):
substrings = [] # type: List[FormattedString]
attributes = DEFAULT_ATTRIBUTES.copy()
def last_match_index(regex, subject, offset_in_match):
matches = list(re.finditer(regex, subject))
return matches[-1].span()[0] + offset_in_match if matches else -1
wrappers = {
"**": {
"key": "bold",
"last_index": last_match_index(r"\S\*\*", line, 1),
},
"*": {
"key": "italic",
"last_index": last_match_index(r"\S\*($|[^*])", line, 1),
},
"_": {
"key": "italic",
"last_index": last_match_index(r"\S_", line, 1),
},
}
wrapper_init_chars = set(k[0] for k in wrappers.keys())
wrapper_max_len = max(len(k) for k in wrappers.keys())
irc_toggles = {
"\x02": "bold",
"\x1D": "italic",
"\x1F": "underline",
}
last_backtick = line.rfind("`")
i = 0
while i < len(line):
# Bold
if line[i] == "\x02" and not attributes["code"]:
# Markdown escape
# NOTE: IRC-native formatting characters are not escaped
if i + 1 < len(line) and line[i] == "\\" \
and line[i + 1] not in "\x02\x03\x0F\x1D\x1F":
text += line[i + 1]
i = i + 2
# IRC bold/italic/underline
elif line[i] in irc_toggles and not attributes["code"]:
if text:
substrings.append(FormattedString(text, attributes.copy()))
text = ""
attributes["bold"] = not attributes["bold"]
key = irc_toggles[line[i]]
attributes[key] = not attributes[key]
i = i + 1
# Markdown inline code
elif line[i] == "`":
if text:
# strip leading and trailing spaces and compress consecutive
# spaces in inline code blocks
if attributes["code"]:
text = text.strip()
text = re.sub(r"\s+", " ", text)
substrings.append(
FormattedString(text, attributes.copy())
)
text = ""
attributes["code"] = not attributes["code"]
i = i + 1
# Markdown emphasis
elif line[i] == "*" and not attributes["code"]:
if attributes["italic"] and not line[i - 1].isspace():
if text:
substrings.append(
FormattedString(text, attributes.copy())
)
text = ""
attributes["italic"] = not attributes["italic"]
i = i + 1
continue
elif attributes["italic"] and line[i - 1].isspace():
text = text + line[i]
i = i + 1
continue
elif i + 1 < len(line) and line[i + 1].isspace():
text = text + line[i]
i = i + 1
continue
elif i == len(line) - 1:
text = text + line[i]
i = i + 1
continue
# IRC reset
elif line[i] == "\x0F" and not attributes["code"]:
if text:
substrings.append(FormattedString(text, attributes.copy()))
text = ""
attributes["italic"] = not attributes["italic"]
# Reset all the attributes
attributes = DEFAULT_ATTRIBUTES.copy()
i = i + 1
# Color
# IRC color
elif line[i] == "\x03" and not attributes["code"]:
if text:
substrings.append(FormattedString(text, attributes.copy()))
@ -185,37 +182,73 @@ class Formatted(object):
attributes["bgcolor"] = color_line_to_weechat(color_string)
else:
attributes["bgcolor"] = None
# Reset
elif line[i] == "\x0F" and not attributes["code"]:
# Markdown inline code
elif line[i] == "`" and (attributes["code"] or last_backtick > i):
if text:
substrings.append(FormattedString(text, attributes.copy()))
# strip leading and trailing spaces and compress consecutive
# spaces in inline code blocks
if attributes["code"]:
text = text.strip()
text = re.sub(r"\s+", " ", text)
substrings.append(
FormattedString(text, attributes.copy())
)
text = ""
# Reset all the attributes
attributes = DEFAULT_ATTRIBUTES.copy()
attributes["code"] = not attributes["code"]
i = i + 1
# Italic
elif line[i] == "\x1D" and not attributes["code"]:
if text:
substrings.append(FormattedString(text, attributes.copy()))
text = ""
attributes["italic"] = not attributes["italic"]
i = i + 1
# Markdown wrapper (emphasis/bold)
elif line[i] in wrapper_init_chars and not attributes["code"]:
for l in range(wrapper_max_len, 0, -1):
if i + l <= len(line) and line[i : i + l] in wrappers:
descriptor = wrappers[line[i : i + l]]
# Underline
elif line[i] == "\x1F" and not attributes["code"]:
if text:
substrings.append(FormattedString(text, attributes.copy()))
text = ""
attributes["underline"] = not attributes["underline"]
i = i + 1
if attributes[descriptor["key"]]:
# Can only turn off if preceded by non-whitespace
if not line[i - 1].isspace():
if text:
substrings.append(
FormattedString(text, attributes.copy()))
text = ""
attributes[descriptor["key"]] = False
i = i + l
else:
text = text + line[i : i + l]
i = i + l
# Must have a chance of closing this, and be followed
# by non-whitespace
elif descriptor["last_index"] >= i + l and \
not line[i + l].isspace():
if text:
substrings.append(
FormattedString(text, attributes.copy()))
text = ""
attributes[descriptor["key"]] = True
i = i + l
else:
text = text + line[i : i + l]
i = i + l
break
else:
# No wrapper matched here (NOTE: cannot happen if "*" and
# "_" are both in wrappers, but for completeness' sake)
text = text + line[i]
i = i + 1
# Normal text
else:
text = text + line[i]
i = i + 1
substrings.append(FormattedString(text, attributes))
if text:
substrings.append(FormattedString(text, attributes))
return cls(substrings)
@classmethod

View file

@ -5,7 +5,7 @@ from __future__ import unicode_literals
import webcolors
from collections import OrderedDict
from hypothesis import given
from hypothesis.strategies import sampled_from, text
from hypothesis.strategies import sampled_from, text, characters
from matrix.colors import (G, Formatted, FormattedString,
color_html_to_weechat, color_weechat_to_html)
@ -58,15 +58,16 @@ def test_normalize_spaces_in_inline_code():
assert formatted.to_weechat() == valid_result
# FIXME: this case doesn't and can't work yet (until a proper Markdown parser
# is integrated)
# @given(text().map(lambda s: '*' + s)
# def test_unpaired_prefix_asterisk_without_space_is_literal(text):
# """An unpaired asterisk at the beginning of the line, without a space
# after it, is considered literal.
# """
# formatted = Formatted.from_input_line(text)
# assert text == formatted.to_weechat()
@given(
text(alphabet=characters(min_codepoint=32,
blacklist_characters="*_"))
.map(lambda s: '*' + s))
def test_unpaired_prefix_asterisk_without_space_is_literal(text):
"""An unpaired asterisk at the beginning of the line, without a space
after it, is considered literal.
"""
formatted = Formatted.from_input_line(text)
assert text.strip() == formatted.to_weechat()
def test_input_line_color():
@ -79,7 +80,7 @@ def test_input_line_bold():
assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat()
assert "<strong>Hello</strong>" == formatted.to_html()
def test_input_line_bold():
def test_input_line_underline():
formatted = Formatted.from_input_line("\x1FHello")
assert "\x1b[04mHello\x1b[024m" == formatted.to_weechat()
assert "<u>Hello</u>" == formatted.to_html()
@ -89,6 +90,25 @@ def test_input_line_markdown_emph():
assert "\x1b[03mHello\x1b[023m" == formatted.to_weechat()
assert "<em>Hello</em>" == formatted.to_html()
def test_input_line_markdown_bold():
formatted = Formatted.from_input_line("**Hello**")
assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat()
assert "<strong>Hello</strong>" == formatted.to_html()
def test_input_line_markdown_various():
inp = "**bold* bold *bital etc* bold **bold** * *italic*"
formatted = Formatted.from_input_line(inp)
assert "<strong>bold* bold </strong>" \
"<em><strong>bital etc</strong></em><strong> bold **bold</strong>" \
" * <em>italic</em>" \
== formatted.to_html()
def test_input_line_markdown_various2():
inp = "norm** `code **code *code` norm `norm"
formatted = Formatted.from_input_line(inp)
assert "norm** <code>code **code *code</code> norm `norm" \
== formatted.to_html()
def test_conversion():
formatted = Formatted.from_input_line("*Hello*")
formatted2 = Formatted.from_html(formatted.to_html())