From 170c5811a3d78d1861a1f57141e81cf9918c39d7 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sun, 31 May 2020 14:09:29 +0200 Subject: [PATCH] Better half-markdown (#202) Better half-markdown. This fixes some issues with the current semi-markdown-parser to make life easier until a full markdown parser is implemented. Changes: - A * that would normally start italics but isn't matched by a closing *, is now left alone. - A ` that would normally start a code block but isn't matched by a closing `, is now left alone. - Backslashes should work as expected. - Support for **bold** and the alternative _italic_ style. --- matrix/colors.py | 169 ++++++++++++++++++++++++++------------------ tests/color_test.py | 42 ++++++++--- 2 files changed, 132 insertions(+), 79 deletions(-) diff --git a/matrix/colors.py b/matrix/colors.py index a0d02ad..86f7faf 100644 --- a/matrix/colors.py +++ b/matrix/colors.py @@ -88,66 +88,63 @@ class Formatted(object): substrings = [] # type: List[FormattedString] attributes = DEFAULT_ATTRIBUTES.copy() + def last_match_index(regex, subject, offset_in_match): + matches = list(re.finditer(regex, subject)) + return matches[-1].span()[0] + offset_in_match if matches else -1 + + wrappers = { + "**": { + "key": "bold", + "last_index": last_match_index(r"\S\*\*", line, 1), + }, + "*": { + "key": "italic", + "last_index": last_match_index(r"\S\*($|[^*])", line, 1), + }, + "_": { + "key": "italic", + "last_index": last_match_index(r"\S_", line, 1), + }, + } + wrapper_init_chars = set(k[0] for k in wrappers.keys()) + wrapper_max_len = max(len(k) for k in wrappers.keys()) + + irc_toggles = { + "\x02": "bold", + "\x1D": "italic", + "\x1F": "underline", + } + + last_backtick = line.rfind("`") + i = 0 while i < len(line): - # Bold - if line[i] == "\x02" and not attributes["code"]: + # Markdown escape + # NOTE: IRC-native formatting characters are not escaped + if i + 1 < len(line) and line[i] == "\\" \ + and line[i + 1] not in "\x02\x03\x0F\x1D\x1F": + text += line[i + 1] + i = i + 2 + + # IRC bold/italic/underline + elif line[i] in irc_toggles and not attributes["code"]: if text: substrings.append(FormattedString(text, attributes.copy())) text = "" - attributes["bold"] = not attributes["bold"] + key = irc_toggles[line[i]] + attributes[key] = not attributes[key] i = i + 1 - # Markdown inline code - elif line[i] == "`": - if text: - # strip leading and trailing spaces and compress consecutive - # spaces in inline code blocks - if attributes["code"]: - text = text.strip() - text = re.sub(r"\s+", " ", text) - - substrings.append( - FormattedString(text, attributes.copy()) - ) - text = "" - attributes["code"] = not attributes["code"] - i = i + 1 - - # Markdown emphasis - elif line[i] == "*" and not attributes["code"]: - if attributes["italic"] and not line[i - 1].isspace(): - if text: - substrings.append( - FormattedString(text, attributes.copy()) - ) - text = "" - attributes["italic"] = not attributes["italic"] - i = i + 1 - continue - - elif attributes["italic"] and line[i - 1].isspace(): - text = text + line[i] - i = i + 1 - continue - - elif i + 1 < len(line) and line[i + 1].isspace(): - text = text + line[i] - i = i + 1 - continue - - elif i == len(line) - 1: - text = text + line[i] - i = i + 1 - continue - + # IRC reset + elif line[i] == "\x0F" and not attributes["code"]: if text: substrings.append(FormattedString(text, attributes.copy())) text = "" - attributes["italic"] = not attributes["italic"] + # Reset all the attributes + attributes = DEFAULT_ATTRIBUTES.copy() i = i + 1 - # Color + # IRC color elif line[i] == "\x03" and not attributes["code"]: if text: substrings.append(FormattedString(text, attributes.copy())) @@ -185,37 +182,73 @@ class Formatted(object): attributes["bgcolor"] = color_line_to_weechat(color_string) else: attributes["bgcolor"] = None - # Reset - elif line[i] == "\x0F" and not attributes["code"]: + + # Markdown inline code + elif line[i] == "`" and (attributes["code"] or last_backtick > i): if text: - substrings.append(FormattedString(text, attributes.copy())) + # strip leading and trailing spaces and compress consecutive + # spaces in inline code blocks + if attributes["code"]: + text = text.strip() + text = re.sub(r"\s+", " ", text) + + substrings.append( + FormattedString(text, attributes.copy()) + ) text = "" - # Reset all the attributes - attributes = DEFAULT_ATTRIBUTES.copy() + attributes["code"] = not attributes["code"] i = i + 1 - # Italic - elif line[i] == "\x1D" and not attributes["code"]: - if text: - substrings.append(FormattedString(text, attributes.copy())) - text = "" - attributes["italic"] = not attributes["italic"] - i = i + 1 + # Markdown wrapper (emphasis/bold) + elif line[i] in wrapper_init_chars and not attributes["code"]: + for l in range(wrapper_max_len, 0, -1): + if i + l <= len(line) and line[i : i + l] in wrappers: + descriptor = wrappers[line[i : i + l]] - # Underline - elif line[i] == "\x1F" and not attributes["code"]: - if text: - substrings.append(FormattedString(text, attributes.copy())) - text = "" - attributes["underline"] = not attributes["underline"] - i = i + 1 + if attributes[descriptor["key"]]: + # Can only turn off if preceded by non-whitespace + if not line[i - 1].isspace(): + if text: + substrings.append( + FormattedString(text, attributes.copy())) + text = "" + attributes[descriptor["key"]] = False + i = i + l + else: + text = text + line[i : i + l] + i = i + l + + # Must have a chance of closing this, and be followed + # by non-whitespace + elif descriptor["last_index"] >= i + l and \ + not line[i + l].isspace(): + if text: + substrings.append( + FormattedString(text, attributes.copy())) + text = "" + attributes[descriptor["key"]] = True + i = i + l + + else: + text = text + line[i : i + l] + i = i + l + + break + + else: + # No wrapper matched here (NOTE: cannot happen if "*" and + # "_" are both in wrappers, but for completeness' sake) + text = text + line[i] + i = i + 1 # Normal text else: text = text + line[i] i = i + 1 - substrings.append(FormattedString(text, attributes)) + if text: + substrings.append(FormattedString(text, attributes)) + return cls(substrings) @classmethod diff --git a/tests/color_test.py b/tests/color_test.py index b51b196..9ede04b 100644 --- a/tests/color_test.py +++ b/tests/color_test.py @@ -5,7 +5,7 @@ from __future__ import unicode_literals import webcolors from collections import OrderedDict from hypothesis import given -from hypothesis.strategies import sampled_from, text +from hypothesis.strategies import sampled_from, text, characters from matrix.colors import (G, Formatted, FormattedString, color_html_to_weechat, color_weechat_to_html) @@ -58,15 +58,16 @@ def test_normalize_spaces_in_inline_code(): assert formatted.to_weechat() == valid_result -# FIXME: this case doesn't and can't work yet (until a proper Markdown parser -# is integrated) -# @given(text().map(lambda s: '*' + s) -# def test_unpaired_prefix_asterisk_without_space_is_literal(text): -# """An unpaired asterisk at the beginning of the line, without a space -# after it, is considered literal. -# """ -# formatted = Formatted.from_input_line(text) -# assert text == formatted.to_weechat() +@given( + text(alphabet=characters(min_codepoint=32, + blacklist_characters="*_")) + .map(lambda s: '*' + s)) +def test_unpaired_prefix_asterisk_without_space_is_literal(text): + """An unpaired asterisk at the beginning of the line, without a space + after it, is considered literal. + """ + formatted = Formatted.from_input_line(text) + assert text.strip() == formatted.to_weechat() def test_input_line_color(): @@ -79,7 +80,7 @@ def test_input_line_bold(): assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat() assert "Hello" == formatted.to_html() -def test_input_line_bold(): +def test_input_line_underline(): formatted = Formatted.from_input_line("\x1FHello") assert "\x1b[04mHello\x1b[024m" == formatted.to_weechat() assert "Hello" == formatted.to_html() @@ -89,6 +90,25 @@ def test_input_line_markdown_emph(): assert "\x1b[03mHello\x1b[023m" == formatted.to_weechat() assert "Hello" == formatted.to_html() +def test_input_line_markdown_bold(): + formatted = Formatted.from_input_line("**Hello**") + assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat() + assert "Hello" == formatted.to_html() + +def test_input_line_markdown_various(): + inp = "**bold* bold *bital etc* bold **bold** * *italic*" + formatted = Formatted.from_input_line(inp) + assert "bold* bold " \ + "bital etc bold **bold" \ + " * italic" \ + == formatted.to_html() + +def test_input_line_markdown_various2(): + inp = "norm** `code **code *code` norm `norm" + formatted = Formatted.from_input_line(inp) + assert "norm** code **code *code norm `norm" \ + == formatted.to_html() + def test_conversion(): formatted = Formatted.from_input_line("*Hello*") formatted2 = Formatted.from_html(formatted.to_html())