Better half-markdown (#202)

Better half-markdown. This fixes some issues with the current semi-markdown-parser to make life easier until a full markdown parser is implemented. Changes: - A * that would normally start italics but isn't matched by a closing *, is now left alone. - A ` that would normally start a code block but isn't matched by a closing `, is now left alone. - Backslashes should work as expected. - Support for **bold** and the alternative _italic_ style.
2020-05-31 14:09:29 +02:00 · 2020-05-31 14:09:29 +02:00 · 170c5811a3
commit 170c5811a3
parent 0ce5b65835
2 changed files with 132 additions and 79 deletions
--- a/matrix/colors.py
+++ b/matrix/colors.py
@ -88,66 +88,63 @@ class Formatted(object):
        substrings = []  # type: List[FormattedString]
        attributes = DEFAULT_ATTRIBUTES.copy()

+        def last_match_index(regex, subject, offset_in_match):
+            matches = list(re.finditer(regex, subject))
+            return matches[-1].span()[0] + offset_in_match if matches else -1
+
+        wrappers = {
+            "**": {
+                "key": "bold",
+                "last_index": last_match_index(r"\S\*\*", line, 1),
+            },
+            "*": {
+                "key": "italic",
+                "last_index": last_match_index(r"\S\*($|[^*])", line, 1),
+            },
+            "_": {
+                "key": "italic",
+                "last_index": last_match_index(r"\S_", line, 1),
+            },
+        }
+        wrapper_init_chars = set(k[0] for k in wrappers.keys())
+        wrapper_max_len = max(len(k) for k in wrappers.keys())
+
+        irc_toggles = {
+            "\x02": "bold",
+            "\x1D": "italic",
+            "\x1F": "underline",
+        }
+
+        last_backtick = line.rfind("`")
+
        i = 0
        while i < len(line):
-            # Bold
-            if line[i] == "\x02" and not attributes["code"]:
+            # Markdown escape
+            # NOTE: IRC-native formatting characters are not escaped
+            if i + 1 < len(line) and line[i] == "\\" \
+                    and line[i + 1] not in "\x02\x03\x0F\x1D\x1F":
+                text += line[i + 1]
+                i = i + 2
+
+            # IRC bold/italic/underline
+            elif line[i] in irc_toggles and not attributes["code"]:
                if text:
                    substrings.append(FormattedString(text, attributes.copy()))
                text = ""
-                attributes["bold"] = not attributes["bold"]
+                key = irc_toggles[line[i]]
+                attributes[key] = not attributes[key]
                i = i + 1

-            # Markdown inline code
-            elif line[i] == "`":
-                if text:
-                    # strip leading and trailing spaces and compress consecutive
-                    # spaces in inline code blocks
-                    if attributes["code"]:
-                        text = text.strip()
-                        text = re.sub(r"\s+", " ", text)
-
-                    substrings.append(
-                        FormattedString(text, attributes.copy())
-                    )
-                text = ""
-                attributes["code"] = not attributes["code"]
-                i = i + 1
-
-            # Markdown emphasis
-            elif line[i] == "*" and not attributes["code"]:
-                if attributes["italic"] and not line[i - 1].isspace():
-                    if text:
-                        substrings.append(
-                            FormattedString(text, attributes.copy())
-                        )
-                    text = ""
-                    attributes["italic"] = not attributes["italic"]
-                    i = i + 1
-                    continue
-
-                elif attributes["italic"] and line[i - 1].isspace():
-                    text = text + line[i]
-                    i = i + 1
-                    continue
-
-                elif i + 1 < len(line) and line[i + 1].isspace():
-                    text = text + line[i]
-                    i = i + 1
-                    continue
-
-                elif i == len(line) - 1:
-                    text = text + line[i]
-                    i = i + 1
-                    continue
-
+            # IRC reset
+            elif line[i] == "\x0F" and not attributes["code"]:
                if text:
                    substrings.append(FormattedString(text, attributes.copy()))
                text = ""
-                attributes["italic"] = not attributes["italic"]
+                # Reset all the attributes
+                attributes = DEFAULT_ATTRIBUTES.copy()
                i = i + 1

-            # Color
+            # IRC color
            elif line[i] == "\x03" and not attributes["code"]:
                if text:
                    substrings.append(FormattedString(text, attributes.copy()))
@ -185,37 +182,73 @@ class Formatted(object):
                    attributes["bgcolor"] = color_line_to_weechat(color_string)
                else:
                    attributes["bgcolor"] = None
-            # Reset
-            elif line[i] == "\x0F" and not attributes["code"]:
+
+            # Markdown inline code
+            elif line[i] == "`" and (attributes["code"] or last_backtick > i):
                if text:
-                    substrings.append(FormattedString(text, attributes.copy()))
+                    # strip leading and trailing spaces and compress consecutive
+                    # spaces in inline code blocks
+                    if attributes["code"]:
+                        text = text.strip()
+                        text = re.sub(r"\s+", " ", text)
+
+                    substrings.append(
+                        FormattedString(text, attributes.copy())
+                    )
                text = ""
-                # Reset all the attributes
-                attributes = DEFAULT_ATTRIBUTES.copy()
+                attributes["code"] = not attributes["code"]
                i = i + 1

-            # Italic
-            elif line[i] == "\x1D" and not attributes["code"]:
-                if text:
-                    substrings.append(FormattedString(text, attributes.copy()))
-                text = ""
-                attributes["italic"] = not attributes["italic"]
-                i = i + 1
+            # Markdown wrapper (emphasis/bold)
+            elif line[i] in wrapper_init_chars and not attributes["code"]:
+                for l in range(wrapper_max_len, 0, -1):
+                    if i + l <= len(line) and line[i : i + l] in wrappers:
+                        descriptor = wrappers[line[i : i + l]]

-            # Underline
-            elif line[i] == "\x1F" and not attributes["code"]:
-                if text:
-                    substrings.append(FormattedString(text, attributes.copy()))
-                text = ""
-                attributes["underline"] = not attributes["underline"]
-                i = i + 1
+                        if attributes[descriptor["key"]]:
+                            # Can only turn off if preceded by non-whitespace
+                            if not line[i - 1].isspace():
+                                if text:
+                                    substrings.append(
+                                        FormattedString(text, attributes.copy()))
+                                text = ""
+                                attributes[descriptor["key"]] = False
+                                i = i + l
+                            else:
+                                text = text + line[i : i + l]
+                                i = i + l
+
+                        # Must have a chance of closing this, and be followed
+                        # by non-whitespace
+                        elif descriptor["last_index"] >= i + l and \
+                                not line[i + l].isspace():
+                            if text:
+                                substrings.append(
+                                    FormattedString(text, attributes.copy()))
+                            text = ""
+                            attributes[descriptor["key"]] = True
+                            i = i + l
+
+                        else:
+                            text = text + line[i : i + l]
+                            i = i + l
+
+                        break
+
+                else:
+                    # No wrapper matched here (NOTE: cannot happen if "*" and
+                    # "_" are both in wrappers, but for completeness' sake)
+                    text = text + line[i]
+                    i = i + 1

            # Normal text
            else:
                text = text + line[i]
                i = i + 1

-        substrings.append(FormattedString(text, attributes))
+        if text:
+            substrings.append(FormattedString(text, attributes))
+
        return cls(substrings)

    @classmethod
--- a/tests/color_test.py
+++ b/tests/color_test.py
@ -5,7 +5,7 @@ from __future__ import unicode_literals
 import webcolors
 from collections import OrderedDict
 from hypothesis import given
-from hypothesis.strategies import sampled_from, text
+from hypothesis.strategies import sampled_from, text, characters

 from matrix.colors import (G, Formatted, FormattedString,
                           color_html_to_weechat, color_weechat_to_html)
@ -58,15 +58,16 @@ def test_normalize_spaces_in_inline_code():
    assert formatted.to_weechat() == valid_result


-# FIXME: this case doesn't and can't work yet (until a proper Markdown parser
-# is integrated)
-# @given(text().map(lambda s: '*' + s)
-# def test_unpaired_prefix_asterisk_without_space_is_literal(text):
-#     """An unpaired asterisk at the beginning of the line, without a space
-#     after it, is considered literal.
-#     """
-#     formatted = Formatted.from_input_line(text)
-#     assert text == formatted.to_weechat()
+@given(
+    text(alphabet=characters(min_codepoint=32,
+                             blacklist_characters="*_"))
+    .map(lambda s: '*' + s))
+def test_unpaired_prefix_asterisk_without_space_is_literal(text):
+   """An unpaired asterisk at the beginning of the line, without a space
+   after it, is considered literal.
+   """
+   formatted = Formatted.from_input_line(text)
+   assert text.strip() == formatted.to_weechat()


 def test_input_line_color():
@ -79,7 +80,7 @@ def test_input_line_bold():
    assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat()
    assert "<strong>Hello</strong>" == formatted.to_html()

-def test_input_line_bold():
+def test_input_line_underline():
    formatted = Formatted.from_input_line("\x1FHello")
    assert "\x1b[04mHello\x1b[024m" == formatted.to_weechat()
    assert "<u>Hello</u>" == formatted.to_html()
@ -89,6 +90,25 @@ def test_input_line_markdown_emph():
    assert "\x1b[03mHello\x1b[023m" == formatted.to_weechat()
    assert "<em>Hello</em>" == formatted.to_html()

+def test_input_line_markdown_bold():
+    formatted = Formatted.from_input_line("**Hello**")
+    assert "\x1b[01mHello\x1b[021m" == formatted.to_weechat()
+    assert "<strong>Hello</strong>" == formatted.to_html()
+
+def test_input_line_markdown_various():
+    inp = "**bold* bold *bital etc* bold **bold** * *italic*"
+    formatted = Formatted.from_input_line(inp)
+    assert "<strong>bold* bold </strong>" \
+           "<em><strong>bital etc</strong></em><strong> bold **bold</strong>" \
+           " * <em>italic</em>" \
+           == formatted.to_html()
+
+def test_input_line_markdown_various2():
+    inp = "norm** `code **code *code` norm `norm"
+    formatted = Formatted.from_input_line(inp)
+    assert "norm** <code>code **code *code</code> norm `norm" \
+           == formatted.to_html()
+
 def test_conversion():
    formatted = Formatted.from_input_line("*Hello*")
    formatted2 = Formatted.from_html(formatted.to_html())