From a5e6434c31df1ed9e3e406d64ecdc9c70dcb1a86 Mon Sep 17 00:00:00 2001
From: Tom Smeding <tomsmeding@users.noreply.github.com>
Date: Wed, 3 Jun 2020 13:06:16 +0200
Subject: [PATCH] Fix handling of backslashes in relation to code blocks (#203)

Backslashes are now correctly preserved inside code blocks while still allowing the user to escape a backtick. The handling of backticks and bold/italic wrappers was unified so that they share the same escaping code.

Backslashes only escape Markdown wrapper characters (*, _, `). If they are encountered before another character, they are considered literal.
---
 matrix/colors.py    | 81 ++++++++++++++++++++++++++++-----------------
 tests/color_test.py | 10 ++++++
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/matrix/colors.py b/matrix/colors.py
index a1222f4..14f047a 100644
--- a/matrix/colors.py
+++ b/matrix/colors.py
@@ -88,23 +88,44 @@ class Formatted(object):
         substrings = []  # type: List[FormattedString]
         attributes = DEFAULT_ATTRIBUTES.copy()
 
-        def last_match_index(regex, subject, offset_in_match):
-            matches = list(re.finditer(regex, subject))
+        # Escaped things are not markdown delimiters, so substitute them away
+        # when (quickly) looking for the last delimiters in the line. Note that
+        # the replacement needs to be the same length as the original for the
+        # indices to be correct.
+        escaped_masked = re.sub(r"\\[\\*_`]", "aa", line)
+
+        def last_match_index(regex, offset_in_match):
+            matches = list(re.finditer(regex, escaped_masked))
             return matches[-1].span()[0] + offset_in_match if matches else -1
 
+        # 'needs_word': whether the wrapper must surround words, for example
+        #   '*italic*' and not '* not-italic *'.
+        # 'validate': whether it can occur within the current attributes
         wrappers = {
             "**": {
                 "key": "bold",
-                "last_index": last_match_index(r"\S\*\*", line, 1),
+                "last_index": last_match_index(r"\S\*\*", 1),
+                "needs_word": True,
+                "validate": lambda attrs: not attrs["code"],
             },
             "*": {
                 "key": "italic",
-                "last_index": last_match_index(r"\S\*($|[^*])", line, 1),
+                "last_index": last_match_index(r"\S\*($|[^*])", 1),
+                "needs_word": True,
+                "validate": lambda attrs: not attrs["code"],
             },
             "_": {
                 "key": "italic",
-                "last_index": last_match_index(r"\S_", line, 1),
+                "last_index": last_match_index(r"\S_", 1),
+                "needs_word": True,
+                "validate": lambda attrs: not attrs["code"],
             },
+            "`": {
+                "key": "code",
+                "last_index": last_match_index(r"`", 0),
+                "needs_word": False,
+                "validate": lambda attrs: True,
+            }
         }
         wrapper_init_chars = set(k[0] for k in wrappers.keys())
         wrapper_max_len = max(len(k) for k in wrappers.keys())
@@ -115,14 +136,17 @@ class Formatted(object):
             "\x1F": "underline",
         }
 
-        last_backtick = line.rfind("`")
+        # Characters that consume a prefixed backslash
+        escapable_chars = wrapper_init_chars.copy()
+        escapable_chars.add("\\")
 
         i = 0
         while i < len(line):
             # Markdown escape
-            # NOTE: IRC-native formatting characters are not escaped
             if i + 1 < len(line) and line[i] == "\\" \
-                    and line[i + 1] not in "\x02\x03\x0F\x1D\x1F":
+                    and (line[i + 1] in escapable_chars
+                            if not attributes["code"]
+                            else line[i + 1] == "`"):
                 text += line[i + 1]
                 i = i + 2
 
@@ -183,32 +207,26 @@ class Formatted(object):
                 else:
                     attributes["bgcolor"] = None
 
-            # Markdown inline code
-            elif line[i] == "`" and (attributes["code"] or last_backtick > i):
-                if text:
-                    # strip leading and trailing spaces and compress consecutive
-                    # spaces in inline code blocks
-                    if attributes["code"]:
-                        text = text.strip()
-                        text = re.sub(r"\s+", " ", text)
-
-                    substrings.append(
-                        FormattedString(text, attributes.copy())
-                    )
-                text = ""
-                attributes["code"] = not attributes["code"]
-                i = i + 1
-
-            # Markdown wrapper (emphasis/bold)
-            elif line[i] in wrapper_init_chars and not attributes["code"]:
+            # Markdown wrapper (emphasis/bold/code)
+            elif line[i] in wrapper_init_chars:
                 for l in range(wrapper_max_len, 0, -1):
                     if i + l <= len(line) and line[i : i + l] in wrappers:
                         descriptor = wrappers[line[i : i + l]]
 
+                        if not descriptor["validate"](attributes):
+                            continue
+
                         if attributes[descriptor["key"]]:
-                            # Can only turn off if preceded by non-whitespace
-                            if not line[i - 1].isspace():
+                            # needs_word wrappers can only be turned off if
+                            # preceded by non-whitespace
+                            if (i >= 1 and not line[i - 1].isspace()) \
+                                    or not descriptor["needs_word"]:
                                 if text:
+                                    # strip leading and trailing spaces and
+                                    # compress consecutive spaces in inline
+                                    # code blocks
+                                    if descriptor["key"] == "code":
+                                        text = re.sub(r"\s+", " ", text.strip())
                                     substrings.append(
                                         FormattedString(text, attributes.copy()))
                                 text = ""
@@ -218,10 +236,11 @@ class Formatted(object):
                                 text = text + line[i : i + l]
                                 i = i + l
 
-                        # Must have a chance of closing this, and be followed
-                        # by non-whitespace
+                        # Must have a chance of closing this, and needs_word
+                        # wrappers must be followed by non-whitespace
                         elif descriptor["last_index"] >= i + l and \
-                                not line[i + l].isspace():
+                                (not line[i + l].isspace() or \
+                                    not descriptor["needs_word"]):
                             if text:
                                 substrings.append(
                                     FormattedString(text, attributes.copy()))
diff --git a/tests/color_test.py b/tests/color_test.py
index 9ede04b..60785c7 100644
--- a/tests/color_test.py
+++ b/tests/color_test.py
@@ -109,6 +109,16 @@ def test_input_line_markdown_various2():
     assert "norm** <code>code **code *code</code> norm `norm" \
            == formatted.to_html()
 
+def test_input_line_backslash():
+    def convert(s): return Formatted.from_input_line(s).to_html()
+    assert "pre <em>italic* ital</em> norm" == convert("pre *italic\\* ital* norm")
+    assert "*norm* norm" == convert("\\*norm* norm")
+    assert "<em>*ital</em>" == convert("*\\*ital*")
+    assert "<code>C:\\path</code>" == convert("`C:\\path`")
+    assert "<code>with`tick</code>" == convert("`with\\`tick`")
+    assert "`un`matched" == convert("`un\\`matched")
+    assert "<strong>bold </strong><em><strong>*bital</strong></em> norm" == convert("**bold *\\*bital*** norm")
+
 def test_conversion():
     formatted = Formatted.from_input_line("*Hello*")
     formatted2 = Formatted.from_html(formatted.to_html())