From a5e6434c31df1ed9e3e406d64ecdc9c70dcb1a86 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Wed, 3 Jun 2020 13:06:16 +0200 Subject: [PATCH] Fix handling of backslashes in relation to code blocks (#203) Backslashes are now correctly preserved inside code blocks while still allowing the user to escape a backtick. The handling of backticks and bold/italic wrappers was unified so that they share the same escaping code. Backslashes only escape Markdown wrapper characters (*, _, `). If they are encountered before another character, they are considered literal. --- matrix/colors.py | 81 ++++++++++++++++++++++++++++----------------- tests/color_test.py | 10 ++++++ 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/matrix/colors.py b/matrix/colors.py index a1222f4..14f047a 100644 --- a/matrix/colors.py +++ b/matrix/colors.py @@ -88,23 +88,44 @@ class Formatted(object): substrings = [] # type: List[FormattedString] attributes = DEFAULT_ATTRIBUTES.copy() - def last_match_index(regex, subject, offset_in_match): - matches = list(re.finditer(regex, subject)) + # Escaped things are not markdown delimiters, so substitute them away + # when (quickly) looking for the last delimiters in the line. Note that + # the replacement needs to be the same length as the original for the + # indices to be correct. + escaped_masked = re.sub(r"\\[\\*_`]", "aa", line) + + def last_match_index(regex, offset_in_match): + matches = list(re.finditer(regex, escaped_masked)) return matches[-1].span()[0] + offset_in_match if matches else -1 + # 'needs_word': whether the wrapper must surround words, for example + # '*italic*' and not '* not-italic *'. + # 'validate': whether it can occur within the current attributes wrappers = { "**": { "key": "bold", - "last_index": last_match_index(r"\S\*\*", line, 1), + "last_index": last_match_index(r"\S\*\*", 1), + "needs_word": True, + "validate": lambda attrs: not attrs["code"], }, "*": { "key": "italic", - "last_index": last_match_index(r"\S\*($|[^*])", line, 1), + "last_index": last_match_index(r"\S\*($|[^*])", 1), + "needs_word": True, + "validate": lambda attrs: not attrs["code"], }, "_": { "key": "italic", - "last_index": last_match_index(r"\S_", line, 1), + "last_index": last_match_index(r"\S_", 1), + "needs_word": True, + "validate": lambda attrs: not attrs["code"], }, + "`": { + "key": "code", + "last_index": last_match_index(r"`", 0), + "needs_word": False, + "validate": lambda attrs: True, + } } wrapper_init_chars = set(k[0] for k in wrappers.keys()) wrapper_max_len = max(len(k) for k in wrappers.keys()) @@ -115,14 +136,17 @@ class Formatted(object): "\x1F": "underline", } - last_backtick = line.rfind("`") + # Characters that consume a prefixed backslash + escapable_chars = wrapper_init_chars.copy() + escapable_chars.add("\\") i = 0 while i < len(line): # Markdown escape - # NOTE: IRC-native formatting characters are not escaped if i + 1 < len(line) and line[i] == "\\" \ - and line[i + 1] not in "\x02\x03\x0F\x1D\x1F": + and (line[i + 1] in escapable_chars + if not attributes["code"] + else line[i + 1] == "`"): text += line[i + 1] i = i + 2 @@ -183,32 +207,26 @@ class Formatted(object): else: attributes["bgcolor"] = None - # Markdown inline code - elif line[i] == "`" and (attributes["code"] or last_backtick > i): - if text: - # strip leading and trailing spaces and compress consecutive - # spaces in inline code blocks - if attributes["code"]: - text = text.strip() - text = re.sub(r"\s+", " ", text) - - substrings.append( - FormattedString(text, attributes.copy()) - ) - text = "" - attributes["code"] = not attributes["code"] - i = i + 1 - - # Markdown wrapper (emphasis/bold) - elif line[i] in wrapper_init_chars and not attributes["code"]: + # Markdown wrapper (emphasis/bold/code) + elif line[i] in wrapper_init_chars: for l in range(wrapper_max_len, 0, -1): if i + l <= len(line) and line[i : i + l] in wrappers: descriptor = wrappers[line[i : i + l]] + if not descriptor["validate"](attributes): + continue + if attributes[descriptor["key"]]: - # Can only turn off if preceded by non-whitespace - if not line[i - 1].isspace(): + # needs_word wrappers can only be turned off if + # preceded by non-whitespace + if (i >= 1 and not line[i - 1].isspace()) \ + or not descriptor["needs_word"]: if text: + # strip leading and trailing spaces and + # compress consecutive spaces in inline + # code blocks + if descriptor["key"] == "code": + text = re.sub(r"\s+", " ", text.strip()) substrings.append( FormattedString(text, attributes.copy())) text = "" @@ -218,10 +236,11 @@ class Formatted(object): text = text + line[i : i + l] i = i + l - # Must have a chance of closing this, and be followed - # by non-whitespace + # Must have a chance of closing this, and needs_word + # wrappers must be followed by non-whitespace elif descriptor["last_index"] >= i + l and \ - not line[i + l].isspace(): + (not line[i + l].isspace() or \ + not descriptor["needs_word"]): if text: substrings.append( FormattedString(text, attributes.copy())) diff --git a/tests/color_test.py b/tests/color_test.py index 9ede04b..60785c7 100644 --- a/tests/color_test.py +++ b/tests/color_test.py @@ -109,6 +109,16 @@ def test_input_line_markdown_various2(): assert "norm** code **code *code norm `norm" \ == formatted.to_html() +def test_input_line_backslash(): + def convert(s): return Formatted.from_input_line(s).to_html() + assert "pre italic* ital norm" == convert("pre *italic\\* ital* norm") + assert "*norm* norm" == convert("\\*norm* norm") + assert "*ital" == convert("*\\*ital*") + assert "C:\\path" == convert("`C:\\path`") + assert "with`tick" == convert("`with\\`tick`") + assert "`un`matched" == convert("`un\\`matched") + assert "bold *bital norm" == convert("**bold *\\*bital*** norm") + def test_conversion(): formatted = Formatted.from_input_line("*Hello*") formatted2 = Formatted.from_html(formatted.to_html())