diff --git a/tests/toolbox/test_remove_markdown.py b/tests/toolbox/test_remove_markdown.py new file mode 100644 index 0000000..0579a52 --- /dev/null +++ b/tests/toolbox/test_remove_markdown.py @@ -0,0 +1,77 @@ +from toolbox.strings import MarkdownFormat +from toolbox.strings import remove_markdown + +test_dict = { + "": (MarkdownFormat.ALL, ""), + "~~test 1~~": (MarkdownFormat.STRIKETHROUGH, "test 1"), + "||test 2||": (MarkdownFormat.SPOILER, "test 2"), + "**test 3**": (MarkdownFormat.BOLD, "test 3"), + "__test 4__": (MarkdownFormat.UNDERLINE, "test 4"), + "_test 5_": (MarkdownFormat.ITALIC_UNDERSCORE, "test 5"), + "*test 6*": (MarkdownFormat.ITALIC_ASTERISK, "test 6"), + "> test 7": (MarkdownFormat.QUOTE, "test 7"), + ">>> test 8": (MarkdownFormat.MULTI_QUOTE, "test 8"), + "`test 9`": (MarkdownFormat.CODE_BLOCK, "test 9"), + "```test 10```": (MarkdownFormat.MULTI_CODE_BLOCK, "test 10"), + "**~~test 11~~**": (MarkdownFormat.BOLD | MarkdownFormat.STRIKETHROUGH, "test 11"), + "*_test 12_*": (MarkdownFormat.ITALIC_ASTERISK | MarkdownFormat.ITALIC_UNDERSCORE, "test 12"), + "~~test 13~~": (MarkdownFormat.ALL, "test 13"), + "||test 14||": (MarkdownFormat.ALL, "test 14"), + "**test 15**": (MarkdownFormat.ALL, "test 15"), + "__test 16__": (MarkdownFormat.ALL, "test 16"), + "_test 17_": (MarkdownFormat.ALL, "test 17"), + "*test 18*": (MarkdownFormat.ALL, "test 18"), + "> test 19": (MarkdownFormat.ALL, "test 19"), + ">>> test 20": (MarkdownFormat.ALL, "test 20"), + "`test 21`": (MarkdownFormat.ALL, "test 21"), + "```test 22```": (MarkdownFormat.ALL, "test 22"), + "**~~test 23~~**": (MarkdownFormat.ALL, "test 23"), + "*_test 24_*": (MarkdownFormat.ALL, "test 24"), + "**__test 25__**": (MarkdownFormat.ALL, "test 25"), + "__test 26__ __test 26__": (MarkdownFormat.UNDERLINE, "test 26 test 26"), + "**test 27** **test 27**": (MarkdownFormat.BOLD, "test 27 test 27"), + "~~test 28~~ __test 28__": (MarkdownFormat.ALL, "test 28 test 28"), + "||test 29|| ||test 29||": (MarkdownFormat.ALL, "test 29 test 29"), + "```test 30``` ```test 30```": (MarkdownFormat.ALL, "test 30 test 30"), + "`test 31` `test 31`": (MarkdownFormat.ALL, "test 31 test 31"), + "`test 32` `test 32`": (MarkdownFormat.ITALIC_ASTERISK, "`test 32` `test 32`"), + ">>> test 33 *test 33*": (MarkdownFormat.ALL, "test 33 test 33"), + "`test 34 _test 34_`": (MarkdownFormat.ALL, "test 34 _test 34_"), + "```test 35 _test 35_```": (MarkdownFormat.ALL, "test 35 _test 35_"), + "```test 36 _test 37_``` _test 38_ ```test 39 *test 40*```": ( + MarkdownFormat.ALL, + "test 36 _test 37_ test 38 test 39 *test 40*", + ), + "`test 41 **test 42**` **test 43** `__test 44__`": (MarkdownFormat.ALL, "test 41 **test 42** test 43 __test 44__"), +} + + +def test_remove_markdown(): + for test, (format, result) in test_dict.items(): + if not format: + assert remove_markdown(test) == result + else: + assert remove_markdown(test, format) == result + + +# MIT License +# +# Copyright (c) 2022-present HyperGH +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. diff --git a/toolbox/strings.py b/toolbox/strings.py index 3f1783d..606bc92 100644 --- a/toolbox/strings.py +++ b/toolbox/strings.py @@ -1,8 +1,16 @@ import datetime import re import typing as t +from enum import IntFlag -__all__: t.Sequence[str] = ("format_dt", "utcnow", "is_url", "is_invite") +__all__: t.Sequence[str] = ( + "format_dt", + "utcnow", + "is_url", + "is_invite", + "remove_markdown", + "MarkdownFormat", +) VALID_TIMESTAMP_STYLES: t.Sequence[str] = ("t", "T", "d", "D", "f", "F", "R") @@ -13,6 +21,73 @@ INVITE_REGEX = re.compile(r"(?:https?://)?discord(?:app)?\.(?:com/invite|gg)/[a-zA-Z0-9]+/?") +class MarkdownFormat(IntFlag): + """An Enum to flag strings with the types of formatting that should be removed.""" + + NONE = 0 + """Refers to no formatting.""" + + STRIKETHROUGH = 1 << 0 + """Used to remove strikethroughs caused by 2 tildes.""" + + ITALIC_UNDERSCORE = 1 << 1 + """Used to remove italic caused by underscores.""" + + ITALIC_ASTERISK = 1 << 2 + """Used to remove italic caused by asterisks.""" + + BOLD = 1 << 3 + """Used to remove bold caused by 2 asterisks.""" + + UNDERLINE = 1 << 4 + """Used to remove underlining caused by 2 underscores.""" + + CODE_BLOCK = 1 << 5 + """Used to remove code blocks caused by backticks.""" + + MULTI_CODE_BLOCK = 1 << 6 + """Used to remove multiline code blocks caused by 3 backticks.""" + + QUOTE = 1 << 7 + """Used to remove quotes caused by a bigger than at the start of the line followed by a whitespace character.""" + + MULTI_QUOTE = 1 << 8 + """Used to remove multiline quotes caused by 3 bigger thans at the start of the line followed by a whitespace character.""" + + SPOILER = 1 << 9 + """Used to remove spoilers caused by 2 pipes.""" + + ALL = ( + STRIKETHROUGH + | ITALIC_UNDERSCORE + | ITALIC_ASTERISK + | BOLD + | UNDERLINE + | CODE_BLOCK + | MULTI_CODE_BLOCK + | QUOTE + | MULTI_QUOTE + | SPOILER + ) + """Used to remove all possible formatting.""" + + +FORMAT_DICT = { + # First value is the regex pattern of the affiliated enum flag, the match includes the formatting that causes it. + # Second value is the amount of characters that will be sliced off the match. + MarkdownFormat.MULTI_CODE_BLOCK: (re.compile(r"(`{3}[^`]+`{3})"), 3), + MarkdownFormat.CODE_BLOCK: (re.compile(r"(`[^`]+`)"), 1), + MarkdownFormat.MULTI_QUOTE: (re.compile(r"\s*>{3} ([\s\S]+)"), 0), + MarkdownFormat.QUOTE: (re.compile(r"\s*> ([\s\S]+)"), 0), + MarkdownFormat.BOLD: (re.compile(r"(\*{2}[^*]+\*{2})"), 2), + MarkdownFormat.UNDERLINE: (re.compile(r"(__[^_]+__)"), 2), + MarkdownFormat.STRIKETHROUGH: (re.compile(r"(~~[^~]+~~)"), 2), + MarkdownFormat.ITALIC_UNDERSCORE: (re.compile(r"(_[^_]+_)"), 1), + MarkdownFormat.ITALIC_ASTERISK: (re.compile(r"(\*[^*]+\*)"), 1), + MarkdownFormat.SPOILER: (re.compile(r"(\|{2}[^|]+\|{2})"), 2), +} + + def format_dt(time: datetime.datetime, style: t.Optional[str] = None) -> str: """ Convert a datetime into a Discord timestamp. @@ -102,6 +177,77 @@ def is_invite(string: str, *, fullmatch: bool = True) -> bool: return False +def remove_markdown(content: str, formats: MarkdownFormat = MarkdownFormat.ALL) -> str: + """ + Removes the markdown formatting from Discord messages. + + Parameters + ---------- + content : str + The `str` object, which needs their content cleaned from Discord's markdown formatting. + formats : MarkdownFormat + The `IntFlag` of the formatting that needs to be removed. + Default is `MarkdownFormat.ALL`. + Multiple can be supplied by using bitwise OR. + Matches for `MarkdownFormat.MULTI_CODE_BLOCK` and `MarkdownFormat.CODE_BLOCK` + don't remove other formatting found inside them. + + Returns + ------- + str + The cleaned string without markdown formatting. + """ + code_block_matches = [] + for format, (regex, replace) in FORMAT_DICT.items(): + if formats & format: + if format & MarkdownFormat.MULTI_CODE_BLOCK or format & MarkdownFormat.CODE_BLOCK: + code_block_matches += re.findall(regex, content) + matches = re.findall(regex, content) + if not code_block_matches: + for match in matches: + if format & MarkdownFormat.MULTI_QUOTE or format & MarkdownFormat.QUOTE: + content = _remove_quote(content, format) + continue + content = content.replace(match, match[replace:-replace], 1) + else: + for match in matches: + if format & MarkdownFormat.MULTI_CODE_BLOCK or format & MarkdownFormat.CODE_BLOCK: + content = content.replace(match, match[replace:-replace], 1) + continue + else: + ignore = False + for code_block_match in code_block_matches: + if match in code_block_match: + ignore = True + if not ignore: + content = content.replace(match, match[replace:-replace], 1) + + return content + + +def _remove_quote(content: str, formats: MarkdownFormat) -> str: + """ + Helper function to remove quote formatting. + + Parameters + ---------- + content : str + The `str` object, which needs to be cleaned from quote formatting. + format : MarkdownFormat + The type of quote formatting that needs to be removed. + + Returns + ------- + str + The cleaned string without quote formatting. + """ + if formats == MarkdownFormat.MULTI_QUOTE and ">>> " in content: + content = content.replace(">>> ", "") + if formats == MarkdownFormat.QUOTE and "> " in content: + content = content.replace("> ", "") + return content + + # MIT License # # Copyright (c) 2022-present HyperGH