Skip to content

Commit

Permalink
Add function to remove all Discord formatting text from string #4 (#8)
Browse files Browse the repository at this point in the history
* Added regex and function with docstring

* changed parameter to str

* Created tests for strikethrough

* Added Strikethrough regex and function

* more tests

* renamed to test_remove_markdown

* added test_remove_block

* added remove_block function, split regex up

* added test_remove_multiblock

* added remove_multiblock function

* added test_remove_bold, renamed formats

* added remove_bold and enum, renamed formats

* added remove_underline

* added test_remove_underline

* added test_remove_italic (for underscores)

* added remove_italic (for _), changed re.sub method

* added test_remove_italic_asterisk

* added remove_italic_asterisk

* added test_remove_quote/multi_quote/spoiler

* added remove_quote/multi_quote/spoiler

* cleaned up tests

* moved methods to remove_markdown.py

* method pluralisation, changes to docstrings

* adjusted method names

* formatting

* add enums to parameter

* moved to strings.py

* add remove_markdown.py

* changed imports to toolbox.strings

* add .ALL enum into function

* add .ALL tests

* removed subfunctions, build general function

* removed subtest, added general tests

* deleted None test

* deleted None statement

* deleted prints in remove_markdown

* fixed typo

* adjusted for loop

* add license info

* cleaned up function

* changed dict values to tuples

* add re.compile to dict, yeet 1 line from loop

* add docstrings to enum

* changed dict to constant and moved to top

* cleaning up comments and docstrings

* _ is no more

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* Update toolbox/strings.py

Co-authored-by: Hyper <[email protected]>

* change patterns to greedy

* fix pattern

* move test cases into dict

* formatting

* add helper function to remove quotes

* more tests

* L mypy

* change remove_quote and fix doctstrings

* tests for ignoring matches inside codeblocks

* formatting

* ignore formatting inside codeblocks, rewrite

* mark remove_quote as internal

Co-authored-by: Hyper <[email protected]>

* mark remove_quote as internal

Co-authored-by: Hyper <[email protected]>

* update docstring

---------

Co-authored-by: Hyper <[email protected]>
  • Loading branch information
Labretx and hypergonial authored Feb 12, 2023
1 parent 4970da4 commit 523f370
Show file tree
Hide file tree
Showing 2 changed files with 224 additions and 1 deletion.
77 changes: 77 additions & 0 deletions tests/toolbox/test_remove_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from toolbox.strings import MarkdownFormat
from toolbox.strings import remove_markdown

test_dict = {
"": (MarkdownFormat.ALL, ""),
"~~test 1~~": (MarkdownFormat.STRIKETHROUGH, "test 1"),
"||test 2||": (MarkdownFormat.SPOILER, "test 2"),
"**test 3**": (MarkdownFormat.BOLD, "test 3"),
"__test 4__": (MarkdownFormat.UNDERLINE, "test 4"),
"_test 5_": (MarkdownFormat.ITALIC_UNDERSCORE, "test 5"),
"*test 6*": (MarkdownFormat.ITALIC_ASTERISK, "test 6"),
"> test 7": (MarkdownFormat.QUOTE, "test 7"),
">>> test 8": (MarkdownFormat.MULTI_QUOTE, "test 8"),
"`test 9`": (MarkdownFormat.CODE_BLOCK, "test 9"),
"```test 10```": (MarkdownFormat.MULTI_CODE_BLOCK, "test 10"),
"**~~test 11~~**": (MarkdownFormat.BOLD | MarkdownFormat.STRIKETHROUGH, "test 11"),
"*_test 12_*": (MarkdownFormat.ITALIC_ASTERISK | MarkdownFormat.ITALIC_UNDERSCORE, "test 12"),
"~~test 13~~": (MarkdownFormat.ALL, "test 13"),
"||test 14||": (MarkdownFormat.ALL, "test 14"),
"**test 15**": (MarkdownFormat.ALL, "test 15"),
"__test 16__": (MarkdownFormat.ALL, "test 16"),
"_test 17_": (MarkdownFormat.ALL, "test 17"),
"*test 18*": (MarkdownFormat.ALL, "test 18"),
"> test 19": (MarkdownFormat.ALL, "test 19"),
">>> test 20": (MarkdownFormat.ALL, "test 20"),
"`test 21`": (MarkdownFormat.ALL, "test 21"),
"```test 22```": (MarkdownFormat.ALL, "test 22"),
"**~~test 23~~**": (MarkdownFormat.ALL, "test 23"),
"*_test 24_*": (MarkdownFormat.ALL, "test 24"),
"**__test 25__**": (MarkdownFormat.ALL, "test 25"),
"__test 26__ __test 26__": (MarkdownFormat.UNDERLINE, "test 26 test 26"),
"**test 27** **test 27**": (MarkdownFormat.BOLD, "test 27 test 27"),
"~~test 28~~ __test 28__": (MarkdownFormat.ALL, "test 28 test 28"),
"||test 29|| ||test 29||": (MarkdownFormat.ALL, "test 29 test 29"),
"```test 30``` ```test 30```": (MarkdownFormat.ALL, "test 30 test 30"),
"`test 31` `test 31`": (MarkdownFormat.ALL, "test 31 test 31"),
"`test 32` `test 32`": (MarkdownFormat.ITALIC_ASTERISK, "`test 32` `test 32`"),
">>> test 33 *test 33*": (MarkdownFormat.ALL, "test 33 test 33"),
"`test 34 _test 34_`": (MarkdownFormat.ALL, "test 34 _test 34_"),
"```test 35 _test 35_```": (MarkdownFormat.ALL, "test 35 _test 35_"),
"```test 36 _test 37_``` _test 38_ ```test 39 *test 40*```": (
MarkdownFormat.ALL,
"test 36 _test 37_ test 38 test 39 *test 40*",
),
"`test 41 **test 42**` **test 43** `__test 44__`": (MarkdownFormat.ALL, "test 41 **test 42** test 43 __test 44__"),
}


def test_remove_markdown():
for test, (format, result) in test_dict.items():
if not format:
assert remove_markdown(test) == result
else:
assert remove_markdown(test, format) == result


# MIT License
#
# Copyright (c) 2022-present HyperGH
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
148 changes: 147 additions & 1 deletion toolbox/strings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import datetime
import re
import typing as t
from enum import IntFlag

__all__: t.Sequence[str] = ("format_dt", "utcnow", "is_url", "is_invite")
__all__: t.Sequence[str] = (
"format_dt",
"utcnow",
"is_url",
"is_invite",
"remove_markdown",
"MarkdownFormat",
)

VALID_TIMESTAMP_STYLES: t.Sequence[str] = ("t", "T", "d", "D", "f", "F", "R")

Expand All @@ -13,6 +21,73 @@
INVITE_REGEX = re.compile(r"(?:https?://)?discord(?:app)?\.(?:com/invite|gg)/[a-zA-Z0-9]+/?")


class MarkdownFormat(IntFlag):
"""An Enum to flag strings with the types of formatting that should be removed."""

NONE = 0
"""Refers to no formatting."""

STRIKETHROUGH = 1 << 0
"""Used to remove strikethroughs caused by 2 tildes."""

ITALIC_UNDERSCORE = 1 << 1
"""Used to remove italic caused by underscores."""

ITALIC_ASTERISK = 1 << 2
"""Used to remove italic caused by asterisks."""

BOLD = 1 << 3
"""Used to remove bold caused by 2 asterisks."""

UNDERLINE = 1 << 4
"""Used to remove underlining caused by 2 underscores."""

CODE_BLOCK = 1 << 5
"""Used to remove code blocks caused by backticks."""

MULTI_CODE_BLOCK = 1 << 6
"""Used to remove multiline code blocks caused by 3 backticks."""

QUOTE = 1 << 7
"""Used to remove quotes caused by a bigger than at the start of the line followed by a whitespace character."""

MULTI_QUOTE = 1 << 8
"""Used to remove multiline quotes caused by 3 bigger thans at the start of the line followed by a whitespace character."""

SPOILER = 1 << 9
"""Used to remove spoilers caused by 2 pipes."""

ALL = (
STRIKETHROUGH
| ITALIC_UNDERSCORE
| ITALIC_ASTERISK
| BOLD
| UNDERLINE
| CODE_BLOCK
| MULTI_CODE_BLOCK
| QUOTE
| MULTI_QUOTE
| SPOILER
)
"""Used to remove all possible formatting."""


FORMAT_DICT = {
# First value is the regex pattern of the affiliated enum flag, the match includes the formatting that causes it.
# Second value is the amount of characters that will be sliced off the match.
MarkdownFormat.MULTI_CODE_BLOCK: (re.compile(r"(`{3}[^`]+`{3})"), 3),
MarkdownFormat.CODE_BLOCK: (re.compile(r"(`[^`]+`)"), 1),
MarkdownFormat.MULTI_QUOTE: (re.compile(r"\s*>{3} ([\s\S]+)"), 0),
MarkdownFormat.QUOTE: (re.compile(r"\s*> ([\s\S]+)"), 0),
MarkdownFormat.BOLD: (re.compile(r"(\*{2}[^*]+\*{2})"), 2),
MarkdownFormat.UNDERLINE: (re.compile(r"(__[^_]+__)"), 2),
MarkdownFormat.STRIKETHROUGH: (re.compile(r"(~~[^~]+~~)"), 2),
MarkdownFormat.ITALIC_UNDERSCORE: (re.compile(r"(_[^_]+_)"), 1),
MarkdownFormat.ITALIC_ASTERISK: (re.compile(r"(\*[^*]+\*)"), 1),
MarkdownFormat.SPOILER: (re.compile(r"(\|{2}[^|]+\|{2})"), 2),
}


def format_dt(time: datetime.datetime, style: t.Optional[str] = None) -> str:
"""
Convert a datetime into a Discord timestamp.
Expand Down Expand Up @@ -102,6 +177,77 @@ def is_invite(string: str, *, fullmatch: bool = True) -> bool:
return False


def remove_markdown(content: str, formats: MarkdownFormat = MarkdownFormat.ALL) -> str:
"""
Removes the markdown formatting from Discord messages.
Parameters
----------
content : str
The `str` object, which needs their content cleaned from Discord's markdown formatting.
formats : MarkdownFormat
The `IntFlag` of the formatting that needs to be removed.
Default is `MarkdownFormat.ALL`.
Multiple can be supplied by using bitwise OR.
Matches for `MarkdownFormat.MULTI_CODE_BLOCK` and `MarkdownFormat.CODE_BLOCK`
don't remove other formatting found inside them.
Returns
-------
str
The cleaned string without markdown formatting.
"""
code_block_matches = []
for format, (regex, replace) in FORMAT_DICT.items():
if formats & format:
if format & MarkdownFormat.MULTI_CODE_BLOCK or format & MarkdownFormat.CODE_BLOCK:
code_block_matches += re.findall(regex, content)
matches = re.findall(regex, content)
if not code_block_matches:
for match in matches:
if format & MarkdownFormat.MULTI_QUOTE or format & MarkdownFormat.QUOTE:
content = _remove_quote(content, format)
continue
content = content.replace(match, match[replace:-replace], 1)
else:
for match in matches:
if format & MarkdownFormat.MULTI_CODE_BLOCK or format & MarkdownFormat.CODE_BLOCK:
content = content.replace(match, match[replace:-replace], 1)
continue
else:
ignore = False
for code_block_match in code_block_matches:
if match in code_block_match:
ignore = True
if not ignore:
content = content.replace(match, match[replace:-replace], 1)

return content


def _remove_quote(content: str, formats: MarkdownFormat) -> str:
"""
Helper function to remove quote formatting.
Parameters
----------
content : str
The `str` object, which needs to be cleaned from quote formatting.
format : MarkdownFormat
The type of quote formatting that needs to be removed.
Returns
-------
str
The cleaned string without quote formatting.
"""
if formats == MarkdownFormat.MULTI_QUOTE and ">>> " in content:
content = content.replace(">>> ", "")
if formats == MarkdownFormat.QUOTE and "> " in content:
content = content.replace("> ", "")
return content


# MIT License
#
# Copyright (c) 2022-present HyperGH
Expand Down

0 comments on commit 523f370

Please sign in to comment.