From 788aec665bf7de0333e56b05fa5787a1d9955578 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 10 Oct 2024 00:47:54 -0400
Subject: [PATCH] Add a new processor which removes markdown and special chars
 from TTS text

---
 CHANGELOG.md                                  |  4 +
 pyproject.toml                                |  1 +
 src/pipecat/processors/text/__init__.py       |  0
 .../processors/text/markdown_remover.py       | 76 +++++++++++++++++++
 4 files changed, 81 insertions(+)
 create mode 100644 src/pipecat/processors/text/__init__.py
 create mode 100644 src/pipecat/processors/text/markdown_remover.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9d232ae79..cc1622fc6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `MarkdownRemovalProcessor`. This processor removes markdown formatting
+  from a TextFrame. It's intended to be used between the LLM and TTS in order
+  to remove markdown from the text the TTS speaks.
+
 - Added new `RTVIUserLLMTextProcessor`. This processor will send an RTVI
   `user-llm-text` message with the user content's that was sent to the LLM.
 
diff --git a/pyproject.toml b/pyproject.toml
index 9d515c8f9..5fd5d6790 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ classifiers = [
 ]
 dependencies = [
     "aiohttp~=3.10.3",
+    "Markdown~=3.7",
     "numpy~=1.26.4",
     "loguru~=0.7.2",
     "Pillow~=10.4.0",
diff --git a/src/pipecat/processors/text/__init__.py b/src/pipecat/processors/text/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/pipecat/processors/text/markdown_remover.py b/src/pipecat/processors/text/markdown_remover.py
new file mode 100644
index 000000000..3c08d5fb2
--- /dev/null
+++ b/src/pipecat/processors/text/markdown_remover.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import re
+
+from markdown import Markdown
+
+from pipecat.frames.frames import Frame, TextFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class MarkdownRemovalProcessor(FrameProcessor):
+    """Removes Markdown formatting from text in TextFrames.
+
+    Converts Markdown to plain text while preserving the overall structure,
+    including leading and trailing spaces. Handles special cases like
+    asterisks and table formatting.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._md = Markdown()
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, TextFrame):
+            cleaned_text = self._remove_markdown(frame.text)
+            await self.push_frame(TextFrame(text=cleaned_text))
+        else:
+            await self.push_frame(frame, direction)
+
+    def _remove_markdown(self, markdown_string: str) -> str:
+        # Replace newlines with spaces to handle cases with leading newlines
+        markdown_string = markdown_string.replace("\n", " ")
+
+        # Preserve numbered list items with a unique marker, §NUM§
+        markdown_string = re.sub(r"^(\d+\.)\s", r"§NUM§\1 ", markdown_string)
+
+        # Preserve leading/trailing spaces with a unique marker, §
+        # Critical for word-by-word streaming in bot-tts-text
+        preserved_markdown = re.sub(
+            r"^( +)|\s+$", lambda m: "§" * len(m.group(0)), markdown_string, flags=re.MULTILINE
+        )
+
+        # Convert markdown to HTML
+        md = Markdown()
+        html = md.convert(preserved_markdown)
+
+        # Remove HTML tags
+        text = re.sub("<[^<]+?>", "", html)
+
+        # Replace HTML entities
+        text = text.replace("&nbsp;", " ")
+        text = text.replace("&lt;", "<")
+        text = text.replace("&gt;", ">")
+        text = text.replace("&amp;", "&")
+
+        # Remove leading/trailing asterisks
+        # Necessary for bot-tts-text, as they appear as literal asterisks
+        text = re.sub(r"^\*{1,2}|\*{1,2}$", "", text)
+
+        # Remove Markdown table formatting
+        text = re.sub(r"\|", "", text)
+        text = re.sub(r"^\s*[-:]+\s*$", "", text, flags=re.MULTILINE)
+
+        # Restore numbered list items
+        text = text.replace("§NUM§", "")
+
+        # Restore leading and trailing spaces
+        text = re.sub("§", " ", text)
+
+        return text