From 788aec665bf7de0333e56b05fa5787a1d9955578 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 10 Oct 2024 00:47:54 -0400 Subject: [PATCH] Add a new processor which removes markdown and special chars from TTS text --- CHANGELOG.md | 4 + pyproject.toml | 1 + src/pipecat/processors/text/__init__.py | 0 .../processors/text/markdown_remover.py | 76 +++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 src/pipecat/processors/text/__init__.py create mode 100644 src/pipecat/processors/text/markdown_remover.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d232ae79..cc1622fc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `MarkdownRemovalProcessor`. This processor removes markdown formatting + from a TextFrame. It's intended to be used between the LLM and TTS in order + to remove markdown from the text the TTS speaks. + - Added new `RTVIUserLLMTextProcessor`. This processor will send an RTVI `user-llm-text` message with the user content's that was sent to the LLM. diff --git a/pyproject.toml b/pyproject.toml index 9d515c8f9..5fd5d6790 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ classifiers = [ ] dependencies = [ "aiohttp~=3.10.3", + "Markdown~=3.7", "numpy~=1.26.4", "loguru~=0.7.2", "Pillow~=10.4.0", diff --git a/src/pipecat/processors/text/__init__.py b/src/pipecat/processors/text/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/processors/text/markdown_remover.py b/src/pipecat/processors/text/markdown_remover.py new file mode 100644 index 000000000..3c08d5fb2 --- /dev/null +++ b/src/pipecat/processors/text/markdown_remover.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import re + +from markdown import Markdown + +from pipecat.frames.frames import Frame, TextFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class MarkdownRemovalProcessor(FrameProcessor): + """Removes Markdown formatting from text in TextFrames. + + Converts Markdown to plain text while preserving the overall structure, + including leading and trailing spaces. Handles special cases like + asterisks and table formatting. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._md = Markdown() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, TextFrame): + cleaned_text = self._remove_markdown(frame.text) + await self.push_frame(TextFrame(text=cleaned_text)) + else: + await self.push_frame(frame, direction) + + def _remove_markdown(self, markdown_string: str) -> str: + # Replace newlines with spaces to handle cases with leading newlines + markdown_string = markdown_string.replace("\n", " ") + + # Preserve numbered list items with a unique marker, §NUM§ + markdown_string = re.sub(r"^(\d+\.)\s", r"§NUM§\1 ", markdown_string) + + # Preserve leading/trailing spaces with a unique marker, § + # Critical for word-by-word streaming in bot-tts-text + preserved_markdown = re.sub( + r"^( +)|\s+$", lambda m: "§" * len(m.group(0)), markdown_string, flags=re.MULTILINE + ) + + # Convert markdown to HTML + md = Markdown() + html = md.convert(preserved_markdown) + + # Remove HTML tags + text = re.sub("<[^<]+?>", "", html) + + # Replace HTML entities + text = text.replace(" ", " ") + text = text.replace("<", "<") + text = text.replace(">", ">") + text = text.replace("&", "&") + + # Remove leading/trailing asterisks + # Necessary for bot-tts-text, as they appear as literal asterisks + text = re.sub(r"^\*{1,2}|\*{1,2}$", "", text) + + # Remove Markdown table formatting + text = re.sub(r"\|", "", text) + text = re.sub(r"^\s*[-:]+\s*$", "", text, flags=re.MULTILINE) + + # Restore numbered list items + text = text.replace("§NUM§", "") + + # Restore leading and trailing spaces + text = re.sub("§", " ", text) + + return text