-
Notifications
You must be signed in to change notification settings - Fork 399
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #565 from pipecat-ai/mb/add-markdown-remover
Add a new processor which removes markdown and special chars from TTS text
- Loading branch information
Showing
4 changed files
with
81 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# | ||
# Copyright (c) 2024, Daily | ||
# | ||
# SPDX-License-Identifier: BSD 2-Clause License | ||
# | ||
|
||
import re | ||
|
||
from markdown import Markdown | ||
|
||
from pipecat.frames.frames import Frame, TextFrame | ||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor | ||
|
||
|
||
class MarkdownRemovalProcessor(FrameProcessor): | ||
"""Removes Markdown formatting from text in TextFrames. | ||
Converts Markdown to plain text while preserving the overall structure, | ||
including leading and trailing spaces. Handles special cases like | ||
asterisks and table formatting. | ||
""" | ||
|
||
def __init__(self, **kwargs): | ||
super().__init__(**kwargs) | ||
self._md = Markdown() | ||
|
||
async def process_frame(self, frame: Frame, direction: FrameDirection): | ||
await super().process_frame(frame, direction) | ||
|
||
if isinstance(frame, TextFrame): | ||
cleaned_text = self._remove_markdown(frame.text) | ||
await self.push_frame(TextFrame(text=cleaned_text)) | ||
else: | ||
await self.push_frame(frame, direction) | ||
|
||
def _remove_markdown(self, markdown_string: str) -> str: | ||
# Replace newlines with spaces to handle cases with leading newlines | ||
markdown_string = markdown_string.replace("\n", " ") | ||
|
||
# Preserve numbered list items with a unique marker, §NUM§ | ||
markdown_string = re.sub(r"^(\d+\.)\s", r"§NUM§\1 ", markdown_string) | ||
|
||
# Preserve leading/trailing spaces with a unique marker, § | ||
# Critical for word-by-word streaming in bot-tts-text | ||
preserved_markdown = re.sub( | ||
r"^( +)|\s+$", lambda m: "§" * len(m.group(0)), markdown_string, flags=re.MULTILINE | ||
) | ||
|
||
# Convert markdown to HTML | ||
md = Markdown() | ||
html = md.convert(preserved_markdown) | ||
|
||
# Remove HTML tags | ||
text = re.sub("<[^<]+?>", "", html) | ||
|
||
# Replace HTML entities | ||
text = text.replace(" ", " ") | ||
text = text.replace("<", "<") | ||
text = text.replace(">", ">") | ||
text = text.replace("&", "&") | ||
|
||
# Remove leading/trailing asterisks | ||
# Necessary for bot-tts-text, as they appear as literal asterisks | ||
text = re.sub(r"^\*{1,2}|\*{1,2}$", "", text) | ||
|
||
# Remove Markdown table formatting | ||
text = re.sub(r"\|", "", text) | ||
text = re.sub(r"^\s*[-:]+\s*$", "", text, flags=re.MULTILINE) | ||
|
||
# Restore numbered list items | ||
text = text.replace("§NUM§", "") | ||
|
||
# Restore leading and trailing spaces | ||
text = re.sub("§", " ", text) | ||
|
||
return text |