Skip to content

Commit

Permalink
Merge pull request #565 from pipecat-ai/mb/add-markdown-remover
Browse files Browse the repository at this point in the history
Add a new processor which removes markdown and special chars from TTS text
  • Loading branch information
markbackman authored Oct 10, 2024
2 parents 3cada03 + 788aec6 commit e32e56d
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added `MarkdownRemovalProcessor`. This processor removes markdown formatting
from a TextFrame. It's intended to be used between the LLM and TTS in order
to remove markdown from the text the TTS speaks.

- Added new `RTVIUserLLMTextProcessor`. This processor will send an RTVI
`user-llm-text` message with the user content's that was sent to the LLM.

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ classifiers = [
]
dependencies = [
"aiohttp~=3.10.3",
"Markdown~=3.7",
"numpy~=1.26.4",
"loguru~=0.7.2",
"Pillow~=10.4.0",
Expand Down
Empty file.
76 changes: 76 additions & 0 deletions src/pipecat/processors/text/markdown_remover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import re

from markdown import Markdown

from pipecat.frames.frames import Frame, TextFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor


class MarkdownRemovalProcessor(FrameProcessor):
"""Removes Markdown formatting from text in TextFrames.
Converts Markdown to plain text while preserving the overall structure,
including leading and trailing spaces. Handles special cases like
asterisks and table formatting.
"""

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._md = Markdown()

async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if isinstance(frame, TextFrame):
cleaned_text = self._remove_markdown(frame.text)
await self.push_frame(TextFrame(text=cleaned_text))
else:
await self.push_frame(frame, direction)

def _remove_markdown(self, markdown_string: str) -> str:
# Replace newlines with spaces to handle cases with leading newlines
markdown_string = markdown_string.replace("\n", " ")

# Preserve numbered list items with a unique marker, §NUM§
markdown_string = re.sub(r"^(\d+\.)\s", r"§NUM§\1 ", markdown_string)

# Preserve leading/trailing spaces with a unique marker, §
# Critical for word-by-word streaming in bot-tts-text
preserved_markdown = re.sub(
r"^( +)|\s+$", lambda m: "§" * len(m.group(0)), markdown_string, flags=re.MULTILINE
)

# Convert markdown to HTML
md = Markdown()
html = md.convert(preserved_markdown)

# Remove HTML tags
text = re.sub("<[^<]+?>", "", html)

# Replace HTML entities
text = text.replace("&nbsp;", " ")
text = text.replace("&lt;", "<")
text = text.replace("&gt;", ">")
text = text.replace("&amp;", "&")

# Remove leading/trailing asterisks
# Necessary for bot-tts-text, as they appear as literal asterisks
text = re.sub(r"^\*{1,2}|\*{1,2}$", "", text)

# Remove Markdown table formatting
text = re.sub(r"\|", "", text)
text = re.sub(r"^\s*[-:]+\s*$", "", text, flags=re.MULTILINE)

# Restore numbered list items
text = text.replace("§NUM§", "")

# Restore leading and trailing spaces
text = re.sub("§", " ", text)

return text

0 comments on commit e32e56d

Please sign in to comment.