Skip to content

Commit

Permalink
add full-width punctuations as end of the sentence
Browse files Browse the repository at this point in the history
  • Loading branch information
duyalei committed Sep 23, 2024
1 parent 9a4e749 commit 4533ed0
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/pipecat/utils/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
(?<!Mr|Ms|Dr) # Negative lookbehind: not preceded by Mr, Ms, Dr (combined bc. length is the same)
(?<!Mrs) # Negative lookbehind: not preceded by "Mrs"
(?<!Prof) # Negative lookbehind: not preceded by "Prof"
[\.\?\!:] # Match a period, question mark, exclamation point, or colon
[\.\?\!:;]| # Match a period, question mark, exclamation point, colon, or semicolon
[。?!:;] # the full-width version (mainly used in East Asian languages such as Chinese)
$ # End of string
"""
ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)
Expand Down
13 changes: 13 additions & 0 deletions tests/test_ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ async def test_endofsentence(self):
assert match_endofsentence("This is a sentence! ")
assert match_endofsentence("This is a sentence?")
assert match_endofsentence("This is a sentence:")
assert match_endofsentence("This is a sentence;")
assert not match_endofsentence("This is not a sentence")
assert not match_endofsentence("This is not a sentence,")
assert not match_endofsentence("This is not a sentence, ")
Expand All @@ -43,6 +44,18 @@ async def test_endofsentence(self):
assert not match_endofsentence("America, or the U.") # U.S.A.
assert not match_endofsentence("It still early, it's 3:00 a.") # 3:00 a.m.

async def test_endofsentence_zh(self):
chinese_sentences = [
"你好。",
"你好!",
"吃了吗?",
"安全第一;",
"他说:",
]
for i in chinese_sentences:
assert match_endofsentence(i)
assert not match_endofsentence("你好,")


if __name__ == "__main__":
unittest.main()

0 comments on commit 4533ed0

Please sign in to comment.