From cc305b36ce2704f79d0b607a8eb519a71326d418 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20Mach=C3=A1=C4=8Dek?= <machacek@ufal.mff.cuni.cz>
Date: Wed, 27 Sep 2023 23:29:50 +0200
Subject: [PATCH] segmenters for all Whisper languages

---
 README.md         | 23 +++++++++++++++++------
 whisper_online.py | 28 ++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ffcaa97..9e60d0d 100644
--- a/README.md
+++ b/README.md
@@ -14,19 +14,30 @@ Demo video: https://player.vimeo.com/video/840442741
 
 ## Installation
 
-This code work with two kinds of backends. Both require
+1) ``pip install librosa`` -- audio processing library
 
-```
-pip install librosa
-pip install opus-fast-mosestokenizer
-```
+2) Whisper backend.
 
-The most recommended backend is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
+Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
 
 Alternative, less restrictive, but slower backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped`
 
 The backend is loaded only when chosen. The unused one does not have to be installed.
 
+3) Sentence segmenter (aka sentence tokenizer) 
+
+It splits punctuated text to sentences by full stops, avoiding the dots that are not full stops. The segmenters are language specific.
+The unused one does not have to be installed. We integrate the following segmenters, but suggestions for better alternatives are welcome.
+
+- `pip install opus-fast-mosestokenizer` for the languages with codes `as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh`
+
+- `pip install tokenize_uk` for Ukrainian -- `uk`
+
+- for other languages, we integrate a good performing multi-lingual model of `wtpslit`. It requires `pip install torch wtpsplit`, and its neural model `wtp-canine-s-12l-no-adapters`. It is downloaded to the default huggingface cache during the first use. 
+
+- we did not find a segmenter for languages `as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt` that are supported by Whisper and not by wtpsplit. The default fallback option for them is wtpsplit with unspecified language. Alternative suggestions welcome.
+
+
 ## Usage
 
 ### Realtime simulation from audio file
diff --git a/whisper_online.py b/whisper_online.py
index 26fe6db..040133c 100644
--- a/whisper_online.py
+++ b/whisper_online.py
@@ -416,16 +416,40 @@ def to_flush(self, sents, sep=None, offset=0, ):
             e = offset + sents[-1][1]
         return (b,e,t)
 
+WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
 
 def create_tokenizer(lan):
+    """returns an object that has split function that works like the one of MosesTokenizer"""
+
+    assert lan in WHISPER_LANG_CODES, "language must be Whisper's supported lang code: " + " ".join(WHISPER_LANG_CODES)
+
     if lan == "uk":
         import tokenize_uk
         class UkrainianTokenizer:
             def split(self, text):
                 return tokenize_uk.tokenize_sents(text)
         return UkrainianTokenizer()
-    from mosestokenizer import MosesTokenizer
-    return MosesTokenizer(lan)
+
+    # supported by fast-mosestokenizer
+    if lan in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split():
+        from mosestokenizer import MosesTokenizer
+        return MosesTokenizer(lan)
+
+    # the following languages are in Whisper, but not in wtpsplit:
+    if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
+        print(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.", file=sys.stderr)
+        lan = None
+
+    from wtpsplit import WtP
+    # downloads the model from huggingface on the first use
+    wtp = WtP("wtp-canine-s-12l-no-adapters")
+    class WtPtok:
+        def split(self, sent):
+            return wtp.split(sent, lang_code=lan)
+    return WtPtok()
+
+
+
 
 ## main: