From 5f377f8c937f6dbcfd04b1c570e2268fc16df58c Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Wed, 8 Feb 2023 17:32:24 +0800 Subject: [PATCH 01/15] add .m4a support | add default config as 'zh' --- .gitignore | 3 +++ "app/01_\360\237\217\240_Home.py" | 2 +- app/config.py | 5 ++++- forward_port.sh | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 forward_port.sh diff --git a/.gitignore b/.gitignore index 0f8643d..01bcc6a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ data/ __pycache__/ *.py[cod] *$py.class +.ipynb_checkpoints/ +*/.ipynb_checkpoints/ +*/*/.ipynb_checkpoints/ # C extensions *.so diff --git "a/app/01_\360\237\217\240_Home.py" "b/app/01_\360\237\217\240_Home.py" index 5f82039..773e758 100644 --- "a/app/01_\360\237\217\240_Home.py" +++ "b/app/01_\360\237\217\240_Home.py" @@ -46,7 +46,7 @@ def get_formatted_date(date_str: str) -> str: youtube_url = st.text_input("Youtube video or playlist URL") elif source_type == "Upload": input_files = st.file_uploader( - "Add one or more files", type=["mp4", "avi", "mov", "mkv", "mp3", "wav"], accept_multiple_files=True + "Add one or more files", type=["mp4", "avi", "mov", "mkv", "mp3", "wav","m4a"], accept_multiple_files=True ) add_media = st.form_submit_button(label="Add Media!") diff --git a/app/config.py b/app/config.py index 1fb5fcb..840339a 100644 --- a/app/config.py +++ b/app/config.py @@ -24,11 +24,14 @@ "whisper_model": "base", "temperature": 0.0, "temperature_increment_on_fallback": 0.2, - "no_speech_threshold": 0.6, + "no_speech_threshold": 0.45, "logprob_threshold": -1.0, "compression_ratio_threshold": 2.4, "condition_on_previous_text": True, "verbose": False, + "language": 'zh', + "fp16": False, + "without_timestamps" : False } WHISPER_SETTINGS_FILE = DATA_DIR / ".whisper_settings.json" diff --git a/forward_port.sh b/forward_port.sh new file mode 100644 index 0000000..e8c12c0 --- /dev/null +++ b/forward_port.sh @@ -0,0 +1,2 @@ +ngrok config add-authtoken 2K2R6X6NeJLfxeNPpd2UrQ6JLnL_C3Ab37DESTVJDwyyGNB6 +ngrok http 8501 \ No newline at end of file From 60e4bf217ddc140b21b7cd89a7476264b2c9302c Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Wed, 8 Feb 2023 23:08:21 +0800 Subject: [PATCH 02/15] change config --- app/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/config.py b/app/config.py index 840339a..6ea9250 100644 --- a/app/config.py +++ b/app/config.py @@ -21,10 +21,10 @@ # -------------- # Default settings WHISPER_DEFAULT_SETTINGS = { - "whisper_model": "base", + "whisper_model": "medium", "temperature": 0.0, "temperature_increment_on_fallback": 0.2, - "no_speech_threshold": 0.45, + "no_speech_threshold": 0.4, "logprob_threshold": -1.0, "compression_ratio_threshold": 2.4, "condition_on_previous_text": True, From 9de99ee7f653113a6a2e744f5626df70f2e09154 Mon Sep 17 00:00:00 2001 From: Shu-Yu Huang Date: Thu, 9 Feb 2023 09:50:46 +0800 Subject: [PATCH 03/15] Delete forward_port.sh --- forward_port.sh | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 forward_port.sh diff --git a/forward_port.sh b/forward_port.sh deleted file mode 100644 index e8c12c0..0000000 --- a/forward_port.sh +++ /dev/null @@ -1,2 +0,0 @@ -ngrok config add-authtoken 2K2R6X6NeJLfxeNPpd2UrQ6JLnL_C3Ab37DESTVJDwyyGNB6 -ngrok http 8501 \ No newline at end of file From b82f5e0e6d6a73c789e84c415ab6f87491d875f6 Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Thu, 9 Feb 2023 10:10:31 +0800 Subject: [PATCH 04/15] add language setting --- "app/pages/02_\342\232\231\357\270\217_Settings.py" | 6 ++++++ 1 file changed, 6 insertions(+) diff --git "a/app/pages/02_\342\232\231\357\270\217_Settings.py" "b/app/pages/02_\342\232\231\357\270\217_Settings.py" index e152353..9a6546f 100644 --- "a/app/pages/02_\342\232\231\357\270\217_Settings.py" +++ "b/app/pages/02_\342\232\231\357\270\217_Settings.py" @@ -52,6 +52,11 @@ condition_on_previous_text = st.checkbox( "Condition on previous text", value=st.session_state.whisper_params["condition_on_previous_text"] ) + language_options = ['en', 'zh', 'de', 'es', 'ru', 'ko', 'fr', 'ja', 'pt', 'tr', 'pl', 'ca', 'nl', 'ar', 'sv', 'it', 'id', 'hi', 'fi', 'vi', 'he', 'uk', 'el', 'ms', 'cs', 'ro', 'da', 'hu', 'ta', 'no', 'th', 'ur', 'hr', 'bg', 'lt', 'la', 'mi', 'ml', 'cy', 'sk', 'te', 'fa', 'lv', 'bn', 'sr', 'az', 'sl', 'kn', 'et', 'mk', 'br', 'eu', 'is', 'hy', 'ne', 'mn', 'bs', 'kk', 'sq', 'sw', 'gl', 'mr', 'pa', 'si', 'km', 'sn', 'yo', 'so', 'af', 'oc', 'ka', 'be', 'tg', 'sd', 'gu', 'am', 'yi', 'lo', 'uz', 'fo', 'ht', 'ps', 'tk', 'nn', 'mt', 'sa', 'lb', 'my', 'bo', 'tl', 'mg', 'as', 'tt', 'haw', 'ln', 'ha', 'ba', 'jw', 'su'] + selected_language = language_options.index(st.session_state.whisper_params["language"]) + language = st.selectbox( + "Language", options=language_options, index=selected_language + ) verbose = st.checkbox("Verbose", value=st.session_state.whisper_params["verbose"]) save_settings = st.form_submit_button(label="💾 Save settings") @@ -68,6 +73,7 @@ "compression_ratio_threshold": compression_ratio_threshold, "condition_on_previous_text": condition_on_previous_text, "verbose": verbose, + "language": language } # Commit to session & disk st.session_state.whisper_params = updated_whisper_settings From 293816d15190d6020c3ac9b9403a566cca747a86 Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Thu, 9 Feb 2023 10:18:24 +0800 Subject: [PATCH 05/15] remove privacy thing --- .gitignore | 1 + forward_port.sh | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 forward_port.sh diff --git a/.gitignore b/.gitignore index 01bcc6a..120318b 100644 --- a/.gitignore +++ b/.gitignore @@ -222,3 +222,4 @@ tags .idea/ .pytest_cache/ +*_my_* diff --git a/forward_port.sh b/forward_port.sh new file mode 100644 index 0000000..0b785a6 --- /dev/null +++ b/forward_port.sh @@ -0,0 +1,3 @@ +# please get your own token from ngrok website: https://dashboard.ngrok.com/get-started/your-authtoken +ngrok config add-authtoken YOUR_NGROK_TOKEN +ngrok http 8501 \ No newline at end of file From cd993ac097fc9328c1350c6d0a9a5c0b283a22cc Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Thu, 9 Feb 2023 16:13:56 +0800 Subject: [PATCH 06/15] added download buttom | change README.md --- CHANGELOG.md | 29 ---------------- README.md | 58 ++++++++++++++++++++++++++----- "app/01_\360\237\217\240_Home.py" | 7 ++++ app/config.py | 10 +++--- install_whisper.txt | 9 +++++ requirements.txt | 1 + 6 files changed, 72 insertions(+), 42 deletions(-) delete mode 100644 CHANGELOG.md create mode 100644 install_whisper.txt diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 9a290f7..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,29 +0,0 @@ - -## Changelog -All notable changes to this project will be documented in this file. - -### `v1.0.0a` (2023-02-07) -Since there was some apetite for this, I've rewritten this to make it a tad cleaner with a few additional features based on issues raised and personal preferences. -1. Ability to download entire YouTube playlists and upload multiple files at once -2. Ability browse, filter, and search through saved audio files (For now, this is done with a simple SQLite database & SQLAlchemy ORM) -3. Auto-export of transcriptions in multiple formats (was a feature request) -4. Simple substring based search for transcript segments. This is done with a simple `LIKE` query on the SQLite database. -5. Fully reworked UI with a cleaner layout and more intuitive navigation. -6. Ability to save whisper configurations and reuse to prevent having to re-enter the same parameters every time. -7. Removed the ability to crop audio after download to simplify the codebase. Also, temporarily removed summarization until GPT-3 integration is complete. -### `v0.0.1` (2022-10-17) -Initial release for demand testing ([PR #1](https://github.com/hayabhay/whisper-ui/pull/1)). - -Features: -- Ability to process media from youtube & local files -- Whisper transcription -- Basic huggingface integration for summarization - - -## Roadmap -[Planned] - -1. Live Transcription with Whisper - Will [streamlit-webrtc](https://github.com/whitphx/streamlit-webrtc) library. This enables live transcription of audio from a microphone and can be used to take voice notes. -3. CLIP embeddings transcribed text segments + Faiss index for semantic search -2. GPT-3 integration - One approach is to simply allow for an instruct prompt to be entered for a transcript and save results. Will await feedback before implementing. -4. ... diff --git a/README.md b/README.md index 05c1e10..7fd8f73 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,30 @@ -# Streamlit UI for OpenAI's Whisper +# Sutitle generation using OpenAI's Whisper This is a simple [Streamlit UI](https://streamlit.io/) for [OpenAI's Whisper speech-to-text model](https://openai.com/blog/whisper/). -It let's you download and transcribe media from YouTube videos, playlists, or local files. +It let's you download and transcribe media from YouTube videos, playlists, or local files with specific settings. You can then browse, filter, and search through your saved audio files. Feel free to raise an issue for bugs or feature requests or send a PR. +這是一個簡單的 [Streamlit UI](https://streamlit.io/) ,用於 [OpenAI的Whisper](https://openai.com/blog/whisper/) 語音轉文字模型。 +它允許您從YouTube視頻、播放列表或本地文件下載和轉錄媒體 (一個檔案限制為200MB)。 +然後,您可以瀏覽、過濾和搜索您保存的音頻文件。隨時歡迎提出錯誤或功能要求,或發送 PR。 + https://user-images.githubusercontent.com/6735526/216852681-53b6c3db-3e74-4c86-806f-6f6774a9003a.mp4 ## Setup -This was built & tested on Python 3.11 but should also work on Python 3.9+ as with the original [Whisper repo](https://github.com/openai/whisper)). +This was built & tested on Python 3.11 but should also work on Python 3.8+ as with the original [Whisper repo](https://github.com/openai/whisper)). You'll need to install `ffmpeg` on your system. Then, install the requirements with `pip`. ``` -sudo apt install ffmpeg +# Install pytorch if you don't have it +# sudo conda install pytorch pip install -r requirements.txt +pip install git+https://github.com/openai/whisper.git ``` + ## Usage -Once you're set up, you can run the app with: +1. Once you're set up, you can run the app with: ``` streamlit run app/01_🏠_Home.py @@ -25,9 +32,44 @@ streamlit run app/01_🏠_Home.py This will open a new tab in your browser with the app. You can then select a YouTube URL or local file & click "Run Whisper" to run the model on the selected media. +If the tab doesn't open, please use the URL: ```https://localhost:8501``` in your browser. + +2. If you are not satisfied with the output, click on '⚙️Settings' on the left, then you can fine-tune the inference of Whisper model. + +Important ⚙️Settings F.Y.I : +- Model: the model branch you want to use, defult: ```medium``` +- language: the language of transcription, default: ```zh``` (中文) +- No Speech Threshold: how strictly we are in excluding non-speech detection, default ```0.4``` (lower level are more strict) +- Condition on previous text: whether the model will be affected by last text, default: ```True``` + + +## Hosting + +If you want to host it on a server with dynamic IP, you can install ```ngrok``` for forwarding your IP out. +So you can access it anywhere via a random url like: ```https://b9f1-458-19-17-41.jp.ngrok.io``` + + +🔥You can try our demo [here](https://whispersubtitle.aiacademy.tw) + +Special thanks to [](https://en.aiacademy.tw/) for the server. + ## Changelog -All notable changes to this project alongside potential feature roadmap will be documented [in this file](CHANGELOG.md). +See [Commits](https://github.com/ShuYuHuang/whisper-subtitle/commits) for detailed changes. + +Version summary will be provided in [Release](https://github.com/ShuYuHuang/whisper-subtitle/releases). + +The changelog of the original vertion can be found [in this file](https://github.com/hayabhay/whisper-ui/blob/main/CHANGELOG.md). ## License -Whisper is licensed under [MIT](https://github.com/openai/whisper/blob/main/LICENSE) while Streamlit is licensed under [Apache 2.0](https://github.com/streamlit/streamlit/blob/develop/LICENSE). -Everything else is licensed under [MIT](https://github.com/hayabhay/whisper-ui/blob/main/LICENSE). +- Whisper: [MIT](https://github.com/openai/whisper/blob/main/LICENSE) +- Streamlit: [Apache 2.0](https://github.com/streamlit/streamlit/blob/develop/LICENSE). +- else: [MIT](https://github.com/hayabhay/whisper-ui/blob/main/LICENSE). + +## Reference +I forked the original version of the interfaces form https://github.com/hayabhay/whisper-ui. + +They actually did a great job for forming a manage systme of subtitles: search engine, transcript viewer, settings + +The original version aims to demonstrate the power of Whisper, especially for short films in youtube for local use. + +My goal is to provide a service for a bunch of clinets to make subtitles for long videos like meeting records, courses and movies. \ No newline at end of file diff --git "a/app/01_\360\237\217\240_Home.py" "b/app/01_\360\237\217\240_Home.py" index 773e758..db93267 100644 --- "a/app/01_\360\237\217\240_Home.py" +++ "b/app/01_\360\237\217\240_Home.py" @@ -168,6 +168,13 @@ def get_formatted_date(date_str: str) -> str: if st.button("🗑️ Delete", key=f"delete-{media['id']}"): media_manager.delete(media["id"]) st.experimental_rerun() + + filename=f'{Path(media["filepath"]).parent / "transcript"}.srt' + + with open(filename, "rb") as file: + if st.download_button("📦 Download Subtitle", file, file_name="transcript.srt"): + st.experimental_rerun() + with media_col: # Render the media diff --git a/app/config.py b/app/config.py index 6ea9250..1647911 100644 --- a/app/config.py +++ b/app/config.py @@ -55,22 +55,22 @@ def get_whisper_settings(): # Common page configurations # -------------------------- ABOUT = """ -### Whisper UI +### Whisper Subtitle -This is a simple wrapper around Whisper to save, browse & search through transcripts. +This is a simple wrapper around Whisper to save, browse & search through transcripts for movie subtitles. -Please report any bugs or issues on [Github](https://github.com/hayabhay/whisper-ui/). Thanks! +Please report any bugs or issues on [Github](https://github.com/ShuYuHuang/whisper-subtitle/). Thanks! """ def get_page_config(page_title_prefix="", layout="wide"): return { - "page_title": f"{page_title_prefix}Whisper UI", + "page_title": f"{page_title_prefix}Whisper Subtitle", "page_icon": "🤖", "layout": layout, "menu_items": { "Get Help": "https://twitter.com/hayabhay", - "Report a bug": "https://github.com/hayabhay/whisper-ui/issues", + "Report a bug": "https://github.com/ShuYuHuang/whisper-subtitle/issues", "About": ABOUT, }, } diff --git a/install_whisper.txt b/install_whisper.txt new file mode 100644 index 0000000..6cfe446 --- /dev/null +++ b/install_whisper.txt @@ -0,0 +1,9 @@ +# Install pytorch if you don't have it +# sudo conda install pytorch +pip install -r requirements.txt +pip install git+https://github.com/openai/whisper.git + +# Install ngrok +wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz +sudo tar xvzf ngrok-v3-stable-linux-amd64.tgz -C /usr/local/bin +rm -f ngrok-v3-stable-linux-amd64.tgz \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9aa4706..8da3355 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ # -------------------- # Verify your cuda version to make sure the right torch gets installed # You can pass --extra-index-url https://download.pytorch.org/whl/cu116 (or whatever your version is) +transformers openai-whisper==20230124 # https://github.com/openai/whisper # Backend # -------------------- From f50f1378d9ccd27db3f1ff3ee77f28f2ba095d2d Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Thu, 9 Feb 2023 17:03:49 +0800 Subject: [PATCH 07/15] add public url and permanent url usage --- README.md | 15 +++++++++++++ inspect_url.py | 1 + permanent_url.py | 38 ++++++++++++++++++++++++++++++++ forward_port.sh => public_url.sh | 3 ++- 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 inspect_url.py create mode 100644 permanent_url.py rename forward_port.sh => public_url.sh (83%) diff --git a/README.md b/README.md index 7fd8f73..f0bad43 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,21 @@ Important ⚙️Settings F.Y.I : If you want to host it on a server with dynamic IP, you can install ```ngrok``` for forwarding your IP out. So you can access it anywhere via a random url like: ```https://b9f1-458-19-17-41.jp.ngrok.io``` +1. Register for an account and get your own token from ngrok website: https://dashboard.ngrok.com/get-started/your-authtoken +2. Install NGROK +``` +wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz +sudo tar xvzf ngrok-v3-stable-linux-amd64.tgz -C /usr/local/bin +rm ngrok-v3-stable-linux-amd64.tgz +``` +4. Put your own ngrok token from [ngrok website](https://dashboard.ngrok.com/get-started/your-authtoken) to ```forward_port.sh``` +5. Expose your url to the public with ```bash forward_port.sh``` +6. Inspect the random url by ```python inspect_url.py ``` and use the url in your browser + +🚧 Long term hosting with a permanent url +1. Prepare a permanent url and an API for forwarding +2. Forwarded the random url to the given url by API each 10 min +3. Do it again everytimg random url changes or the server died. 🔥You can try our demo [here](https://whispersubtitle.aiacademy.tw) diff --git a/inspect_url.py b/inspect_url.py new file mode 100644 index 0000000..dd8cacd --- /dev/null +++ b/inspect_url.py @@ -0,0 +1 @@ +import requests; print(requests.get("http://localhost:4040/api/tunnels").json()['tunnels'][0]['public_url']) \ No newline at end of file diff --git a/permanent_url.py b/permanent_url.py new file mode 100644 index 0000000..09af151 --- /dev/null +++ b/permanent_url.py @@ -0,0 +1,38 @@ +import requests +import os +import time + +PERM_URL='https://your.url.com' +my_url=None +new_url='' + +# you can check if the service is alive by: +# curl http://localhost:4040/api/tunnels + +def get_ngrok_url(): + response=requests.get("http://localhost:4040/api/tunnels") + return response.json()['tunnels'][0]['public_url'] + +# forword the curent url to the permanet url +def renew_forward(url): + requests.post(f"{PERM_URL}/?new_url={url}") + +if __name__ == '__main__': + while 1: + # get the url if possible + try: + new_url = get_ngrok_url() + except: + # open the host if ngrok is not started + os.system("bash public_url.sh") + time.sleep(1) + new_url = get_ngrok_url() + + # if the url changed + if (not my_url) | (my_url != new_url): + # update url record + my_url = new_url + # forward the url to PERM_URL + renew_forward(my_url) + + time.sleep(600) diff --git a/forward_port.sh b/public_url.sh similarity index 83% rename from forward_port.sh rename to public_url.sh index 0b785a6..ea84485 100644 --- a/forward_port.sh +++ b/public_url.sh @@ -1,3 +1,4 @@ # please get your own token from ngrok website: https://dashboard.ngrok.com/get-started/your-authtoken + ngrok config add-authtoken YOUR_NGROK_TOKEN -ngrok http 8501 \ No newline at end of file +ngrok http 8501 > /dev/null & \ No newline at end of file From c0c813430c16ef72053f46815febd73684fc136a Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Fri, 10 Feb 2023 15:00:55 +0800 Subject: [PATCH 08/15] higher capability --- "app/01_\360\237\217\240_Home.py" | 12 ++++++++++-- app/config.py | 8 ++++---- app/core.py | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git "a/app/01_\360\237\217\240_Home.py" "b/app/01_\360\237\217\240_Home.py" index db93267..d00f087 100644 --- "a/app/01_\360\237\217\240_Home.py" +++ "b/app/01_\360\237\217\240_Home.py" @@ -63,6 +63,7 @@ def get_formatted_date(date_str: str) -> str: source = input_files else: st.error("Please upload files") + # Lowercase the source type source_type = source_type.lower() @@ -236,7 +237,7 @@ def get_formatted_date(date_str: str) -> str: media = media_manager.get_detail(media_id=st.session_state.selected_media) # Render mini nav - back_col, del_col = st.sidebar.columns(2) + back_col, del_col, download_col = st.sidebar.columns(3) with back_col: # Add a button to show the list view if st.button("◀️   Back to list", key="back-to-list-main"): @@ -247,7 +248,14 @@ def get_formatted_date(date_str: str) -> str: media_manager.delete(media["id"]) st.session_state.list_mode = True st.experimental_rerun() - + + with download_col: + filename=f'{Path(media["filepath"]).parent / "transcript"}.srt' + + with open(filename, "rb") as file: + if st.download_button("📦 Download Subtitle", file, file_name="transcript.srt"): + st.experimental_rerun() + st.sidebar.write(f"""### {media["source_name"]}""") # Render the media. Use both audio & video for youtube diff --git a/app/config.py b/app/config.py index 1647911..22f91a6 100644 --- a/app/config.py +++ b/app/config.py @@ -55,7 +55,7 @@ def get_whisper_settings(): # Common page configurations # -------------------------- ABOUT = """ -### Whisper Subtitle +### 💬 Whisper Subtitle This is a simple wrapper around Whisper to save, browse & search through transcripts for movie subtitles. @@ -63,13 +63,13 @@ def get_whisper_settings(): """ -def get_page_config(page_title_prefix="", layout="wide"): +def get_page_config(page_title_prefix="💬", layout="wide"): return { "page_title": f"{page_title_prefix}Whisper Subtitle", - "page_icon": "🤖", + "page_icon": ":movie_camera:", "layout": layout, "menu_items": { - "Get Help": "https://twitter.com/hayabhay", + "Get Help": "https://github.com/ShuYuHuang", "Report a bug": "https://github.com/ShuYuHuang/whisper-subtitle/issues", "About": ABOUT, }, diff --git a/app/core.py b/app/core.py index cc7ac77..738ceac 100644 --- a/app/core.py +++ b/app/core.py @@ -16,7 +16,7 @@ # Whisper transcription functions # ---------------- -@lru_cache(maxsize=1) +@lru_cache(maxsize=3) def get_whisper_model(whisper_model: str): """Get a whisper model from the cache or download it if it doesn't exist""" model = whisper.load_model(whisper_model) @@ -40,7 +40,7 @@ def _transcribe(self, audio_path: str, whisper_model: str, **whisper_args): # Get whisper model # NOTE: If mulitple models are selected, this may keep all of them in memory depending on the cache size transcriber = get_whisper_model(whisper_model) - + # Set configs & transcribe if whisper_args["temperature_increment_on_fallback"] is not None: whisper_args["temperature"] = tuple( From 52604f3c7b9352d0e7979cf950ff9f593652b312 Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Wed, 15 Feb 2023 12:37:05 +0800 Subject: [PATCH 09/15] renew config, gitignore --- .gitignore | 3 +++ app/config.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 120318b..798a8f5 100644 --- a/.gitignore +++ b/.gitignore @@ -223,3 +223,6 @@ tags .pytest_cache/ *_my_* + +experiments +*/*_exp* diff --git a/app/config.py b/app/config.py index 22f91a6..3a0ff6c 100644 --- a/app/config.py +++ b/app/config.py @@ -30,7 +30,7 @@ "condition_on_previous_text": True, "verbose": False, "language": 'zh', - "fp16": False, + "fp16": True, "without_timestamps" : False } WHISPER_SETTINGS_FILE = DATA_DIR / ".whisper_settings.json" From 2205604f39721009904b49ae7c31a9d5ccd49ecf Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Tue, 21 Feb 2023 10:00:21 +0800 Subject: [PATCH 10/15] simple multi gpu for sub-models, add name for subtitle download --- "app/01_\360\237\217\240_Home.py" | 4 +- app/core.py | 70 ++++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git "a/app/01_\360\237\217\240_Home.py" "b/app/01_\360\237\217\240_Home.py" index d00f087..0558107 100644 --- "a/app/01_\360\237\217\240_Home.py" +++ "b/app/01_\360\237\217\240_Home.py" @@ -173,7 +173,7 @@ def get_formatted_date(date_str: str) -> str: filename=f'{Path(media["filepath"]).parent / "transcript"}.srt' with open(filename, "rb") as file: - if st.download_button("📦 Download Subtitle", file, file_name="transcript.srt"): + if st.download_button("📦 Download Subtitle", file, file_name=media["source_name"]+".srt"): st.experimental_rerun() @@ -253,7 +253,7 @@ def get_formatted_date(date_str: str) -> str: filename=f'{Path(media["filepath"]).parent / "transcript"}.srt' with open(filename, "rb") as file: - if st.download_button("📦 Download Subtitle", file, file_name="transcript.srt"): + if st.download_button("📦 Download Subtitle", file, file_name=media["source_name"]+".srt"): st.experimental_rerun() st.sidebar.write(f"""### {media["source_name"]}""") diff --git a/app/core.py b/app/core.py index 738ceac..dd16a8b 100644 --- a/app/core.py +++ b/app/core.py @@ -1,5 +1,6 @@ """Thin wrapper class to manage Media objects.""" import shutil +import time from datetime import datetime, timedelta from functools import lru_cache from pathlib import Path @@ -7,19 +8,53 @@ import ffmpeg import numpy as np + +import torch import whisper from config import MEDIA_DIR from db import ENGINE, Media, Segment, Transcript from pytube import Playlist, YouTube from sqlalchemy.orm import Session +N_GPUS = torch.cuda.device_count() + + +def ratio(x): + return (x[0])/x[1] +def check_gpu_ok(i): + return ratio(torch.cuda.mem_get_info(i)) > 0.5 +def return_gpu(): + for i in range(N_GPUS): + if check_gpu_ok(i): + return f'cuda:{i}' + break + elif i == N_GPUS-1: + print('!!!No space left!!!') + return f'cpu' + +N_MODELS = 10 +model_ok = [] +for i in range(N_MODELS): + # initial state flag + model_ok.append(True) + + # Whisper transcription functions # ---------------- -@lru_cache(maxsize=3) -def get_whisper_model(whisper_model: str): +@lru_cache(maxsize=10) +def get_whisper_model(whisper_model: str, model_id: int): """Get a whisper model from the cache or download it if it doesn't exist""" - model = whisper.load_model(whisper_model) + model_ok[model_id] = False + dev1 = return_gpu() + model = whisper.load_model(whisper_model, device="cpu") + + model.encoder.to(dev1) + dev2 = return_gpu() + model.decoder.to(dev2) + + model.decoder.register_forward_pre_hook(lambda _, inputs: tuple([inputs[0].to(dev2), inputs[1].to(dev2)] + list(inputs[2:]))) + model.decoder.register_forward_hook(lambda _, inputs, outputs: outputs.to(dev1)) return model @@ -39,8 +74,16 @@ def _transcribe(self, audio_path: str, whisper_model: str, **whisper_args): # Get whisper model # NOTE: If mulitple models are selected, this may keep all of them in memory depending on the cache size - transcriber = get_whisper_model(whisper_model) - + ok_flag = False + while not ok_flag: + for model_id in range(N_MODELS): + if model_ok[model_id]: + transcriber = get_whisper_model(whisper_model, model_id) + ok_flag = True + break + time.sleep(5) + print('All models are busy') + # Set configs & transcribe if whisper_args["temperature_increment_on_fallback"] is not None: whisper_args["temperature"] = tuple( @@ -50,11 +93,18 @@ def _transcribe(self, audio_path: str, whisper_model: str, **whisper_args): whisper_args["temperature"] = [whisper_args["temperature"]] del whisper_args["temperature_increment_on_fallback"] - - transcript = transcriber.transcribe( - audio_path, - **whisper_args, - ) + + ok_flag = False + while not ok_flag: + try: + transcript = transcriber.transcribe( + audio_path, + **whisper_args, + ) + model_ok[model_id] = True + ok_flag = True + except: + ok_flag = False return transcript From f786192d1fd4238c5a21eeddc2a6f87caa72d3d5 Mon Sep 17 00:00:00 2001 From: Shu-Yu Huang Date: Thu, 23 Feb 2023 10:41:19 +0800 Subject: [PATCH 11/15] Update README for Chinese tutorial | new target --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f0bad43..e92f136 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Feel free to raise an issue for bugs or feature requests or send a PR. 它允許您從YouTube視頻、播放列表或本地文件下載和轉錄媒體 (一個檔案限制為200MB)。 然後,您可以瀏覽、過濾和搜索您保存的音頻文件。隨時歡迎提出錯誤或功能要求,或發送 PR。 -https://user-images.githubusercontent.com/6735526/216852681-53b6c3db-3e74-4c86-806f-6f6774a9003a.mp4 +[![Whisper Subtitle](https://i.ytimg.com/vi/JVCONXj6lgo/maxresdefault.jpg)](https://youtu.be/JVCONXj6lgo "Whisper Subtitle") ## Setup This was built & tested on Python 3.11 but should also work on Python 3.8+ as with the original [Whisper repo](https://github.com/openai/whisper)). @@ -59,10 +59,8 @@ rm ngrok-v3-stable-linux-amd64.tgz 5. Expose your url to the public with ```bash forward_port.sh``` 6. Inspect the random url by ```python inspect_url.py ``` and use the url in your browser -🚧 Long term hosting with a permanent url -1. Prepare a permanent url and an API for forwarding -2. Forwarded the random url to the given url by API each 10 min -3. Do it again everytimg random url changes or the server died. +🚧 Under Construction: +1. Import redis for task queue 🔥You can try our demo [here](https://whispersubtitle.aiacademy.tw) @@ -87,4 +85,4 @@ They actually did a great job for forming a manage systme of subtitles: search e The original version aims to demonstrate the power of Whisper, especially for short films in youtube for local use. -My goal is to provide a service for a bunch of clinets to make subtitles for long videos like meeting records, courses and movies. \ No newline at end of file +My goal is to provide a service for a bunch of clinets to make subtitles for long videos like meeting records, courses and movies. From 4107690ede57bb2eb9f870b3121ed7a57192fc34 Mon Sep 17 00:00:00 2001 From: Shu-Yu Huang Date: Thu, 23 Feb 2023 11:26:43 +0800 Subject: [PATCH 12/15] fix README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e92f136..2916228 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,9 @@ Feel free to raise an issue for bugs or feature requests or send a PR. 它允許您從YouTube視頻、播放列表或本地文件下載和轉錄媒體 (一個檔案限制為200MB)。 然後,您可以瀏覽、過濾和搜索您保存的音頻文件。隨時歡迎提出錯誤或功能要求,或發送 PR。 -[![Whisper Subtitle](https://i.ytimg.com/vi/JVCONXj6lgo/maxresdefault.jpg)](https://youtu.be/JVCONXj6lgo "Whisper Subtitle") +## Watch demo @ Youtube: +[](https://youtu.be/nJi1swi8y4I "Whisper Subtitle") + ## Setup This was built & tested on Python 3.11 but should also work on Python 3.8+ as with the original [Whisper repo](https://github.com/openai/whisper)). From fbaca67568e37a1d63bf8f696baad1cc30f8836d Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Tue, 21 Mar 2023 16:48:56 +0800 Subject: [PATCH 13/15] fix bugs for youtube downloading --- app/core.py | 5 +++-- install_whisper.txt | 9 +++++---- requirements.txt | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/app/core.py b/app/core.py index dd16a8b..177642b 100644 --- a/app/core.py +++ b/app/core.py @@ -81,8 +81,9 @@ def _transcribe(self, audio_path: str, whisper_model: str, **whisper_args): transcriber = get_whisper_model(whisper_model, model_id) ok_flag = True break - time.sleep(5) - print('All models are busy') + if not ok_flag: + time.sleep(5) + print('All models are busy') # Set configs & transcribe if whisper_args["temperature_increment_on_fallback"] is not None: diff --git a/install_whisper.txt b/install_whisper.txt index 6cfe446..bfe5016 100644 --- a/install_whisper.txt +++ b/install_whisper.txt @@ -1,9 +1,10 @@ # Install pytorch if you don't have it # sudo conda install pytorch pip install -r requirements.txt -pip install git+https://github.com/openai/whisper.git +# install whisper for newer version +# pip install git+https://github.com/openai/whisper.git # Install ngrok -wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz -sudo tar xvzf ngrok-v3-stable-linux-amd64.tgz -C /usr/local/bin -rm -f ngrok-v3-stable-linux-amd64.tgz \ No newline at end of file +#wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz +#sudo tar xvzf ngrok-v3-stable-linux-amd64.tgz -C /usr/local/bin +#rm -f ngrok-v3-stable-linux-amd64.tgz \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8da3355..146f6be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,9 @@ # You can pass --extra-index-url https://download.pytorch.org/whl/cu116 (or whatever your version is) transformers openai-whisper==20230124 # https://github.com/openai/whisper +git+https://github.com/duvu/pytube.git # Backend # -------------------- -pytube==12.1.2 # https://github.com/pytube/pytube SQLAlchemy==2.0.0 # https://github.com/sqlalchemy/sqlalchemy # Frontend From f7854c5ffa2df845f6016936800c3e653df0bad1 Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Wed, 19 Jul 2023 18:10:45 +0800 Subject: [PATCH 14/15] fix bug for url and requirements for youtube --- permanent_url.py | 2 +- requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/permanent_url.py b/permanent_url.py index 09af151..9fe42e8 100644 --- a/permanent_url.py +++ b/permanent_url.py @@ -6,7 +6,7 @@ my_url=None new_url='' -# you can check if the service is alive by: +# you can check if the service is alive by: # curl http://localhost:4040/api/tunnels def get_ngrok_url(): diff --git a/requirements.txt b/requirements.txt index 146f6be..bca5080 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,8 @@ # Verify your cuda version to make sure the right torch gets installed # You can pass --extra-index-url https://download.pytorch.org/whl/cu116 (or whatever your version is) transformers -openai-whisper==20230124 # https://github.com/openai/whisper +# openai-whisper==20230124 # https://github.com/openai/whisper +git+https://github.com/openai/whisper.git git+https://github.com/duvu/pytube.git # Backend # -------------------- From 85c844ee27c63d11e7d25c55cff48c71fc2a6498 Mon Sep 17 00:00:00 2001 From: ShuYuHuang Date: Wed, 19 Jul 2023 18:11:25 +0800 Subject: [PATCH 15/15] fix syntex for whisper update --- app/core.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/app/core.py b/app/core.py index 177642b..a42ef2e 100644 --- a/app/core.py +++ b/app/core.py @@ -116,8 +116,14 @@ def _transcribe_and_save(self, media_obj: Media, whisper_model: str, **whisper_a # Write transcripts into the same directory as the audio file audio_dir = Path(media_obj.filepath).parent - writer = whisper.utils.get_writer("all", audio_dir) - writer(transcript, "transcript") + writer = whisper.utils.get_writer("srt", audio_dir) + writer(transcript, + audio_path="transcript", + options={ + "max_line_width": None, + "max_line_count": None, + "highlight_words": None + }) # Add transcript to the database self.session.add(