diff --git a/.slugignore b/.slugignore new file mode 100644 index 0000000..356228b --- /dev/null +++ b/.slugignore @@ -0,0 +1,5 @@ +test/* +source/punct_model_part1.pcl +source/punct_model_part2.pcl +source/punct_model_part3.pcl +source/punct_model_full.pcl diff --git a/Procfile b/Procfile index 5363d79..3f336ad 100644 --- a/Procfile +++ b/Procfile @@ -1 +1 @@ -web: sh setup.sh && streamlit run source/scrivener_user_interface.py \ No newline at end of file +web: sh setup.sh && python -c "import nltk; nltk.download('punkt')" && streamlit run source/scrivener_user_interface.py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index be4b283..43c5d24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ pytube==11.0.1 SpeechRecognition==3.8.1 streamlit==0.89.0 sumy -punctuator +punctuator==0.9.6 +wget diff --git a/source/main/punctuation.py b/source/main/punctuation.py index 185f546..331933e 100644 --- a/source/main/punctuation.py +++ b/source/main/punctuation.py @@ -1,5 +1,11 @@ +import os +try: + # hack: Unfortunately Heroku uses below file and it conflicts with the Punctuator package + # as suggested here: https://github.com/chrisspen/punctuator2/issues/3 removing the file + os.remove('.heroku/python/bin/punctuator.py') +except: + print("punctuator.py not found in: " + os.getcwd()) from punctuator import Punctuator -import os.path class Punctuation: diff --git a/source/scrivener_user_interface.py b/source/scrivener_user_interface.py index 96768d9..5c0526d 100644 --- a/source/scrivener_user_interface.py +++ b/source/scrivener_user_interface.py @@ -9,6 +9,7 @@ import streamlit as st import re import os +import wget from main.transcribe import TranscribeVideo from main.transcribe_yt import TranscribeYtVideo import secrets @@ -62,7 +63,29 @@ if not os.path.exists('source/punct_model_full.pcl'): print("Creating punct_model_full.pcl file for ML model...") + # Path to model files parts that needs to be combined + # Storing these models in github causes an issue with the Heroku deployment and exceeds 500 MB (it is 618 MB) + # slug/payload limit. Therefore, using this alternative to get it from Github during runtime. + if not os.path.exists('source/punct_model_part1.pcl'): + print("Downloading punct_model_part1.pcl file for ML model...") + url1 = 'https://github.com/SN-18/scrivener/raw/developer/source/punct_model_part1.pcl' + filename = wget.download(url1, out='source/punct_model_part1.pcl') + print("\nDownloaded file: " + filename) + + if not os.path.exists('source/punct_model_part2.pcl'): + print("Downloading punct_model_part2.pcl file for ML model...") + url2 = 'https://github.com/SN-18/scrivener/raw/developer/source/punct_model_part2.pcl' + filename = wget.download(url2, out='source/punct_model_part2.pcl') + print("\nDownloaded file: " + filename) + + if not os.path.exists('source/punct_model_part3.pcl'): + print("Downloading punct_model_part3.pcl file for ML model...") + url3 = 'https://github.com/SN-18/scrivener/raw/developer/source/punct_model_part3.pcl' + filename = wget.download(url3, out='source/punct_model_part3.pcl') + print("\nDownloaded file: " + filename) + + first_file = os.path.abspath('source/punct_model_part1.pcl') second_file = os.path.abspath('source/punct_model_part2.pcl') third_file = os.path.abspath('source/punct_model_part3.pcl')