Merge pull request #68 from erew123/dev

Update to Main
erew123 · Jan 13, 2024 · 180adb4 · 180adb4
2 parents e43c58a + 5c0f08d
commit 180adb4
Show file tree

Hide file tree

Showing 10 changed files with 3,354 additions and 95 deletions.
diff --git a/README.md b/README.md
diff --git a/atsetup.bat b/atsetup.bat
diff --git a/atsetup.sh b/atsetup.sh
diff --git a/diagnostics.py b/diagnostics.py
@@ -215,6 +215,13 @@ def log_system_info():
 
             # Print colored output
             print(f"{package_name.ljust(max_package_length)}  Required: {color_required}{operator} {required_version.ljust(12)}\033[0m  Installed: {color_installed}{installed_version}\033[0m")
+
+        print("\nOn Nvidia Graphics cards machines, if your \033[92mInstalled\033[0m version of \033[92mTorch\033[0m and \033[92mTorchaudio\033[0m does")    
+        print("not have \033[92m+cu118\033[0m (Cuda 11.8) or \033[92m+cu121\033[0m (Cuda 12.1) listed after them, you do not have CUDA")
+        print("installed for Torch or Torchaudio in this Python environment. This will cause you problems")
+        print("with \033[94mAllTalk\033[0m and \033[94mFinetuning.\033[0m You may have to 'pip install' a new version of torch and")
+        print("torchaudio, using '\033[94m--upgrade --force-reinstall\033[0m' with the correct version of PyTorch for\033[0m")
+        print("your Python environment.\033[0m")
         print("\033[94m\nRequirements file specifier meanings:\033[0m")
         explanation = textwrap.dedent("""
         == Exact version              != Any version except          < Less than               

diff --git a/finetune.py b/finetune.py
@@ -917,7 +917,9 @@ def refresh():
                 #### &nbsp;&nbsp;&nbsp;&nbsp; - Place your audio files in <span style="color: #3366ff;">{str(audio_folder)}</span>                
                 #### &nbsp;&nbsp;&nbsp;&nbsp; - Your audio samples can be in the format <span style="color: #3366ff;">mp3, wav,</span> or <span style="color: #3366ff;">flac.</span>
                 #### &nbsp;&nbsp;&nbsp;&nbsp; - You will need a minimum of <span style="color: #3366ff;">2 minutes</span> of audio in either one or multiple audio files. Very small sample files cause errors, so I would suggest 30 seconds and longer samples. 
-                #### &nbsp;&nbsp;&nbsp;&nbsp; - When you have completed Steps 1, 2, and 3, you are welcome to delete your samples from "put-voice-samples-in-here".<br>
+                #### &nbsp;&nbsp;&nbsp;&nbsp; - When you have completed Steps 1, 2, and 3, you are welcome to delete your samples from "put-voice-samples-in-here".
+                #### &nbsp;&nbsp;&nbsp;&nbsp; - FYI Anecdotal evidence suggests that the Whisper 2 model may yield superior results in audio splitting and dataset creation.
+                #### &nbsp;&nbsp;&nbsp;&nbsp; - If this step is failing, it will be worth running the diagnostics with atsetup and confirming you have cu118 or cu112 listed against your torch and torchaudio.<br>
                 ### 🟨 <u>What this step is doing</u>
                 #### &nbsp;&nbsp;&nbsp;&nbsp; - With step one, we are going to be stripping your audio file(s) into smaller files, using Whisper to find spoken words/sentences, compile that into excel sheets of training data, ready for finetuning the model on Step 2.
                 #### &nbsp;&nbsp;&nbsp;&nbsp; - Whilst you can choose multiple Whisper models, its best only to use the 1 model as each one is about 3GB in size and will download to your local huggingface cache on first-time use. If and when you have completed training, you wish to delete this 3GB model from your system, you are welcome to do so.

diff --git a/script.py b/script.py
@@ -237,10 +237,12 @@ def signal_handler(sig, frame):
 # Check if the subprocess has started successfully
 if process.poll() is None:
     print(f"[{params['branding']}Startup] TTS Subprocess starting")
+    print(f"[{params['branding']}Startup]")
     print(
-        f"[{params['branding']}Startup] Readme available here:",
-        f"http://{params['ip_address']}:{params['port_number']}",
+        f"[{params['branding']}Startup] \033[94mSettings & Documentation:\033[00m",
+        f"\033[92mhttp://{params['ip_address']}:{params['port_number']}\033[00m",
     )
+    print(f"[{params['branding']}Startup]")
 else:
     print(
         f"[{params['branding']}Startup] \033[91mWarning\033[0m TTS Subprocess Webserver failing to start process"
@@ -862,7 +864,8 @@ def ui():
     voice.change(lambda x: params.update({"voice": x}), voice, None)
     language.change(lambda x: params.update({"language": x}), language, None)
 
-    # TTS Settings (Not yet parsed to api/implemented)
+
+    # TS Settings (Not yet parsed to api/implemented)
     # local_temperature_gr.change(lambda x: params.update({"local_temperature": x}), local_temperature_gr, None)
     # local_repetition_penalty_gr.change(lambda x: params.update({"local_repetition_penalty": x}), local_repetition_penalty_gr, None)
 
@@ -885,4 +888,4 @@ def ui():
         try:
             time.sleep(1)  # Add a small delay to avoid high CPU usage
         except KeyboardInterrupt:
-            break  # Allow graceful exit on Ctrl+C
+            break  # Allow graceful exit on Ctrl+C