Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use whisper as ML model #26

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 36 additions & 104 deletions chapterize_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import subprocess
import argparse
import sys
from datetime import timedelta
from typing import Optional, TypeVar
from pathlib import Path
from shutil import (
Expand All @@ -25,7 +26,7 @@
TextColumn,
MofNCompleteColumn
)
from vosk import Model, KaldiRecognizer, SetLogLevel
from faster_whisper import WhisperModel, BatchedInferencePipeline

# Local imports
from model.models import (
Expand Down Expand Up @@ -99,64 +100,6 @@ def verify_language(language: str) -> str:
return code


def verify_download(language: str, model_type: str) -> str:
"""Verifies that the selected language can be downloaded by the script.

If the download option is selected, this function verifies that the language
model and size are supported by the script.

:param language: Language of the model to download.
:param model_type: Type of model (small or large).
:return: String name of the model file to download if supported.
"""

lang_code = verify_language(language)
name = ''
found = False
other = 'small' if model_type == 'large' else 'large'

if model_type == 'small':
for line in models_small:
if lang_code in line:
name = line
break
elif model_type == 'large':
for line in models_large:
if lang_code in line:
name = line
break

# If the specified model wasn't found, check for a different size
if not name and model_type == 'small':
for line in models_large:
if lang_code in line:
found = True
break
elif not name and model_type == 'large':
for line in models_small:
if lang_code in line:
found = True
break

if not name and found:
con.print(
f"[bold yellow]WARNING:[/] The selected model cannot be downloaded for '{language}' "
f"in the specified size '{model_type}'. However, a '{other}' model was found. "
f"You can re-run the script and choose {other}, or attempt to "
f"download a different model manually from {vosk_link}."
)
sys.exit(3)
elif not name:
con.print(
f"[bold red]ERROR:[/] The selected model cannot be downloaded for '{language}' "
f"in size {model_type}. You can try and download a different model manually "
f"from {vosk_link}."
)
sys.exit(33)

return name


def parse_config() -> dict:
"""Parses the toml config file.

Expand Down Expand Up @@ -208,9 +151,6 @@ def parse_args():
type=str, choices=['small', 'large'],
help='Model type to use if multiple models are available. Default is small.')
parser.add_argument('--list_languages', '-ll', action='store_true', help='List supported languages and exit')
parser.add_argument('--download_model', '-dm', choices=['small', 'large'], dest='download',
nargs='?', default=argparse.SUPPRESS,
help='Download the model archive specified in the --language parameter')
parser.add_argument('--cover_art', '-ca', dest='cover_art', nargs='?', default=None,
metavar='COVER_ART_PATH', type=path_exists, help='Path to cover art file. Optional')
parser.add_argument('--author', '-a', dest='author', nargs='?', default=None,
Expand Down Expand Up @@ -247,17 +187,6 @@ def parse_args():
print("\n")
sys.exit(0)

if 'download' in args:
if args.lang == 'en-us':
con.print(
"[bold yellow]WARNING:[/] [bold green]--download_model[/] was used, but a language was not set. "
"the default value [cyan]'en-us'[/] will be used. If you want a different language, use the "
"[bold blue]--language[/] option to specify one."
)

download = 'small' if args.download not in ['small', 'large'] else args.download
model_name = verify_download(args.lang, download)


# Set ID3 metadata fields based on passed args
meta_fields = {'cover_art': args.cover_art if args.cover_art else None,
Expand Down Expand Up @@ -608,7 +537,7 @@ def convert_time(time: str) -> str:
else:
parts[-1] = str(int(last) - 1)
except Exception as e:
con.print(f"[bold red]CRITICAL:[/] Could not covert end chapter marker for {time}: [red]{e}[/red]")
con.print(f"[bold red]CRITICAL:[/] Could not convert end chapter marker for {time}: [red]{e}[/red]")
sys.exit(6)

return f"{':'.join(parts)}.{milliseconds}"
Expand Down Expand Up @@ -700,6 +629,16 @@ def split_file(audiobook_path: PathLike,
progress.update(task, advance=1)


def format_timestamp_from_float(seconds: float):
# Create a timedelta object
td = timedelta(seconds=seconds)
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
milliseconds = td.microseconds // 1000
formatted_timestamp = '{:02}:{:02}:{:02},{:03}'.format(hours, minutes, seconds, milliseconds)
return formatted_timestamp


def generate_timecodes(audiobook_path: PathLike, language: str, model_type: str) -> Path:
"""Generate chapter timecodes using vosk Machine Learning API.

Expand Down Expand Up @@ -750,24 +689,25 @@ def generate_timecodes(audiobook_path: PathLike, language: str, model_type: str)
)
model_path = None

SetLogLevel(-1)
model = Model(lang=language, model_path=str(model_path))
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)

try:
# Convert the file to wav (if needed), and stream output to file
with subprocess.Popen([str(ffmpeg), "-loglevel", "quiet", "-i",
audiobook_path,
"-ar", str(sample_rate), "-ac", "1", "-f", "s16le", "-"],
stdout=subprocess.PIPE).stdout as stream:
with open(out_file, 'w+') as fp:
fp.writelines(rec.SrtResult(stream))

con.print("[bold green]SUCCESS![/] Timecode file created\n")
except Exception as e:
con.print(f"[bold red]ERROR:[/] Failed to generate timecode file with vosk: [red]{e}[/red]\n")
sys.exit(7)
model_size = "tiny.en"
model = WhisperModel(model_size, compute_type="float32")
batched_model = BatchedInferencePipeline(model=model)

# set word_timestamps for ms precision of segements
segments, _ = batched_model.transcribe(audiobook_path, word_timestamps=True, batch_size=16)

with open(out_file, 'w+') as fp:
count = 0
for segment in segments:
for word in segment.words:
count += 1
start_timestamp = format_timestamp_from_float(word.start)
end_timestamp =format_timestamp_from_float(word.end)
fp.write(f"{str(count+1)}\n")
fp.write(f"{start_timestamp} --> {end_timestamp}\n")
fp.write(f"{word.word.strip()}\n")
fp.write("\n")
con.print("[bold green]SUCCESS![/] Timecode file created\n")

return Path(out_file)

Expand Down Expand Up @@ -811,16 +751,16 @@ def parse_timecodes(srt_content: list, language: str = 'en-us') -> list[dict]:
start = start_regexp.group(0).replace(',', '.')

# Prologue
if markers[0] in srt_content[i+1]:
if markers[0] in srt_content[i+1] or markers[1] in srt_content[i+1]:
chapter_type = markers[0].title()
# Chapter X
elif markers[1] in srt_content[i+1]:
elif markers[2] in srt_content[i+1]:
# Add leading zero for better sorting if < 10
chapter_count = f'0{counter}' if counter < 10 else f'{counter}'
chapter_type = f'{markers[1].title()} {chapter_count}'
chapter_type = f'{markers[2].title()} {chapter_count}'
counter += 1
# Epilogue
elif markers[2] in srt_content[i+1]:
elif markers[3] in srt_content[i+1]:
chapter_type = markers[2].title()
else:
chapter_type = ''
Expand Down Expand Up @@ -1007,14 +947,6 @@ def main():
con.print("[bold yellow]WARNING:[/] Cover art path does not exist")
cover_art = None

# Download model if option selected
if model_name and lang:
con.rule(f"[cyan]Downloading '{lang} ({model_type})' Model[/cyan]")
print("\n")
con.print("[magenta]Preparing download...[/magenta]")
print("\n")
download_model(model_name)

# Generate timecodes from mp3 file
con.rule("[cyan]Generating Timecodes[/cyan]")
print("\n")
Expand Down
2 changes: 1 addition & 1 deletion model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
'''

# Signal phrases for chapter markers
_markers_english = ('prologue', 'chapter', 'epilogue')
_markers_english = ('Prolog', 'Prologue', 'Chapter', 'Epilogue')
_markers_german = ('prolog', 'kapitel', 'epilog')

# Some false positive phrases/words that trigger a chapter marker...will need building over time
Expand Down