-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
102 lines (81 loc) · 3.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import tkinter as tk
from tkinter import ttk, filedialog
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
from transformers import pipeline
import json
import subprocess
import os
import threading
FRAME_RATE = 16000
CHANNEL = 1
def voice_recognition(filename):
model = Model(model_name="vosk-model-en-us-0.22")
recog = KaldiRecognizer(model, FRAME_RATE)
recog.SetWords(True)
audio = AudioSegment.from_mp3(filename)
audio = audio.set_channels(CHANNEL)
audio = audio.set_frame_rate(FRAME_RATE)
step = 45000
transcript = ""
for i in range(0, len(audio), step):
print(f"Progress: {i/len(audio)}")
segment = audio[i:i+step]
recog.AcceptWaveform(segment.raw_data)
result = recog.Result()
text = json.loads(result)["text"]
transcript += text
cased = subprocess.check_output('/Users/gaurav.goyal/Desktop/Codes/projectvenv/bin/python vosk-recasepunc-en-0.22/recasepunc.py predict vosk-recasepunc-en-0.22/checkpoint', shell=True, text=True, input=transcript)
return cased
def summarize_transcript(transcript):
summary_maker = pipeline("summarization")
spl_value = transcript.split(" ")
trans = []
for i in range(0, len(spl_value), 850):
selection = " ".join(spl_value[i:(i+850)])
trans.append(selection)
summary = summary_maker(trans)
summary_1 = "\n\n".join([d["summary_text"] for d in summary])
return summary_1
def process_file(file_path):
processing_dialog = tk.Toplevel(root)
processing_dialog.title("Processing...")
processing_dialog.geometry("200x100")
processing_label = ttk.Label(processing_dialog, text="Processing... \n this might take a few min...", font=("Helvetica", 14))
processing_label.pack(pady=20)
processing_dialog.update()
transcript = voice_recognition(file_path)
summary = summarize_transcript(transcript)
processing_dialog.destroy()
heading_label.config(text="Hello user, here is your summary for given audio file", foreground="black")
output_text.delete(1.0, tk.END)
output_text.insert(tk.END, summary)
def browse_file():
file_path = filedialog.askopenfilename(filetypes=[("MP3 Files", "*.mp3")])
if file_path:
thread = threading.Thread(target=process_file, args=(file_path,))
thread.start()
root = tk.Tk()
root.title("Voice Recognition and Summarization")
root = tk.Tk()
root.title("Voice Recognition and Summarization")
window_width = 600
window_height = 400
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
x_coordinate = (screen_width - window_width) // 2
y_coordinate = (screen_height - window_height) // 2
root.geometry(f"{window_width}x{window_height}+{x_coordinate}+{y_coordinate}")
style = ttk.Style()
style.theme_use("clam")
style.configure("TButton", font=("Helvetica", 12), padding=5)
style.configure("TLabel", font=("Helvetica", 16, "bold"))
style.configure("TText", font=("Helvetica", 12))
browse_button = ttk.Button(root, text="Browse MP3 File", command=browse_file)
browse_button.pack(pady=10)
heading_label = ttk.Label(root, text="", foreground="blue", padding=10)
heading_label.pack()
heading_label.grid_remove()
output_text = tk.Text(root, height=10, wrap=tk.WORD)
output_text.pack(padx=10, pady=5, fill=tk.BOTH, expand=True)
root.mainloop()