-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathVideoSpeechProcessor_refactored.py
120 lines (97 loc) · 5.1 KB
/
VideoSpeechProcessor_refactored.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from video_gemini import vision_model
from tts.text_to_speech import generate_speech
from pydub import AudioSegment # Ensure pydub is installed for audio handling.
from moviepy.editor import VideoFileClip, AudioFileClip # Ensure moviepy is installed for video and audio handling.
import os
class VideoSpeechProcessor:
def __init__(self, video_file_path, target_frame_rate, prompt_path, project_uuid, voice_uuid):
self.video_file_path = video_file_path
self.target_frame_rate = target_frame_rate
self.prompt_path = prompt_path
self.project_uuid = project_uuid
self.voice_uuid = voice_uuid
self.total_text_length = 0 # Initialize total text length counter
def get_video_duration(self):
"""Get the duration of the video in seconds."""
with VideoFileClip(self.video_file_path) as video:
return video.duration
def process_video(self):
"""Process video to obtain text responses for TTS."""
return vision_model(self.video_file_path, self.target_frame_rate, self.prompt_path)
def generate_speech_for_responses(self, responses):
audio_files = []
for sequence_number, response_text in enumerate(responses, start=1):
self.total_text_length += len(response_text)
title = f"AudioResponse_{sequence_number}"
audio_path = generate_speech(response_text, self.project_uuid, self.voice_uuid, title, sequence_number)
if audio_path: # Check if a path was returned and it's not None
audio_files.append(audio_path)
else:
print(f"Warning: No audio file generated for {title}.")
return audio_files
def calculate_total_audio_duration(self, audio_files):
"""Calculate the total duration of all audio files."""
total_duration = 0
for audio_file in audio_files:
try:
audio = AudioSegment.from_file(audio_file)
total_duration += len(audio)
except Exception as e:
print(f"Error processing file {audio_file}: {e}")
return total_duration / 1000 # Convert milliseconds to seconds
def process_video_and_generate_speech(self):
"""Main method to process video and generate speech."""
video_duration = self.get_video_duration()
responses = self.process_video()
audio_files = self.generate_speech_for_responses(responses)
total_audio_duration = self.calculate_total_audio_duration(audio_files)
return audio_files, self.total_text_length, total_audio_duration, video_duration
@staticmethod
def concatenate_audio_files(audio_files, output_path):
import os
from pydub import AudioSegment
# Ensure the directory exists
directory = os.path.dirname(output_path)
if not os.path.exists(directory):
os.makedirs(directory)
combined = AudioSegment.empty()
for audio_file in audio_files:
audio = AudioSegment.from_file(audio_file)
combined += audio
combined.export(output_path, format="wav")
@staticmethod
def overlay_audio(video_path, audio_path, output_path):
# Validate input paths
if not video_path or not os.path.exists(video_path):
raise ValueError(f"Video path is invalid or does not exist: {video_path}")
if not audio_path or not os.path.exists(audio_path):
raise ValueError(f"Audio path is invalid or does not exist: {audio_path}")
# Load the video clip
video_clip = VideoFileClip(video_path)
# Load the audio file
audio_clip = AudioFileClip(audio_path)
# Set the audio of the video clip as the audio clip
final_clip = video_clip.set_audio(audio_clip)
# Write the result to a file
final_clip.write_videofile(output_path, codec='libx264', audio_codec='aac')
if __name__ == "__main__":
# Example usage of VideoSpeechProcessor
video_file_path = 'public/wrestling.mp4'
target_frame_rate = 30
prompt_path = 'prompts/narrations/tik5.md'
project_uuid = '0448305f'
voice_uuid = 'd3e61caf'
# Initialize the VideoSpeechProcessor with the video and parameters
processor = VideoSpeechProcessor(video_file_path, target_frame_rate, prompt_path, project_uuid, voice_uuid)
# Process the video and generate speech
audio_files, total_text_length, total_audio_duration, video_duration = processor.process_video_and_generate_speech()
# Example: Concatenate generated audio files into one
output_audio_path = 'path/to/output/audio.wav'
VideoSpeechProcessor.concatenate_audio_files(audio_files, output_audio_path)
# Example: Overlay the concatenated audio on the original video
output_video_path = 'path/to/output/video.mp4'
VideoSpeechProcessor.overlay_audio(video_file_path, output_audio_path, output_video_path)
print(f"Total text length processed: {total_text_length} characters")
print(f"Total audio duration: {total_audio_duration} seconds")
print(f"Original video duration: {video_duration} seconds")
print(f"Output video saved to: {output_video_path}")