diff --git a/colab/AdClip Gemini Prototype.ipynb b/colab/AdClip Gemini Prototype.ipynb new file mode 100644 index 0000000..d0b0a8a --- /dev/null +++ b/colab/AdClip Gemini Prototype.ipynb @@ -0,0 +1,1463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "gZQOxcdIv3w9" + }, + "source": [ + "## AdClip Gemini Prototype\n", + "\n", + "#### AdClip Gemini leverages Gemini to understand long-context videos or video ads, and trim them based on the most important segments. There are two options: automatic trimming for long-context videos (transcript only) or [YouTube ABCDs](https://www.thinkwithgoogle.com/intl/en-apac/future-of-marketing/creativity/youtube-video-ad-creative/) (Attention, Branding, Connection, Direct) for video ads (transcript and visual description).\n", + "\n", + "Contact: adclip-team@google.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u1pD7yVfgJsp" + }, + "source": [ + "## Install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "collapsed": true, + "id": "BtlqUCvMlLDT" + }, + "outputs": [], + "source": [ + "#@title Install Modules\n", + "!pip install google-cloud-aiplatform --quiet\n", + "!pip install google-cloud-speech --quiet\n", + "!pip install firebase_functions~=0.1.0 --quiet\n", + "!pip install google-cloud-videointelligence --quiet\n", + "!pip install moviepy --quiet\n", + "!pip install pytube --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wB-cy_XYfS2O" + }, + "outputs": [], + "source": [ + "#@title Initialize the imports\n", + "from firebase_functions import https_fn\n", + "from firebase_admin import initialize_app, firestore\n", + "from google.cloud import speech, storage\n", + "from vertexai.preview.language_models import TextGenerationModel\n", + "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Image, Content, Part #gemini\n", + "from google.cloud import videointelligence\n", + "\n", + "import moviepy.editor as moviepy\n", + "import re\n", + "import itertools\n", + "import functools\n", + "import copy\n", + "import math\n", + "import requests\n", + "from pytube import YouTube\n", + "from urllib.parse import urlparse, parse_qs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "srJ8VH0rghYH" + }, + "outputs": [], + "source": [ + "#@title GCS & FireStore\n", + "\n", + "# Video files are stored in GCS, json files (transcripts, video shots) are stored in FireStore\n", + "\n", + "def upload_blob(source_file_name: str,\n", + " destination_blob_name: str) -> None:\n", + " \"\"\"Upload file to bucket.\"\"\"\n", + " blob = bucket.blob(destination_blob_name)\n", + " blob.upload_from_filename(source_file_name)\n", + "\n", + " print(\n", + " 'File {} uploaded to {}.'.format(source_file_name, destination_blob_name)\n", + " )\n", + "\n", + "def download_blob(source_file_name: str,\n", + " destination_blob_name: str) -> None:\n", + " \"\"\"Download file from bucket.\"\"\"\n", + " blob = bucket.blob(source_file_name)\n", + " # Download the file to a destination\n", + " blob.download_to_filename(destination_blob_name)\n", + "\n", + " print(\n", + " 'File {} downloaded to {}.'.format(source_file_name, destination_blob_name)\n", + " )\n", + "\n", + "def does_file_exist(file_path: str) -> bool:\n", + " \"\"\"Validate if file already existing in the bucket.\n", + "\n", + " Args:\n", + " file_path: A file location.\n", + "\n", + " Returns:\n", + " True if file existed, otherwise, False\n", + " \"\"\"\n", + " blob = bucket.get_blob(file_path)\n", + " return blob is not None\n", + "\n", + "def upload_video_shots(file_name: str, video_shots: list) -> None:\n", + " \"\"\"Uploads video shot to firestore.\"\"\"\n", + " db = firestore.client()\n", + " doc_ref = db.collection('video_shots').document(file_name)\n", + " doc_ref.set({'data': video_shots})\n", + "\n", + "\n", + "def get_video_shots(file_name: str) -> bool:\n", + " \"\"\"Gets video shots from firestore by file name.\"\"\"\n", + " db = firestore.client()\n", + " doc = db.collection('video_shots').document(file_name).get()\n", + " if not doc.exists:\n", + " return None\n", + " return doc.to_dict().get('data')\n", + "\n", + "def get_transcript(file_name: str) -> bool:\n", + " \"\"\"Gets transcript from firestore by file name.\"\"\"\n", + " db = firestore.client()\n", + " doc = db.collection('transcripts').document(file_name).get()\n", + " if not doc.exists:\n", + " return None\n", + "\n", + " return doc.to_dict().get('original')\n", + "\n", + "def upload_transcript(file_name: str, transcript: list) -> None:\n", + " db = firestore.client()\n", + " doc_ref = db.collection('transcripts').document(file_name)\n", + " doc_ref.set({'original': transcript})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "c56iad0LCoEz" + }, + "outputs": [], + "source": [ + "#@title Downloads Videos From YouTube\n", + "\n", + "#import moviepy\n", + "#from moviepy import editor as moviepy\n", + "#from moviepy.editor import moviepy.editor\n", + "#from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip\n", + "\n", + "def get_id(youtube_url):\n", + " \"\"\"\" Extract YouTube id from YouTube url\n", + " https://www.youtube.com/watch?v=EUYpKwgqi1M -> \"EUYpKwgqi1M\"\n", + " \"\"\"\n", + " u_pars = urlparse(youtube_url)\n", + " quer_v = parse_qs(u_pars.query).get('v')\n", + " if quer_v:\n", + " return quer_v[0]\n", + " pth = u_pars.path.split('/')\n", + " if pth:\n", + " return pth[-1]\n", + "\n", + "def video_accessible(youtube_url):\n", + " try:\n", + " yt = YouTube(youtube_url)\n", + " except VideoUnavailable:\n", + " #print(f'Video {youtube_url} is not accessible')\n", + " return False\n", + " else:\n", + " video_id = get_id(youtube_url)\n", + " video_title = yt.title\n", + " return {\"video_id\": video_id, \"video_title\": video_title}\n", + "\n", + "# Download Youtube video from url\n", + "def download_video_from_url(youtube_url,video_file):\n", + " \"\"\"Download YouTube video from url\n", + " Pytube: https://pytube.io/en/latest/api.html#pytube.Stream.download\n", + " \"\"\"\n", + " path = '/tmp/'\n", + " yt = YouTube(youtube_url)\n", + " video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path=path, filename=video_file)\n", + " return video_path # return video_path string for other functions\n", + "\n", + "def download_video(youtube_url): # Main function for CF\n", + " if youtube_url is None:\n", + " return {\n", + " \"error\": \"Missing url, sample format: https://youtu.be/9wobcM-WPQk\"\n", + " }\n", + " if video_accessible(youtube_url) is False:\n", + " return {\n", + " \"error\": \"Video is not accessible\"\n", + " }\n", + " video_id = video_accessible(youtube_url)[\"video_id\"]\n", + " video_title = video_accessible(youtube_url)[\"video_title\"]\n", + " video_title = re.sub('\\W+',' ', video_title) #remove all special characters\n", + " video_file = video_title + \"_\" + video_id + '.mp4'\n", + " video_path_gcs = 'videos/' + video_file #adclip.appspot.com/videos/mytitle_ZDDH2.mp4\n", + " if does_file_exist(video_path_gcs):\n", + " print(f\"Video files already exist in GCS: {video_path_gcs}\")\n", + " # return {\n", + " # \"message\": \"Video files already exist in GCS\"\n", + " # }\n", + " else:\n", + " video_path_tmp = download_video_from_url(youtube_url=youtube_url,video_file=video_file)\n", + " upload_blob(video_path_tmp, video_path_gcs) #tmp/video.mp4 --> adclip.appspot.com/videos/video.mp4\n", + " #print(\"Video files uploaded to GCS\")\n", + " return {\n", + " \"video_uri\": \"https://storage.mtls.cloud.google.com/adclip.appspot.com/\" + video_path_gcs,\n", + " \"full_path\": video_path_gcs\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jg3WZVVsfpCR" + }, + "outputs": [], + "source": [ + "#@title Detects Shots (Video Intelligence)\n", + "\n", + "def process_video(video_gcs_uri: str) -> list:\n", + " \"\"\"Processing the video to create video shots with timestamps and store the video shots in GCS.\n", + "\n", + " Args:\n", + " video_gcs_uri: A video gcs uri for processing.\n", + "\n", + " Returns:\n", + " A list of video shots metadata. For example:\n", + " [\n", + " {\n", + " 'start_time': 0.0,\n", + " 'end_time': 4.8\n", + " },\n", + " {\n", + " 'start_time': 5.2,\n", + " 'end_time': 5.6\n", + " }\n", + " ]\n", + " \"\"\"\n", + " video_client = videointelligence.VideoIntelligenceServiceClient()\n", + "\n", + " #TODO: b/306068003 - Add speech-to-text feature here.\n", + " features = [\n", + " videointelligence.Feature.SHOT_CHANGE_DETECTION,\n", + " # videointelligence.Feature.SPEECH_TRANSCRIPTION,\n", + " ]\n", + "\n", + " transcript_config = videointelligence.SpeechTranscriptionConfig(\n", + " language_code=\"en-US\"\n", + " )\n", + " video_context = videointelligence.VideoContext(\n", + " speech_transcription_config=transcript_config\n", + " )\n", + "\n", + " operation = video_client.annotate_video(\n", + " request={\n", + " \"features\": features,\n", + " \"input_uri\": video_gcs_uri,\n", + " }\n", + " )\n", + "\n", + " print(\"\\nProcessing video.\", operation)\n", + "\n", + " result = operation.result(timeout=300)\n", + "\n", + " print(\"\\n finished processing.\")\n", + "\n", + " video_shots = []\n", + " # first result is retrieved because a single video was processed\n", + " for i, shot in enumerate(result.annotation_results[0].shot_annotations):\n", + " start_time = (\n", + " shot.start_time_offset.seconds + shot.start_time_offset.microseconds / 1e6\n", + " )\n", + " end_time = (\n", + " shot.end_time_offset.seconds + shot.end_time_offset.microseconds / 1e6\n", + " )\n", + " video_shots.append(\n", + " {\n", + " \"start_time\": math.floor(start_time * 10) / 10.0,\n", + " \"end_time\": round(end_time, 1),\n", + " }\n", + " )\n", + " print(\"\\tShot {}: {} to {}\".format(i, start_time, end_time))\n", + "\n", + " return video_shots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uFPTPJ67gIxr" + }, + "outputs": [], + "source": [ + "#@title Transcribes Audio & Syncs with Shot\n", + "\n", + "def get_speech_recognition_config(language_code: str):\n", + " \"\"\"Get speech recognition config from the given language code and model.\n", + "\n", + " Check all supported language code and model here:\n", + " https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages\n", + "\n", + " Args:\n", + " language_code: A language code for transribing.\n", + " model: A video transcribe model.\n", + "\n", + " Return:\n", + " A speech recognition config.\n", + " \"\"\"\n", + " if language_code == 'en-US':\n", + " model = 'video'\n", + " elif language_code == 'zh-TW':\n", + " model = 'command_and_search'\n", + " else:\n", + " model = 'default'\n", + " return speech.RecognitionConfig(\n", + " enable_word_time_offsets=True,\n", + " audio_channel_count=2, # 2 is default for wav files\n", + " # Enable automatic punctuation\n", + " # enable_automatic_punctuation=True,\n", + " language_code=language_code,\n", + " model=model,\n", + " # Works for model=\"video\" or \"phone call\" (en-US only)\n", + " use_enhanced=True,\n", + " )\n", + "\n", + "def extract_audio(video_full_path, file_name, output_name=None) -> str:\n", + " \"\"\"Extract audio from the video by the given video path.\n", + "\n", + " Args:\n", + " video_full_path: A full video path that store in GCS.\n", + " file_name: A file name for temp use.\n", + " output_name: A custom output name.\n", + "\n", + " Returns:\n", + " A path to video audio file.\n", + " \"\"\"\n", + " file_name_without_extension = file_name.rsplit('.', 1)[0]\n", + " if output_name is None:\n", + " audio_output_file = file_name_without_extension + '.wav'\n", + " else:\n", + " audio_output_file = output_name + '.wav'\n", + " gcs_file_path = AUDIO_FOLDER + audio_output_file\n", + "\n", + " if does_file_exist(gcs_file_path):\n", + " print('File {} exists'.format(gcs_file_path))\n", + " return GS_PATH + gcs_file_path\n", + " tmp_file_path = TEMP_FOLDER + file_name\n", + "\n", + " # use video file_path\n", + " blob = bucket.blob(video_full_path)\n", + " blob.download_to_filename(tmp_file_path)\n", + " clip = moviepy.VideoFileClip(tmp_file_path)\n", + " audio_output_path = TEMP_FOLDER + audio_output_file\n", + " clip.audio.write_audiofile(audio_output_path)\n", + "\n", + " upload_blob(audio_output_path, gcs_file_path)\n", + "\n", + " return GS_PATH + gcs_file_path\n", + "\n", + "\n", + "def build_transcript(response) -> list:\n", + " \"\"\"Build video transcript response with transcript metadata.\n", + "\n", + " Args:\n", + " response: A transcript response from speech API.\n", + "\n", + " Returns:\n", + " A list of new video transcript strucutre and metadata.\n", + " For example,\n", + " [\n", + " {\n", + " \"text\": \"some sentence\"\n", + " \"startTime\": 0,\n", + " \"endTime\": 2.8,\n", + " \"duration\": 2.8\n", + " \"words\": [\n", + " {\n", + " \"text\": \"some\"\n", + " \"startTime\": 0,\n", + " \"endTime\": 1.2,\n", + " \"duration\": 1.2\n", + " },\n", + " {\n", + " \"text\": \"sentence\"\n", + " \"startTime\": 1.2,\n", + " \"endTime\": 2.8,\n", + " \"duration\": 1.6\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + " \"\"\"\n", + " transcript_builder = []\n", + " last_end_time = 0\n", + " # Each result is for a consecutive portion of the audio. Iterate through\n", + " # them to get the transcripts for the entire audio file.\n", + " for result in response.results:\n", + " # The first alternative is the most likely one for this portion.\n", + " for alternative in result.alternatives:\n", + "\n", + " if len(alternative.words) > 0:\n", + " transcript_item = {\n", + " 'text': alternative.transcript,\n", + " 'startTime': alternative.words[0].start_time.total_seconds(),\n", + " 'endTime': alternative.words[-1].end_time.total_seconds(),\n", + " 'duration': (alternative.words[-1].end_time.total_seconds()\n", + " - alternative.words[0].start_time.total_seconds())\n", + " }\n", + "\n", + " transcript_item['words'] = []\n", + " for word in alternative.words:\n", + " transcript_item['words'].append({\n", + " 'text': word.word,\n", + " 'startTime': word.start_time.total_seconds(),\n", + " 'endTime': word.end_time.total_seconds(),\n", + " 'duration': (word.end_time.total_seconds()\n", + " - word.start_time.total_seconds()),\n", + " 'gap': word.end_time.total_seconds() - last_end_time})\n", + " last_end_time = word.end_time.total_seconds()\n", + " transcript_builder.append(transcript_item)\n", + " return transcript_builder\n", + "\n", + "\n", + "def generate_transcript_item(\n", + " words: list, start_time: float = None, end_time: float = None) -> dict:\n", + " \"\"\"Generates transcript item.\"\"\"\n", + " start_time = words[0]['startTime'] if start_time is None else start_time\n", + " end_time = words[-1]['endTime'] if end_time is None else end_time\n", + " return {\n", + " 'text': ' '.join(list(map(lambda word: word['text'], words))),\n", + " 'startTime': start_time,\n", + " 'endTime': end_time,\n", + " 'duration': end_time - start_time,\n", + " 'words': words\n", + " }\n", + "\n", + "\n", + "def refine_by_gaps(transcript: list) -> list:\n", + " \"\"\"Refines the transcript by the gap time.\"\"\"\n", + " new_transcript = []\n", + "\n", + " for line in transcript:\n", + " gaps = list(map(lambda clip: clip['gap'], line['words']))\n", + " gaps.pop(0) #remove first gap\n", + "\n", + " if len(gaps) == 0:\n", + " continue\n", + " average = sum(gaps) / len(gaps)\n", + " words = []\n", + " for index, word in enumerate(line['words']):\n", + " if index > 1 and word['gap'] > average * GAP_MULTIPLIER:\n", + " new_transcript.append(generate_transcript_item(words))\n", + " words = []\n", + " words.append(word)\n", + " if len(words) > 0:\n", + " new_transcript.append(generate_transcript_item(words))\n", + " return new_transcript\n", + "\n", + "\n", + "\n", + "def merge_clips(transcript: list) -> list:\n", + " \"\"\"Merges clips under 5seconds.\"\"\"\n", + " if len(transcript) == 0:\n", + " return []\n", + "\n", + " def merge(transcript1, transcript2):\n", + " \"\"\"Merges transcript1 and transcript2.\"\"\"\n", + " start_time = transcript1['startTime']\n", + " end_time = max(transcript1['endTime'], transcript2['endTime'])\n", + " return {\n", + " 'text': f\"{transcript1['text']} {transcript2['text']}\",\n", + " 'startTime': start_time,\n", + " 'endTime': end_time,\n", + " 'duration': end_time - start_time,\n", + " 'words': transcript1['words'] + transcript2['words'],\n", + " }\n", + "\n", + " def is_overlapping(transcript1, transcript2):\n", + " \"\"\"Validate overlapping transcript time.\"\"\"\n", + " t2_start_time = transcript2['words'][0]['startTime']\n", + " t2_prev_start_time = transcript2['words'][-1]['startTime']\n", + " t1_start_time = transcript1['startTime']\n", + " t1_end_time = transcript1['endTime']\n", + " return t2_start_time >= t1_start_time and t2_prev_start_time <= t1_end_time\n", + "\n", + " output = []\n", + " index = 0\n", + " clip = transcript[index]\n", + "\n", + " for index in range(len(transcript)):\n", + " if index < len(transcript) - 1:\n", + " next = transcript[index + 1]\n", + " if (next['endTime'] - clip['startTime'] <= MIN_CLIP_DURATION or\n", + " is_overlapping(clip, next)):\n", + " clip = merge(clip, next)\n", + " else:\n", + " output.append(clip)\n", + " clip = transcript[index + 1]\n", + " else:\n", + " output.append(clip)\n", + " return output\n", + "\n", + "\n", + "def refine_by_video_shots(\n", + " file_name: str, video_gcs_uri: str, transcript: list) -> list:\n", + " \"\"\"Refines transcript with video shots data.\"\"\"\n", + "\n", + " new_transcript = []\n", + " video_shots = get_video_shots(file_name)\n", + "\n", + " if video_shots is None:\n", + " video_shots = process_video(video_gcs_uri)\n", + " upload_video_shots(file_name, video_shots)\n", + "\n", + " video_shots_index = 0\n", + " list_of_words = list(map(lambda line: line['words'], transcript))\n", + " transcript_words = list(itertools.chain.from_iterable(list_of_words))\n", + " print('\\\\\\\\\\ Transcript_words ////')\n", + " print(transcript_words)\n", + " words = []\n", + "\n", + " for index, word in enumerate(transcript_words):\n", + " words.append(word)\n", + " while video_shots[video_shots_index]['end_time'] <= words[0]['startTime']:\n", + " video_shots_index = video_shots_index + 1\n", + " video_shot = video_shots[video_shots_index]\n", + " if word['endTime'] > video_shot['end_time']:\n", + " start_time = min(words[0]['startTime'], video_shot['start_time'])\n", + " if index < len(transcript_words) - 1:\n", + " end_time = max(\n", + " word['endTime'],\n", + " min(\n", + " video_shot['end_time'], transcript_words[index + 1]['startTime']\n", + " ),\n", + " )\n", + " else:\n", + " end_time = max(word['endTime'], video_shot['end_time'])\n", + " video_shots_index = video_shots_index + 1\n", + " new_transcript.append(\n", + " generate_transcript_item(words, start_time, end_time)\n", + " )\n", + " words = []\n", + " if len(words) > 0:\n", + " start_time = min(\n", + " words[0]['startTime'], video_shots[video_shots_index]['start_time']\n", + " )\n", + " if len(new_transcript) > 0:\n", + " previous_last_word = new_transcript[-1]['words'][-1]\n", + " start_time = max(start_time, previous_last_word['endTime'])\n", + "\n", + " end_time = max(word['endTime'], video_shots[video_shots_index]['end_time'])\n", + " video_shots_index = video_shots_index + 1\n", + " new_transcript.append(generate_transcript_item(words, start_time, end_time))\n", + "\n", + " return new_transcript\n", + "\n", + "# @https_fn.on_call(\n", + "# timeout_sec=600,\n", + "# memory=options.MemoryOption.GB_4,\n", + "# cpu=2,\n", + "# region='asia-southeast1',\n", + "# )\n", + "\n", + "# video_full_path = request.data['full_path'] or VIDEO_FULL_PATH\n", + "# file_name = request.data['file_name'] or FILE_NAME\n", + "# language_code = request.data['language_code'] or LANGUAGE_CODE\n", + "\n", + "\n", + "def transcribe_video() -> any:\n", + " \"\"\"Transcribe video audio and store the transcript in GCS.\n", + "\n", + " Args:\n", + " request: A request payload from API call.\n", + "\n", + " Returns:\n", + " An object that contain video transcript with the timestamp data.\n", + " \"\"\"\n", + "\n", + " video_full_path = VIDEO_FULL_PATH\n", + " file_name = FILE_NAME\n", + " language_code = LANGUAGE_CODE\n", + "\n", + " if video_full_path is None:\n", + " return {\n", + " 'error': (\n", + " 'Missing video uri, sample format:'\n", + " ' https://googleapis.com/example.wav'\n", + " )\n", + " }\n", + "\n", + " transcript_in_firestore = get_transcript(file_name)\n", + " if transcript_in_firestore is not None:\n", + " return {\n", + " 'transcript': merge_clips(\n", + " refine_by_video_shots(\n", + " file_name,\n", + " GS_PATH + video_full_path,\n", + " transcript_in_firestore)),\n", + " 'original': transcript_in_firestore,\n", + " 'v1': refine_by_gaps(transcript_in_firestore),\n", + " }\n", + "\n", + " audio_gcs_uri = extract_audio(video_full_path, file_name)\n", + " print(f'Extracted audio is stored at {audio_gcs_uri}')\n", + "\n", + " audio = speech.RecognitionAudio(uri=audio_gcs_uri)\n", + " client = speech.SpeechClient()\n", + "\n", + " config = get_speech_recognition_config(language_code)\n", + "\n", + " operation = client.long_running_recognize(config=config, audio=audio)\n", + "\n", + " print(\"Waiting for operation to complete...\")\n", + " response = operation.result(timeout=900)\n", + "\n", + " transcript = build_transcript(response)\n", + " upload_transcript(file_name, transcript)\n", + "\n", + " return {\n", + " 'transcript': merge_clips(\n", + " refine_by_video_shots(\n", + " file_name,\n", + " GS_PATH + video_full_path,\n", + " transcript)),\n", + " 'original': transcript,\n", + " 'v1': refine_by_gaps(transcript)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xjPmIUMkz_fQ" + }, + "outputs": [], + "source": [ + "#@title Syncs with Shot (After Summarization)\n", + "\n", + "def match_with_video_shots(video_shots: list,\n", + " transcript: list,\n", + " words: list) -> list:\n", + " \"\"\"Adjusts the startTime and endTime of each line in the transcript.\n", + "\n", + " This implementation helps with \"jumpy\" transition in the final output video.\n", + "\n", + " Args:\n", + " video_shots: The list containing video shots in format of\n", + " [{end_time, start_time}, {end_time, start_time},]\n", + " transcript: The full transcript transcribed by Speech to Text AI.\n", + " words: A list containing the startTime and eachTime of each word in the full\n", + " transcript.\n", + "\n", + " Returns:\n", + " The transcript with the adjusted startTime and endTime.\n", + " \"\"\"\n", + " shot_index = 0\n", + " word_index = 0\n", + " for index, line in enumerate(transcript):\n", + " while video_shots[shot_index]['endTime'] <= line['startTime']:\n", + " shot_index += 1\n", + " video_shot = video_shots[shot_index]\n", + "\n", + " start_time = min(line['startTime'], video_shot['startTime'])\n", + " while (\n", + " word_index + 1 < len(words) - 1\n", + " and words[word_index + 1]['endTime'] < line['startTime']\n", + " ):\n", + " word_index += 1\n", + " previous_word = words[word_index]\n", + " if previous_word['startTime'] != line['startTime']:\n", + " start_time = max(previous_word['endTime'], start_time)\n", + "\n", + " transcript[index]['startTime'] = start_time\n", + "\n", + " while video_shots[shot_index]['endTime'] < line['endTime']:\n", + " shot_index += 1\n", + " video_shot = video_shots[shot_index]\n", + "\n", + " end_time = max(line['endTime'], video_shot['endTime'])\n", + "\n", + " while (\n", + " word_index < len(words) - 1\n", + " and words[word_index]['startTime'] < line['endTime']\n", + " ):\n", + " word_index += 1\n", + " next_word = words[word_index]\n", + " if next_word['endTime'] != line['endTime']:\n", + " end_time = min(end_time, next_word['startTime'])\n", + "\n", + " if index == len(transcript) - 1:\n", + " end_time = video_shots[-1]['endTime']\n", + " else:\n", + " #manually add 0.3s to end_time for better transitions\n", + " end_time = round(end_time + 0.3, 2)\n", + "\n", + " transcript[index]['endTime'] = end_time\n", + " transcript[index]['duration'] = end_time - start_time\n", + " return transcript\n", + "\n", + "\n", + "def extract_words_from_str(summary: str) -> list:\n", + " \"\"\"Extracts the words from the given summary splitting by space.\n", + "\n", + " Args:\n", + " summary: A summary of the transcript.\n", + "\n", + " Return:\n", + " A list of words from the given summary.\n", + " \"\"\"\n", + " # Remove the trailing \"transcript:\" from the summarized transcript from LLM\n", + " if summary.lstrip().lower().startswith('transcript:'):\n", + " summary = summary.lower().replace('transcript:', '', 1)\n", + "\n", + " summary = re.sub('[,.?!]', '', summary).lower()\n", + " summary = summary.replace('\\n', ' ')\n", + "\n", + " words = summary.split(' ')\n", + " words = list(filter(lambda word: len(word) > 0, words))\n", + " print(f'words: {words}')\n", + " return words\n", + "\n", + "\n", + "\n", + "def get_clips_from_transcript(\n", + " #self,\n", + " transcript_words: list,\n", + " shortened_text: str,\n", + " input_transcript: list) -> list:\n", + " \"\"\"Identifies the clip from the summarized transcript. This function minimizes the hallucination when LLM\n", + " doesn't respect the original sentences from the full transcripts by adding new words or only returning parts\n", + " of the original sentences in its response.\n", + "\n", + " Example:\n", + " - Original sentence: \"MacBook Air for the first time ever in 15 inches we've been dreaming about making this for years we\"\n", + " - Response from LLM: \"MacBook Air for the first time ever in 15 inches...\"\n", + "\n", + " Args:\n", + " transcript: The original full transcripts\n", + " summary: The \"summarized\" transcript from LLM\n", + "\n", + " Returns:\n", + " A list containing the adjusted text, start_time, end_time, duration.\n", + " \"\"\"\n", + " print(\"----get_clips_from_transcript-----'\")\n", + " print(transcript_words)\n", + " transcript_ptr = 0\n", + " output = []\n", + "\n", + " summary_words = extract_words_from_str(shortened_text)\n", + "\n", + " word_ptr = 0\n", + "\n", + " def does_word_match_transcript(transcript_idx: int, word_idx: int):\n", + " if (transcript_idx >= len(transcript_words) or\n", + " word_idx >= len(summary_words)):\n", + " return False\n", + "\n", + " transcript_word_text = transcript_words[transcript_idx].get('text')\n", + " transcript_word_text = re.sub('[,.?!]', '', transcript_word_text)\n", + " return (transcript_word_text.lower() ==\n", + " summary_words[word_idx].lower())\n", + "\n", + " while transcript_ptr < len(transcript_words):\n", + " transcript_builder = []\n", + "\n", + " # loop until the summary word match with transcript\n", + " # or until the transcript has True shouldKeep flag\n", + " while (transcript_ptr < len(transcript_words)\n", + " and not does_word_match_transcript(transcript_ptr, word_ptr)\n", + " and transcript_words[transcript_ptr].get('shouldKeep') != True):\n", + " transcript_ptr = transcript_ptr + 1\n", + "\n", + " # append all matched transcript summary\n", + " # or transcript that has True shouldKeep flag\n", + " while (transcript_ptr < len(transcript_words) and\n", + " (does_word_match_transcript(transcript_ptr, word_ptr)\n", + " or does_word_match_transcript(transcript_ptr + 1, word_ptr + 1)\n", + " or does_word_match_transcript(transcript_ptr + 2, word_ptr + 1)\n", + " or transcript_words[transcript_ptr].get('shouldKeep') == True)):\n", + " transcript_builder.append(transcript_words[transcript_ptr])\n", + "\n", + " if does_word_match_transcript(transcript_ptr, word_ptr):\n", + " word_ptr += 1\n", + "\n", + " elif transcript_words[transcript_ptr].get('shouldKeep') != True:\n", + " transcript_builder.append(transcript_words[transcript_ptr+1])\n", + "\n", + " if not does_word_match_transcript(transcript_ptr + 1, word_ptr + 1):\n", + " transcript_builder.append(transcript_words[transcript_ptr+2])\n", + " transcript_ptr += 1\n", + "\n", + " transcript_ptr += 1\n", + " word_ptr += 2\n", + "\n", + " transcript_ptr += 1\n", + "\n", + " if len(transcript_builder) == 0:\n", + " continue\n", + " if len(transcript_builder) == 1:\n", + " word_ptr -= 1\n", + " continue\n", + "\n", + " new_text = list(map(lambda item: item.get('text'), transcript_builder))\n", + " output.append({\n", + " 'text': ' '.join(new_text),\n", + " 'startTime': transcript_builder[0].get('startTime'),\n", + " 'endTime': transcript_builder[-1].get('endTime'),\n", + " 'duration': (transcript_builder[-1].get('endTime') -\n", + " transcript_builder[0].get('startTime')),\n", + " 'words': transcript_builder\n", + " })\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q88CnLNeU1m-" + }, + "outputs": [], + "source": [ + "#@title Gemini Functions\n", + "\n", + "from logging import Handler\n", + "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, SafetySetting, Image, Content, Part, FinishReason, HarmCategory, HarmBlockThreshold\n", + "\n", + "def upload_segments_to_gemini_bucket(segments: list,\n", + " source_file_name: str,\n", + " destination_blob_name: str) -> list:\n", + " \"\"\"Split videos into segments as a result from the transcribe_videos (for videos with\n", + " voiceovers) or from the shot detection API and then uploads those segmented parts to GCS\n", + " Returns: A list of uri for segmented videos\"\"\"\n", + " local_video_path = f\"/content/{VIDEO_FULL_PATH.replace('videos', '')}\" # Process video files in Colab's content folder for quicker processing\n", + " download_blob(source_file_name=f\"{VIDEO_FULL_PATH}\", #/videos/video.mp4\n", + " destination_blob_name=local_video_path) #download to /content/ in codelab --- change later when deploying to CF, use /tmp/\n", + " original_clip = moviepy.VideoFileClip(local_video_path)\n", + " segments_uri = []\n", + " for counter, segment in enumerate(segments):\n", + " new_clip = original_clip.subclip(segment['startTime'], segment['endTime'])\n", + " file_name_without_extension = FILE_NAME.replace('.mp4', '')\n", + " segment_file = f\"{file_name_without_extension}_{counter}.mp4\"\n", + " new_clip.write_videofile(f\"/content/{segment_file}\",audio_codec=\"aac\", logger=None)\n", + " upload_blob(source_file_name=segment_file,\n", + " destination_blob_name=f'gemini/{segment_file}') # Upload to adclip.appspot.com/gemini/ bucket (seperate from video bucket)\n", + " segments_uri.append(f\"{GS_PATH}gemini/{segment_file}\")\n", + " return segments_uri\n", + "\n", + "def send_video_to_gemini(video_full_path, root_promt=\"\", context=\"\"):\n", + " model = GenerativeModel(\"gemini-pro-vision\")\n", + " video = Part.from_uri(video_full_path, mime_type=\"video/mp4\")\n", + " root_prompt = \"Provide a description on the visual elements of the video\"\n", + " prompt = f\"{root_prompt} '\\n' {context}\"\n", + " contents = [video, prompt]\n", + " generation_config = GenerationConfig(max_output_tokens=2048,\n", + " temperature=0.4,\n", + " top_p=1,\n", + " top_k=32)\n", + " responses = model.generate_content(contents,\n", + " stream = True,\n", + " generation_config=generation_config,)\n", + " response = \"\"\n", + " for chunk in responses:\n", + " response += chunk.text\n", + " return response\n", + "\n", + "def send_transcript_to_llm(text: str,\n", + " model: str = \"gemini-1.5-pro-001\",\n", + " temperature: float = 0.3,\n", + " max_output_tokens: int = 8192,\n", + " top_k: int = 32,\n", + " top_p: int = 1) -> str:\n", + " \"\"\"Sends a transcript to Vertex LLM.\n", + "\n", + " Args:\n", + " text: A prompt to generate the response from the model.\n", + " model: The Language Model to use.\n", + " temperature: A temperature indicates the degree of randomness in token selection.\n", + " max_output_tokens: The maximum number of tokens that can be generated in the response.\n", + " top_k: A value indicates how the model selects tokens for output.\n", + " top_p: A value indicates how the model selects tokens for output.\n", + "\n", + " Returns:\n", + " A string of the summarized transcript.\n", + " \"\"\"\n", + " model = GenerativeModel(model)\n", + " prompt = f\"{root_prompt} '\\n' {text} \"\n", + " contents = [prompt]\n", + " generation_config = GenerationConfig(\n", + " max_output_tokens=max_output_tokens,\n", + " temperature=temperature,\n", + " top_k=top_k,\n", + " top_p=top_p,)\n", + "\n", + " safety_config = [\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_HARASSMENT,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " ]\n", + "\n", + " responses = model.generate_content(contents,\n", + " generation_config=generation_config,\n", + " safety_settings=safety_config,\n", + " stream=False,\n", + " )\n", + " print(responses.text)\n", + " return responses.text\n", + "\n", + "\n", + "def select_segments(num_of_lines=3, descriptions=\"\"):\n", + " model = GenerativeModel(\"gemini-1.5-pro-001\")\n", + " prompt = f\"\"\"\n", + " You are an expert video editor. Your task is to trim a video ad based on the following criteria\n", + " - Awareness: Strong message, scene/tease, or conclusion or Showcase the subject, product, animation tightky framed, or close up, or zoomed in\n", + " - Branding: Mention of brand name, logo, products, packages\n", + " - Connection: Visible face or human (body parts, humans, animations, cartoons are acceptable) or Product Interaction (for example: a user holding phone, eating product, using app)\n", + " or Clear messaging on benefits to consumer (show what products can do, what brands want viewers to do) or Competitive claims (awards, review, recommendation) or\n", + " Emotion response through text, speech, music (fear, laughter, sadness, disgust, surprise, delight, etc.)\n", + " - Direction: Call-to-action is detected through text or audio or Path to purchase showing how to buy (physical stores, app, website) is detected\n", + " or A search bar is visible on screen or Mention of limited qualities, price, special offer\n", + "\n", + " Below are the computer-generated transcript and visual description of each of the line in the video\n", + "\n", + " Provide the top {num_of_lines} for each of the criteria above in JSON.\n", + "\n", + " {{'Awareness': {{'Line': Description,\n", + " 'Line': Description}},\n", + " 'Branding':}}\n", + "\n", + " {descriptions}\n", + " \"\"\"\n", + " generation_config = GenerationConfig(\n", + " max_output_tokens=2048,\n", + " temperature=0.2,\n", + " top_k=32,\n", + " top_p=1,)\n", + " safety_config = [\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_HARASSMENT,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " SafetySetting(\n", + " category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,\n", + " threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n", + " ),\n", + " ]\n", + " responses = model.generate_content(\n", + " prompt,\n", + " generation_config=generation_config,\n", + " safety_settings=safety_config,\n", + " stream=False,\n", + " )\n", + " return responses.text\n", + "\n", + "def get_descriptions_of_all_shots():\n", + " descriptions = \"\"\n", + " for counter, segment in enumerate(segments):\n", + " if has_voice_over:\n", + " descriptions += (f\"Line {counter} \\nTranscript: {segment['text']}\\nVisual Description: {segment['visual_description']}\\n\")\n", + " else:\n", + " descriptions += (f\"Line {counter} \\nVisual Description: {segment['visual_description']}\\n\")\n", + " #print(descriptions)\n", + " return descriptions\n", + "\n", + "\n", + "root_prompt =\"\"\"You are a senior copy writer for an advertising agency who excels at summarizing transcript for video ads.\n", + "Shorten the transcript by keeping important lines and removing other lines. Make sure the output is less than 30% of the original input transcript\n", + "Keep the format of the output the same with the input. Keep Line number. Do not capitalize sentences, add commas, or rewrite the output.\n", + "\n", + "input:\n", + "MacBook Air\n", + "for the first time ever in 15 inches\n", + "we\\'ve been dreaming about making this for years\n", + "we designed a big beautiful display\n", + "the kept it incredibly thin and super like it\\'s all possible because of a basilica\n", + "M2 is so efficient but we don\\'t need a fan which means you\\'ll have a MacBook Air that\\'s a thin as ever while running completely\n", + "silent and everything comes together inside a design packed so tight that there\\'s barely room for an ant\\'s\n", + "from the side it almost\n", + "disappears and the liquid Retina Display\n", + "just look at it\n", + "that\\'s twice the resolution of a comparable 15 inch PC laptop and 25 percent brighter a bigger display means more room and more room means more speakers\n", + "double the Boost double the Beast\n", + "they have double the base of the\n", + "13-inch MacBook Air and because the speakers are located behind the\n", + "keyboard the sound reflects off the display towards the viewer so it feels super\n", + "immersive\n", + "okay tell us more about the chip it\\'s a very efficient SOC with an 8 core\n", + "CPU tank or\n", + "GPU and a 16 Corner oh engine\n", + "running fifteen point eight trillion operations per second and comes with up to twenty four gigs of memory which\n", + "means killer battery life and crazy fast\n", + "of course it also works great with iPhone\n", + "and the one thing no one realizes well other laptops that are thin and wide can sometimes feel\n", + "flimsy\n", + "so we\\'re extremely intentional with the are structural design keeping it ultra light but making it as durable as possible\n", + "because you know things happen\n", + "this is the\n", + "an inch laptop but we\\'ve always wanted to create its\n", + "uncompromising\n", + "expansive the in light and Powerful we love making it and we think you\\'ll love using it\n", + "\n", + "output:\n", + "MacBook Air\n", + "for the first time ever in 15 inches\n", + "we\\'ve been dreaming about making this for years\n", + "we designed a big beautiful display\n", + "just look at it\n", + "they have double the base of the\n", + "immersive\n", + "GPU and a 16 Corner oh engine\n", + "running fifteen point eight trillion operations per second and comes with up to twenty four gigs of memory which\n", + "means killer battery life and crazy fast\n", + "uncompromising\n", + "expansive the in light and Powerful we love making it and we think you\\'ll love using it\n", + "\n", + "input:\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UP70fomJkRAc" + }, + "outputs": [], + "source": [ + "#@title Video Cutting Functions\n", + "\n", + "from moviepy.video.fx.all import crop\n", + "\n", + "def merge_segments(segments):\n", + " if len(segments) == 0:\n", + " return []\n", + "\n", + " output = [segments[0]]\n", + "\n", + " for index in range(1, len(segments)):\n", + " if segments[index]['startTime'] <= output[-1]['endTime']:\n", + " output[-1]['endTime'] = segments[index]['endTime']\n", + " else:\n", + " output.append(segments[index])\n", + " return output\n", + "\n", + "def clip_video(video_path, file_name, segments):\n", + " # loading original video\n", + " original_clip = moviepy.VideoFileClip(video_path)\n", + " new_clip = {}\n", + "\n", + " segments = merge_segments(segments)\n", + "\n", + " for segment in segments:\n", + " #make sure end_time does not exceed the video duration\n", + " end_time = min(segment['endTime'], original_clip.duration)\n", + " if new_clip:\n", + " new_clip = moviepy.concatenate_videoclips([new_clip, original_clip.subclip(segment['startTime'], end_time)])\n", + " else:\n", + " new_clip = original_clip.subclip(segment['startTime'], end_time)\n", + "\n", + " (w, h) = new_clip.size\n", + "\n", + " # 9/16 ratio\n", + "\n", + " crop_width = h * 9/16\n", + " crop_width = crop_width//2*2\n", + "\n", + " x1, x2 = (w - crop_width)//2, (w+crop_width)//2\n", + " y1, y2 = 0, h\n", + " cropped_clip = crop(new_clip, x1=x1, y1=y1, x2=x2, y2=y2)\n", + "\n", + " # 1/1 ratio\n", + "\n", + " crop_width = h\n", + " crop_width = crop_width//2*2\n", + "\n", + " x1, x2 = (w - crop_width)//2, (w+crop_width)//2\n", + " y1, y2 = 0, h\n", + " cropped_clip_square = crop(new_clip, x1=x1, y1=y1, x2=x2, y2=y2)\n", + "\n", + " video_output_path = f\"/content/output_vertical_{FILE_NAME}\" # 9:16 vertical\n", + " video_output_square = f\"/content/output_square_{FILE_NAME}\" # 1:1\n", + " video_output_path_original = f\"/content/output_horizontal_{FILE_NAME}\"\n", + "\n", + " # Write file without wm\n", + " cropped_clip.write_videofile(video_output_path,audio_codec=\"aac\") #put in audio_codec so the clip has sound\n", + " cropped_clip_square.write_videofile(video_output_square, audio_codec=\"aac\")\n", + " new_clip.write_videofile(video_output_path_original,audio_codec=\"aac\") #put in audio_codec so the clip has sound\n", + "\n", + " # upload to cloud storage\n", + " # upload_blob(video_output_path, 'output/' + 'vertical_tmp_' + file_name)\n", + " # upload_blob(video_output_square, 'output/' + 'square_tmp_' + file_name)\n", + " # upload_blob(video_output_path_original, 'output/' + 'landscape_tmp_' + file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "webKdTrceHS8" + }, + "source": [ + "## GCP & Videos Processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "alXRLxDilkbS" + }, + "outputs": [], + "source": [ + "#@title Configuration\n", + "\n", + "#@markdown ##### Note: There will be a pop-up window asking you to authenticate. Feel free to use your own project and enable all required APIs.\n", + "\n", + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "LOCATION = \"\" # @param {type:\"string\"}\n", + "GCLOUD_BUCKET_NAME = \"\" # @param {type:\"string\"}\n", + "GS_PATH = f'gs://{GCLOUD_BUCKET_NAME}/'\n", + "\n", + "\n", + "# From CF\n", + "TEMP_FOLDER = '/tmp/'\n", + "AUDIO_FOLDER = 'videos/audio/' # in Cloud Storage\n", + "\n", + "# Customized for transcribe\n", + "GAP_MULTIPLIER = 2.5\n", + "MIN_CLIP_DURATION = 5\n", + "\n", + "#@title Initialize GCP\n", + "\n", + "from google.colab import auth as google_auth\n", + "google_auth.authenticate_user(project_id=PROJECT_ID)\n", + "!gcloud config set project {PROJECT_ID}\n", + "!gcloud config get-value project\n", + "\n", + "storage_client = storage.Client()\n", + "bucket = storage_client.get_bucket(GCLOUD_BUCKET_NAME)\n", + "\n", + "#Initialize Front End; Need to initialize to use Firestore\n", + "initialize_app()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "eWyp5qd7FAWi" + }, + "outputs": [], + "source": [ + "#@title Download Videos from YouTube and Upload to GCS\n", + "\n", + "#@markdown ##### If the functionality to download from YouTube is not working, you can upload to the video to GCS bucket and put the VIDEO_FULL_PATH directly in the box. [Link to GCS bucket](https://pantheon.corp.google.com/storage/browser/adclip.appspot.com/videos?e=13802955&mods=dm_deploy_from_gcs&project=adclip&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))) `gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}`\n", + "youtube_url = \"\" # @param {type:\"string\"}\n", + "VIDEO_FULL_PATH = \"\" # @param {type:\"string\"}\n", + "\n", + "#@markdown ##### Not all languages are supported.\n", + "has_voice_over = True # @param [\"False\", \"True\"] {type:\"raw\"}\n", + "LANGUAGE_CODE = \"en-US\" # @param [\"en-US\", \"es-ES\", \"id-ID\", \"fil-PH\", \"id-ID\", \"th-TH\", \"vi-VN\"]\n", + "\n", + "#@markdown ##### Select this option to analyze videos with visual elements. Note: Only do this for video less than 10 minutes please.\n", + "upload_to_gemini_bucket = False # @param [\"False\", \"True\"] {type:\"raw\"}\n", + "\n", + "##@markdown ##### Select the model for Speech-to-Text. Not all models are available (for example: \"video\" are only available for en-US. If in doubt, use \"default\". Check [this link](https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages) for details\"\n", + "#MODEL = \"default\" # @param [\"default\", \"command_and_search\", \"latest_short\", \"video\"]\n", + "\n", + "try:\n", + " VIDEO_FULL_PATH = download_video(youtube_url)[\"full_path\"] # will return `videos/myclip.mp4`\n", + "except:\n", + " VIDEO_FULL_PATH = VIDEO_FULL_PATH\n", + "\n", + "FILE_NAME = VIDEO_FULL_PATH.split(\"/\")[-1]\n", + "\n", + "if has_voice_over:\n", + " segments = transcribe_video()[\"transcript\"] # ['transcript', 'original', 'v1']\n", + "else:\n", + " shots = process_video(\n", + " video_gcs_uri=f\"gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}\") # returns videoshot only\n", + " segments = []\n", + " for shot in shots:\n", + " reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key\n", + " reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())\n", + " segments.append(reformed_dict)\n", + "\n", + "print(\"\\\\\\ Segments ////\")\n", + "print(segments)\n", + "print(f\"The video is segmented to {len(segments)} parts.\")\n", + "\n", + "if upload_to_gemini_bucket:\n", + " file_name_without_extension = FILE_NAME.replace('.mp4', '')\n", + " segment_file_0 = f\"{file_name_without_extension}_0.mp4\"\n", + "\n", + " if not does_file_exist(f\"gemini/{segment_file_0}\"): # check if the 1st file exists in bucket\n", + " segments_uri = upload_segments_to_gemini_bucket(segments,\n", + " source_file_name=VIDEO_FULL_PATH,\n", + " destination_blob_name=f'gemini/{FILE_NAME}')\n", + "\n", + " print(f\"Completed uploading {len(segments)} segments to {GCLOUD_BUCKET_NAME} buckets\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GnUwrHmHg8GR" + }, + "source": [ + "## Gemini Processing (Visual + Semantic)\n", + "\n", + "### Only run Option 1 or Option 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "collapsed": true, + "id": "t5oN3mppERvf" + }, + "outputs": [], + "source": [ + "# @title Option 1: Select Shots based on Transcript and Visual Description (ABCD framework)\n", + "\n", + "if send_video_to_gemini:\n", + " for counter, segment in enumerate(segments):\n", + " video_full_path = f\"gs://adclip.appspot.com/gemini/{file_name_without_extension}_{counter}.mp4\"\n", + " try:\n", + " response = send_video_to_gemini(video_full_path)\n", + " segment['visual_description'] = response.strip()\n", + " except:\n", + " segment['visual_description'] = \"\" # the response got blocked\n", + " descriptions = get_descriptions_of_all_shots()\n", + " print(descriptions)\n", + "\n", + " num_of_lines = int(len(segments)/2) # The higher num of lines, the longer the video will be\n", + " data = select_segments(num_of_lines=2, descriptions=descriptions).strip(\"'\")\n", + " print(data)\n", + " print('Data is type string. Please input the line number in the box below')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Soaxbq_rzeew" + }, + "outputs": [], + "source": [ + "# WRITE THE SELECTED SEGEMENTS FROM GEMINI IN THIS FUNCTION\n", + "selected_segments = sorted([0,1,2,3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4D2o3U6dW2bS" + }, + "outputs": [], + "source": [ + "# @title Option 2: Select Shots based on Transcript (no Visual Description) -- use for transcript-heavy videos.\n", + "\n", + "input_transcript = segments\n", + "full_text = \"\"\n", + "for counter, i in enumerate(input_transcript):\n", + " line = f\"{counter}: {i['text']}\"\n", + " full_text += line + '\\n'\n", + "\n", + "shortened_text = send_transcript_to_llm(full_text).strip(\" \").strip(\"'\")\n", + "print('----shortened_text-----')\n", + "print(shortened_text)\n", + "\n", + "selected_segments = []\n", + "\n", + "for line in shortened_text.split(\"\\n\"):\n", + " try:\n", + " counter = int(line.split(\": \")[0])\n", + " selected_segments.append(counter)\n", + " except:\n", + " continue\n", + "\n", + "print(selected_segments)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u7kI4uaSXjjx" + }, + "source": [ + "## Video Processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8iCzOqZF97uN" + }, + "outputs": [], + "source": [ + "# @title Generate Short-Form Videos\n", + "\n", + "processed_segments = []\n", + "for segment in selected_segments:\n", + " processed_segments.append(segments[segment])\n", + "print(f\"Number of segments from the original videos: {len(segments)} | Number of Segments selected by Gemini: {len(processed_segments)}\")\n", + "\n", + "import os\n", + "\n", + "local_video_path = f\"/content/{VIDEO_FULL_PATH.replace('videos/', '')}\"\n", + "if not os.path.exists(local_video_path):\n", + " download_blob(source_file_name=VIDEO_FULL_PATH,\n", + " destination_blob_name=local_video_path)\n", + "\n", + "print(f\"Start shortening videos, this process will take a while ...\")\n", + "clip_video(video_path=local_video_path,\n", + " file_name=FILE_NAME,\n", + " segments=processed_segments)\n", + "\n", + "try:\n", + " ################ Second sync with shots###############\n", + " print(\"Try processing with shots sync\")\n", + " input_transcript = processed_segments\n", + " list_of_words = list(map(lambda line: line['words'], input_transcript))\n", + " transcript_words = list(itertools.chain.from_iterable(list_of_words))\n", + " video_shots = get_video_shots(FILE_NAME)\n", + " ## process video shots to align keys name ###\n", + " if video_shots is None:\n", + " # shots = process_video(\n", + " # video_gcs_uri=f\"gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}\") # returns videoshot only\n", + " video_shots = []\n", + " for shot in shots: # from step 1\n", + " reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key\n", + " reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())\n", + " video_shots.append(reformed_dict)\n", + " shortened_text = ''\n", + " for i in segments:\n", + " shortened_text += i['text'] + '\\n'\n", + " segments_transformed = get_clips_from_transcript(transcript_words=transcript_words,\n", + " shortened_text=shortened_text,\n", + " input_transcript=input_transcript)\n", + " try:\n", + " segments_transformed = match_with_video_shots(video_shots, segments, transcript_words)\n", + " print(\"Matched with Shots successfully\")\n", + " except:\n", + " print(\"Retry matching with shots\")\n", + " shots = get_video_shots(FILE_NAME)\n", + " video_shots = []\n", + " for shot in shots:\n", + " reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key\n", + " reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())\n", + " video_shots.append(reformed_dict)\n", + " segments_transformed = match_with_video_shots(video_shots, segments, transcript_words)\n", + " print(f\"Segments selected by Gemini and processed with video shots: {len(processed_segments)}\")\n", + " FILE_NAME = f\"{FILE_NAME.replace('.mp4','')}_sync.mp4\"\n", + " clip_video(video_path=local_video_path,\n", + " file_name=FILE_NAME,\n", + " segments=segments_transformed)\n", + "\n", + "except:\n", + " print(\"Can not process with shots matching. This is due to the video does not have voice over or the transcript is too heavy to be match with video shots.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fBaGK7NO0opG" + }, + "source": [ + "## Copy files to Google Drive (if needed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0rqEshZQNgE4" + }, + "outputs": [], + "source": [ + "# from google.colab import drive\n", + "# drive.mount('/content/drive')\n", + "# !cp \"/content/output.mp4\" \"/content/drive/MyDrive/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lPnlJm9_7LlN" + }, + "outputs": [], + "source": [ + "# Write data to JSON if needed\n", + "\n", + "import json\n", + "with open(\"segments.json\", 'w') as f:\n", + " json.dump(segments, f, indent=2)\n", + "\n", + "with open(\"segments_transformed.json\", 'w') as f:\n", + " json.dump(segments_transformed, f, indent=2)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "u1pD7yVfgJsp", + "GnUwrHmHg8GR" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}