diff --git a/colab/AdClip Gemini Prototype.ipynb b/colab/AdClip Gemini Prototype.ipynb
new file mode 100644
index 0000000..d0b0a8a
--- /dev/null
+++ b/colab/AdClip Gemini Prototype.ipynb	
@@ -0,0 +1,1463 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gZQOxcdIv3w9"
+      },
+      "source": [
+        "## AdClip Gemini Prototype\n",
+        "\n",
+        "#### AdClip Gemini leverages Gemini to understand long-context videos or video ads, and trim them based on the most important segments. There are two options: automatic trimming for long-context videos (transcript only) or [YouTube ABCDs](https://www.thinkwithgoogle.com/intl/en-apac/future-of-marketing/creativity/youtube-video-ad-creative/) (Attention, Branding, Connection, Direct) for video ads (transcript and visual description).\n",
+        "\n",
+        "Contact: adclip-team@google.com"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "u1pD7yVfgJsp"
+      },
+      "source": [
+        "## Install"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "collapsed": true,
+        "id": "BtlqUCvMlLDT"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Install Modules\n",
+        "!pip install google-cloud-aiplatform --quiet\n",
+        "!pip install google-cloud-speech --quiet\n",
+        "!pip install firebase_functions~=0.1.0 --quiet\n",
+        "!pip install google-cloud-videointelligence --quiet\n",
+        "!pip install moviepy --quiet\n",
+        "!pip install pytube --quiet"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wB-cy_XYfS2O"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Initialize the imports\n",
+        "from firebase_functions import https_fn\n",
+        "from firebase_admin import initialize_app, firestore\n",
+        "from google.cloud import speech, storage\n",
+        "from vertexai.preview.language_models import TextGenerationModel\n",
+        "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Image, Content, Part #gemini\n",
+        "from google.cloud import videointelligence\n",
+        "\n",
+        "import moviepy.editor as moviepy\n",
+        "import re\n",
+        "import itertools\n",
+        "import functools\n",
+        "import copy\n",
+        "import math\n",
+        "import requests\n",
+        "from pytube import YouTube\n",
+        "from urllib.parse import urlparse, parse_qs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "srJ8VH0rghYH"
+      },
+      "outputs": [],
+      "source": [
+        "#@title GCS & FireStore\n",
+        "\n",
+        "# Video files are stored in GCS, json files (transcripts, video shots) are stored in FireStore\n",
+        "\n",
+        "def upload_blob(source_file_name: str,\n",
+        "                destination_blob_name: str) -> None:\n",
+        "    \"\"\"Upload file to bucket.\"\"\"\n",
+        "    blob = bucket.blob(destination_blob_name)\n",
+        "    blob.upload_from_filename(source_file_name)\n",
+        "\n",
+        "    print(\n",
+        "        'File {} uploaded to {}.'.format(source_file_name, destination_blob_name)\n",
+        "    )\n",
+        "\n",
+        "def download_blob(source_file_name: str,\n",
+        "                  destination_blob_name: str) -> None:\n",
+        "    \"\"\"Download file from bucket.\"\"\"\n",
+        "    blob = bucket.blob(source_file_name)\n",
+        "    # Download the file to a destination\n",
+        "    blob.download_to_filename(destination_blob_name)\n",
+        "\n",
+        "    print(\n",
+        "        'File {} downloaded to {}.'.format(source_file_name, destination_blob_name)\n",
+        "    )\n",
+        "\n",
+        "def does_file_exist(file_path: str) -> bool:\n",
+        "  \"\"\"Validate if file already existing in the bucket.\n",
+        "\n",
+        "  Args:\n",
+        "    file_path: A file location.\n",
+        "\n",
+        "  Returns:\n",
+        "    True if file existed, otherwise, False\n",
+        "  \"\"\"\n",
+        "  blob = bucket.get_blob(file_path)\n",
+        "  return blob is not None\n",
+        "\n",
+        "def upload_video_shots(file_name: str, video_shots: list) -> None:\n",
+        "  \"\"\"Uploads video shot to firestore.\"\"\"\n",
+        "  db = firestore.client()\n",
+        "  doc_ref = db.collection('video_shots').document(file_name)\n",
+        "  doc_ref.set({'data': video_shots})\n",
+        "\n",
+        "\n",
+        "def get_video_shots(file_name: str) -> bool:\n",
+        "  \"\"\"Gets video shots from firestore by file name.\"\"\"\n",
+        "  db = firestore.client()\n",
+        "  doc = db.collection('video_shots').document(file_name).get()\n",
+        "  if not doc.exists:\n",
+        "    return None\n",
+        "  return doc.to_dict().get('data')\n",
+        "\n",
+        "def get_transcript(file_name: str) -> bool:\n",
+        "  \"\"\"Gets transcript from firestore by file name.\"\"\"\n",
+        "  db = firestore.client()\n",
+        "  doc = db.collection('transcripts').document(file_name).get()\n",
+        "  if not doc.exists:\n",
+        "    return None\n",
+        "\n",
+        "  return doc.to_dict().get('original')\n",
+        "\n",
+        "def upload_transcript(file_name: str, transcript: list) -> None:\n",
+        "  db = firestore.client()\n",
+        "  doc_ref = db.collection('transcripts').document(file_name)\n",
+        "  doc_ref.set({'original': transcript})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "c56iad0LCoEz"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Downloads Videos From YouTube\n",
+        "\n",
+        "#import moviepy\n",
+        "#from moviepy import editor as moviepy\n",
+        "#from moviepy.editor import moviepy.editor\n",
+        "#from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip\n",
+        "\n",
+        "def get_id(youtube_url):\n",
+        "    \"\"\"\" Extract YouTube id from YouTube url\n",
+        "    https://www.youtube.com/watch?v=EUYpKwgqi1M -> \"EUYpKwgqi1M\"\n",
+        "    \"\"\"\n",
+        "    u_pars = urlparse(youtube_url)\n",
+        "    quer_v = parse_qs(u_pars.query).get('v')\n",
+        "    if quer_v:\n",
+        "        return quer_v[0]\n",
+        "    pth = u_pars.path.split('/')\n",
+        "    if pth:\n",
+        "        return pth[-1]\n",
+        "\n",
+        "def video_accessible(youtube_url):\n",
+        "    try:\n",
+        "        yt = YouTube(youtube_url)\n",
+        "    except VideoUnavailable:\n",
+        "        #print(f'Video {youtube_url} is not accessible')\n",
+        "        return False\n",
+        "    else:\n",
+        "        video_id = get_id(youtube_url)\n",
+        "        video_title = yt.title\n",
+        "        return {\"video_id\": video_id, \"video_title\": video_title}\n",
+        "\n",
+        "# Download Youtube video from url\n",
+        "def download_video_from_url(youtube_url,video_file):\n",
+        "    \"\"\"Download YouTube video from url\n",
+        "    Pytube: https://pytube.io/en/latest/api.html#pytube.Stream.download\n",
+        "    \"\"\"\n",
+        "    path = '/tmp/'\n",
+        "    yt = YouTube(youtube_url)\n",
+        "    video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path=path, filename=video_file)\n",
+        "    return video_path  # return video_path string for other functions\n",
+        "\n",
+        "def download_video(youtube_url): # Main function for CF\n",
+        "    if youtube_url is None:\n",
+        "        return {\n",
+        "            \"error\": \"Missing url, sample format: https://youtu.be/9wobcM-WPQk\"\n",
+        "        }\n",
+        "    if video_accessible(youtube_url) is False:\n",
+        "        return {\n",
+        "            \"error\": \"Video is not accessible\"\n",
+        "        }\n",
+        "    video_id = video_accessible(youtube_url)[\"video_id\"]\n",
+        "    video_title = video_accessible(youtube_url)[\"video_title\"]\n",
+        "    video_title = re.sub('\\W+',' ', video_title)  #remove all special characters\n",
+        "    video_file = video_title + \"_\" + video_id + '.mp4'\n",
+        "    video_path_gcs = 'videos/' + video_file #adclip.appspot.com/videos/mytitle_ZDDH2.mp4\n",
+        "    if does_file_exist(video_path_gcs):\n",
+        "        print(f\"Video files already exist in GCS: {video_path_gcs}\")\n",
+        "        # return {\n",
+        "        #     \"message\": \"Video files already exist in GCS\"\n",
+        "        # }\n",
+        "    else:\n",
+        "        video_path_tmp = download_video_from_url(youtube_url=youtube_url,video_file=video_file)\n",
+        "        upload_blob(video_path_tmp, video_path_gcs) #tmp/video.mp4 --> adclip.appspot.com/videos/video.mp4\n",
+        "        #print(\"Video files uploaded to GCS\")\n",
+        "    return {\n",
+        "        \"video_uri\": \"https://storage.mtls.cloud.google.com/adclip.appspot.com/\" + video_path_gcs,\n",
+        "        \"full_path\": video_path_gcs\n",
+        "    }"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jg3WZVVsfpCR"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Detects Shots (Video Intelligence)\n",
+        "\n",
+        "def process_video(video_gcs_uri: str) -> list:\n",
+        "  \"\"\"Processing the video to create video shots with timestamps and store the video shots in GCS.\n",
+        "\n",
+        "  Args:\n",
+        "    video_gcs_uri: A video gcs uri for processing.\n",
+        "\n",
+        "  Returns:\n",
+        "    A list of video shots metadata. For example:\n",
+        "    [\n",
+        "      {\n",
+        "        'start_time': 0.0,\n",
+        "        'end_time': 4.8\n",
+        "      },\n",
+        "      {\n",
+        "        'start_time': 5.2,\n",
+        "        'end_time': 5.6\n",
+        "      }\n",
+        "    ]\n",
+        "  \"\"\"\n",
+        "  video_client = videointelligence.VideoIntelligenceServiceClient()\n",
+        "\n",
+        "  #TODO: b/306068003 - Add speech-to-text feature here.\n",
+        "  features = [\n",
+        "    videointelligence.Feature.SHOT_CHANGE_DETECTION,\n",
+        "    # videointelligence.Feature.SPEECH_TRANSCRIPTION,\n",
+        "  ]\n",
+        "\n",
+        "  transcript_config = videointelligence.SpeechTranscriptionConfig(\n",
+        "    language_code=\"en-US\"\n",
+        "  )\n",
+        "  video_context = videointelligence.VideoContext(\n",
+        "    speech_transcription_config=transcript_config\n",
+        "  )\n",
+        "\n",
+        "  operation = video_client.annotate_video(\n",
+        "    request={\n",
+        "      \"features\": features,\n",
+        "      \"input_uri\": video_gcs_uri,\n",
+        "    }\n",
+        "  )\n",
+        "\n",
+        "  print(\"\\nProcessing video.\", operation)\n",
+        "\n",
+        "  result = operation.result(timeout=300)\n",
+        "\n",
+        "  print(\"\\n finished processing.\")\n",
+        "\n",
+        "  video_shots = []\n",
+        "  # first result is retrieved because a single video was processed\n",
+        "  for i, shot in enumerate(result.annotation_results[0].shot_annotations):\n",
+        "    start_time = (\n",
+        "      shot.start_time_offset.seconds + shot.start_time_offset.microseconds / 1e6\n",
+        "    )\n",
+        "    end_time = (\n",
+        "      shot.end_time_offset.seconds + shot.end_time_offset.microseconds / 1e6\n",
+        "    )\n",
+        "    video_shots.append(\n",
+        "      {\n",
+        "        \"start_time\": math.floor(start_time * 10) / 10.0,\n",
+        "        \"end_time\": round(end_time, 1),\n",
+        "      }\n",
+        "    )\n",
+        "    print(\"\\tShot {}: {} to {}\".format(i, start_time, end_time))\n",
+        "\n",
+        "  return video_shots"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uFPTPJ67gIxr"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Transcribes Audio & Syncs with Shot\n",
+        "\n",
+        "def get_speech_recognition_config(language_code: str):\n",
+        "  \"\"\"Get speech recognition config from the given language code and model.\n",
+        "\n",
+        "  Check all supported language code and model here:\n",
+        "  https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages\n",
+        "\n",
+        "  Args:\n",
+        "    language_code: A language code for transribing.\n",
+        "    model: A video transcribe model.\n",
+        "\n",
+        "  Return:\n",
+        "    A speech recognition config.\n",
+        "  \"\"\"\n",
+        "  if language_code == 'en-US':\n",
+        "    model = 'video'\n",
+        "  elif language_code == 'zh-TW':\n",
+        "    model = 'command_and_search'\n",
+        "  else:\n",
+        "    model = 'default'\n",
+        "  return speech.RecognitionConfig(\n",
+        "      enable_word_time_offsets=True,\n",
+        "      audio_channel_count=2,  # 2 is default for wav files\n",
+        "      # Enable automatic punctuation\n",
+        "      # enable_automatic_punctuation=True,\n",
+        "      language_code=language_code,\n",
+        "      model=model,\n",
+        "      # Works for model=\"video\" or \"phone call\" (en-US only)\n",
+        "      use_enhanced=True,\n",
+        "  )\n",
+        "\n",
+        "def extract_audio(video_full_path, file_name, output_name=None) -> str:\n",
+        "  \"\"\"Extract audio from the video by the given video path.\n",
+        "\n",
+        "  Args:\n",
+        "    video_full_path: A full video path that store in GCS.\n",
+        "    file_name: A file name for temp use.\n",
+        "    output_name: A custom output name.\n",
+        "\n",
+        "  Returns:\n",
+        "    A path to video audio file.\n",
+        "  \"\"\"\n",
+        "  file_name_without_extension = file_name.rsplit('.', 1)[0]\n",
+        "  if output_name is None:\n",
+        "    audio_output_file = file_name_without_extension  + '.wav'\n",
+        "  else:\n",
+        "    audio_output_file = output_name + '.wav'\n",
+        "  gcs_file_path = AUDIO_FOLDER + audio_output_file\n",
+        "\n",
+        "  if does_file_exist(gcs_file_path):\n",
+        "    print('File {} exists'.format(gcs_file_path))\n",
+        "    return GS_PATH + gcs_file_path\n",
+        "  tmp_file_path = TEMP_FOLDER + file_name\n",
+        "\n",
+        "  # use video file_path\n",
+        "  blob = bucket.blob(video_full_path)\n",
+        "  blob.download_to_filename(tmp_file_path)\n",
+        "  clip = moviepy.VideoFileClip(tmp_file_path)\n",
+        "  audio_output_path = TEMP_FOLDER + audio_output_file\n",
+        "  clip.audio.write_audiofile(audio_output_path)\n",
+        "\n",
+        "  upload_blob(audio_output_path, gcs_file_path)\n",
+        "\n",
+        "  return GS_PATH + gcs_file_path\n",
+        "\n",
+        "\n",
+        "def build_transcript(response) -> list:\n",
+        "  \"\"\"Build video transcript response with transcript metadata.\n",
+        "\n",
+        "  Args:\n",
+        "    response: A transcript response from speech API.\n",
+        "\n",
+        "  Returns:\n",
+        "    A list of new video transcript strucutre and metadata.\n",
+        "    For example,\n",
+        "    [\n",
+        "      {\n",
+        "        \"text\": \"some sentence\"\n",
+        "        \"startTime\": 0,\n",
+        "        \"endTime\": 2.8,\n",
+        "        \"duration\": 2.8\n",
+        "        \"words\": [\n",
+        "          {\n",
+        "            \"text\": \"some\"\n",
+        "            \"startTime\": 0,\n",
+        "            \"endTime\": 1.2,\n",
+        "            \"duration\": 1.2\n",
+        "          },\n",
+        "          {\n",
+        "            \"text\": \"sentence\"\n",
+        "            \"startTime\": 1.2,\n",
+        "            \"endTime\": 2.8,\n",
+        "            \"duration\": 1.6\n",
+        "          }\n",
+        "        ]\n",
+        "      }\n",
+        "    ]\n",
+        "  \"\"\"\n",
+        "  transcript_builder = []\n",
+        "  last_end_time = 0\n",
+        "  # Each result is for a consecutive portion of the audio. Iterate through\n",
+        "  # them to get the transcripts for the entire audio file.\n",
+        "  for result in response.results:\n",
+        "    # The first alternative is the most likely one for this portion.\n",
+        "    for alternative in result.alternatives:\n",
+        "\n",
+        "      if len(alternative.words) > 0:\n",
+        "        transcript_item = {\n",
+        "            'text': alternative.transcript,\n",
+        "            'startTime': alternative.words[0].start_time.total_seconds(),\n",
+        "            'endTime': alternative.words[-1].end_time.total_seconds(),\n",
+        "            'duration': (alternative.words[-1].end_time.total_seconds()\n",
+        "                         - alternative.words[0].start_time.total_seconds())\n",
+        "        }\n",
+        "\n",
+        "        transcript_item['words'] = []\n",
+        "        for word in alternative.words:\n",
+        "          transcript_item['words'].append({\n",
+        "              'text': word.word,\n",
+        "              'startTime': word.start_time.total_seconds(),\n",
+        "              'endTime': word.end_time.total_seconds(),\n",
+        "              'duration': (word.end_time.total_seconds()\n",
+        "                           - word.start_time.total_seconds()),\n",
+        "              'gap': word.end_time.total_seconds() - last_end_time})\n",
+        "          last_end_time = word.end_time.total_seconds()\n",
+        "        transcript_builder.append(transcript_item)\n",
+        "  return transcript_builder\n",
+        "\n",
+        "\n",
+        "def generate_transcript_item(\n",
+        "    words: list, start_time: float = None, end_time: float = None) -> dict:\n",
+        "  \"\"\"Generates transcript item.\"\"\"\n",
+        "  start_time = words[0]['startTime'] if start_time is None else start_time\n",
+        "  end_time = words[-1]['endTime'] if end_time is None else end_time\n",
+        "  return {\n",
+        "      'text': ' '.join(list(map(lambda word: word['text'], words))),\n",
+        "      'startTime': start_time,\n",
+        "      'endTime': end_time,\n",
+        "      'duration': end_time - start_time,\n",
+        "      'words': words\n",
+        "  }\n",
+        "\n",
+        "\n",
+        "def refine_by_gaps(transcript: list) -> list:\n",
+        "  \"\"\"Refines the transcript by the gap time.\"\"\"\n",
+        "  new_transcript = []\n",
+        "\n",
+        "  for line in transcript:\n",
+        "    gaps = list(map(lambda clip: clip['gap'], line['words']))\n",
+        "    gaps.pop(0) #remove first gap\n",
+        "\n",
+        "    if len(gaps) == 0:\n",
+        "      continue\n",
+        "    average = sum(gaps) / len(gaps)\n",
+        "    words = []\n",
+        "    for index, word in enumerate(line['words']):\n",
+        "      if index > 1 and word['gap'] > average * GAP_MULTIPLIER:\n",
+        "        new_transcript.append(generate_transcript_item(words))\n",
+        "        words = []\n",
+        "      words.append(word)\n",
+        "    if len(words) > 0:\n",
+        "      new_transcript.append(generate_transcript_item(words))\n",
+        "  return new_transcript\n",
+        "\n",
+        "\n",
+        "\n",
+        "def merge_clips(transcript: list) -> list:\n",
+        "  \"\"\"Merges clips under 5seconds.\"\"\"\n",
+        "  if len(transcript) == 0:\n",
+        "    return []\n",
+        "\n",
+        "  def merge(transcript1, transcript2):\n",
+        "    \"\"\"Merges transcript1 and transcript2.\"\"\"\n",
+        "    start_time = transcript1['startTime']\n",
+        "    end_time = max(transcript1['endTime'], transcript2['endTime'])\n",
+        "    return {\n",
+        "        'text': f\"{transcript1['text']} {transcript2['text']}\",\n",
+        "        'startTime': start_time,\n",
+        "        'endTime': end_time,\n",
+        "        'duration': end_time - start_time,\n",
+        "        'words': transcript1['words'] + transcript2['words'],\n",
+        "    }\n",
+        "\n",
+        "  def is_overlapping(transcript1, transcript2):\n",
+        "    \"\"\"Validate overlapping transcript time.\"\"\"\n",
+        "    t2_start_time = transcript2['words'][0]['startTime']\n",
+        "    t2_prev_start_time = transcript2['words'][-1]['startTime']\n",
+        "    t1_start_time = transcript1['startTime']\n",
+        "    t1_end_time = transcript1['endTime']\n",
+        "    return t2_start_time >= t1_start_time and t2_prev_start_time <= t1_end_time\n",
+        "\n",
+        "  output = []\n",
+        "  index = 0\n",
+        "  clip = transcript[index]\n",
+        "\n",
+        "  for index in range(len(transcript)):\n",
+        "    if index < len(transcript) - 1:\n",
+        "      next = transcript[index + 1]\n",
+        "      if (next['endTime'] - clip['startTime'] <= MIN_CLIP_DURATION or\n",
+        "          is_overlapping(clip, next)):\n",
+        "        clip = merge(clip, next)\n",
+        "      else:\n",
+        "        output.append(clip)\n",
+        "        clip = transcript[index + 1]\n",
+        "    else:\n",
+        "      output.append(clip)\n",
+        "  return output\n",
+        "\n",
+        "\n",
+        "def refine_by_video_shots(\n",
+        "    file_name: str, video_gcs_uri: str, transcript: list) -> list:\n",
+        "  \"\"\"Refines transcript with video shots data.\"\"\"\n",
+        "\n",
+        "  new_transcript = []\n",
+        "  video_shots = get_video_shots(file_name)\n",
+        "\n",
+        "  if video_shots is None:\n",
+        "    video_shots = process_video(video_gcs_uri)\n",
+        "    upload_video_shots(file_name, video_shots)\n",
+        "\n",
+        "  video_shots_index = 0\n",
+        "  list_of_words = list(map(lambda line: line['words'], transcript))\n",
+        "  transcript_words = list(itertools.chain.from_iterable(list_of_words))\n",
+        "  print('\\\\\\\\\\ Transcript_words ////')\n",
+        "  print(transcript_words)\n",
+        "  words = []\n",
+        "\n",
+        "  for index, word in enumerate(transcript_words):\n",
+        "    words.append(word)\n",
+        "    while video_shots[video_shots_index]['end_time'] <= words[0]['startTime']:\n",
+        "      video_shots_index = video_shots_index + 1\n",
+        "    video_shot = video_shots[video_shots_index]\n",
+        "    if word['endTime'] > video_shot['end_time']:\n",
+        "      start_time = min(words[0]['startTime'], video_shot['start_time'])\n",
+        "      if index < len(transcript_words) - 1:\n",
+        "        end_time = max(\n",
+        "            word['endTime'],\n",
+        "            min(\n",
+        "                video_shot['end_time'], transcript_words[index + 1]['startTime']\n",
+        "            ),\n",
+        "        )\n",
+        "      else:\n",
+        "        end_time = max(word['endTime'], video_shot['end_time'])\n",
+        "      video_shots_index = video_shots_index + 1\n",
+        "      new_transcript.append(\n",
+        "          generate_transcript_item(words, start_time, end_time)\n",
+        "      )\n",
+        "      words = []\n",
+        "  if len(words) > 0:\n",
+        "    start_time = min(\n",
+        "        words[0]['startTime'], video_shots[video_shots_index]['start_time']\n",
+        "    )\n",
+        "    if len(new_transcript) > 0:\n",
+        "      previous_last_word = new_transcript[-1]['words'][-1]\n",
+        "      start_time = max(start_time, previous_last_word['endTime'])\n",
+        "\n",
+        "    end_time = max(word['endTime'], video_shots[video_shots_index]['end_time'])\n",
+        "    video_shots_index = video_shots_index + 1\n",
+        "    new_transcript.append(generate_transcript_item(words, start_time, end_time))\n",
+        "\n",
+        "  return new_transcript\n",
+        "\n",
+        "# @https_fn.on_call(\n",
+        "#     timeout_sec=600,\n",
+        "#     memory=options.MemoryOption.GB_4,\n",
+        "#     cpu=2,\n",
+        "#     region='asia-southeast1',\n",
+        "# )\n",
+        "\n",
+        "#   video_full_path = request.data['full_path'] or VIDEO_FULL_PATH\n",
+        "#   file_name = request.data['file_name'] or FILE_NAME\n",
+        "#   language_code = request.data['language_code'] or LANGUAGE_CODE\n",
+        "\n",
+        "\n",
+        "def transcribe_video() -> any:\n",
+        "  \"\"\"Transcribe video audio and store the transcript in GCS.\n",
+        "\n",
+        "  Args:\n",
+        "    request: A request payload from API call.\n",
+        "\n",
+        "  Returns:\n",
+        "    An object that contain video transcript with the timestamp data.\n",
+        "  \"\"\"\n",
+        "\n",
+        "  video_full_path = VIDEO_FULL_PATH\n",
+        "  file_name = FILE_NAME\n",
+        "  language_code = LANGUAGE_CODE\n",
+        "\n",
+        "  if video_full_path is None:\n",
+        "    return {\n",
+        "        'error': (\n",
+        "            'Missing video uri, sample format:'\n",
+        "            ' https://googleapis.com/example.wav'\n",
+        "        )\n",
+        "    }\n",
+        "\n",
+        "  transcript_in_firestore = get_transcript(file_name)\n",
+        "  if transcript_in_firestore is not None:\n",
+        "    return {\n",
+        "        'transcript': merge_clips(\n",
+        "            refine_by_video_shots(\n",
+        "                file_name,\n",
+        "                GS_PATH + video_full_path,\n",
+        "                transcript_in_firestore)),\n",
+        "        'original': transcript_in_firestore,\n",
+        "        'v1': refine_by_gaps(transcript_in_firestore),\n",
+        "    }\n",
+        "\n",
+        "  audio_gcs_uri = extract_audio(video_full_path, file_name)\n",
+        "  print(f'Extracted audio is stored at {audio_gcs_uri}')\n",
+        "\n",
+        "  audio = speech.RecognitionAudio(uri=audio_gcs_uri)\n",
+        "  client = speech.SpeechClient()\n",
+        "\n",
+        "  config = get_speech_recognition_config(language_code)\n",
+        "\n",
+        "  operation = client.long_running_recognize(config=config, audio=audio)\n",
+        "\n",
+        "  print(\"Waiting for operation to complete...\")\n",
+        "  response = operation.result(timeout=900)\n",
+        "\n",
+        "  transcript = build_transcript(response)\n",
+        "  upload_transcript(file_name, transcript)\n",
+        "\n",
+        "  return {\n",
+        "      'transcript': merge_clips(\n",
+        "          refine_by_video_shots(\n",
+        "              file_name,\n",
+        "              GS_PATH + video_full_path,\n",
+        "              transcript)),\n",
+        "      'original': transcript,\n",
+        "      'v1': refine_by_gaps(transcript)\n",
+        "  }"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xjPmIUMkz_fQ"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Syncs with Shot (After Summarization)\n",
+        "\n",
+        "def match_with_video_shots(video_shots: list,\n",
+        "                           transcript: list,\n",
+        "                           words: list) -> list:\n",
+        "  \"\"\"Adjusts the startTime and endTime of each line in the transcript.\n",
+        "\n",
+        "  This implementation helps with \"jumpy\" transition in the final output video.\n",
+        "\n",
+        "  Args:\n",
+        "    video_shots: The list containing video shots in format of\n",
+        "    [{end_time, start_time}, {end_time, start_time},]\n",
+        "    transcript: The full transcript transcribed by Speech to Text AI.\n",
+        "    words: A list containing the startTime and eachTime of each word in the full\n",
+        "    transcript.\n",
+        "\n",
+        "  Returns:\n",
+        "    The transcript with the adjusted startTime and endTime.\n",
+        "  \"\"\"\n",
+        "  shot_index = 0\n",
+        "  word_index = 0\n",
+        "  for index, line in enumerate(transcript):\n",
+        "    while video_shots[shot_index]['endTime'] <= line['startTime']:\n",
+        "      shot_index += 1\n",
+        "    video_shot = video_shots[shot_index]\n",
+        "\n",
+        "    start_time = min(line['startTime'], video_shot['startTime'])\n",
+        "    while (\n",
+        "        word_index + 1 < len(words) - 1\n",
+        "        and words[word_index + 1]['endTime'] < line['startTime']\n",
+        "    ):\n",
+        "      word_index += 1\n",
+        "    previous_word = words[word_index]\n",
+        "    if previous_word['startTime'] != line['startTime']:\n",
+        "      start_time = max(previous_word['endTime'], start_time)\n",
+        "\n",
+        "    transcript[index]['startTime'] = start_time\n",
+        "\n",
+        "    while video_shots[shot_index]['endTime'] < line['endTime']:\n",
+        "      shot_index += 1\n",
+        "    video_shot = video_shots[shot_index]\n",
+        "\n",
+        "    end_time = max(line['endTime'], video_shot['endTime'])\n",
+        "\n",
+        "    while (\n",
+        "        word_index < len(words) - 1\n",
+        "        and words[word_index]['startTime'] < line['endTime']\n",
+        "    ):\n",
+        "      word_index += 1\n",
+        "    next_word = words[word_index]\n",
+        "    if next_word['endTime'] != line['endTime']:\n",
+        "      end_time = min(end_time, next_word['startTime'])\n",
+        "\n",
+        "    if index == len(transcript) - 1:\n",
+        "      end_time = video_shots[-1]['endTime']\n",
+        "    else:\n",
+        "      #manually add 0.3s to end_time for better transitions\n",
+        "      end_time = round(end_time + 0.3, 2)\n",
+        "\n",
+        "    transcript[index]['endTime'] = end_time\n",
+        "    transcript[index]['duration'] = end_time - start_time\n",
+        "  return transcript\n",
+        "\n",
+        "\n",
+        "def extract_words_from_str(summary: str) -> list:\n",
+        "    \"\"\"Extracts the words from the given summary splitting by space.\n",
+        "\n",
+        "    Args:\n",
+        "      summary: A summary of the transcript.\n",
+        "\n",
+        "    Return:\n",
+        "      A list of words from the given summary.\n",
+        "    \"\"\"\n",
+        "    # Remove the trailing \"transcript:\" from the summarized transcript from LLM\n",
+        "    if summary.lstrip().lower().startswith('transcript:'):\n",
+        "      summary = summary.lower().replace('transcript:', '', 1)\n",
+        "\n",
+        "    summary = re.sub('[,.?!]', '', summary).lower()\n",
+        "    summary = summary.replace('\\n', ' ')\n",
+        "\n",
+        "    words = summary.split(' ')\n",
+        "    words = list(filter(lambda word: len(word) > 0, words))\n",
+        "    print(f'words: {words}')\n",
+        "    return words\n",
+        "\n",
+        "\n",
+        "\n",
+        "def get_clips_from_transcript(\n",
+        "      #self,\n",
+        "      transcript_words: list,\n",
+        "      shortened_text: str,\n",
+        "      input_transcript: list) -> list:\n",
+        "    \"\"\"Identifies the clip from the summarized transcript. This function  minimizes the hallucination when LLM\n",
+        "    doesn't respect the original sentences from the full transcripts by adding  new words or only returning parts\n",
+        "    of the original sentences in its response.\n",
+        "\n",
+        "    Example:\n",
+        "      - Original sentence: \"MacBook Air for the first time ever in 15 inches  we've been dreaming about making this for years we\"\n",
+        "      - Response from LLM: \"MacBook Air for the first time ever in 15 inches...\"\n",
+        "\n",
+        "    Args:\n",
+        "      transcript: The original full transcripts\n",
+        "      summary: The \"summarized\" transcript from LLM\n",
+        "\n",
+        "    Returns:\n",
+        "      A list containing the adjusted text, start_time, end_time, duration.\n",
+        "    \"\"\"\n",
+        "    print(\"----get_clips_from_transcript-----'\")\n",
+        "    print(transcript_words)\n",
+        "    transcript_ptr = 0\n",
+        "    output = []\n",
+        "\n",
+        "    summary_words = extract_words_from_str(shortened_text)\n",
+        "\n",
+        "    word_ptr = 0\n",
+        "\n",
+        "    def does_word_match_transcript(transcript_idx: int, word_idx: int):\n",
+        "      if (transcript_idx >= len(transcript_words) or\n",
+        "          word_idx >= len(summary_words)):\n",
+        "        return False\n",
+        "\n",
+        "      transcript_word_text = transcript_words[transcript_idx].get('text')\n",
+        "      transcript_word_text = re.sub('[,.?!]', '', transcript_word_text)\n",
+        "      return (transcript_word_text.lower() ==\n",
+        "              summary_words[word_idx].lower())\n",
+        "\n",
+        "    while transcript_ptr < len(transcript_words):\n",
+        "      transcript_builder = []\n",
+        "\n",
+        "      # loop until the summary word match with transcript\n",
+        "      # or until the transcript has True shouldKeep flag\n",
+        "      while (transcript_ptr < len(transcript_words)\n",
+        "             and not does_word_match_transcript(transcript_ptr, word_ptr)\n",
+        "             and transcript_words[transcript_ptr].get('shouldKeep') != True):\n",
+        "        transcript_ptr = transcript_ptr + 1\n",
+        "\n",
+        "      # append all matched transcript summary\n",
+        "      # or transcript that has True shouldKeep flag\n",
+        "      while (transcript_ptr < len(transcript_words) and\n",
+        "              (does_word_match_transcript(transcript_ptr, word_ptr)\n",
+        "               or does_word_match_transcript(transcript_ptr + 1, word_ptr + 1)\n",
+        "               or does_word_match_transcript(transcript_ptr + 2, word_ptr + 1)\n",
+        "               or transcript_words[transcript_ptr].get('shouldKeep') == True)):\n",
+        "        transcript_builder.append(transcript_words[transcript_ptr])\n",
+        "\n",
+        "        if does_word_match_transcript(transcript_ptr, word_ptr):\n",
+        "          word_ptr += 1\n",
+        "\n",
+        "        elif transcript_words[transcript_ptr].get('shouldKeep') != True:\n",
+        "          transcript_builder.append(transcript_words[transcript_ptr+1])\n",
+        "\n",
+        "          if not does_word_match_transcript(transcript_ptr + 1, word_ptr + 1):\n",
+        "            transcript_builder.append(transcript_words[transcript_ptr+2])\n",
+        "            transcript_ptr += 1\n",
+        "\n",
+        "          transcript_ptr += 1\n",
+        "          word_ptr += 2\n",
+        "\n",
+        "        transcript_ptr += 1\n",
+        "\n",
+        "      if len(transcript_builder) == 0:\n",
+        "        continue\n",
+        "      if len(transcript_builder) == 1:\n",
+        "        word_ptr -= 1\n",
+        "        continue\n",
+        "\n",
+        "      new_text = list(map(lambda item: item.get('text'), transcript_builder))\n",
+        "      output.append({\n",
+        "        'text': ' '.join(new_text),\n",
+        "        'startTime': transcript_builder[0].get('startTime'),\n",
+        "        'endTime': transcript_builder[-1].get('endTime'),\n",
+        "        'duration': (transcript_builder[-1].get('endTime') -\n",
+        "               transcript_builder[0].get('startTime')),\n",
+        "        'words': transcript_builder\n",
+        "      })\n",
+        "    return output"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Q88CnLNeU1m-"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Gemini Functions\n",
+        "\n",
+        "from logging import Handler\n",
+        "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, SafetySetting, Image, Content, Part, FinishReason, HarmCategory, HarmBlockThreshold\n",
+        "\n",
+        "def upload_segments_to_gemini_bucket(segments: list,\n",
+        "                                    source_file_name: str,\n",
+        "                                    destination_blob_name: str) -> list:\n",
+        "    \"\"\"Split videos into segments as a result from the transcribe_videos (for videos with\n",
+        "    voiceovers) or from the shot detection API and then uploads those segmented parts to GCS\n",
+        "    Returns: A list of uri for segmented videos\"\"\"\n",
+        "    local_video_path = f\"/content/{VIDEO_FULL_PATH.replace('videos', '')}\"  # Process video files in Colab's content folder for quicker processing\n",
+        "    download_blob(source_file_name=f\"{VIDEO_FULL_PATH}\", #/videos/video.mp4\n",
+        "                  destination_blob_name=local_video_path) #download to /content/ in codelab --- change later when deploying to CF, use /tmp/\n",
+        "    original_clip = moviepy.VideoFileClip(local_video_path)\n",
+        "    segments_uri = []\n",
+        "    for counter, segment in enumerate(segments):\n",
+        "        new_clip = original_clip.subclip(segment['startTime'], segment['endTime'])\n",
+        "        file_name_without_extension = FILE_NAME.replace('.mp4', '')\n",
+        "        segment_file = f\"{file_name_without_extension}_{counter}.mp4\"\n",
+        "        new_clip.write_videofile(f\"/content/{segment_file}\",audio_codec=\"aac\", logger=None)\n",
+        "        upload_blob(source_file_name=segment_file,\n",
+        "                    destination_blob_name=f'gemini/{segment_file}')  # Upload to adclip.appspot.com/gemini/ bucket (seperate from video bucket)\n",
+        "        segments_uri.append(f\"{GS_PATH}gemini/{segment_file}\")\n",
+        "    return segments_uri\n",
+        "\n",
+        "def send_video_to_gemini(video_full_path, root_promt=\"\", context=\"\"):\n",
+        "    model = GenerativeModel(\"gemini-pro-vision\")\n",
+        "    video = Part.from_uri(video_full_path, mime_type=\"video/mp4\")\n",
+        "    root_prompt = \"Provide a description on the visual elements of the video\"\n",
+        "    prompt = f\"{root_prompt} '\\n' {context}\"\n",
+        "    contents = [video, prompt]\n",
+        "    generation_config = GenerationConfig(max_output_tokens=2048,\n",
+        "                                         temperature=0.4,\n",
+        "                                         top_p=1,\n",
+        "                                         top_k=32)\n",
+        "    responses = model.generate_content(contents,\n",
+        "                                       stream = True,\n",
+        "                                       generation_config=generation_config,)\n",
+        "    response = \"\"\n",
+        "    for chunk in responses:\n",
+        "        response += chunk.text\n",
+        "    return response\n",
+        "\n",
+        "def send_transcript_to_llm(text: str,\n",
+        "                           model: str = \"gemini-1.5-pro-001\",\n",
+        "                           temperature: float = 0.3,\n",
+        "                           max_output_tokens: int = 8192,\n",
+        "                           top_k: int = 32,\n",
+        "                           top_p: int = 1) -> str:\n",
+        "    \"\"\"Sends a transcript to Vertex LLM.\n",
+        "\n",
+        "    Args:\n",
+        "        text: A prompt to generate the response from the model.\n",
+        "        model: The Language Model to use.\n",
+        "        temperature: A temperature indicates the degree of randomness in token selection.\n",
+        "        max_output_tokens: The maximum number of tokens that can be generated in the response.\n",
+        "        top_k: A value indicates how the model selects tokens for output.\n",
+        "        top_p: A value indicates how the model selects tokens for output.\n",
+        "\n",
+        "    Returns:\n",
+        "        A string of the summarized transcript.\n",
+        "    \"\"\"\n",
+        "    model = GenerativeModel(model)\n",
+        "    prompt = f\"{root_prompt} '\\n' {text} \"\n",
+        "    contents = [prompt]\n",
+        "    generation_config = GenerationConfig(\n",
+        "            max_output_tokens=max_output_tokens,\n",
+        "            temperature=temperature,\n",
+        "            top_k=top_k,\n",
+        "            top_p=top_p,)\n",
+        "\n",
+        "    safety_config = [\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_HARASSMENT,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "    ]\n",
+        "\n",
+        "    responses = model.generate_content(contents,\n",
+        "        generation_config=generation_config,\n",
+        "        safety_settings=safety_config,\n",
+        "        stream=False,\n",
+        "    )\n",
+        "    print(responses.text)\n",
+        "    return responses.text\n",
+        "\n",
+        "\n",
+        "def select_segments(num_of_lines=3, descriptions=\"\"):\n",
+        "    model = GenerativeModel(\"gemini-1.5-pro-001\")\n",
+        "    prompt = f\"\"\"\n",
+        "    You are an expert video editor. Your task is to trim a video ad based on the following criteria\n",
+        "    - Awareness: Strong message, scene/tease, or conclusion or Showcase the subject, product, animation tightky framed, or close up, or zoomed in\n",
+        "    - Branding: Mention of brand name, logo, products, packages\n",
+        "    - Connection: Visible face or human (body parts, humans, animations, cartoons are acceptable) or Product Interaction (for example: a user holding phone, eating product, using app)\n",
+        "    or Clear messaging on benefits to consumer (show what products can do, what brands want viewers to do) or Competitive claims (awards, review, recommendation) or\n",
+        "    Emotion response through text, speech, music (fear, laughter, sadness, disgust, surprise, delight, etc.)\n",
+        "    - Direction: Call-to-action is detected through text or audio or Path to purchase showing how to buy (physical stores, app, website) is detected\n",
+        "    or A search bar is visible on screen or Mention of limited qualities, price, special offer\n",
+        "\n",
+        "    Below are the computer-generated transcript and visual description of each of the line in the video\n",
+        "\n",
+        "    Provide the top {num_of_lines} for each of the criteria above in JSON.\n",
+        "\n",
+        "    {{'Awareness': {{'Line': Description,\n",
+        "                     'Line': Description}},\n",
+        "    'Branding':}}\n",
+        "\n",
+        "    {descriptions}\n",
+        "    \"\"\"\n",
+        "    generation_config = GenerationConfig(\n",
+        "            max_output_tokens=2048,\n",
+        "            temperature=0.2,\n",
+        "            top_k=32,\n",
+        "            top_p=1,)\n",
+        "    safety_config = [\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_HARASSMENT,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "        SafetySetting(\n",
+        "            category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,\n",
+        "            threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,\n",
+        "        ),\n",
+        "    ]\n",
+        "    responses = model.generate_content(\n",
+        "        prompt,\n",
+        "        generation_config=generation_config,\n",
+        "        safety_settings=safety_config,\n",
+        "        stream=False,\n",
+        "        )\n",
+        "    return responses.text\n",
+        "\n",
+        "def get_descriptions_of_all_shots():\n",
+        "    descriptions = \"\"\n",
+        "    for counter, segment in enumerate(segments):\n",
+        "        if has_voice_over:\n",
+        "            descriptions += (f\"Line {counter} \\nTranscript: {segment['text']}\\nVisual Description: {segment['visual_description']}\\n\")\n",
+        "        else:\n",
+        "            descriptions += (f\"Line {counter} \\nVisual Description: {segment['visual_description']}\\n\")\n",
+        "    #print(descriptions)\n",
+        "    return descriptions\n",
+        "\n",
+        "\n",
+        "root_prompt =\"\"\"You are a senior copy writer for an advertising agency who excels at summarizing transcript for video ads.\n",
+        "Shorten the transcript by keeping important lines and removing other lines. Make sure the output is less than 30% of the original input transcript\n",
+        "Keep the format of the output the same with the input. Keep Line number. Do not capitalize sentences, add commas, or rewrite the output.\n",
+        "\n",
+        "input:\n",
+        "MacBook Air\n",
+        "for the first time ever in 15 inches\n",
+        "we\\'ve been dreaming about making this for years\n",
+        "we designed a big beautiful display\n",
+        "the kept it incredibly thin and super like it\\'s all possible because of a basilica\n",
+        "M2 is so efficient but we don\\'t need a fan which means you\\'ll have a MacBook Air that\\'s a thin as ever while running completely\n",
+        "silent and everything comes together inside a design packed so tight that there\\'s barely room for an ant\\'s\n",
+        "from the side it almost\n",
+        "disappears and the liquid Retina Display\n",
+        "just look at it\n",
+        "that\\'s twice the resolution of a comparable 15 inch PC laptop and 25 percent brighter a bigger display means more room and more room means more speakers\n",
+        "double the Boost double the Beast\n",
+        "they have double the base of the\n",
+        "13-inch MacBook Air and because the speakers are located behind the\n",
+        "keyboard the sound reflects off the display towards the viewer so it feels super\n",
+        "immersive\n",
+        "okay tell us more about the chip it\\'s a very efficient SOC with an 8 core\n",
+        "CPU tank or\n",
+        "GPU and a 16 Corner oh engine\n",
+        "running fifteen point eight trillion operations per second and comes with up to twenty four gigs of memory which\n",
+        "means killer battery life and crazy fast\n",
+        "of course it also works great with iPhone\n",
+        "and the one thing no one realizes well other laptops that are thin and wide can sometimes feel\n",
+        "flimsy\n",
+        "so we\\'re extremely intentional with the are structural design keeping it ultra light but making it as durable as possible\n",
+        "because you know things happen\n",
+        "this is the\n",
+        "an inch laptop but we\\'ve always wanted to create its\n",
+        "uncompromising\n",
+        "expansive the in light and Powerful we love making it and we think you\\'ll love using it\n",
+        "\n",
+        "output:\n",
+        "MacBook Air\n",
+        "for the first time ever in 15 inches\n",
+        "we\\'ve been dreaming about making this for years\n",
+        "we designed a big beautiful display\n",
+        "just look at it\n",
+        "they have double the base of the\n",
+        "immersive\n",
+        "GPU and a 16 Corner oh engine\n",
+        "running fifteen point eight trillion operations per second and comes with up to twenty four gigs of memory which\n",
+        "means killer battery life and crazy fast\n",
+        "uncompromising\n",
+        "expansive the in light and Powerful we love making it and we think you\\'ll love using it\n",
+        "\n",
+        "input:\"\"\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UP70fomJkRAc"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Video Cutting Functions\n",
+        "\n",
+        "from moviepy.video.fx.all import crop\n",
+        "\n",
+        "def merge_segments(segments):\n",
+        "  if len(segments) == 0:\n",
+        "    return []\n",
+        "\n",
+        "  output = [segments[0]]\n",
+        "\n",
+        "  for index in range(1, len(segments)):\n",
+        "    if segments[index]['startTime'] <= output[-1]['endTime']:\n",
+        "        output[-1]['endTime'] = segments[index]['endTime']\n",
+        "    else:\n",
+        "        output.append(segments[index])\n",
+        "  return output\n",
+        "\n",
+        "def clip_video(video_path, file_name, segments):\n",
+        " # loading original video\n",
+        "    original_clip = moviepy.VideoFileClip(video_path)\n",
+        "    new_clip = {}\n",
+        "\n",
+        "    segments = merge_segments(segments)\n",
+        "\n",
+        "    for segment in segments:\n",
+        "        #make sure end_time does not exceed the video duration\n",
+        "        end_time = min(segment['endTime'], original_clip.duration)\n",
+        "        if new_clip:\n",
+        "            new_clip = moviepy.concatenate_videoclips([new_clip, original_clip.subclip(segment['startTime'], end_time)])\n",
+        "        else:\n",
+        "            new_clip = original_clip.subclip(segment['startTime'], end_time)\n",
+        "\n",
+        "    (w, h) = new_clip.size\n",
+        "\n",
+        "    # 9/16 ratio\n",
+        "\n",
+        "    crop_width = h * 9/16\n",
+        "    crop_width = crop_width//2*2\n",
+        "\n",
+        "    x1, x2 = (w - crop_width)//2, (w+crop_width)//2\n",
+        "    y1, y2 = 0, h\n",
+        "    cropped_clip = crop(new_clip, x1=x1, y1=y1, x2=x2, y2=y2)\n",
+        "\n",
+        "    # 1/1 ratio\n",
+        "\n",
+        "    crop_width = h\n",
+        "    crop_width = crop_width//2*2\n",
+        "\n",
+        "    x1, x2 = (w - crop_width)//2, (w+crop_width)//2\n",
+        "    y1, y2 = 0, h\n",
+        "    cropped_clip_square = crop(new_clip, x1=x1, y1=y1, x2=x2, y2=y2)\n",
+        "\n",
+        "    video_output_path = f\"/content/output_vertical_{FILE_NAME}\" # 9:16 vertical\n",
+        "    video_output_square = f\"/content/output_square_{FILE_NAME}\" # 1:1\n",
+        "    video_output_path_original = f\"/content/output_horizontal_{FILE_NAME}\"\n",
+        "\n",
+        "    # Write file without wm\n",
+        "    cropped_clip.write_videofile(video_output_path,audio_codec=\"aac\") #put in audio_codec so the clip has sound\n",
+        "    cropped_clip_square.write_videofile(video_output_square, audio_codec=\"aac\")\n",
+        "    new_clip.write_videofile(video_output_path_original,audio_codec=\"aac\") #put in audio_codec so the clip has sound\n",
+        "\n",
+        "    # upload to cloud storage\n",
+        "    # upload_blob(video_output_path, 'output/' + 'vertical_tmp_' + file_name)\n",
+        "    # upload_blob(video_output_square, 'output/' + 'square_tmp_' + file_name)\n",
+        "    # upload_blob(video_output_path_original, 'output/' + 'landscape_tmp_' + file_name)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "webKdTrceHS8"
+      },
+      "source": [
+        "## GCP & Videos Processing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "alXRLxDilkbS"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Configuration\n",
+        "\n",
+        "#@markdown ##### Note: There will be a pop-up window asking you to authenticate. Feel free to use your own project and enable all required APIs.\n",
+        "\n",
+        "PROJECT_ID = \"\"   # @param {type:\"string\"}\n",
+        "LOCATION = \"\" # @param {type:\"string\"}\n",
+        "GCLOUD_BUCKET_NAME = \"\" # @param {type:\"string\"}\n",
+        "GS_PATH = f'gs://{GCLOUD_BUCKET_NAME}/'\n",
+        "\n",
+        "\n",
+        "# From CF\n",
+        "TEMP_FOLDER = '/tmp/'\n",
+        "AUDIO_FOLDER = 'videos/audio/'  # in Cloud Storage\n",
+        "\n",
+        "# Customized for transcribe\n",
+        "GAP_MULTIPLIER = 2.5\n",
+        "MIN_CLIP_DURATION = 5\n",
+        "\n",
+        "#@title Initialize GCP\n",
+        "\n",
+        "from google.colab import auth as google_auth\n",
+        "google_auth.authenticate_user(project_id=PROJECT_ID)\n",
+        "!gcloud config set project {PROJECT_ID}\n",
+        "!gcloud config get-value project\n",
+        "\n",
+        "storage_client = storage.Client()\n",
+        "bucket = storage_client.get_bucket(GCLOUD_BUCKET_NAME)\n",
+        "\n",
+        "#Initialize Front End; Need to initialize to use Firestore\n",
+        "initialize_app()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "eWyp5qd7FAWi"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Download Videos from YouTube and Upload to GCS\n",
+        "\n",
+        "#@markdown ##### If the functionality to download from YouTube is not working, you can upload to the video to GCS bucket and put the VIDEO_FULL_PATH directly in the box. [Link to GCS bucket](https://pantheon.corp.google.com/storage/browser/adclip.appspot.com/videos?e=13802955&mods=dm_deploy_from_gcs&project=adclip&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))) `gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}`\n",
+        "youtube_url = \"\" # @param {type:\"string\"}\n",
+        "VIDEO_FULL_PATH = \"\" # @param {type:\"string\"}\n",
+        "\n",
+        "#@markdown ##### Not all languages are supported.\n",
+        "has_voice_over = True # @param [\"False\", \"True\"] {type:\"raw\"}\n",
+        "LANGUAGE_CODE = \"en-US\" # @param [\"en-US\", \"es-ES\", \"id-ID\", \"fil-PH\", \"id-ID\", \"th-TH\", \"vi-VN\"]\n",
+        "\n",
+        "#@markdown ##### Select this option to analyze videos with visual elements. Note: Only do this for video less than 10 minutes please.\n",
+        "upload_to_gemini_bucket = False # @param [\"False\", \"True\"] {type:\"raw\"}\n",
+        "\n",
+        "##@markdown ##### Select the model for Speech-to-Text. Not all models are available (for example: \"video\" are only available for en-US. If in doubt, use \"default\". Check [this link](https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages) for details\"\n",
+        "#MODEL = \"default\" # @param [\"default\", \"command_and_search\", \"latest_short\", \"video\"]\n",
+        "\n",
+        "try:\n",
+        "    VIDEO_FULL_PATH = download_video(youtube_url)[\"full_path\"] # will return `videos/myclip.mp4`\n",
+        "except:\n",
+        "    VIDEO_FULL_PATH = VIDEO_FULL_PATH\n",
+        "\n",
+        "FILE_NAME = VIDEO_FULL_PATH.split(\"/\")[-1]\n",
+        "\n",
+        "if has_voice_over:\n",
+        "    segments = transcribe_video()[\"transcript\"]  # ['transcript', 'original', 'v1']\n",
+        "else:\n",
+        "    shots = process_video(\n",
+        "        video_gcs_uri=f\"gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}\") # returns videoshot only\n",
+        "    segments = []\n",
+        "    for shot in shots:\n",
+        "        reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key\n",
+        "        reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())\n",
+        "        segments.append(reformed_dict)\n",
+        "\n",
+        "print(\"\\\\\\ Segments ////\")\n",
+        "print(segments)\n",
+        "print(f\"The video is segmented to {len(segments)} parts.\")\n",
+        "\n",
+        "if upload_to_gemini_bucket:\n",
+        "    file_name_without_extension = FILE_NAME.replace('.mp4', '')\n",
+        "    segment_file_0 = f\"{file_name_without_extension}_0.mp4\"\n",
+        "\n",
+        "    if not does_file_exist(f\"gemini/{segment_file_0}\"):  # check if the 1st file exists in bucket\n",
+        "        segments_uri = upload_segments_to_gemini_bucket(segments,\n",
+        "                                                    source_file_name=VIDEO_FULL_PATH,\n",
+        "                                                    destination_blob_name=f'gemini/{FILE_NAME}')\n",
+        "\n",
+        "    print(f\"Completed uploading {len(segments)} segments to {GCLOUD_BUCKET_NAME} buckets\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GnUwrHmHg8GR"
+      },
+      "source": [
+        "## Gemini Processing (Visual + Semantic)\n",
+        "\n",
+        "### Only run Option 1 or Option 2"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "collapsed": true,
+        "id": "t5oN3mppERvf"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Option 1: Select Shots based on Transcript and Visual Description (ABCD framework)\n",
+        "\n",
+        "if send_video_to_gemini:\n",
+        "    for counter, segment in enumerate(segments):\n",
+        "        video_full_path = f\"gs://adclip.appspot.com/gemini/{file_name_without_extension}_{counter}.mp4\"\n",
+        "        try:\n",
+        "            response = send_video_to_gemini(video_full_path)\n",
+        "            segment['visual_description'] = response.strip()\n",
+        "        except:\n",
+        "            segment['visual_description'] = \"\"  # the response got blocked\n",
+        "    descriptions = get_descriptions_of_all_shots()\n",
+        "    print(descriptions)\n",
+        "\n",
+        "    num_of_lines = int(len(segments)/2) # The higher num of lines, the longer the video will be\n",
+        "    data = select_segments(num_of_lines=2, descriptions=descriptions).strip(\"'\")\n",
+        "    print(data)\n",
+        "    print('Data is type string. Please input the line number in the box below')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Soaxbq_rzeew"
+      },
+      "outputs": [],
+      "source": [
+        "# WRITE THE SELECTED SEGEMENTS FROM GEMINI IN THIS FUNCTION\n",
+        "selected_segments = sorted([0,1,2,3])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4D2o3U6dW2bS"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Option 2: Select Shots based on Transcript (no Visual Description) -- use for transcript-heavy videos.\n",
+        "\n",
+        "input_transcript = segments\n",
+        "full_text = \"\"\n",
+        "for counter, i in enumerate(input_transcript):\n",
+        "    line = f\"{counter}: {i['text']}\"\n",
+        "    full_text += line + '\\n'\n",
+        "\n",
+        "shortened_text = send_transcript_to_llm(full_text).strip(\" \").strip(\"'\")\n",
+        "print('----shortened_text-----')\n",
+        "print(shortened_text)\n",
+        "\n",
+        "selected_segments = []\n",
+        "\n",
+        "for line in shortened_text.split(\"\\n\"):\n",
+        "    try:\n",
+        "        counter = int(line.split(\": \")[0])\n",
+        "        selected_segments.append(counter)\n",
+        "    except:\n",
+        "        continue\n",
+        "\n",
+        "print(selected_segments)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "u7kI4uaSXjjx"
+      },
+      "source": [
+        "## Video Processing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8iCzOqZF97uN"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Generate Short-Form Videos\n",
+        "\n",
+        "processed_segments = []\n",
+        "for segment in selected_segments:\n",
+        "    processed_segments.append(segments[segment])\n",
+        "print(f\"Number of segments from the original videos: {len(segments)} | Number of Segments selected by Gemini: {len(processed_segments)}\")\n",
+        "\n",
+        "import os\n",
+        "\n",
+        "local_video_path = f\"/content/{VIDEO_FULL_PATH.replace('videos/', '')}\"\n",
+        "if not os.path.exists(local_video_path):\n",
+        "    download_blob(source_file_name=VIDEO_FULL_PATH,\n",
+        "                  destination_blob_name=local_video_path)\n",
+        "\n",
+        "print(f\"Start shortening videos, this process will take a while ...\")\n",
+        "clip_video(video_path=local_video_path,\n",
+        "           file_name=FILE_NAME,\n",
+        "           segments=processed_segments)\n",
+        "\n",
+        "try:\n",
+        "    ################ Second sync with shots###############\n",
+        "    print(\"Try processing with shots sync\")\n",
+        "    input_transcript = processed_segments\n",
+        "    list_of_words = list(map(lambda line: line['words'], input_transcript))\n",
+        "    transcript_words = list(itertools.chain.from_iterable(list_of_words))\n",
+        "    video_shots = get_video_shots(FILE_NAME)\n",
+        "    ## process video shots to align keys name ###\n",
+        "    if video_shots is None:\n",
+        "        # shots = process_video(\n",
+        "        # video_gcs_uri=f\"gs://{GCLOUD_BUCKET_NAME}/{VIDEO_FULL_PATH}\") # returns videoshot only\n",
+        "        video_shots = []\n",
+        "        for shot in shots: # from step 1\n",
+        "            reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key\n",
+        "            reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())\n",
+        "            video_shots.append(reformed_dict)\n",
+        "    shortened_text = ''\n",
+        "    for i in segments:\n",
+        "        shortened_text += i['text'] + '\\n'\n",
+        "    segments_transformed = get_clips_from_transcript(transcript_words=transcript_words,\n",
+        "                                shortened_text=shortened_text,\n",
+        "                                input_transcript=input_transcript)\n",
+        "    try:\n",
+        "        segments_transformed = match_with_video_shots(video_shots, segments, transcript_words)\n",
+        "        print(\"Matched with Shots successfully\")\n",
+        "    except:\n",
+        "        print(\"Retry matching with shots\")\n",
+        "        shots = get_video_shots(FILE_NAME)\n",
+        "        video_shots = []\n",
+        "        for shot in shots:\n",
+        "            reformed_keys = {'start_time':'startTime', 'end_time':'endTime'} # due to TTS and Shot Detection have different convention for key\n",
+        "            reformed_dict = dict((reformed_keys[key], value) for (key, value) in shot.items())\n",
+        "            video_shots.append(reformed_dict)\n",
+        "        segments_transformed = match_with_video_shots(video_shots, segments, transcript_words)\n",
+        "    print(f\"Segments selected by Gemini and processed with video shots: {len(processed_segments)}\")\n",
+        "    FILE_NAME = f\"{FILE_NAME.replace('.mp4','')}_sync.mp4\"\n",
+        "    clip_video(video_path=local_video_path,\n",
+        "            file_name=FILE_NAME,\n",
+        "            segments=segments_transformed)\n",
+        "\n",
+        "except:\n",
+        "    print(\"Can not process with shots matching. This is due to the video does not have voice over or the transcript is too heavy to be match with video shots.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fBaGK7NO0opG"
+      },
+      "source": [
+        "## Copy files to Google Drive (if needed)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0rqEshZQNgE4"
+      },
+      "outputs": [],
+      "source": [
+        "# from google.colab import drive\n",
+        "# drive.mount('/content/drive')\n",
+        "# !cp \"/content/output.mp4\" \"/content/drive/MyDrive/\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lPnlJm9_7LlN"
+      },
+      "outputs": [],
+      "source": [
+        "# Write data to JSON if needed\n",
+        "\n",
+        "import json\n",
+        "with open(\"segments.json\", 'w') as f:\n",
+        "    json.dump(segments, f, indent=2)\n",
+        "\n",
+        "with open(\"segments_transformed.json\", 'w') as f:\n",
+        "    json.dump(segments_transformed, f, indent=2)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [
+        "u1pD7yVfgJsp",
+        "GnUwrHmHg8GR"
+      ],
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}