From 7cf51822f7416344c140a0fb438421b92811ff80 Mon Sep 17 00:00:00 2001
From: apoman38 <apoman38@gmail.com>
Date: Wed, 31 Mar 2021 11:20:43 +0300
Subject: [PATCH 1/7] add clustering

---
 analyze_visual/wrapper.py | 99 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 12 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index cdfb384..d8f11bc 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -11,12 +11,18 @@
 """
 
 import sys
+import glob
 import argparse
 import os.path
 import numpy as np
+from numpy import unique
+from numpy import where
 import pandas as pd
 from pickle import load
-import glob
+from matplotlib import pyplot
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.cluster import KMeans
+
 sys.path.insert(0, '..')
 from analyze_visual.analyze_visual import process_video
 
@@ -28,8 +34,8 @@ def parse_arguments():
                                                  "Predict shot class")
 
     parser.add_argument("-i", "--input_videos_path",
-                        required=True, nargs=None,
-                        help="Videos folder path")
+                        action='append', nargs='+',
+                        required=True, help="Videos folder path")
 
     parser.add_argument("-m", "--model", required=True, nargs=None,
                         help="Model")
@@ -82,6 +88,68 @@ def video_class_predict(features, algorithm):
 
     return probas, classes
 
+def clustering(videos_path, model, algorithm,
+                               outfilename):
+    """
+    Clustering process
+    :param videos_path: path to video directory of filename to be analyzed
+    :param model: path name of the model
+    :param algorithm: type of the modelling algorithm (e.g. SVM)
+    :param outfilename: output csv filename (only for input folder)
+    :return:
+    """
+
+    features_all= []
+    final_df = create_dataframe(model.classes_)
+    features = np.empty((0, len(model.classes_)))
+    for video in videos_path:
+
+        f, c, ft, df = video_class_predict_folder(video, model, algorithm,
+                                outfilename)
+        features_all.append(ft)
+        final_df = final_df.append(df, ignore_index=True)
+
+        print(f,c)
+        
+    final_df.to_csv(outfilename)
+    #Convert to list of lists
+    for i in range(2):
+        features_all = [item for sublist in features_all for item in sublist]
+
+    #Convert list of lists to numpy array
+    features = np.array(features_all)
+
+    # Define scaler
+    scaler = MinMaxScaler()
+
+    # Fit scaler on the training dataset
+    scaler.fit(features)
+
+    # Transform both datasets
+    scaled_features = scaler.transform(features)
+
+    model = KMeans(n_clusters=len(videos_path))
+
+    model.fit(scaled_features)
+
+    yhat = model.predict(scaled_features)
+
+    with open("cluster_prediction.txt", "w") as output:
+        output.write(str(yhat))
+
+    print(yhat)
+
+    clusters = unique(yhat)
+
+    for cluster in clusters:
+        # get row indexes for samples with this cluster
+        row_ix = where(yhat == cluster)
+        # create scatter of these samples
+        pyplot.scatter(scaled_features[row_ix, 0], scaled_features[row_ix, 1])
+
+    # show the plot
+    pyplot.savefig('plot_clusters.png')
+
 
 def video_class_predict_folder(videos_path, model, algorithm,
                                outfilename):
@@ -93,9 +161,10 @@ def video_class_predict_folder(videos_path, model, algorithm,
     :param outfilename: output csv filename (only for input folder)
     :return:
     """
-
+    features_all = []
     final_proba = np.empty((0, len(model.classes_)))
     df = create_dataframe(model.classes_)
+    
     if os.path.exists(str(videos_path) + ".txt"):
         os.remove(str(videos_path) + ".txt")
     if os.path.isfile(videos_path):
@@ -115,15 +184,14 @@ def video_class_predict_folder(videos_path, model, algorithm,
         for files in types:
             video_files_list.extend(glob.glob(os.path.join(videos_path, files)))
         video_files_list = sorted(video_files_list)
-
         for v in video_files_list:
             features_stats = process_video(v, 2, True, True, True)
             features = features_stats[0]
             features = features.reshape(1, -1)
-            probas, classes = video_class_predict(features, algorithm)
+            features_all.append(features)
+            probas, classes = video_class_predict(features, algorithm)           
             # Save the resuls in a numpy array
             final_proba = np.append(final_proba, [probas], axis=0)
-
             # Convert format of file names
             splitting = v.split('/')
             v = splitting[-1]
@@ -132,7 +200,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
 
         for i, class_name in enumerate(classes):
             df[class_name] = final_proba[:, i]
-        # Save values to csv
+        # Save results to csv
         df.to_csv(outfilename)
 
         print(final_proba)
@@ -147,7 +215,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
                       f'belongs by {"{:.2%}".format(proba)} '
                       f'in {class_name} class')
 
-    return final_proba, classes
+    return final_proba, classes, features_all, df
 
 
 def main():
@@ -155,11 +223,18 @@ def main():
     videos_path = args.input_videos_path
     algorithm = args.model
     outfilename = args.output_file
+    # Convert list of lists to a single list
+    videos_path = [item for sublist in videos_path for item in sublist]    
     model = load(open('shot_classifier_' + str(algorithm)+'.pkl', 'rb'))
 
-    f, c = video_class_predict_folder(videos_path, model, algorithm,
-                                      outfilename)
-    print(f, c)
+    if (len(videos_path)) > 1:    
+        clustering(videos_path, model, algorithm, outfilename)
+    else:
+        videos_path = videos_path[-1]
+        print(videos_path)
+        f, c, _, _ = video_class_predict_folder(videos_path, model, algorithm,
+                                    outfilename)
+        print(f, c)
 
  
 

From 7c7e01f55cd9100b58222de85e370ffe50d19c7b Mon Sep 17 00:00:00 2001
From: apoman38 <apoman38@gmail.com>
Date: Wed, 31 Mar 2021 12:13:41 +0300
Subject: [PATCH 2/7] revision of cluster process

---
 analyze_visual/wrapper.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index d8f11bc..55084f1 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -101,24 +101,21 @@ def clustering(videos_path, model, algorithm,
 
     features_all= []
     final_df = create_dataframe(model.classes_)
-    features = np.empty((0, len(model.classes_)))
-    for video in videos_path:
 
-        f, c, ft, df = video_class_predict_folder(video, model, algorithm,
+    for movie in videos_path:
+
+        f, c, ft, df = video_class_predict_folder(movie, model, algorithm,
                                 outfilename)
         features_all.append(ft)
         final_df = final_df.append(df, ignore_index=True)
 
         print(f,c)
-        
+    
     final_df.to_csv(outfilename)
-    #Convert to list of lists
-    for i in range(2):
-        features_all = [item for sublist in features_all for item in sublist]
 
     #Convert list of lists to numpy array
     features = np.array(features_all)
-
+    
     # Define scaler
     scaler = MinMaxScaler()
 
@@ -135,10 +132,9 @@ def clustering(videos_path, model, algorithm,
     yhat = model.predict(scaled_features)
 
     with open("cluster_prediction.txt", "w") as output:
-        output.write(str(yhat))
-
-    print(yhat)
-
+        for movie,y in zip(videos_path,yhat):
+            print(f'{movie} : {y}',file = output)
+        
     clusters = unique(yhat)
 
     for cluster in clusters:
@@ -149,7 +145,7 @@ def clustering(videos_path, model, algorithm,
 
     # show the plot
     pyplot.savefig('plot_clusters.png')
-
+    
 
 def video_class_predict_folder(videos_path, model, algorithm,
                                outfilename):
@@ -188,6 +184,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
             features_stats = process_video(v, 2, True, True, True)
             features = features_stats[0]
             features = features.reshape(1, -1)
+            features = features.tolist()
             features_all.append(features)
             probas, classes = video_class_predict(features, algorithm)           
             # Save the resuls in a numpy array
@@ -197,7 +194,8 @@ def video_class_predict_folder(videos_path, model, algorithm,
             v = splitting[-1]
             # Insert values to dataframe
             df = df.append({'File_name': v}, ignore_index=True)
-
+        for i in range(2):
+            features_all = [item for sublist in features_all for item in sublist]
         for i, class_name in enumerate(classes):
             df[class_name] = final_proba[:, i]
         # Save results to csv

From 90c0d15d2957bddac735ad8287258f5e1692c86b Mon Sep 17 00:00:00 2001
From: apoman38 <apoman38@gmail.com>
Date: Fri, 2 Apr 2021 19:59:04 +0300
Subject: [PATCH 3/7] change input array to cluster algorithm

---
 analyze_visual/wrapper.py | 116 +++++++++++++++++---------------------
 1 file changed, 52 insertions(+), 64 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index 55084f1..fc3520a 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -88,63 +88,6 @@ def video_class_predict(features, algorithm):
 
     return probas, classes
 
-def clustering(videos_path, model, algorithm,
-                               outfilename):
-    """
-    Clustering process
-    :param videos_path: path to video directory of filename to be analyzed
-    :param model: path name of the model
-    :param algorithm: type of the modelling algorithm (e.g. SVM)
-    :param outfilename: output csv filename (only for input folder)
-    :return:
-    """
-
-    features_all= []
-    final_df = create_dataframe(model.classes_)
-
-    for movie in videos_path:
-
-        f, c, ft, df = video_class_predict_folder(movie, model, algorithm,
-                                outfilename)
-        features_all.append(ft)
-        final_df = final_df.append(df, ignore_index=True)
-
-        print(f,c)
-    
-    final_df.to_csv(outfilename)
-
-    #Convert list of lists to numpy array
-    features = np.array(features_all)
-    
-    # Define scaler
-    scaler = MinMaxScaler()
-
-    # Fit scaler on the training dataset
-    scaler.fit(features)
-
-    # Transform both datasets
-    scaled_features = scaler.transform(features)
-
-    model = KMeans(n_clusters=len(videos_path))
-
-    model.fit(scaled_features)
-
-    yhat = model.predict(scaled_features)
-
-    with open("cluster_prediction.txt", "w") as output:
-        for movie,y in zip(videos_path,yhat):
-            print(f'{movie} : {y}',file = output)
-        
-    clusters = unique(yhat)
-
-    for cluster in clusters:
-        # get row indexes for samples with this cluster
-        row_ix = where(yhat == cluster)
-        # create scatter of these samples
-        pyplot.scatter(scaled_features[row_ix, 0], scaled_features[row_ix, 1])
-
-    # show the plot
-    pyplot.savefig('plot_clusters.png')
     
 
 def video_class_predict_folder(videos_path, model, algorithm,
@@ -157,7 +100,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
     :param outfilename: output csv filename (only for input folder)
     :return:
     """
-    features_all = []
+
     final_proba = np.empty((0, len(model.classes_)))
     df = create_dataframe(model.classes_)
     
@@ -184,8 +127,6 @@ def video_class_predict_folder(videos_path, model, algorithm,
             features_stats = process_video(v, 2, True, True, True)
             features = features_stats[0]
             features = features.reshape(1, -1)
-            features = features.tolist()
-            features_all.append(features)
             probas, classes = video_class_predict(features, algorithm)           
             # Save the resuls in a numpy array
             final_proba = np.append(final_proba, [probas], axis=0)
@@ -194,8 +135,6 @@ def video_class_predict_folder(videos_path, model, algorithm,
             v = splitting[-1]
             # Insert values to dataframe
             df = df.append({'File_name': v}, ignore_index=True)
-        for i in range(2):
-            features_all = [item for sublist in features_all for item in sublist]
         for i, class_name in enumerate(classes):
             df[class_name] = final_proba[:, i]
         # Save results to csv
@@ -213,9 +152,58 @@ def video_class_predict_folder(videos_path, model, algorithm,
                       f'belongs by {"{:.2%}".format(proba)} '
                       f'in {class_name} class')
 
-    return final_proba, classes, features_all, df
+    return final_proba, classes, df
+
+def clustering(videos_path, model, algorithm,
+                               outfilename):
+    """
+    Clustering process
+    :param videos_path: path to video directory of filename to be analyzed
+    :param model: path name of the model
+    :param algorithm: type of the modelling algorithm (e.g. SVM)
+    :param outfilename: output csv filename (only for input folder)
+    :return:
+    """
+    final_proba = []
+    final_df = create_dataframe(model.classes_)
+   
+    for movie in videos_path:
+
+        f, c, df = video_class_predict_folder(movie, model, algorithm,
+                                outfilename)
+
+        final_proba.append(f)
+        final_df = final_df.append(df, ignore_index=True)
+
+        print(f,c)
+
+    final_df.to_csv(outfilename)
+    
+    final_proba = np.array(final_proba)
+
+    print(final_proba)
+       
+    model = KMeans(n_clusters=len(model.classes_))
 
+    model.fit(final_proba)
+
+    yhat = model.predict(final_proba)
+
+    with open("cluster_prediction.txt", "w") as output:
+        for movie,y in zip(videos_path,yhat):
+            print(f'{movie} : {y}',file = output)
+        
+    clusters = unique(yhat)
+    
+    for cluster in clusters:
+        # get row indexes for samples with this cluster
+        row_ix = where(yhat == cluster)
+        # create scatter of these samples
+        pyplot.scatter(final_proba[row_ix, 0], final_proba[row_ix, 1])
 
+    # show the plot
+    pyplot.savefig('plot_clusters.png')
+    
 def main():
     args = parse_arguments()
     videos_path = args.input_videos_path
@@ -230,7 +218,7 @@ def main():
     else:
         videos_path = videos_path[-1]
         print(videos_path)
-        f, c, _, _ = video_class_predict_folder(videos_path, model, algorithm,
+        f, c, _ = video_class_predict_folder(videos_path, model, algorithm,
                                     outfilename)
         print(f, c)
 

From 56444a0a6942f3acbe9de7c5567d4cf193183eb4 Mon Sep 17 00:00:00 2001
From: apoman38 <apoman38@gmail.com>
Date: Sat, 3 Apr 2021 10:20:29 +0300
Subject: [PATCH 4/7] check for corrupted videos

---
 analyze_visual/wrapper.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index fc3520a..340690b 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -124,17 +124,20 @@ def video_class_predict_folder(videos_path, model, algorithm,
             video_files_list.extend(glob.glob(os.path.join(videos_path, files)))
         video_files_list = sorted(video_files_list)
         for v in video_files_list:
-            features_stats = process_video(v, 2, True, True, True)
-            features = features_stats[0]
-            features = features.reshape(1, -1)
-            probas, classes = video_class_predict(features, algorithm)           
-            # Save the resuls in a numpy array
-            final_proba = np.append(final_proba, [probas], axis=0)
-            # Convert format of file names
-            splitting = v.split('/')
-            v = splitting[-1]
-            # Insert values to dataframe
-            df = df.append({'File_name': v}, ignore_index=True)
+            try:
+                features_stats = process_video(v, 2, True, True, True)
+                features = features_stats[0]
+                features = features.reshape(1, -1)
+                probas, classes = video_class_predict(features, algorithm)           
+                # Save the resuls in a numpy array
+                final_proba = np.append(final_proba, [probas], axis=0)
+                # Convert format of file names
+                splitting = v.split('/')
+                v = splitting[-1]
+                # Insert values to dataframe
+                df = df.append({'File_name': v}, ignore_index=True)
+            except:
+                print('This video is corrupted')
         for i, class_name in enumerate(classes):
             df[class_name] = final_proba[:, i]
         # Save results to csv

From 40e99c3eb1f08a4b8442b4fc0ba14a2abf558d29 Mon Sep 17 00:00:00 2001
From: Theodoros Giannakopoulos <tyiannak@gmail.com>
Date: Sun, 4 Apr 2021 22:08:21 +0300
Subject: [PATCH 5/7] minor

---
 analyze_visual/wrapper.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index 340690b..03e0b3c 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -157,6 +157,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
 
     return final_proba, classes, df
 
+
 def clustering(videos_path, model, algorithm,
                                outfilename):
     """
@@ -173,31 +174,24 @@ def clustering(videos_path, model, algorithm,
     for movie in videos_path:
 
         f, c, df = video_class_predict_folder(movie, model, algorithm,
-                                outfilename)
+                                              outfilename)
 
         final_proba.append(f)
         final_df = final_df.append(df, ignore_index=True)
-
-        print(f,c)
+        print(f, c)
 
     final_df.to_csv(outfilename)
-    
     final_proba = np.array(final_proba)
-
     print(final_proba)
-       
     model = KMeans(n_clusters=len(model.classes_))
-
     model.fit(final_proba)
-
     yhat = model.predict(final_proba)
-
     with open("cluster_prediction.txt", "w") as output:
-        for movie,y in zip(videos_path,yhat):
+        for movie, y in zip(videos_path,yhat):
             print(f'{movie} : {y}',file = output)
         
     clusters = unique(yhat)
-    
+    print(clusters)
     for cluster in clusters:
         # get row indexes for samples with this cluster
         row_ix = where(yhat == cluster)
@@ -206,7 +200,8 @@ def clustering(videos_path, model, algorithm,
 
     # show the plot
     pyplot.savefig('plot_clusters.png')
-    
+
+
 def main():
     args = parse_arguments()
     videos_path = args.input_videos_path

From 5f8e674e27361295c67352e189ac3bd2eed1d080 Mon Sep 17 00:00:00 2001
From: tyiannak <tyiannak@gmail.com>
Date: Mon, 5 Apr 2021 10:37:26 +0300
Subject: [PATCH 6/7] minor

---
 analyze_visual/wrapper.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index 03e0b3c..db217d3 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -123,8 +123,13 @@ def video_class_predict_folder(videos_path, model, algorithm,
         for files in types:
             video_files_list.extend(glob.glob(os.path.join(videos_path, files)))
         video_files_list = sorted(video_files_list)
+        print(video_files_list)
+        count_processed = 0
         for v in video_files_list:
             try:
+                # TODO remove long try-except statement
+                print(algorithm)
+                print(v)
                 features_stats = process_video(v, 2, True, True, True)
                 features = features_stats[0]
                 features = features.reshape(1, -1)
@@ -132,10 +137,11 @@ def video_class_predict_folder(videos_path, model, algorithm,
                 # Save the resuls in a numpy array
                 final_proba = np.append(final_proba, [probas], axis=0)
                 # Convert format of file names
-                splitting = v.split('/')
+                splitting = v.split(os.sep)
                 v = splitting[-1]
                 # Insert values to dataframe
                 df = df.append({'File_name': v}, ignore_index=True)
+                count_processed += 1
             except:
                 print('This video is corrupted')
         for i, class_name in enumerate(classes):
@@ -158,8 +164,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
     return final_proba, classes, df
 
 
-def clustering(videos_path, model, algorithm,
-                               outfilename):
+def clustering(videos_path, model, algorithm, outfilename):
     """
     Clustering process
     :param videos_path: path to video directory of filename to be analyzed

From 4206996f1a875de3e5d70f2c715220f3533a5d45 Mon Sep 17 00:00:00 2001
From: tyiannak <tyiannak@gmail.com>
Date: Mon, 5 Apr 2021 12:10:23 +0300
Subject: [PATCH 7/7] add number of clusters as parameter

---
 analyze_visual/wrapper.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
index db217d3..fb41e97 100644
--- a/analyze_visual/wrapper.py
+++ b/analyze_visual/wrapper.py
@@ -43,6 +43,11 @@ def parse_arguments():
     parser.add_argument("-o", "--output_file", required=True, nargs=None,
                         help="Output file with results")
 
+    parser.add_argument("-c", "--num_of_clusters", required=False, nargs=None,
+                        default=2, help="number of clusters used"
+                                        "(clustering is executed only if "
+                                        "number of directories is more than 1")
+
     return parser.parse_args()
 
 
@@ -164,13 +169,14 @@ def video_class_predict_folder(videos_path, model, algorithm,
     return final_proba, classes, df
 
 
-def clustering(videos_path, model, algorithm, outfilename):
+def clustering(videos_path, model, algorithm, outfilename, nclusters=2):
     """
     Clustering process
     :param videos_path: path to video directory of filename to be analyzed
     :param model: path name of the model
     :param algorithm: type of the modelling algorithm (e.g. SVM)
     :param outfilename: output csv filename (only for input folder)
+    :param nclusters: number of clusters to use
     :return:
     """
     final_proba = []
@@ -188,7 +194,7 @@ def clustering(videos_path, model, algorithm, outfilename):
     final_df.to_csv(outfilename)
     final_proba = np.array(final_proba)
     print(final_proba)
-    model = KMeans(n_clusters=len(model.classes_))
+    model = KMeans(n_clusters=nclusters)
     model.fit(final_proba)
     yhat = model.predict(final_proba)
     with open("cluster_prediction.txt", "w") as output:
@@ -212,17 +218,18 @@ def main():
     videos_path = args.input_videos_path
     algorithm = args.model
     outfilename = args.output_file
+    nclusters = int(args.num_of_clusters)
     # Convert list of lists to a single list
     videos_path = [item for sublist in videos_path for item in sublist]    
     model = load(open('shot_classifier_' + str(algorithm)+'.pkl', 'rb'))
 
     if (len(videos_path)) > 1:    
-        clustering(videos_path, model, algorithm, outfilename)
+        clustering(videos_path, model, algorithm, outfilename, nclusters)
     else:
         videos_path = videos_path[-1]
         print(videos_path)
         f, c, _ = video_class_predict_folder(videos_path, model, algorithm,
-                                    outfilename)
+                                             outfilename)
         print(f, c)