Merge pull request #48 from tyiannak/Add-clustering-to-wrapper-#47

Add clustering to wrapper #47
tyiannak · Apr 5, 2021 · d0c2579 · d0c2579
2 parents 00ad522 + 4206996
commit d0c2579
Showing 1 changed file with 93 additions and 22 deletions.
diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py
@@ -11,12 +11,18 @@
 """
 
 import sys
+import glob
 import argparse
 import os.path
 import numpy as np
+from numpy import unique
+from numpy import where
 import pandas as pd
 from pickle import load
-import glob
+from matplotlib import pyplot
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.cluster import KMeans
+
 sys.path.insert(0, '..')
 from analyze_visual.analyze_visual import process_video
 
@@ -28,15 +34,20 @@ def parse_arguments():
                                                  "Predict shot class")
 
     parser.add_argument("-i", "--input_videos_path",
-                        required=True, nargs=None,
-                        help="Videos folder path")
+                        action='append', nargs='+',
+                        required=True, help="Videos folder path")
 
     parser.add_argument("-m", "--model", required=True, nargs=None,
                         help="Model")
 
     parser.add_argument("-o", "--output_file", required=True, nargs=None,
                         help="Output file with results")
 
+    parser.add_argument("-c", "--num_of_clusters", required=False, nargs=None,
+                        default=2, help="number of clusters used"
+                                        "(clustering is executed only if "
+                                        "number of directories is more than 1")
+
     return parser.parse_args()
 
 
@@ -82,6 +93,7 @@ def video_class_predict(features, algorithm):
 
     return probas, classes
 
+
 
 def video_class_predict_folder(videos_path, model, algorithm,
                                outfilename):
@@ -96,6 +108,7 @@ def video_class_predict_folder(videos_path, model, algorithm,
 
     final_proba = np.empty((0, len(model.classes_)))
     df = create_dataframe(model.classes_)
+
     if os.path.exists(str(videos_path) + ".txt"):
         os.remove(str(videos_path) + ".txt")
     if os.path.isfile(videos_path):
@@ -115,24 +128,30 @@ def video_class_predict_folder(videos_path, model, algorithm,
         for files in types:
             video_files_list.extend(glob.glob(os.path.join(videos_path, files)))
         video_files_list = sorted(video_files_list)
-
+        print(video_files_list)
+        count_processed = 0
         for v in video_files_list:
-            features_stats = process_video(v, 2, True, True, True)
-            features = features_stats[0]
-            features = features.reshape(1, -1)
-            probas, classes = video_class_predict(features, algorithm)
-            # Save the resuls in a numpy array
-            final_proba = np.append(final_proba, [probas], axis=0)
-
-            # Convert format of file names
-            splitting = v.split('/')
-            v = splitting[-1]
-            # Insert values to dataframe
-            df = df.append({'File_name': v}, ignore_index=True)
-
+            try:
+                # TODO remove long try-except statement
+                print(algorithm)
+                print(v)
+                features_stats = process_video(v, 2, True, True, True)
+                features = features_stats[0]
+                features = features.reshape(1, -1)
+                probas, classes = video_class_predict(features, algorithm)           
+                # Save the resuls in a numpy array
+                final_proba = np.append(final_proba, [probas], axis=0)
+                # Convert format of file names
+                splitting = v.split(os.sep)
+                v = splitting[-1]
+                # Insert values to dataframe
+                df = df.append({'File_name': v}, ignore_index=True)
+                count_processed += 1
+            except:
+                print('This video is corrupted')
         for i, class_name in enumerate(classes):
             df[class_name] = final_proba[:, i]
-        # Save values to csv
+        # Save results to csv
         df.to_csv(outfilename)
 
         print(final_proba)
@@ -147,19 +166,71 @@ def video_class_predict_folder(videos_path, model, algorithm,
                       f'belongs by {"{:.2%}".format(proba)} '
                       f'in {class_name} class')
 
-    return final_proba, classes
+    return final_proba, classes, df
+
+
+def clustering(videos_path, model, algorithm, outfilename, nclusters=2):
+    """
+    Clustering process
+    :param videos_path: path to video directory of filename to be analyzed
+    :param model: path name of the model
+    :param algorithm: type of the modelling algorithm (e.g. SVM)
+    :param outfilename: output csv filename (only for input folder)
+    :param nclusters: number of clusters to use
+    :return:
+    """
+    final_proba = []
+    final_df = create_dataframe(model.classes_)
+
+    for movie in videos_path:
+
+        f, c, df = video_class_predict_folder(movie, model, algorithm,
+                                              outfilename)
+
+        final_proba.append(f)
+        final_df = final_df.append(df, ignore_index=True)
+        print(f, c)
+
+    final_df.to_csv(outfilename)
+    final_proba = np.array(final_proba)
+    print(final_proba)
+    model = KMeans(n_clusters=nclusters)
+    model.fit(final_proba)
+    yhat = model.predict(final_proba)
+    with open("cluster_prediction.txt", "w") as output:
+        for movie, y in zip(videos_path,yhat):
+            print(f'{movie} : {y}',file = output)
+
+    clusters = unique(yhat)
+    print(clusters)
+    for cluster in clusters:
+        # get row indexes for samples with this cluster
+        row_ix = where(yhat == cluster)
+        # create scatter of these samples
+        pyplot.scatter(final_proba[row_ix, 0], final_proba[row_ix, 1])
+
+    # show the plot
+    pyplot.savefig('plot_clusters.png')
 
 
 def main():
     args = parse_arguments()
     videos_path = args.input_videos_path
     algorithm = args.model
     outfilename = args.output_file
+    nclusters = int(args.num_of_clusters)
+    # Convert list of lists to a single list
+    videos_path = [item for sublist in videos_path for item in sublist]    
     model = load(open('shot_classifier_' + str(algorithm)+'.pkl', 'rb'))
 
-    f, c = video_class_predict_folder(videos_path, model, algorithm,
-                                      outfilename)
-    print(f, c)
+    if (len(videos_path)) > 1:    
+        clustering(videos_path, model, algorithm, outfilename, nclusters)
+    else:
+        videos_path = videos_path[-1]
+        print(videos_path)
+        f, c, _ = video_class_predict_folder(videos_path, model, algorithm,
+                                             outfilename)
+        print(f, c)