From 7cf51822f7416344c140a0fb438421b92811ff80 Mon Sep 17 00:00:00 2001 From: apoman38 Date: Wed, 31 Mar 2021 11:20:43 +0300 Subject: [PATCH 1/7] add clustering --- analyze_visual/wrapper.py | 99 ++++++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index cdfb384..d8f11bc 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -11,12 +11,18 @@ """ import sys +import glob import argparse import os.path import numpy as np +from numpy import unique +from numpy import where import pandas as pd from pickle import load -import glob +from matplotlib import pyplot +from sklearn.preprocessing import MinMaxScaler +from sklearn.cluster import KMeans + sys.path.insert(0, '..') from analyze_visual.analyze_visual import process_video @@ -28,8 +34,8 @@ def parse_arguments(): "Predict shot class") parser.add_argument("-i", "--input_videos_path", - required=True, nargs=None, - help="Videos folder path") + action='append', nargs='+', + required=True, help="Videos folder path") parser.add_argument("-m", "--model", required=True, nargs=None, help="Model") @@ -82,6 +88,68 @@ def video_class_predict(features, algorithm): return probas, classes +def clustering(videos_path, model, algorithm, + outfilename): + """ + Clustering process + :param videos_path: path to video directory of filename to be analyzed + :param model: path name of the model + :param algorithm: type of the modelling algorithm (e.g. SVM) + :param outfilename: output csv filename (only for input folder) + :return: + """ + + features_all= [] + final_df = create_dataframe(model.classes_) + features = np.empty((0, len(model.classes_))) + for video in videos_path: + + f, c, ft, df = video_class_predict_folder(video, model, algorithm, + outfilename) + features_all.append(ft) + final_df = final_df.append(df, ignore_index=True) + + print(f,c) + + final_df.to_csv(outfilename) + #Convert to list of lists + for i in range(2): + features_all = [item for sublist in features_all for item in sublist] + + #Convert list of lists to numpy array + features = np.array(features_all) + + # Define scaler + scaler = MinMaxScaler() + + # Fit scaler on the training dataset + scaler.fit(features) + + # Transform both datasets + scaled_features = scaler.transform(features) + + model = KMeans(n_clusters=len(videos_path)) + + model.fit(scaled_features) + + yhat = model.predict(scaled_features) + + with open("cluster_prediction.txt", "w") as output: + output.write(str(yhat)) + + print(yhat) + + clusters = unique(yhat) + + for cluster in clusters: + # get row indexes for samples with this cluster + row_ix = where(yhat == cluster) + # create scatter of these samples + pyplot.scatter(scaled_features[row_ix, 0], scaled_features[row_ix, 1]) + + # show the plot + pyplot.savefig('plot_clusters.png') + def video_class_predict_folder(videos_path, model, algorithm, outfilename): @@ -93,9 +161,10 @@ def video_class_predict_folder(videos_path, model, algorithm, :param outfilename: output csv filename (only for input folder) :return: """ - + features_all = [] final_proba = np.empty((0, len(model.classes_))) df = create_dataframe(model.classes_) + if os.path.exists(str(videos_path) + ".txt"): os.remove(str(videos_path) + ".txt") if os.path.isfile(videos_path): @@ -115,15 +184,14 @@ def video_class_predict_folder(videos_path, model, algorithm, for files in types: video_files_list.extend(glob.glob(os.path.join(videos_path, files))) video_files_list = sorted(video_files_list) - for v in video_files_list: features_stats = process_video(v, 2, True, True, True) features = features_stats[0] features = features.reshape(1, -1) - probas, classes = video_class_predict(features, algorithm) + features_all.append(features) + probas, classes = video_class_predict(features, algorithm) # Save the resuls in a numpy array final_proba = np.append(final_proba, [probas], axis=0) - # Convert format of file names splitting = v.split('/') v = splitting[-1] @@ -132,7 +200,7 @@ def video_class_predict_folder(videos_path, model, algorithm, for i, class_name in enumerate(classes): df[class_name] = final_proba[:, i] - # Save values to csv + # Save results to csv df.to_csv(outfilename) print(final_proba) @@ -147,7 +215,7 @@ def video_class_predict_folder(videos_path, model, algorithm, f'belongs by {"{:.2%}".format(proba)} ' f'in {class_name} class') - return final_proba, classes + return final_proba, classes, features_all, df def main(): @@ -155,11 +223,18 @@ def main(): videos_path = args.input_videos_path algorithm = args.model outfilename = args.output_file + # Convert list of lists to a single list + videos_path = [item for sublist in videos_path for item in sublist] model = load(open('shot_classifier_' + str(algorithm)+'.pkl', 'rb')) - f, c = video_class_predict_folder(videos_path, model, algorithm, - outfilename) - print(f, c) + if (len(videos_path)) > 1: + clustering(videos_path, model, algorithm, outfilename) + else: + videos_path = videos_path[-1] + print(videos_path) + f, c, _, _ = video_class_predict_folder(videos_path, model, algorithm, + outfilename) + print(f, c) From 7c7e01f55cd9100b58222de85e370ffe50d19c7b Mon Sep 17 00:00:00 2001 From: apoman38 Date: Wed, 31 Mar 2021 12:13:41 +0300 Subject: [PATCH 2/7] revision of cluster process --- analyze_visual/wrapper.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index d8f11bc..55084f1 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -101,24 +101,21 @@ def clustering(videos_path, model, algorithm, features_all= [] final_df = create_dataframe(model.classes_) - features = np.empty((0, len(model.classes_))) - for video in videos_path: - f, c, ft, df = video_class_predict_folder(video, model, algorithm, + for movie in videos_path: + + f, c, ft, df = video_class_predict_folder(movie, model, algorithm, outfilename) features_all.append(ft) final_df = final_df.append(df, ignore_index=True) print(f,c) - + final_df.to_csv(outfilename) - #Convert to list of lists - for i in range(2): - features_all = [item for sublist in features_all for item in sublist] #Convert list of lists to numpy array features = np.array(features_all) - + # Define scaler scaler = MinMaxScaler() @@ -135,10 +132,9 @@ def clustering(videos_path, model, algorithm, yhat = model.predict(scaled_features) with open("cluster_prediction.txt", "w") as output: - output.write(str(yhat)) - - print(yhat) - + for movie,y in zip(videos_path,yhat): + print(f'{movie} : {y}',file = output) + clusters = unique(yhat) for cluster in clusters: @@ -149,7 +145,7 @@ def clustering(videos_path, model, algorithm, # show the plot pyplot.savefig('plot_clusters.png') - + def video_class_predict_folder(videos_path, model, algorithm, outfilename): @@ -188,6 +184,7 @@ def video_class_predict_folder(videos_path, model, algorithm, features_stats = process_video(v, 2, True, True, True) features = features_stats[0] features = features.reshape(1, -1) + features = features.tolist() features_all.append(features) probas, classes = video_class_predict(features, algorithm) # Save the resuls in a numpy array @@ -197,7 +194,8 @@ def video_class_predict_folder(videos_path, model, algorithm, v = splitting[-1] # Insert values to dataframe df = df.append({'File_name': v}, ignore_index=True) - + for i in range(2): + features_all = [item for sublist in features_all for item in sublist] for i, class_name in enumerate(classes): df[class_name] = final_proba[:, i] # Save results to csv From 90c0d15d2957bddac735ad8287258f5e1692c86b Mon Sep 17 00:00:00 2001 From: apoman38 Date: Fri, 2 Apr 2021 19:59:04 +0300 Subject: [PATCH 3/7] change input array to cluster algorithm --- analyze_visual/wrapper.py | 116 +++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index 55084f1..fc3520a 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -88,63 +88,6 @@ def video_class_predict(features, algorithm): return probas, classes -def clustering(videos_path, model, algorithm, - outfilename): - """ - Clustering process - :param videos_path: path to video directory of filename to be analyzed - :param model: path name of the model - :param algorithm: type of the modelling algorithm (e.g. SVM) - :param outfilename: output csv filename (only for input folder) - :return: - """ - - features_all= [] - final_df = create_dataframe(model.classes_) - - for movie in videos_path: - - f, c, ft, df = video_class_predict_folder(movie, model, algorithm, - outfilename) - features_all.append(ft) - final_df = final_df.append(df, ignore_index=True) - - print(f,c) - - final_df.to_csv(outfilename) - - #Convert list of lists to numpy array - features = np.array(features_all) - - # Define scaler - scaler = MinMaxScaler() - - # Fit scaler on the training dataset - scaler.fit(features) - - # Transform both datasets - scaled_features = scaler.transform(features) - - model = KMeans(n_clusters=len(videos_path)) - - model.fit(scaled_features) - - yhat = model.predict(scaled_features) - - with open("cluster_prediction.txt", "w") as output: - for movie,y in zip(videos_path,yhat): - print(f'{movie} : {y}',file = output) - - clusters = unique(yhat) - - for cluster in clusters: - # get row indexes for samples with this cluster - row_ix = where(yhat == cluster) - # create scatter of these samples - pyplot.scatter(scaled_features[row_ix, 0], scaled_features[row_ix, 1]) - - # show the plot - pyplot.savefig('plot_clusters.png') def video_class_predict_folder(videos_path, model, algorithm, @@ -157,7 +100,7 @@ def video_class_predict_folder(videos_path, model, algorithm, :param outfilename: output csv filename (only for input folder) :return: """ - features_all = [] + final_proba = np.empty((0, len(model.classes_))) df = create_dataframe(model.classes_) @@ -184,8 +127,6 @@ def video_class_predict_folder(videos_path, model, algorithm, features_stats = process_video(v, 2, True, True, True) features = features_stats[0] features = features.reshape(1, -1) - features = features.tolist() - features_all.append(features) probas, classes = video_class_predict(features, algorithm) # Save the resuls in a numpy array final_proba = np.append(final_proba, [probas], axis=0) @@ -194,8 +135,6 @@ def video_class_predict_folder(videos_path, model, algorithm, v = splitting[-1] # Insert values to dataframe df = df.append({'File_name': v}, ignore_index=True) - for i in range(2): - features_all = [item for sublist in features_all for item in sublist] for i, class_name in enumerate(classes): df[class_name] = final_proba[:, i] # Save results to csv @@ -213,9 +152,58 @@ def video_class_predict_folder(videos_path, model, algorithm, f'belongs by {"{:.2%}".format(proba)} ' f'in {class_name} class') - return final_proba, classes, features_all, df + return final_proba, classes, df + +def clustering(videos_path, model, algorithm, + outfilename): + """ + Clustering process + :param videos_path: path to video directory of filename to be analyzed + :param model: path name of the model + :param algorithm: type of the modelling algorithm (e.g. SVM) + :param outfilename: output csv filename (only for input folder) + :return: + """ + final_proba = [] + final_df = create_dataframe(model.classes_) + + for movie in videos_path: + + f, c, df = video_class_predict_folder(movie, model, algorithm, + outfilename) + + final_proba.append(f) + final_df = final_df.append(df, ignore_index=True) + + print(f,c) + + final_df.to_csv(outfilename) + + final_proba = np.array(final_proba) + + print(final_proba) + + model = KMeans(n_clusters=len(model.classes_)) + model.fit(final_proba) + + yhat = model.predict(final_proba) + + with open("cluster_prediction.txt", "w") as output: + for movie,y in zip(videos_path,yhat): + print(f'{movie} : {y}',file = output) + + clusters = unique(yhat) + + for cluster in clusters: + # get row indexes for samples with this cluster + row_ix = where(yhat == cluster) + # create scatter of these samples + pyplot.scatter(final_proba[row_ix, 0], final_proba[row_ix, 1]) + # show the plot + pyplot.savefig('plot_clusters.png') + def main(): args = parse_arguments() videos_path = args.input_videos_path @@ -230,7 +218,7 @@ def main(): else: videos_path = videos_path[-1] print(videos_path) - f, c, _, _ = video_class_predict_folder(videos_path, model, algorithm, + f, c, _ = video_class_predict_folder(videos_path, model, algorithm, outfilename) print(f, c) From 56444a0a6942f3acbe9de7c5567d4cf193183eb4 Mon Sep 17 00:00:00 2001 From: apoman38 Date: Sat, 3 Apr 2021 10:20:29 +0300 Subject: [PATCH 4/7] check for corrupted videos --- analyze_visual/wrapper.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index fc3520a..340690b 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -124,17 +124,20 @@ def video_class_predict_folder(videos_path, model, algorithm, video_files_list.extend(glob.glob(os.path.join(videos_path, files))) video_files_list = sorted(video_files_list) for v in video_files_list: - features_stats = process_video(v, 2, True, True, True) - features = features_stats[0] - features = features.reshape(1, -1) - probas, classes = video_class_predict(features, algorithm) - # Save the resuls in a numpy array - final_proba = np.append(final_proba, [probas], axis=0) - # Convert format of file names - splitting = v.split('/') - v = splitting[-1] - # Insert values to dataframe - df = df.append({'File_name': v}, ignore_index=True) + try: + features_stats = process_video(v, 2, True, True, True) + features = features_stats[0] + features = features.reshape(1, -1) + probas, classes = video_class_predict(features, algorithm) + # Save the resuls in a numpy array + final_proba = np.append(final_proba, [probas], axis=0) + # Convert format of file names + splitting = v.split('/') + v = splitting[-1] + # Insert values to dataframe + df = df.append({'File_name': v}, ignore_index=True) + except: + print('This video is corrupted') for i, class_name in enumerate(classes): df[class_name] = final_proba[:, i] # Save results to csv From 40e99c3eb1f08a4b8442b4fc0ba14a2abf558d29 Mon Sep 17 00:00:00 2001 From: Theodoros Giannakopoulos Date: Sun, 4 Apr 2021 22:08:21 +0300 Subject: [PATCH 5/7] minor --- analyze_visual/wrapper.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index 340690b..03e0b3c 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -157,6 +157,7 @@ def video_class_predict_folder(videos_path, model, algorithm, return final_proba, classes, df + def clustering(videos_path, model, algorithm, outfilename): """ @@ -173,31 +174,24 @@ def clustering(videos_path, model, algorithm, for movie in videos_path: f, c, df = video_class_predict_folder(movie, model, algorithm, - outfilename) + outfilename) final_proba.append(f) final_df = final_df.append(df, ignore_index=True) - - print(f,c) + print(f, c) final_df.to_csv(outfilename) - final_proba = np.array(final_proba) - print(final_proba) - model = KMeans(n_clusters=len(model.classes_)) - model.fit(final_proba) - yhat = model.predict(final_proba) - with open("cluster_prediction.txt", "w") as output: - for movie,y in zip(videos_path,yhat): + for movie, y in zip(videos_path,yhat): print(f'{movie} : {y}',file = output) clusters = unique(yhat) - + print(clusters) for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) @@ -206,7 +200,8 @@ def clustering(videos_path, model, algorithm, # show the plot pyplot.savefig('plot_clusters.png') - + + def main(): args = parse_arguments() videos_path = args.input_videos_path From 5f8e674e27361295c67352e189ac3bd2eed1d080 Mon Sep 17 00:00:00 2001 From: tyiannak Date: Mon, 5 Apr 2021 10:37:26 +0300 Subject: [PATCH 6/7] minor --- analyze_visual/wrapper.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index 03e0b3c..db217d3 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -123,8 +123,13 @@ def video_class_predict_folder(videos_path, model, algorithm, for files in types: video_files_list.extend(glob.glob(os.path.join(videos_path, files))) video_files_list = sorted(video_files_list) + print(video_files_list) + count_processed = 0 for v in video_files_list: try: + # TODO remove long try-except statement + print(algorithm) + print(v) features_stats = process_video(v, 2, True, True, True) features = features_stats[0] features = features.reshape(1, -1) @@ -132,10 +137,11 @@ def video_class_predict_folder(videos_path, model, algorithm, # Save the resuls in a numpy array final_proba = np.append(final_proba, [probas], axis=0) # Convert format of file names - splitting = v.split('/') + splitting = v.split(os.sep) v = splitting[-1] # Insert values to dataframe df = df.append({'File_name': v}, ignore_index=True) + count_processed += 1 except: print('This video is corrupted') for i, class_name in enumerate(classes): @@ -158,8 +164,7 @@ def video_class_predict_folder(videos_path, model, algorithm, return final_proba, classes, df -def clustering(videos_path, model, algorithm, - outfilename): +def clustering(videos_path, model, algorithm, outfilename): """ Clustering process :param videos_path: path to video directory of filename to be analyzed From 4206996f1a875de3e5d70f2c715220f3533a5d45 Mon Sep 17 00:00:00 2001 From: tyiannak Date: Mon, 5 Apr 2021 12:10:23 +0300 Subject: [PATCH 7/7] add number of clusters as parameter --- analyze_visual/wrapper.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index db217d3..fb41e97 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -43,6 +43,11 @@ def parse_arguments(): parser.add_argument("-o", "--output_file", required=True, nargs=None, help="Output file with results") + parser.add_argument("-c", "--num_of_clusters", required=False, nargs=None, + default=2, help="number of clusters used" + "(clustering is executed only if " + "number of directories is more than 1") + return parser.parse_args() @@ -164,13 +169,14 @@ def video_class_predict_folder(videos_path, model, algorithm, return final_proba, classes, df -def clustering(videos_path, model, algorithm, outfilename): +def clustering(videos_path, model, algorithm, outfilename, nclusters=2): """ Clustering process :param videos_path: path to video directory of filename to be analyzed :param model: path name of the model :param algorithm: type of the modelling algorithm (e.g. SVM) :param outfilename: output csv filename (only for input folder) + :param nclusters: number of clusters to use :return: """ final_proba = [] @@ -188,7 +194,7 @@ def clustering(videos_path, model, algorithm, outfilename): final_df.to_csv(outfilename) final_proba = np.array(final_proba) print(final_proba) - model = KMeans(n_clusters=len(model.classes_)) + model = KMeans(n_clusters=nclusters) model.fit(final_proba) yhat = model.predict(final_proba) with open("cluster_prediction.txt", "w") as output: @@ -212,17 +218,18 @@ def main(): videos_path = args.input_videos_path algorithm = args.model outfilename = args.output_file + nclusters = int(args.num_of_clusters) # Convert list of lists to a single list videos_path = [item for sublist in videos_path for item in sublist] model = load(open('shot_classifier_' + str(algorithm)+'.pkl', 'rb')) if (len(videos_path)) > 1: - clustering(videos_path, model, algorithm, outfilename) + clustering(videos_path, model, algorithm, outfilename, nclusters) else: videos_path = videos_path[-1] print(videos_path) f, c, _ = video_class_predict_folder(videos_path, model, algorithm, - outfilename) + outfilename) print(f, c)