diff --git a/analyze_visual/wrapper.py b/analyze_visual/wrapper.py index cdfb384..fb41e97 100644 --- a/analyze_visual/wrapper.py +++ b/analyze_visual/wrapper.py @@ -11,12 +11,18 @@ """ import sys +import glob import argparse import os.path import numpy as np +from numpy import unique +from numpy import where import pandas as pd from pickle import load -import glob +from matplotlib import pyplot +from sklearn.preprocessing import MinMaxScaler +from sklearn.cluster import KMeans + sys.path.insert(0, '..') from analyze_visual.analyze_visual import process_video @@ -28,8 +34,8 @@ def parse_arguments(): "Predict shot class") parser.add_argument("-i", "--input_videos_path", - required=True, nargs=None, - help="Videos folder path") + action='append', nargs='+', + required=True, help="Videos folder path") parser.add_argument("-m", "--model", required=True, nargs=None, help="Model") @@ -37,6 +43,11 @@ def parse_arguments(): parser.add_argument("-o", "--output_file", required=True, nargs=None, help="Output file with results") + parser.add_argument("-c", "--num_of_clusters", required=False, nargs=None, + default=2, help="number of clusters used" + "(clustering is executed only if " + "number of directories is more than 1") + return parser.parse_args() @@ -82,6 +93,7 @@ def video_class_predict(features, algorithm): return probas, classes + def video_class_predict_folder(videos_path, model, algorithm, outfilename): @@ -96,6 +108,7 @@ def video_class_predict_folder(videos_path, model, algorithm, final_proba = np.empty((0, len(model.classes_))) df = create_dataframe(model.classes_) + if os.path.exists(str(videos_path) + ".txt"): os.remove(str(videos_path) + ".txt") if os.path.isfile(videos_path): @@ -115,24 +128,30 @@ def video_class_predict_folder(videos_path, model, algorithm, for files in types: video_files_list.extend(glob.glob(os.path.join(videos_path, files))) video_files_list = sorted(video_files_list) - + print(video_files_list) + count_processed = 0 for v in video_files_list: - features_stats = process_video(v, 2, True, True, True) - features = features_stats[0] - features = features.reshape(1, -1) - probas, classes = video_class_predict(features, algorithm) - # Save the resuls in a numpy array - final_proba = np.append(final_proba, [probas], axis=0) - - # Convert format of file names - splitting = v.split('/') - v = splitting[-1] - # Insert values to dataframe - df = df.append({'File_name': v}, ignore_index=True) - + try: + # TODO remove long try-except statement + print(algorithm) + print(v) + features_stats = process_video(v, 2, True, True, True) + features = features_stats[0] + features = features.reshape(1, -1) + probas, classes = video_class_predict(features, algorithm) + # Save the resuls in a numpy array + final_proba = np.append(final_proba, [probas], axis=0) + # Convert format of file names + splitting = v.split(os.sep) + v = splitting[-1] + # Insert values to dataframe + df = df.append({'File_name': v}, ignore_index=True) + count_processed += 1 + except: + print('This video is corrupted') for i, class_name in enumerate(classes): df[class_name] = final_proba[:, i] - # Save values to csv + # Save results to csv df.to_csv(outfilename) print(final_proba) @@ -147,7 +166,51 @@ def video_class_predict_folder(videos_path, model, algorithm, f'belongs by {"{:.2%}".format(proba)} ' f'in {class_name} class') - return final_proba, classes + return final_proba, classes, df + + +def clustering(videos_path, model, algorithm, outfilename, nclusters=2): + """ + Clustering process + :param videos_path: path to video directory of filename to be analyzed + :param model: path name of the model + :param algorithm: type of the modelling algorithm (e.g. SVM) + :param outfilename: output csv filename (only for input folder) + :param nclusters: number of clusters to use + :return: + """ + final_proba = [] + final_df = create_dataframe(model.classes_) + + for movie in videos_path: + + f, c, df = video_class_predict_folder(movie, model, algorithm, + outfilename) + + final_proba.append(f) + final_df = final_df.append(df, ignore_index=True) + print(f, c) + + final_df.to_csv(outfilename) + final_proba = np.array(final_proba) + print(final_proba) + model = KMeans(n_clusters=nclusters) + model.fit(final_proba) + yhat = model.predict(final_proba) + with open("cluster_prediction.txt", "w") as output: + for movie, y in zip(videos_path,yhat): + print(f'{movie} : {y}',file = output) + + clusters = unique(yhat) + print(clusters) + for cluster in clusters: + # get row indexes for samples with this cluster + row_ix = where(yhat == cluster) + # create scatter of these samples + pyplot.scatter(final_proba[row_ix, 0], final_proba[row_ix, 1]) + + # show the plot + pyplot.savefig('plot_clusters.png') def main(): @@ -155,11 +218,19 @@ def main(): videos_path = args.input_videos_path algorithm = args.model outfilename = args.output_file + nclusters = int(args.num_of_clusters) + # Convert list of lists to a single list + videos_path = [item for sublist in videos_path for item in sublist] model = load(open('shot_classifier_' + str(algorithm)+'.pkl', 'rb')) - f, c = video_class_predict_folder(videos_path, model, algorithm, - outfilename) - print(f, c) + if (len(videos_path)) > 1: + clustering(videos_path, model, algorithm, outfilename, nclusters) + else: + videos_path = videos_path[-1] + print(videos_path) + f, c, _ = video_class_predict_folder(videos_path, model, algorithm, + outfilename) + print(f, c)