From e50d28bde9aea06344ab8a06dd1147fa6bab531f Mon Sep 17 00:00:00 2001
From: Benjamin Cretois <benjamin.cretois@nina.no>
Date: Thu, 7 Mar 2024 13:02:49 +0100
Subject: [PATCH] [FIX] n_subsample for the POS samples

---
 evaluate/_utils_writing.py |  10 ++--
 evaluate/evaluateDCASE.py  | 102 ++++++++++++++++---------------------
 2 files changed, 50 insertions(+), 62 deletions(-)

diff --git a/evaluate/_utils_writing.py b/evaluate/_utils_writing.py
index e9d8dc6..a651900 100644
--- a/evaluate/_utils_writing.py
+++ b/evaluate/_utils_writing.py
@@ -136,7 +136,12 @@ def write_wav(
     )
     wavfile.write(output, target_fs, result_wav.T)
 
-def plot_2_d_representation():
+def plot_2_d_representation(prototypes,
+                            z_pos_supports,
+                            z_neg_supports,
+                            q_embeddings,
+                            labels,
+                            output):
         import numpy as np
         import matplotlib.pyplot as plt
         from sklearn.manifold import TSNE
@@ -180,9 +185,6 @@ def plot_2_d_representation():
         plt.ylabel('Dimension 2')
         plt.grid(True)
 
-        fig_name = os.path.basename(support_spectrograms).split("data_")[1].split(".")[0] + ".png"
-        output = os.path.join(target_path, fig_name)
-
         # Save the figure
         plt.savefig(output, bbox_inches="tight")
         plt.show()
\ No newline at end of file
diff --git a/evaluate/evaluateDCASE.py b/evaluate/evaluateDCASE.py
index 5400c5d..6efe911 100644
--- a/evaluate/evaluateDCASE.py
+++ b/evaluate/evaluateDCASE.py
@@ -26,7 +26,7 @@
 
 from statsmodels.distributions.empirical_distribution import ECDF
 
-from evaluate._utils_writing import write_wav, write_results
+from evaluate._utils_writing import write_wav, write_results, plot_2_d_representation
 from evaluate._utils_compute import (to_dataframe, get_proto_coordinates, calculate_distance, 
                                      compute_scores, merge_preds, reshape_support, training, 
                                      predict_labels_query, filter_outliers_by_p_values)
@@ -88,12 +88,13 @@ def compute(
     #################################################
     print("GETTING THE DISTRIBUTION OF THE POS SUPPORT SAMPLES")
     support_samples_pos = df_support[df_support["category"] == "POS"]["feature"].to_numpy()
-    support_samples_pos = reshape_support(support_samples_pos, tensor_length=cfg["data"]["tensor_length"])
+    support_samples_pos = reshape_support(support_samples_pos, 
+                                          tensor_length=cfg["data"]["tensor_length"], 
+                                          n_subsample=cfg["predict"]["n_subsample"])
     z_pos_supports, _ = model.get_embeddings(support_samples_pos, padding_mask=None)
 
     _, d_supports_to_POS_prototypes = calculate_distance(model_type, z_pos_supports, prototypes[pos_index])
 
-    print(f"DISTANCE TO POS = {d_supports_to_POS_prototypes}")
     ecdf = ECDF(d_supports_to_POS_prototypes.detach().numpy())
 
     ######################################
@@ -134,7 +135,36 @@ def compute(
         pos_index=pos_index,
     )
 
+    # GET THE PVALUES
+    p_values_pos = 1 - ecdf(distances_to_pos)
+
+    if cfg["predict"]["filter_by_p_values"]:
+        predicted_labels = filter_outliers_by_p_values(predicted_labels, p_values_pos, target_class=1, upper_threshold=0.05)
+
     if n_self_detected_supports > 0:
+        #######################
+        # NEW SELF_SUPERVISED #
+        #######################
+
+        # Take all the queries with a pvalue of 1
+
+
+        # Get some random samples with pvalue of 0
+
+
+        # update custom_dcasedatamodule
+        custom_dcasedatamodule = DCASEDataModule(
+            data_frame=df_support_extended,
+            tensor_length=cfg["data"]["tensor_length"],
+            n_shot=3 + n_self_detected_supports,
+            n_query=2,
+            n_subsample=cfg["data"]["n_subsample"],
+        )
+
+
+        #######################
+        # OLD SELF SUPERVISED #
+        #######################
         # find n best predictions
         n_best_ind = np.argpartition(distances_to_pos, -n_self_detected_supports)[
             -n_self_detected_supports:
@@ -154,6 +184,11 @@ def compute(
         df_support_extended = df_support_extended.append(
             df_extension_neg, ignore_index=True
         )
+
+        ##########################
+        # KEEP THIS UPDATE PART! #
+        ##########################
+
         # update custom_dcasedatamodule
         custom_dcasedatamodule = DCASEDataModule(
             data_frame=df_support_extended,
@@ -201,63 +236,14 @@ def compute(
     # PLOT PROTOTYPES AND EMBEDDINGS IN A 2D SPACE #
     ################################################
     if cfg["plot"]["tsne"]:
-
-        import numpy as np
-        import matplotlib.pyplot as plt
-        from sklearn.manifold import TSNE
-
-        # Assuming `prototypes`, `z_pos_supports`, `z_neg_supports`, `q_embeddings`, and `labels` are already defined
-        # Convert tensors to numpy arrays if they are in tensor format
-        # e.g., z_pos_supports = z_pos_supports.detach().numpy()
-
-        # Create a labels array for all points
-        # Label for prototypes, positive supports, negative supports, and query embeddings respectively
-        prototypes_labels = np.array([2] * prototypes.shape[0])  # Assuming 2 is not used in `gt_labels`
-        pos_supports_labels = np.array([3] * z_pos_supports.shape[0])  # Assuming 3 is not used in `gt_labels`
-        neg_supports_labels = np.array([4] * z_neg_supports.shape[0])  # Assuming 4 is not used in `gt_labels`
-        q_embeddings = q_embeddings.detach().numpy()
-        gt_labels = labels.detach().numpy()
-
-        # Concatenate everything into one dataset
-        feat = np.concatenate([prototypes, z_pos_supports, z_neg_supports, q_embeddings])
-        all_labels = np.concatenate([prototypes_labels, pos_supports_labels, neg_supports_labels, gt_labels])
-
-        # Run t-SNE
-        tsne = TSNE(n_components=2, perplexity=30)
-        features_2d = tsne.fit_transform(feat)
-
-        # Plot
-        plt.figure(figsize=(10, 8))
-        # Define marker for each type of point
-        markers = {2: "P", 3: "o", 4: "X"}  # P for prototypes, o for supports, X for negative supports
-
-        for label in np.unique(all_labels):
-            # Plot each class with its own color and marker
-            idx = np.where(all_labels == label)
-            if label in markers:  # Prototypes or supports
-                plt.scatter(features_2d[idx, 0], features_2d[idx, 1], label=label, alpha=1.0, marker=markers[label], s=100)  # Larger size
-            else:  # Query embeddings
-                plt.scatter(features_2d[idx, 0], features_2d[idx, 1], label=label, alpha=0.5, s=50)  # Smaller size, more transparent
-
-        plt.legend()
-        plt.title('t-SNE visualization of embeddings, prototypes, and supports')
-        plt.xlabel('Dimension 1')
-        plt.ylabel('Dimension 2')
-        plt.grid(True)
-
         fig_name = os.path.basename(support_spectrograms).split("data_")[1].split(".")[0] + ".png"
         output = os.path.join(target_path, fig_name)
-
-        # Save the figure
-        plt.savefig(output, bbox_inches="tight")
-        plt.show()
-
-
-    # GET THE PVALUES
-    p_values_pos = 1 - ecdf(distances_to_pos)
-
-    if cfg["predict"]["filter_by_p_values"]:
-        predicted_labels = filter_outliers_by_p_values(predicted_labels, p_values_pos, target_class=1, upper_threshold=0.05)
+        plot_2_d_representation(prototypes, 
+                                z_pos_supports, 
+                                z_neg_supports, 
+                                q_embeddings,
+                                labels,
+                                output)
 
     # Compute the scores for the analysed file -- just as information
     acc, recall, precision, f1score = compute_scores(