treat variance threshold as the corresponding percentile value and ex…

…pose to the user for variance filtering option
BIMSBbioinfo · May 30, 2024 · 53dee25 · 53dee25
1 parent 4dfb151
commit 53dee25
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 4 deletions.
diff --git a/flexynesis/__main__.py b/flexynesis/__main__.py
@@ -33,11 +33,12 @@ def main():
     parser.add_argument("--fusion_type", help="How to fuse the omics layers", type=str, choices=["early", "intermediate"], default = 'intermediate')
     parser.add_argument("--hpo_iter", help="Number of iterations for hyperparameter optimisation", type=int, default = 5)
     parser.add_argument("--finetuning_samples", help="Number of samples from the test dataset to use for fine-tuning the model. Set to 0 to disable fine-tuning", type=int, default = 0)
+    parser.add_argument("--variance_threshold", help="Variance threshold (as percentile) to drop low variance features (default: 1; set to 0 for no variance filtering)", type=float, default = 1)
     parser.add_argument("--correlation_threshold", help="Correlation threshold to drop highly redundant features (default: 0.8; set to 1 for no redundancy filtering)", type=float, default = 0.8)
     parser.add_argument("--restrict_to_features", help="Restrict the analyis to the list of features provided by the user (default: None)", type = str, default = None)
     parser.add_argument("--subsample", help="Downsample training set to randomly drawn N samples for training. Disabled when set to 0", type=int, default = 0)
     parser.add_argument("--features_min", help="Minimum number of features to retain after feature selection", type=int, default = 500)
-    parser.add_argument("--features_top_percentile", help="Top percentile features to retain after feature selection", type=float, default = 20)
+    parser.add_argument("--features_top_percentile", help="Top percentile features (among the features remaining after variance filtering and data cleanup to retain after feature selection", type=float, default = 20)
     parser.add_argument("--data_types", help="(Required) Which omic data matrices to work on, comma-separated: e.g. 'gex,cnv'", type=str, required = True)
     parser.add_argument("--input_layers", 
                         help="If model_class is set to CrossModalPred, choose which data types to use as input/encoded layers"
@@ -182,6 +183,7 @@ class AvailableModels(NamedTuple):
                                             data_types = datatypes,
                                             concatenate = concatenate, 
                                             log_transform = args.log_transform == 'True',
+                                            variance_threshold = args.variance_threshold/100,  
                                             correlation_threshold = args.correlation_threshold,
                                             restrict_to_features = args.restrict_to_features,
                                             min_features= args.features_min, 

diff --git a/flexynesis/data.py b/flexynesis/data.py
@@ -97,7 +97,7 @@ class DataImporter:
     """
 
     def __init__(self, path, data_types, processed_dir="processed", log_transform = False, concatenate = False, restrict_to_features = None, min_features=None,
-                 top_percentile=20, correlation_threshold = 0.9, variance_threshold=1e-5, na_threshold=0.1,
+                 top_percentile=20, correlation_threshold = 0.9, variance_threshold=0.01, na_threshold=0.1,
                  graph=None, string_organism=9606, string_node_name="gene_name", transform=None, downsample=0):
         self.path = path
         self.data_types = data_types
@@ -350,9 +350,9 @@ def cleanup_data(self, df_dict):
 
             # Filter based on both variance and NA percentage thresholds
             # Identify features that meet both criteria
-            df = df.loc[(feature_variances > self.variance_threshold) & (na_percentages < self.na_threshold)]
+            df = df.loc[(feature_variances > feature_variances.quantile(self.variance_threshold)) & (na_percentages < self.na_threshold)]
             # set selected features to True
-            log_df['selected'] = (log_df['variance'] > self.variance_threshold) & (log_df['na_percent'] < self.na_threshold)
+            log_df['selected'] = (log_df['variance'] > feature_variances.quantile(self.variance_threshold)) & (log_df['na_percent'] < self.na_threshold)
             feature_logs[key] = log_df
 
             # Step 3: Fill NA values with the median of the feature