Skip to content

Commit

Permalink
treat variance threshold as the corresponding percentile value and ex…
Browse files Browse the repository at this point in the history
…pose to the user for variance filtering option
  • Loading branch information
borauyar committed May 30, 2024
1 parent 4dfb151 commit 53dee25
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
4 changes: 3 additions & 1 deletion flexynesis/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ def main():
parser.add_argument("--fusion_type", help="How to fuse the omics layers", type=str, choices=["early", "intermediate"], default = 'intermediate')
parser.add_argument("--hpo_iter", help="Number of iterations for hyperparameter optimisation", type=int, default = 5)
parser.add_argument("--finetuning_samples", help="Number of samples from the test dataset to use for fine-tuning the model. Set to 0 to disable fine-tuning", type=int, default = 0)
parser.add_argument("--variance_threshold", help="Variance threshold (as percentile) to drop low variance features (default: 1; set to 0 for no variance filtering)", type=float, default = 1)
parser.add_argument("--correlation_threshold", help="Correlation threshold to drop highly redundant features (default: 0.8; set to 1 for no redundancy filtering)", type=float, default = 0.8)
parser.add_argument("--restrict_to_features", help="Restrict the analyis to the list of features provided by the user (default: None)", type = str, default = None)
parser.add_argument("--subsample", help="Downsample training set to randomly drawn N samples for training. Disabled when set to 0", type=int, default = 0)
parser.add_argument("--features_min", help="Minimum number of features to retain after feature selection", type=int, default = 500)
parser.add_argument("--features_top_percentile", help="Top percentile features to retain after feature selection", type=float, default = 20)
parser.add_argument("--features_top_percentile", help="Top percentile features (among the features remaining after variance filtering and data cleanup to retain after feature selection", type=float, default = 20)
parser.add_argument("--data_types", help="(Required) Which omic data matrices to work on, comma-separated: e.g. 'gex,cnv'", type=str, required = True)
parser.add_argument("--input_layers",
help="If model_class is set to CrossModalPred, choose which data types to use as input/encoded layers"
Expand Down Expand Up @@ -182,6 +183,7 @@ class AvailableModels(NamedTuple):
data_types = datatypes,
concatenate = concatenate,
log_transform = args.log_transform == 'True',
variance_threshold = args.variance_threshold/100,
correlation_threshold = args.correlation_threshold,
restrict_to_features = args.restrict_to_features,
min_features= args.features_min,
Expand Down
6 changes: 3 additions & 3 deletions flexynesis/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class DataImporter:
"""

def __init__(self, path, data_types, processed_dir="processed", log_transform = False, concatenate = False, restrict_to_features = None, min_features=None,
top_percentile=20, correlation_threshold = 0.9, variance_threshold=1e-5, na_threshold=0.1,
top_percentile=20, correlation_threshold = 0.9, variance_threshold=0.01, na_threshold=0.1,
graph=None, string_organism=9606, string_node_name="gene_name", transform=None, downsample=0):
self.path = path
self.data_types = data_types
Expand Down Expand Up @@ -350,9 +350,9 @@ def cleanup_data(self, df_dict):

# Filter based on both variance and NA percentage thresholds
# Identify features that meet both criteria
df = df.loc[(feature_variances > self.variance_threshold) & (na_percentages < self.na_threshold)]
df = df.loc[(feature_variances > feature_variances.quantile(self.variance_threshold)) & (na_percentages < self.na_threshold)]
# set selected features to True
log_df['selected'] = (log_df['variance'] > self.variance_threshold) & (log_df['na_percent'] < self.na_threshold)
log_df['selected'] = (log_df['variance'] > feature_variances.quantile(self.variance_threshold)) & (log_df['na_percent'] < self.na_threshold)
feature_logs[key] = log_df

# Step 3: Fill NA values with the median of the feature
Expand Down

0 comments on commit 53dee25

Please sign in to comment.