Added an option --avg_all, to create additional output table, where t…

…he APSS are calculated using all available regions of each pair of samples.
leylabmpi · Feb 22, 2024 · ebff5fb · ebff5fb
1 parent 1935cc7
commit ebff5fb
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -81,7 +81,7 @@ python syntracker.py [-h] [-target target_directory_path] [-ref ref_directory_pa
                      [-out output_directory_path] [-metadata metadata_file] 
                      [-mode 'new'/'continue'] [-cores number_of_cores] [-length region_length] 
                      [--identity blast_identity] [--coverage blast_coverage] 
-                     [--save_intermediate] [--no_seed]
+                     [--save_intermediate] [--no_seed] [--avg_all]
 
 options:
   -h, --help        show this help message and exit
@@ -123,6 +123,11 @@ options:
   --no_seed         Set no seed for the subsampling of n regions per pairwise (optional). 
                     This means that the average synteny scores may change between SynTracker runs due to the subsampling. 
                     By default, a seed=1 is set to enable reproducibility between different runs.
+ 
+  --avg_all   
+                    Create an additional output table with APSS (Average Pairwise Synteny Scores), 
+                    which are based on all the available regions per each pair of samples 
+                    (in addition to the output tables, based on the subsampling of n regions).                 
 ```
 
 ## Output
@@ -134,18 +139,25 @@ All the output files for a certain reference genome are located under the direct
 The table `[genome name]_synteny_scores_per_region.tab` contains the raw results obtained by the comparison of each two homologous genomic 
 regions in each two metagenomes in which they were detected (or genomes, if those are being compared).
 
-The second type of output tables, `[genome name]_avg_synteny_scores_[subsampling length].txt`, gives the APSS 
+The second type of output tables, `[genome name]_avg_synteny_scores_[subsampling length]_regions.txt`, gives the APSS 
 (Average Pairwise Synteny Score) that was calculated by subsampling N regions per pair of samples
 from the overall regions that appear in the raw table (detailed above). 
 By default, N equals to 20, 30, 40, 60, 80, 100, 200 regions per pair of samples.
 
+In case the user has applied the --avg_all option, an additional table, 
+named `[genome name]_avg_synteny_scores_all_regions.txt` is created too. In this table, the APSS are calculated 
+using all the available regions per each pair of samples. 
+
 #### Summary output (all genomes together):
 Syntracker also creates the same output tables mentioned above for all the references genomes combined together. 
 These summary output files are located under the directory `summary_output/`.
 
 The raw (per-region synteny scores) table is called `synteny_scores_per_region.tab`. 
 
-The tables containing the APSS in different subsampling lengths are called `avg_synteny_scores_[subsampling length].txt`.
+The tables containing the APSS in different subsampling lengths are called `avg_synteny_scores_[subsampling length]_regions.txt`.
+
+The table containing the APSS using all regions (in case of applying the --avg_all option) 
+is called `avg_synteny_scores_all_regions.txt`.
 
 #### Sample output:
 

diff --git a/SynTracker_Manual.docx b/SynTracker_Manual.docx
diff --git a/SynTracker_Manual.pdf b/SynTracker_Manual.pdf
diff --git a/config.py b/config.py
@@ -49,12 +49,14 @@
 
 # R - related parameters
 save_intermediate = False
-is_set_seed = True
+is_set_seed = True  # Whether to set a seed for the subsampling process (to have same results between different runs)
 seed_num = 1
 subsampling_lengths = [20, 30, 40, 60, 80, 100, 200]
 subsampled_regions_file_names = []
 for i in range(len(subsampling_lengths)):
-    subsampled_regions_file_names.append("avg_synteny_scores_" + str(subsampling_lengths[i]) + ".txt")
+    subsampled_regions_file_names.append("avg_synteny_scores_" + str(subsampling_lengths[i]) + "_regions.txt")
+avg_all = False  # Whether to add non-subsampled output (average all the regions per pair of samples)
+avg_all_file_name = "avg_synteny_scores_all_regions.txt"
 
 # Run related parameters
 running_mode = "new"  # Mode can be 'new' or 'continue'

diff --git a/syntracker.py b/syntracker.py
@@ -70,6 +70,8 @@ def main():
             out_param.write("\nSave intermediate: " + str(config.save_intermediate) + "\n")
         if config.is_set_seed is False:
             out_param.write("No seed\n")
+        if config.avg_all:
+            out_param.write("Average all regions\n")
 
         ############################################
         # Create a directory for the summary output (all genomes together)
@@ -93,6 +95,13 @@ def main():
             out_subsampled.write("\"Ref_genome\",\"Sample1\",\"Sample2\",\"Average_score\",\"Compared_regions\"\n")
             out_subsampled.close()
 
+        # If the user added the --avg_all option, create also a file for the average-all-regions output
+        if config.avg_all:
+            file_path = config.summary_output_path + config.avg_all_file_name
+            out_avg_all = open(file_path, "w")
+            out_avg_all.write("\"Ref_genome\",\"Sample1\",\"Sample2\",\"Average_score\",\"Compared_regions\"\n")
+            out_avg_all.close()
+
         ################################################################
         # Take care of the naming issues of the target genomes:
         # a. change sample names (i.e., assemblies/genomes) to Sample.xxx
@@ -444,16 +453,16 @@ def main():
             command = "Rscript syntracker_R_scripts/SynTracker.R" + " " + ref_genome + " " + \
                         config.sample_dictionary_table_path + " " + genome_blastdbcmd_out_dir + " " + \
                         final_output_path + " " + config.summary_output_path + " " + r_temp_path + " " + " " + \
-                        intermediate_objects_path + " " + str(config.seed_num) + " " + str(config.cpu_num) + " " + \
-                        metadata_file_path
+                        intermediate_objects_path + " " + str(config.seed_num) + " " + str(config.avg_all) + " " + \
+                        str(config.cpu_num) + " " + metadata_file_path
             print("\nRunning the following Rscript command:\n" + command + "\n")
 
             try:
                 subprocess.run(["Rscript", "syntracker_R_scripts/SynTracker.R", ref_genome,
                                 config.sample_dictionary_table_path,
                                 genome_blastdbcmd_out_dir, final_output_path, config.summary_output_path,
-                                r_temp_path, intermediate_objects_path, str(config.seed_num), str(config.cpu_num),
-                                metadata_file_path], check=True)
+                                r_temp_path, intermediate_objects_path, str(config.seed_num), str(config.avg_all),
+                                str(config.cpu_num), metadata_file_path], check=True)
             except subprocess.CalledProcessError as err:
                 print("\nThe following command has failed:")
                 print(command)

diff --git a/syntracker_R_scripts/SynTracker.R b/syntracker_R_scripts/SynTracker.R
@@ -17,8 +17,9 @@ output_summary_folder <- args[5] # The destination folder for the summary output
 tmp_folder <- args[6] # A temporary folder for the db files - should be deleted in the end
 intermediate_file_folder <- args[7] # If not empty - the folder for saving R intermediate objects (if empty - do not save them)
 set_seed_arg <- as.integer(args[8]) # an integer to set the seed (if 0 - do not use seed)
-core_number <- as.integer(args[9])
-metadata_file <- args[10] # If not empty - the path of the metadata file (if empty - there is no metadata)
+avg_all_regions <- args[9] # If it's True, add an output table without subsampling
+core_number <- as.integer(args[10])
+metadata_file <- args[11] # If not empty - the path of the metadata file (if empty - there is no metadata)
 
 ######################################################################################################
 
@@ -96,7 +97,7 @@ write.table(summary_organized_dfs, file=summary_table_path, sep=",", row.names =
 # filter and subsample regions
 ##################
 # find the maximal number of regions/pairwise-comparison:
-biggest_group<-max(big_organized_dfs %>%
+biggest_group<-max(big_organized_dfs_final %>%
                    group_by(sample1, sample2) %>%
                    mutate(regions = n()) %>%
                    ungroup() %>%
@@ -108,14 +109,28 @@ regions_sampled<-c(20,30,40,60,80,100,200)
 
 for (i in 1:length(regions_sampled)) {
     if(biggest_group >= regions_sampled[i]){
-        grouped_df<-as.data.frame(mapply(subsample_regions, list(big_organized_dfs), regions_sampled[i], set_seed_arg, SIMPLIFY = F))
-        write.table(grouped_df, file=paste(output_folder, genome_name, "_avg_synteny_scores_", regions_sampled[i], ".txt", sep=""), row.names=FALSE, sep=",")
+        grouped_df<-as.data.frame(mapply(subsample_regions, list(big_organized_dfs_final), regions_sampled[i], set_seed_arg, SIMPLIFY = F))
+
+        write.table(grouped_df, file=paste(output_folder, genome_name, "_avg_synteny_scores_", regions_sampled[i], "_regions.txt", sep=""), row.names=FALSE, sep=",")
         grouped_df_with_genome<-grouped_df %>% mutate(ref_genome=genome_name, .before = sample1)
-        write.table(grouped_df_with_genome, file=paste(output_summary_folder, "avg_synteny_scores_", regions_sampled[i], ".txt", sep=""),
+        write.table(grouped_df_with_genome, file=paste(output_summary_folder, "avg_synteny_scores_", regions_sampled[i], "_regions.txt", sep=""),
         row.names=FALSE, col.names=FALSE, append=TRUE, sep=",")
     }
 }
 
+# Create an additional output table without subsampling if the user has requested for it
+if (avg_all_regions == 'True'){
+    grouped_df_avg_all<-big_organized_dfs_final %>%
+    group_by(sample1, sample2) %>%
+    mutate(regions = n()) %>%
+    summarise(average_score=mean(syn_score), compared_regions=mean(regions))
+
+    write.table(grouped_df_avg_all, file=paste(output_folder, genome_name, "_avg_synteny_scores_all_regions.txt", sep=""), row.names=FALSE, sep=",")
+    grouped_df_all_with_genome<-grouped_df_avg_all %>% mutate(ref_genome=genome_name, .before = sample1)
+    write.table(grouped_df_all_with_genome, file=paste(output_summary_folder, "avg_synteny_scores_all_regions.txt", sep=""),
+    row.names=FALSE, col.names=FALSE, append=TRUE, sep=",")
+}
+
 unlink(tmp_folder, recursive = T)
 
 cat("\nFinished synteny analysis for:", genome_name, sep = "\n" )

diff --git a/syntracker_first_stage/parser.py b/syntracker_first_stage/parser.py
@@ -50,6 +50,10 @@ def parse_arguments():
                         "the average synteny scores may change between SynTracker runs. This is an optional parameter. "
                         "By default, a seed=1 is set to enable reproducibility between different runs.",
                         action='store_true', default=False)
+    parser.add_argument("--avg_all", help="Create an additional output table with APSS (Average Pairwise Synteny "
+                                          "Scores), which are based on all the available regions per each pair of "
+                                          "samples (in addition to the output tables, based on the subsampling of n "
+                                          "regions).", action='store_true', default=False)
 
     # Parse the given arguments
     args = parser.parse_args()
@@ -160,6 +164,9 @@ def parse_arguments():
         config.is_set_seed = False
         config.seed_num = 0
 
+    if args.avg_all:
+        config.avg_all = True
+
     return error
 
 
@@ -223,6 +230,9 @@ def read_conf_file():
                 config.is_set_seed = False
                 config.seed_num = 0
 
+            elif re.search("^Average all", line):
+                config.avg_all = True
+
             elif re.search("^Reference genomes:", line):
                 in_ref_genomes_list = 1