Version 1.2.6:

- syntracker.py script can be executed now from every path. - Move the used R functions from SynTracker_functions.R file to SynTracker.R file to prevent problems if the main python script is executed not from the scripts directory. - Updated manual and README.
leylabmpi · Jun 28, 2024 · f7cc995 · f7cc995
1 parent a0ab0a1
commit f7cc995
Show file tree

Hide file tree

Showing 7 changed files with 169 additions and 301 deletions.
diff --git a/README.md b/README.md
@@ -84,7 +84,8 @@ python syntracker.py -target Sample_Data/Input_example/Target_genomes/ -ref Samp
 python syntracker.py -out SynTracker_output/ -mode continue
 ```
 
-2. Process all the reference genome again without repeating the blastDB building stage:
+2. Process all the reference genome again without repeating the blastDB building stage 
+(relevant only for datasets containing more than one reference genome):
 ```
 python syntracker.py -out SynTracker_output/ -mode continue_all_genomes
 ```

diff --git a/SynTracker_Manual.docx b/SynTracker_Manual.docx
diff --git a/SynTracker_Manual.pdf b/SynTracker_Manual.pdf
diff --git a/config.py b/config.py
@@ -1,5 +1,6 @@
 # Paths
-working_dir = ""
+scripts_dir = ""
+R_script = ""
 input_target_dir = ""  # Should be given by the user
 input_ref_dir = ""  # Should be given by the user
 output_dir = "Syntracker_output/"  # Default output dir

diff --git a/syntracker.py b/syntracker.py
@@ -13,8 +13,12 @@
 
 def main():
 
-    # Get the working directory
-    config.working_dir = os.getcwd()
+    # Get the scripts directory (it's not necessarily the working dir)
+    config.scripts_dir = os.path.dirname(os.path.abspath(__file__))
+    print("\nScript_dir: " + config.scripts_dir)
+
+    config.R_script = config.scripts_dir + "/syntracker_R_scripts/SynTracker.R"
+    print("R script path: " + config.R_script)
 
     # Parse the command-line arguments
     error = parser.parse_arguments()
@@ -412,46 +416,6 @@ def main():
                 logfile.close()
                 shutil.rmtree(genome_tmp_out_dir)
 
-        # In 'continue' mode, where only the BLAST stage of the current ref-genome was finished successfully
-        #else:
-            # Set the names of the folders that should hold the R per-genome output and intermediate files
-            #final_output_path = ref_genome_output_dir + config.final_output_dir
-            #r_temp_path = ref_genome_output_dir + config.r_temp_dir
-
-            # It can happen that the R process continue running successfully until the end, but detached from
-            # the parent python process, so it doesn't know that the R has finished (and wrote the results to the outfiles).
-            # If this is the case - there is no need to run the synteny scores calculation again for the current ref-genome.
-
-            # If the synteny calculation was finished successfully, the final_output directory with all the per-genome
-            # output files should exist and the R_temp folder should have been removed.
-            #if os.path.exists(final_output_path) is True and os.path.exists(r_temp_path) is False:
-
-                # Check the number of existing per-genome final output files
-                #outfiles_num = len(os.listdir(final_output_path))
-
-                # According to the number of output files - R has finished successfully
-                #if outfiles_num == len(config.subsampling_lengths) + 1:
-                    #print("\nFound synteny calculation output files - no need to run this stage again")
-                    #logfile = open(config.logfile_path, "a")
-                    #logfile.write("\nFound synteny calculation output files - no need to run this stage again\n")
-                    #logfile.close()
-                    #config.genomes_dict[ref_genome]['finished_R'] = 1
-                #else:
-                    #print("\nNumber of output files doesn't match the expected - run the synteny calculation again")
-                    #logfile = open(config.logfile_path, "a")
-                    #logfile.write("\nNumber of output files doesn't match the expected - "
-                     #             "run the synteny calculation again\n")
-                    #logfile.close()
-                    #config.genomes_dict[ref_genome]['finished_R'] = 0
-
-            # Probably R hasn't finished successfully -> run it again
-            #else:
-                #print("\nFound no output files of the synteny calculation - run it again")
-                #logfile = open(config.logfile_path, "a")
-                #logfile.write("\nFound no output files of the synteny calculation - run it again\n")
-                #logfile.close()
-                #config.genomes_dict[ref_genome]['finished_R'] = 0
-
         ####################################################################################
         # Step 3: Run synteny calculation using R
         if config.genomes_dict[ref_genome]['finished_R'] == 0:
@@ -515,7 +479,7 @@ def main():
             else:
                 metadata_file_path = config.metadata_file_path
 
-            command = "Rscript syntracker_R_scripts/SynTracker.R" + " " + ref_genome + " " + \
+            command = "Rscript " + config.R_script + " " + ref_genome + " " + \
                         config.sample_dictionary_table_path + " " + genome_blastdbcmd_out_dir + " " + \
                         final_output_path + " " + config.summary_output_path + " " + r_temp_path + " " + " " + \
                         intermediate_objects_path + " " + str(config.seed_num) + " " + str(config.avg_all) + " " + \
@@ -526,7 +490,7 @@ def main():
             logfile.close()
 
             try:
-                subprocess.run(["Rscript", "syntracker_R_scripts/SynTracker.R", ref_genome,
+                subprocess.run(["Rscript", config.R_script, ref_genome,
                                 config.sample_dictionary_table_path,
                                 genome_blastdbcmd_out_dir, final_output_path, config.summary_output_path,
                                 r_temp_path, intermediate_objects_path, str(config.seed_num), str(config.avg_all),

diff --git a/syntracker_R_scripts/SynTracker.R b/syntracker_R_scripts/SynTracker.R
@@ -5,7 +5,163 @@ library(parallel)
 library(tools)
 
 ###### functions file ######
-source("syntracker_R_scripts/SynTracker_functions.R")
+#source("syntracker_R_scripts/SynTracker_functions.R")
+
+############
+# fucntions
+###########
+
+# A function that combines two stages and is applied on each region:
+# First stage: Synteny analysis using DECIPHER - calculate all vs. all hits
+# Second stage: Calculate synteny score for each comparison
+synteny_analysis_per_region<-function(inpath, region_name, tmp_folder, intermediate_file_folder) {
+
+    # Define a data frame that will hold the synteny scores for all the comparisons in the region:
+    per_region_table<-data.frame("sample1" = character(),
+                               "sample2"= character(),
+                               "length1" = integer(),
+                               "length2" = integer() ,
+                               "overlap" = integer(), #accomulative length of overlapping regions
+                               "Blocks" = integer(), # number of synteny blocks per pairwise comparison
+                               "synteny_score" = integer(), stringsAsFactors = FALSE)
+
+
+    cat("\nRunning synteny analysis using Decipher for region ", region_name, sep = "\n" )
+    seqs <- readDNAStringSet(inpath)
+    db<-paste0(tmp_folder,region_name)
+    flag<-0
+    for (i in seq_along(seqs)) {
+        # The following line adds sequences to the DECIPHER DB:
+        ###########################
+        #   IMPORTANT:
+        # SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used.
+        # If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample.
+        Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1])
+        ###########################
+        flag<-flag+1
+    }
+
+    # only run DECIPHER analysis if we have more than 1 sequence matching this specific region.
+    if (flag>1) {
+
+        try_catch <- function(exprs) {!inherits(try(eval(exprs)), "try-error")}
+
+        # Synteny run for this region finished successfully and returned a synteny object
+        if (try_catch(synteny_object_region<-FindSynteny(db, maxSep=15, maxGap = 15))) {
+            cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " " )
+
+        # Synteny analysis for this region returned an error - return an empty matrix
+        } else{
+            cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " " )
+
+            # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file
+            if(intermediate_file_folder != 'NA') {
+                error_file_name = paste0("DECIPHER_object_error_", region_name, ".rds")
+                saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name))
+            }
+
+            unlink(db) # Remove the DECIPHER db file
+            return(per_region_table)
+        }
+
+    # Otherwise, return an emtpy matrix
+    } else {
+        cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " " )
+        unlink(db) # Remove the DECIPHER db file
+        return(per_region_table)
+    }
+
+    # The synteny object was created but something is wrong and it contains no real information
+    if (nrow(synteny_object_region[2,1][[1]]) == 0){
+
+        # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file
+        if(intermediate_file_folder != 'NA') {
+          error_file_name = paste0("DECIPHER_object_error_", region_name, ".rds")
+          saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name))
+        }
+
+        unlink(db) # Remove the DECIPHER db file
+        return(per_region_table)
+
+    # Sunteny object is fine
+    } else{
+
+        # If the user asked to save intermediate objects - save the synteny object of the current region
+        if(intermediate_file_folder != 'NA') {
+          synteny_object_file_name = paste0("DECIPHER_object_", region_name, ".rds")
+          saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, synteny_object_file_name))
+        }
+    }
+
+    # Remove the DECIPHER db file
+    unlink(db)
+
+    # Continue to calculating the syntey scores only if the synteny analysis returned a valid object
+    cat("\nCalculating synteny scores for region ", region_name, sep = "\n" )
+
+    # for each two samples, create the values to be kept in the dataframe (assing to one row).
+    for (i in 2:ncol(synteny_object_region)) {
+        for (j in 1:(i-1)) {
+            sample1<-as.character(str_split(colnames(synteny_object_region)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
+            sample2<-as.character(str_split(colnames(synteny_object_region)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
+            length1<-synteny_object_region[j,j][[1]]
+            length2<-synteny_object_region[i,i][[1]]
+            overlap<-sum(synteny_object_region[j,i][[1]][,4])
+            blocks<-nrow(synteny_object_region[i,j][[1]])
+
+            # calculate the synteny score
+            if (length1>length2) {
+                syn_score<-1+log10((overlap/length2)/blocks)
+            }
+            else {
+                syn_score<-1+log10((overlap/length1)/blocks)
+            }
+            temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score)
+
+            per_region_table<-rbind(per_region_table, temprow)
+        }
+    }
+
+    cat("\nSynteny scores calculation for region", region_name, "finished successfully\n", sep = " " )
+
+    # If the user asked to save intermediate objects - save the synteny scores object of the current region
+        if(intermediate_file_folder != 'NA') {
+          synteny_scores_object_file_name = paste0("synteny_scores_object_", region_name, ".rds")
+          saveRDS(per_region_table, file = paste0(intermediate_file_folder, synteny_scores_object_file_name))
+        }
+
+    return(per_region_table)
+}
+
+
+# add the names (ref_genome_header + position) to the dataframe
+add_names<-function(dfs, names) {
+  dfs %>% mutate(ref_genome_region=names)
+}
+
+
+# subsample_regions: a function to sample x regions per pairwise comparison and calculate APSS (Average Pairwise Synteny Scores)
+# inputs:
+# 1. big_organized_dfs = dataframe that holds per-region pairwise comparisons (multiple regions, multiple pairs)
+# 2. subsampling_value = how many regions/pairwise to sample.
+# output: dataframe with APSS values (col name is "average_score"), based on "subsampling_value" regions per pair. Pairs with <"subsampling_value" regions are filtererd out.
+subsample_regions<-function(big_organized_dfs, subsampling_value, set_seed_arg) {
+  if(set_seed_arg != 0) {
+    set.seed(set_seed_arg)
+    cat("Using set.seed for reproducable subsampling. Seed: ", set_seed_arg, sep = " " )
+    cat("\n\n")
+  }
+  newdf<-big_organized_dfs %>%
+    # pay attention to the grouping variable below - should match the groups specified in the "synteny_score" function
+    group_by(Sample1, Sample2) %>%
+    filter(n() > subsampling_value-1) %>%
+    sample_n(subsampling_value) %>% #subsample "subsampling_value" regions from each group
+    mutate(regions = n()) %>%
+    summarise(Average_score=mean(Synteny_score), Compared_regions=mean(regions)) %>%
+    #mutate(ref_genome=genome_name, .before = sample1)
+  return(newdf)
+}
+###########################################################################################################################
 
 # Getting the arguments from the python script
 args <- commandArgs(trailingOnly = TRUE)
@@ -131,4 +287,3 @@ if (avg_all_regions == 'True'){
 unlink(tmp_folder, recursive = T)
 
 cat("\nFinished synteny analysis for:", genome_name, sep = "\n" )
-