diff --git a/README.md b/README.md index 2df8def..05ccbe6 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,8 @@ python syntracker.py -target Sample_Data/Input_example/Target_genomes/ -ref Samp python syntracker.py -out SynTracker_output/ -mode continue ``` -2. Process all the reference genome again without repeating the blastDB building stage: +2. Process all the reference genome again without repeating the blastDB building stage +(relevant only for datasets containing more than one reference genome): ``` python syntracker.py -out SynTracker_output/ -mode continue_all_genomes ``` diff --git a/SynTracker_Manual.docx b/SynTracker_Manual.docx index 4a60f30..a1ff522 100644 Binary files a/SynTracker_Manual.docx and b/SynTracker_Manual.docx differ diff --git a/SynTracker_Manual.pdf b/SynTracker_Manual.pdf index 9a6e1b8..fe25704 100644 Binary files a/SynTracker_Manual.pdf and b/SynTracker_Manual.pdf differ diff --git a/config.py b/config.py index e57922d..fc46cec 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,6 @@ # Paths -working_dir = "" +scripts_dir = "" +R_script = "" input_target_dir = "" # Should be given by the user input_ref_dir = "" # Should be given by the user output_dir = "Syntracker_output/" # Default output dir diff --git a/syntracker.py b/syntracker.py index 37b9347..b4b6e85 100755 --- a/syntracker.py +++ b/syntracker.py @@ -13,8 +13,12 @@ def main(): - # Get the working directory - config.working_dir = os.getcwd() + # Get the scripts directory (it's not necessarily the working dir) + config.scripts_dir = os.path.dirname(os.path.abspath(__file__)) + print("\nScript_dir: " + config.scripts_dir) + + config.R_script = config.scripts_dir + "/syntracker_R_scripts/SynTracker.R" + print("R script path: " + config.R_script) # Parse the command-line arguments error = parser.parse_arguments() @@ -412,46 +416,6 @@ def main(): logfile.close() shutil.rmtree(genome_tmp_out_dir) - # In 'continue' mode, where only the BLAST stage of the current ref-genome was finished successfully - #else: - # Set the names of the folders that should hold the R per-genome output and intermediate files - #final_output_path = ref_genome_output_dir + config.final_output_dir - #r_temp_path = ref_genome_output_dir + config.r_temp_dir - - # It can happen that the R process continue running successfully until the end, but detached from - # the parent python process, so it doesn't know that the R has finished (and wrote the results to the outfiles). - # If this is the case - there is no need to run the synteny scores calculation again for the current ref-genome. - - # If the synteny calculation was finished successfully, the final_output directory with all the per-genome - # output files should exist and the R_temp folder should have been removed. - #if os.path.exists(final_output_path) is True and os.path.exists(r_temp_path) is False: - - # Check the number of existing per-genome final output files - #outfiles_num = len(os.listdir(final_output_path)) - - # According to the number of output files - R has finished successfully - #if outfiles_num == len(config.subsampling_lengths) + 1: - #print("\nFound synteny calculation output files - no need to run this stage again") - #logfile = open(config.logfile_path, "a") - #logfile.write("\nFound synteny calculation output files - no need to run this stage again\n") - #logfile.close() - #config.genomes_dict[ref_genome]['finished_R'] = 1 - #else: - #print("\nNumber of output files doesn't match the expected - run the synteny calculation again") - #logfile = open(config.logfile_path, "a") - #logfile.write("\nNumber of output files doesn't match the expected - " - # "run the synteny calculation again\n") - #logfile.close() - #config.genomes_dict[ref_genome]['finished_R'] = 0 - - # Probably R hasn't finished successfully -> run it again - #else: - #print("\nFound no output files of the synteny calculation - run it again") - #logfile = open(config.logfile_path, "a") - #logfile.write("\nFound no output files of the synteny calculation - run it again\n") - #logfile.close() - #config.genomes_dict[ref_genome]['finished_R'] = 0 - #################################################################################### # Step 3: Run synteny calculation using R if config.genomes_dict[ref_genome]['finished_R'] == 0: @@ -515,7 +479,7 @@ def main(): else: metadata_file_path = config.metadata_file_path - command = "Rscript syntracker_R_scripts/SynTracker.R" + " " + ref_genome + " " + \ + command = "Rscript " + config.R_script + " " + ref_genome + " " + \ config.sample_dictionary_table_path + " " + genome_blastdbcmd_out_dir + " " + \ final_output_path + " " + config.summary_output_path + " " + r_temp_path + " " + " " + \ intermediate_objects_path + " " + str(config.seed_num) + " " + str(config.avg_all) + " " + \ @@ -526,7 +490,7 @@ def main(): logfile.close() try: - subprocess.run(["Rscript", "syntracker_R_scripts/SynTracker.R", ref_genome, + subprocess.run(["Rscript", config.R_script, ref_genome, config.sample_dictionary_table_path, genome_blastdbcmd_out_dir, final_output_path, config.summary_output_path, r_temp_path, intermediate_objects_path, str(config.seed_num), str(config.avg_all), diff --git a/syntracker_R_scripts/SynTracker.R b/syntracker_R_scripts/SynTracker.R index 3a8bd09..76b9470 100644 --- a/syntracker_R_scripts/SynTracker.R +++ b/syntracker_R_scripts/SynTracker.R @@ -5,7 +5,163 @@ library(parallel) library(tools) ###### functions file ###### -source("syntracker_R_scripts/SynTracker_functions.R") +#source("syntracker_R_scripts/SynTracker_functions.R") + +############ +# fucntions +########### + +# A function that combines two stages and is applied on each region: +# First stage: Synteny analysis using DECIPHER - calculate all vs. all hits +# Second stage: Calculate synteny score for each comparison +synteny_analysis_per_region<-function(inpath, region_name, tmp_folder, intermediate_file_folder) { + + # Define a data frame that will hold the synteny scores for all the comparisons in the region: + per_region_table<-data.frame("sample1" = character(), + "sample2"= character(), + "length1" = integer(), + "length2" = integer() , + "overlap" = integer(), #accomulative length of overlapping regions + "Blocks" = integer(), # number of synteny blocks per pairwise comparison + "synteny_score" = integer(), stringsAsFactors = FALSE) + + + cat("\nRunning synteny analysis using Decipher for region ", region_name, sep = "\n" ) + seqs <- readDNAStringSet(inpath) + db<-paste0(tmp_folder,region_name) + flag<-0 + for (i in seq_along(seqs)) { + # The following line adds sequences to the DECIPHER DB: + ########################### + # IMPORTANT: + # SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used. + # If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample. + Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1]) + ########################### + flag<-flag+1 + } + + # only run DECIPHER analysis if we have more than 1 sequence matching this specific region. + if (flag>1) { + + try_catch <- function(exprs) {!inherits(try(eval(exprs)), "try-error")} + + # Synteny run for this region finished successfully and returned a synteny object + if (try_catch(synteny_object_region<-FindSynteny(db, maxSep=15, maxGap = 15))) { + cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " " ) + + # Synteny analysis for this region returned an error - return an empty matrix + } else{ + cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " " ) + + # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file + if(intermediate_file_folder != 'NA') { + error_file_name = paste0("DECIPHER_object_error_", region_name, ".rds") + saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name)) + } + + unlink(db) # Remove the DECIPHER db file + return(per_region_table) + } + + # Otherwise, return an emtpy matrix + } else { + cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " " ) + unlink(db) # Remove the DECIPHER db file + return(per_region_table) + } + + # The synteny object was created but something is wrong and it contains no real information + if (nrow(synteny_object_region[2,1][[1]]) == 0){ + + # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file + if(intermediate_file_folder != 'NA') { + error_file_name = paste0("DECIPHER_object_error_", region_name, ".rds") + saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name)) + } + + unlink(db) # Remove the DECIPHER db file + return(per_region_table) + + # Sunteny object is fine + } else{ + + # If the user asked to save intermediate objects - save the synteny object of the current region + if(intermediate_file_folder != 'NA') { + synteny_object_file_name = paste0("DECIPHER_object_", region_name, ".rds") + saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, synteny_object_file_name)) + } + } + + # Remove the DECIPHER db file + unlink(db) + + # Continue to calculating the syntey scores only if the synteny analysis returned a valid object + cat("\nCalculating synteny scores for region ", region_name, sep = "\n" ) + + # for each two samples, create the values to be kept in the dataframe (assing to one row). + for (i in 2:ncol(synteny_object_region)) { + for (j in 1:(i-1)) { + sample1<-as.character(str_split(colnames(synteny_object_region)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats ! + sample2<-as.character(str_split(colnames(synteny_object_region)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats ! + length1<-synteny_object_region[j,j][[1]] + length2<-synteny_object_region[i,i][[1]] + overlap<-sum(synteny_object_region[j,i][[1]][,4]) + blocks<-nrow(synteny_object_region[i,j][[1]]) + + # calculate the synteny score + if (length1>length2) { + syn_score<-1+log10((overlap/length2)/blocks) + } + else { + syn_score<-1+log10((overlap/length1)/blocks) + } + temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score) + + per_region_table<-rbind(per_region_table, temprow) + } + } + + cat("\nSynteny scores calculation for region", region_name, "finished successfully\n", sep = " " ) + + # If the user asked to save intermediate objects - save the synteny scores object of the current region + if(intermediate_file_folder != 'NA') { + synteny_scores_object_file_name = paste0("synteny_scores_object_", region_name, ".rds") + saveRDS(per_region_table, file = paste0(intermediate_file_folder, synteny_scores_object_file_name)) + } + + return(per_region_table) +} + + +# add the names (ref_genome_header + position) to the dataframe +add_names<-function(dfs, names) { + dfs %>% mutate(ref_genome_region=names) +} + + +# subsample_regions: a function to sample x regions per pairwise comparison and calculate APSS (Average Pairwise Synteny Scores) +# inputs: +# 1. big_organized_dfs = dataframe that holds per-region pairwise comparisons (multiple regions, multiple pairs) +# 2. subsampling_value = how many regions/pairwise to sample. +# output: dataframe with APSS values (col name is "average_score"), based on "subsampling_value" regions per pair. Pairs with <"subsampling_value" regions are filtererd out. +subsample_regions<-function(big_organized_dfs, subsampling_value, set_seed_arg) { + if(set_seed_arg != 0) { + set.seed(set_seed_arg) + cat("Using set.seed for reproducable subsampling. Seed: ", set_seed_arg, sep = " " ) + cat("\n\n") + } + newdf<-big_organized_dfs %>% + # pay attention to the grouping variable below - should match the groups specified in the "synteny_score" function + group_by(Sample1, Sample2) %>% + filter(n() > subsampling_value-1) %>% + sample_n(subsampling_value) %>% #subsample "subsampling_value" regions from each group + mutate(regions = n()) %>% + summarise(Average_score=mean(Synteny_score), Compared_regions=mean(regions)) %>% + #mutate(ref_genome=genome_name, .before = sample1) + return(newdf) +} +########################################################################################################################### # Getting the arguments from the python script args <- commandArgs(trailingOnly = TRUE) @@ -131,4 +287,3 @@ if (avg_all_regions == 'True'){ unlink(tmp_folder, recursive = T) cat("\nFinished synteny analysis for:", genome_name, sep = "\n" ) - diff --git a/syntracker_R_scripts/SynTracker_functions.R b/syntracker_R_scripts/SynTracker_functions.R index 0d2a81a..e280792 100755 --- a/syntracker_R_scripts/SynTracker_functions.R +++ b/syntracker_R_scripts/SynTracker_functions.R @@ -1,256 +1,3 @@ -#requirements -library(DECIPHER) - -############ -# fucntions -########### - -# A function that combines two stages and is applied on each region: -# First stage: Synteny analysis using DECIPHER - calculate all vs. all hits -# Second stage: Calculate synteny score for each comparison -synteny_analysis_per_region<-function(inpath, region_name, tmp_folder, intermediate_file_folder) { - - # Define a data frame that will hold the synteny scores for all the comparisons in the region: - per_region_table<-data.frame("sample1" = character(), - "sample2"= character(), - "length1" = integer(), - "length2" = integer() , - "overlap" = integer(), #accomulative length of overlapping regions - "Blocks" = integer(), # number of synteny blocks per pairwise comparison - "synteny_score" = integer(), stringsAsFactors = FALSE) - - - cat("\nRunning synteny analysis using Decipher for region ", region_name, sep = "\n" ) - seqs <- readDNAStringSet(inpath) - db<-paste0(tmp_folder,region_name) - flag<-0 - for (i in seq_along(seqs)) { - # The following line adds sequences to the DECIPHER DB: - ########################### - # IMPORTANT: - # SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used. - # If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample. - Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1]) - ########################### - flag<-flag+1 - } - - # only run DECIPHER analysis if we have more than 1 sequence matching this specific region. - if (flag>1) { - - try_catch <- function(exprs) {!inherits(try(eval(exprs)), "try-error")} - - # Synteny run for this region finished successfully and returned a synteny object - if (try_catch(synteny_object_region<-FindSynteny(db, maxSep=15, maxGap = 15))) { - cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " " ) - - # Synteny analysis for this region returned an error - return an empty matrix - } else{ - cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " " ) - - # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file - if(intermediate_file_folder != 'NA') { - error_file_name = paste0("DECIPHER_object_error_", region_name, ".rds") - saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name)) - } - - unlink(db) # Remove the DECIPHER db file - return(per_region_table) - } - - # Otherwise, return an emtpy matrix - } else { - cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " " ) - unlink(db) # Remove the DECIPHER db file - return(per_region_table) - } - - # The synteny object was created but something is wrong and it contains no real information - if (nrow(synteny_object_region[2,1][[1]]) == 0){ - - # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file - if(intermediate_file_folder != 'NA') { - error_file_name = paste0("DECIPHER_object_error_", region_name, ".rds") - saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name)) - } - - unlink(db) # Remove the DECIPHER db file - return(per_region_table) - - # Sunteny object is fine - } else{ - - # If the user asked to save intermediate objects - save the synteny object of the current region - if(intermediate_file_folder != 'NA') { - synteny_object_file_name = paste0("DECIPHER_object_", region_name, ".rds") - saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, synteny_object_file_name)) - } - } - - # Remove the DECIPHER db file - unlink(db) - - # Continue to calculating the syntey scores only if the synteny analysis returned a valid object - cat("\nCalculating synteny scores for region ", region_name, sep = "\n" ) - - # for each two samples, create the values to be kept in the dataframe (assing to one row). - for (i in 2:ncol(synteny_object_region)) { - for (j in 1:(i-1)) { - sample1<-as.character(str_split(colnames(synteny_object_region)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats ! - sample2<-as.character(str_split(colnames(synteny_object_region)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats ! - length1<-synteny_object_region[j,j][[1]] - length2<-synteny_object_region[i,i][[1]] - overlap<-sum(synteny_object_region[j,i][[1]][,4]) - blocks<-nrow(synteny_object_region[i,j][[1]]) - - # calculate the synteny score - if (length1>length2) { - syn_score<-1+log10((overlap/length2)/blocks) - } - else { - syn_score<-1+log10((overlap/length1)/blocks) - } - temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score) - - per_region_table<-rbind(per_region_table, temprow) - } - } - - cat("\nSynteny scores calculation for region", region_name, "finished successfully\n", sep = " " ) - - # If the user asked to save intermediate objects - save the synteny scores object of the current region - if(intermediate_file_folder != 'NA') { - synteny_scores_object_file_name = paste0("synteny_scores_object_", region_name, ".rds") - saveRDS(per_region_table, file = paste0(intermediate_file_folder, synteny_scores_object_file_name)) - } - - return(per_region_table) -} - - -######################################################################################### -# The old functions - not in use anymore -######################################################################################### - -# synteny_analysis: function to run Decipher analysis for each gene -# inputs: 1. inpath = path to fasta file(s) -# 2. gene name = central region identifier -# 3. tmp_folder = temporary folder, to store the DECIPHER data. - -# output: synteny object -synteny_analysis<-function(inpath, region_name, tmp_folder) { - #fas<-inpath - seqs <- readDNAStringSet(inpath) - db<-paste0(tmp_folder,region_name) - flag<-0 - for (i in seq_along(seqs)) { - # The following line adds sequences to the DECIPHER DB: - ########################### - # IMPORTANT: - # SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used. - # If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample. - Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1]) - ########################### - flag<-flag+1 - } - - if (flag>1) { # only run DECIPHER analysis if we have more than 1 sequence matching this specific region. - synteny <- FindSynteny(db, maxSep=15, maxGap = 15) - } else { - synteny<-matrix((1:4),nrow=1) - } -} - - -# synteny_scores: a function to do the downstream analysis (i.e., assign per-region pairwise synteny scores) -# for each region that was analysed with DECIPHER syntenty. -# inputs: -# 1. synteny_object = (filtered output of "synteny_analysis") -# 2. metadata = metadata file (sea documentation for formatting instructions) - -# output: dataframe (columns specified within the function). - -synteny_scores<- function(synteny_object) { - - #cat(" starting second function - synteny scores") - - # Define a data frame that will hold all the data: change number of Groups according to number of Fields in metadata (i.e., GroupA, GroupB, etc.): here, for example, we have 4 - per_region_table<-data.frame("sample1" = character(), - "sample2"= character(), - "length1" = integer(), - "length2" = integer() , - "overlap" = integer(), #accomulative length of overlapping regions - "Blocks" = integer(), # number of synteny blocks per pairwise comparison - "synteny_score" = integer(), stringsAsFactors = FALSE) - - # for each two samples, create the values to be kept in the dataframe (assing to one row). - for (i in 2:ncol(synteny_object)) { - for (j in 1:(i-1)) { - sample1<-as.character(str_split(colnames(synteny_object)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats ! - sample2<-as.character(str_split(colnames(synteny_object)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats ! - length1<-synteny_object[j,j][[1]] - length2<-synteny_object[i,i][[1]] - overlap<-sum(synteny_object[j,i][[1]][,4]) - blocks<-nrow(synteny_object[i,j][[1]]) - - # calculate the synteny score - if (length1>length2) { - syn_score<-1+log10((overlap/length2)/blocks) - } else { - syn_score<-1+log10((overlap/length1)/blocks) - } - temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score) - - per_region_table<-rbind(per_region_table, temprow) - } - } - return(per_region_table) -} - - - -# add_names: a function to add the ID of the central regions that it's homologs are compared (and other information) to each dataframe outputed by "synteny_scores" -# inputs: -# 1. dfs = list of dataframes returned by "synteny_scores" -# 2.names = namas(dfs) - -# output: the same list of dataframes, but with addtional columns - -add_names<-function(dfs, names) { - #newnames<-str_extract_all(names, "[[:digit:]]+") # extract the numeric parts of the region name - #nth<-length(newnames[[1]]) #find the number of the last element - #newernames<-sapply(newnames,'[', nth) #extract the last element - #contig<-str_split(names, "_")[[1]][9] - #dfs %>% mutate(contig=contig, position_counter=newernames, region=paste(contig, position_counter, sep="_" )) # add to the dataframe - - # add the names (ref_genome_header + position) to the dataframe - dfs %>% mutate(ref_genome_region=names) -} - - -# subsample_regions: a function to sample x regions per pairwise comparison and calculate APSS (Average Pairwise Synteny Scores) -# inputs: -# 1. big_organized_dfs = dataframe that holds per-region pairwise comparisons (multiple regions, multiple pairs) -# 2. subsampling_value = how many regions/pairwise to sample. - -# output: dataframe with APSS values (col name is "average_score"), based on "subsampling_value" regions per pair. Pairs with <"subsampling_value" regions are filtererd out. -subsample_regions<-function(big_organized_dfs, subsampling_value, set_seed_arg) { - if(set_seed_arg != 0) { - set.seed(set_seed_arg) - cat("Using set.seed for reproducable subsampling. Seed: ", set_seed_arg, sep = " " ) - cat("\n\n") - } - newdf<-big_organized_dfs %>% - # pay attention to the grouping variable below - should match the groups specified in the "synteny_score" function - group_by(Sample1, Sample2) %>% - filter(n() > subsampling_value-1) %>% - sample_n(subsampling_value) %>% #subsample "subsampling_value" regions from each group - mutate(regions = n()) %>% - summarise(Average_score=mean(Synteny_score), Compared_regions=mean(regions)) %>% - #mutate(ref_genome=genome_name, .before = sample1) - return(newdf) -} - # add metadata: a function to add metadata to the list of comparisons # input: