diff --git a/README.md b/README.md index da93c7f..08ab1b0 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Create a new conda environment using the ‘SynTracker_env.yml’ file (located `conda env create -f SynTracker_env.yml` Activate the newly created environment: - `conda activate SynTracker_1_2` + `conda activate SynTracker_1_3` ## Input diff --git a/SynTracker_Manual.docx b/SynTracker_Manual.docx index 7a32ef7..f484804 100644 Binary files a/SynTracker_Manual.docx and b/SynTracker_Manual.docx differ diff --git a/SynTracker_Manual.pdf b/SynTracker_Manual.pdf index 11dce30..9f7da6d 100644 Binary files a/SynTracker_Manual.pdf and b/SynTracker_Manual.pdf differ diff --git a/SynTracker_env.yml b/SynTracker_env.yml index fbeb261..d77d871 100755 --- a/SynTracker_env.yml +++ b/SynTracker_env.yml @@ -1,4 +1,4 @@ -name: SynTracker_1_2 +name: SynTracker_1_3 channels: - conda-forge - bioconda @@ -6,5 +6,4 @@ dependencies: - python - r-base=4.0.5 - bioconductor-decipher - - biopython - r-tidyverse=1.3.0 diff --git a/syntracker.py b/syntracker.py index 4a80143..b18539d 100755 --- a/syntracker.py +++ b/syntracker.py @@ -347,7 +347,7 @@ def main(): config.flanking_length, config.minimal_flanking_length, config.minimal_full_length, config.minimal_identity, config.minimal_coverage, - config.blast_num_threads)) + config.blast_num_threads, config.logfile_path)) # Start the process process.start() diff --git a/syntracker_first_stage/blast.py b/syntracker_first_stage/blast.py index 0b762b3..8221e24 100644 --- a/syntracker_first_stage/blast.py +++ b/syntracker_first_stage/blast.py @@ -1,42 +1,47 @@ -from Bio.Blast.Applications import NcbimakeblastdbCommandline -from Bio.Blast.Applications import NcbiblastnCommandline +#from Bio.Blast.Applications import NcbimakeblastdbCommandline +#from Bio.Blast.Applications import NcbiblastnCommandline import os import sys import csv import re import config +import subprocess def make_blast_db(logfile): input_file = config.combined_renamed_genomes_file_path - command = NcbimakeblastdbCommandline(input_file=input_file, out=config.blast_db_file_path, dbtype="nucl", - parse_seqids=True, title=config.blast_db_file) + command = "makeblastdb -in " + input_file + " -dbtype nucl -title " + config.blast_db_file + " -parse_seqids -out " \ + + config.blast_db_file_path + print("\nExecuting the following BLAST command:") print(command) - logfile = open(config.logfile_path, "a") - logfile.write("\nExecuting the following BLAST command:\n" + str(command) + "\n") - logfile.close() + logfile.write("\nExecuting the following BLAST command:\n" + command + "\n") try: - stdout, stderr = command() - print(stdout) + subprocess.run(["makeblastdb", "-in", input_file, "-dbtype", "nucl", "-title", config.blast_db_file, + "-parse_seqids", "-out", config.blast_db_file_path], check=True) + except subprocess.CalledProcessError as err: + print("\nThe following command has failed:") + print(command) + print(err) + logfile.write("\nThe following command has failed:\n" + command + "\n") + exit() except Exception as err: - print("\nmakeblastdb command has failed for input file: " + input_file) - logfile = open(config.logfile_path, "a") - logfile.write("\nmakeblastdb command has failed for input file: " + input_file + "\n") - logfile.close() + print("\nThe following command has failed:") + print(command) print(err) + logfile.write("\nThe following command has failed:\n" + command + "\n") exit() - print("...Done!") + print("...Done!\n") def blast_per_region_process(full_path_region_file, blast_region_outfile, blastdbcmd_region_outfile, blastdbcmd_region_outfile_tmp, blast_db_file_path, flanking_length, minimal_flanking_length, minimal_full_length, minimal_identity, minimal_coverage, - num_threads): + num_threads, logfile_path): # Run blast for each region exit_code = run_blastn(full_path_region_file, blast_region_outfile, blast_db_file_path, minimal_identity, @@ -60,7 +65,7 @@ def blast_per_region_process(full_path_region_file, blast_region_outfile, blastd end = row[2] strand = row[3] - m = re.search("^(Sample\.\d+)_contig", sample_name) + m = re.search(r'^(Sample\.\d+)_contig', sample_name) if m: sample_name_only = m.group(1) @@ -130,25 +135,43 @@ def blast_per_region_process(full_path_region_file, blast_region_outfile, blastd else: print("\nBLAST output file " + blast_region_outfile + " contains less than two valid samples - " "skip it...\n") + logfile = open(logfile_path, "a") + logfile.write("\nBLAST output file " + blast_region_outfile + " contains less than two valid samples - " + "skip it...\n") + logfile.close() sys.exit(1) # The BLAST outfile contains less than 2 hits -> return failure for this region else: print("\nBLAST output file " + blast_region_outfile + " contains less than two samples - skip it...\n") + logfile = open(logfile_path, "a") + logfile.write("\nBLAST output file " + blast_region_outfile + " contains less than two valid samples - " + "skip it...\n") + logfile.close() sys.exit(1) def run_blastn(query_file, outfile, blast_db_file_path, minimal_identity, minimal_coverage, num_threads): - command = NcbiblastnCommandline(query=query_file, db=blast_db_file_path, out=outfile, - outfmt="6 sseqid sstart send sstrand", - max_target_seqs=10000, perc_identity=minimal_identity, - qcov_hsp_perc=minimal_coverage, num_threads=num_threads) + + command = "blastn -query " + query_file + " -db " + blast_db_file_path + " -out " + outfile + \ + " -outfmt 6 sseqid sstart send sstrand -max_target_seqs 10000 -perc_identity " + str(minimal_identity) + \ + " -qcov_hsp_perc " + str(minimal_coverage) + " -num_threads " + str(num_threads) try: - command() + subprocess.run(["blastn", "-query", query_file, "-db", blast_db_file_path, "-out", outfile, + "-outfmt", "6 sseqid sstart send sstrand", "-max_target_seqs", "10000", + "-perc_identity", str(minimal_identity), "-qcov_hsp_perc", str(minimal_coverage), "-num_threads" + , str(num_threads)], check=True) + + except subprocess.CalledProcessError as err: + print("\nThe following command has failed:") + print(command) + print(err) + return 1 + except Exception as err: print("\nThe following command has failed:") - print(str(command)) + print(command) print(err) return 1 diff --git a/syntracker_first_stage/parser.py b/syntracker_first_stage/parser.py index 646ef08..53905ff 100644 --- a/syntracker_first_stage/parser.py +++ b/syntracker_first_stage/parser.py @@ -192,7 +192,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): in_processed_genomes_list = 0 for line in read_conf: if re.search("^Reference genomes directory:", line): - m = re.search("^Reference.+:\s(\S+)\n", line) + m = re.search(r'^Reference.+: (\S+)\n', line) if m: ref_dir = m.group(1) @@ -200,7 +200,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) elif re.search("^Target", line): - m = re.search("^Target.+:\s(\S+)\n", line) + m = re.search(r'^Target.+: (\S+)\n', line) if m: target_dir = m.group(1) @@ -208,7 +208,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) elif re.search("^Output", line): - m = re.search("^Output.+:\s(\S+)\n", line) + m = re.search(r'^Output.+: (\S+)\n', line) if m: output_dir = m.group(1) @@ -216,7 +216,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) elif re.search("^Metadata", line): - m = re.search("^Metadata.+:\s(\S+)\n", line) + m = re.search(r'^Metadata.+: (\S+)\n', line) if m: metadata_file = m.group(1) @@ -224,7 +224,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) elif re.search("^Full", line): - m = re.search("^Full.+:\s(\d+)\n", line) + m = re.search(r'^Full.+: (\d+)\n', line) if m: full_length = m.group(1) @@ -232,7 +232,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) elif re.search("^Minimal coverage", line): - m = re.search("^Minimal coverage:\s(\d+)\n", line) + m = re.search(r'^Minimal coverage: (\d+)\n', line) if m: minimal_coverage = m.group(1) @@ -240,7 +240,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) elif re.search("^Minimal identity", line): - m = re.search("^Minimal identity:\s(\d+)\n", line) + m = re.search(r'^Minimal identity: (\d+)\n', line) if m: minimal_identity = m.group(1) @@ -267,8 +267,8 @@ def read_conf_file(old_conf_file, new_conf_file, mode): out_param.write("\n" + line) out_param.write("--------------------\n\n") - elif re.search("^\S+\t\S+\n", line) and in_ref_genomes_list and in_processed_genomes_list == 0: - m = re.search("^(\S+)\t(\S+)\n", line) + elif re.search(r'^\S+\t\S+\n', line) and in_ref_genomes_list and in_processed_genomes_list == 0: + m = re.search(r'^(\S+)\t(\S+)\n', line) genome_name = m.group(1) genome_file = m.group(2) config.genomes_dict[genome_name] = dict() @@ -293,7 +293,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode): break elif re.search("^ref_genome:", line) and in_processed_genomes_list: - m = re.search("^ref_genome:\s+(\S+)\n$", line) + m = re.search(r'^ref_genome:\s+(\S+)\n$', line) current_genome_name = m.group(1) elif re.search("BLAST finished", line) and in_processed_genomes_list: