Version 1.3:

- Removed the biopython package from the SynTracker_env.yml file - Changed the execution of Blast processes in blast.py so that Bio.Blast.Applications module (deprecated) is not needed anymore and they are executed directly by the Subprocess module.
leylabmpi · Jul 15, 2024 · 99da7ba · 99da7ba
1 parent e58cdfc
commit 99da7ba
Show file tree

Hide file tree

Showing 7 changed files with 58 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Create a new conda environment using the ‘SynTracker_env.yml’ file (located
       `conda env create -f SynTracker_env.yml`
 
 Activate the newly created environment: 
-      `conda activate SynTracker_1_2`
+      `conda activate SynTracker_1_3`
 
 
 ## Input

diff --git a/SynTracker_Manual.docx b/SynTracker_Manual.docx
diff --git a/SynTracker_Manual.pdf b/SynTracker_Manual.pdf
diff --git a/SynTracker_env.yml b/SynTracker_env.yml
@@ -1,10 +1,9 @@
-name: SynTracker_1_2
+name: SynTracker_1_3
 channels:
   - conda-forge
   - bioconda
 dependencies:
   - python
   - r-base=4.0.5
   - bioconductor-decipher
-  - biopython
   - r-tidyverse=1.3.0
diff --git a/syntracker.py b/syntracker.py
@@ -347,7 +347,7 @@ def main():
                                                                 config.flanking_length, config.minimal_flanking_length,
                                                                 config.minimal_full_length,
                                                                 config.minimal_identity, config.minimal_coverage,
-                                                                config.blast_num_threads))
+                                                                config.blast_num_threads, config.logfile_path))
                         # Start the process
                         process.start()
 

diff --git a/syntracker_first_stage/blast.py b/syntracker_first_stage/blast.py
@@ -1,42 +1,47 @@
-from Bio.Blast.Applications import NcbimakeblastdbCommandline
-from Bio.Blast.Applications import NcbiblastnCommandline
+#from Bio.Blast.Applications import NcbimakeblastdbCommandline
+#from Bio.Blast.Applications import NcbiblastnCommandline
 import os
 import sys
 import csv
 import re
 import config
+import subprocess
 
 
 def make_blast_db(logfile):
 
     input_file = config.combined_renamed_genomes_file_path
 
-    command = NcbimakeblastdbCommandline(input_file=input_file, out=config.blast_db_file_path, dbtype="nucl",
-                                         parse_seqids=True, title=config.blast_db_file)
+    command = "makeblastdb -in " + input_file + " -dbtype nucl -title " + config.blast_db_file + "  -parse_seqids -out " \
+              + config.blast_db_file_path
+
     print("\nExecuting the following BLAST command:")
     print(command)
-    logfile = open(config.logfile_path, "a")
-    logfile.write("\nExecuting the following BLAST command:\n" + str(command) + "\n")
-    logfile.close()
+    logfile.write("\nExecuting the following BLAST command:\n" + command + "\n")
 
     try:
-        stdout, stderr = command()
-        print(stdout)
+        subprocess.run(["makeblastdb", "-in", input_file, "-dbtype", "nucl", "-title", config.blast_db_file,
+                        "-parse_seqids", "-out", config.blast_db_file_path], check=True)
+    except subprocess.CalledProcessError as err:
+        print("\nThe following command has failed:")
+        print(command)
+        print(err)
+        logfile.write("\nThe following command has failed:\n" + command + "\n")
+        exit()
     except Exception as err:
-        print("\nmakeblastdb command has failed for input file: " + input_file)
-        logfile = open(config.logfile_path, "a")
-        logfile.write("\nmakeblastdb command has failed for input file: " + input_file + "\n")
-        logfile.close()
+        print("\nThe following command has failed:")
+        print(command)
         print(err)
+        logfile.write("\nThe following command has failed:\n" + command + "\n")
         exit()
 
-    print("...Done!")
+    print("...Done!\n")
 
 
 def blast_per_region_process(full_path_region_file, blast_region_outfile, blastdbcmd_region_outfile,
                              blastdbcmd_region_outfile_tmp, blast_db_file_path, flanking_length,
                              minimal_flanking_length, minimal_full_length, minimal_identity, minimal_coverage,
-                             num_threads):
+                             num_threads, logfile_path):
 
     # Run blast for each region
     exit_code = run_blastn(full_path_region_file, blast_region_outfile, blast_db_file_path, minimal_identity,
@@ -60,7 +65,7 @@ def blast_per_region_process(full_path_region_file, blast_region_outfile, blastd
                 end = row[2]
                 strand = row[3]
 
-                m = re.search("^(Sample\.\d+)_contig", sample_name)
+                m = re.search(r'^(Sample\.\d+)_contig', sample_name)
                 if m:
                     sample_name_only = m.group(1)
 
@@ -130,25 +135,43 @@ def blast_per_region_process(full_path_region_file, blast_region_outfile, blastd
         else:
             print("\nBLAST output file " + blast_region_outfile + " contains less than two valid samples - "
                                                                   "skip it...\n")
+            logfile = open(logfile_path, "a")
+            logfile.write("\nBLAST output file " + blast_region_outfile + " contains less than two valid samples - "
+                                                                  "skip it...\n")
+            logfile.close()
             sys.exit(1)
 
     # The BLAST outfile contains less than 2 hits -> return failure for this region
     else:
         print("\nBLAST output file " + blast_region_outfile + " contains less than two samples - skip it...\n")
+        logfile = open(logfile_path, "a")
+        logfile.write("\nBLAST output file " + blast_region_outfile + " contains less than two valid samples - "
+                                                                      "skip it...\n")
+        logfile.close()
         sys.exit(1)
 
 
 def run_blastn(query_file, outfile, blast_db_file_path, minimal_identity, minimal_coverage, num_threads):
-    command = NcbiblastnCommandline(query=query_file, db=blast_db_file_path, out=outfile,
-                                    outfmt="6 sseqid sstart send sstrand",
-                                    max_target_seqs=10000, perc_identity=minimal_identity,
-                                    qcov_hsp_perc=minimal_coverage, num_threads=num_threads)
+
+    command = "blastn -query " + query_file + " -db " + blast_db_file_path + " -out " + outfile + \
+              " -outfmt 6 sseqid sstart send sstrand -max_target_seqs 10000 -perc_identity " + str(minimal_identity) + \
+              " -qcov_hsp_perc " + str(minimal_coverage) + " -num_threads " + str(num_threads)
 
     try:
-        command()
+        subprocess.run(["blastn", "-query", query_file, "-db", blast_db_file_path, "-out", outfile,
+                        "-outfmt", "6 sseqid sstart send sstrand", "-max_target_seqs", "10000",
+                        "-perc_identity", str(minimal_identity), "-qcov_hsp_perc", str(minimal_coverage), "-num_threads"
+                        , str(num_threads)], check=True)
+
+    except subprocess.CalledProcessError as err:
+        print("\nThe following command has failed:")
+        print(command)
+        print(err)
+        return 1
+
     except Exception as err:
         print("\nThe following command has failed:")
-        print(str(command))
+        print(command)
         print(err)
         return 1
 

diff --git a/syntracker_first_stage/parser.py b/syntracker_first_stage/parser.py
@@ -192,55 +192,55 @@ def read_conf_file(old_conf_file, new_conf_file, mode):
         in_processed_genomes_list = 0
         for line in read_conf:
             if re.search("^Reference genomes directory:", line):
-                m = re.search("^Reference.+:\s(\S+)\n", line)
+                m = re.search(r'^Reference.+: (\S+)\n', line)
                 if m:
                     ref_dir = m.group(1)
 
                 if mode == "continue_all_genomes":
                     out_param.write("\n" + line)
 
             elif re.search("^Target", line):
-                m = re.search("^Target.+:\s(\S+)\n", line)
+                m = re.search(r'^Target.+: (\S+)\n', line)
                 if m:
                     target_dir = m.group(1)
 
                 if mode == "continue_all_genomes":
                     out_param.write("\n" + line)
 
             elif re.search("^Output", line):
-                m = re.search("^Output.+:\s(\S+)\n", line)
+                m = re.search(r'^Output.+: (\S+)\n', line)
                 if m:
                     output_dir = m.group(1)
 
                 if mode == "continue_all_genomes":
                     out_param.write("\n" + line)
 
             elif re.search("^Metadata", line):
-                m = re.search("^Metadata.+:\s(\S+)\n", line)
+                m = re.search(r'^Metadata.+: (\S+)\n', line)
                 if m:
                     metadata_file = m.group(1)
 
                 if mode == "continue_all_genomes":
                     out_param.write("\n" + line)
 
             elif re.search("^Full", line):
-                m = re.search("^Full.+:\s(\d+)\n", line)
+                m = re.search(r'^Full.+: (\d+)\n', line)
                 if m:
                     full_length = m.group(1)
 
                 if mode == "continue_all_genomes":
                     out_param.write("\n" + line)
 
             elif re.search("^Minimal coverage", line):
-                m = re.search("^Minimal coverage:\s(\d+)\n", line)
+                m = re.search(r'^Minimal coverage: (\d+)\n', line)
                 if m:
                     minimal_coverage = m.group(1)
 
                 if mode == "continue_all_genomes":
                     out_param.write("\n" + line)
 
             elif re.search("^Minimal identity", line):
-                m = re.search("^Minimal identity:\s(\d+)\n", line)
+                m = re.search(r'^Minimal identity: (\d+)\n', line)
                 if m:
                     minimal_identity = m.group(1)
 
@@ -267,8 +267,8 @@ def read_conf_file(old_conf_file, new_conf_file, mode):
                     out_param.write("\n" + line)
                     out_param.write("--------------------\n\n")
 
-            elif re.search("^\S+\t\S+\n", line) and in_ref_genomes_list and in_processed_genomes_list == 0:
-                m = re.search("^(\S+)\t(\S+)\n", line)
+            elif re.search(r'^\S+\t\S+\n', line) and in_ref_genomes_list and in_processed_genomes_list == 0:
+                m = re.search(r'^(\S+)\t(\S+)\n', line)
                 genome_name = m.group(1)
                 genome_file = m.group(2)
                 config.genomes_dict[genome_name] = dict()
@@ -293,7 +293,7 @@ def read_conf_file(old_conf_file, new_conf_file, mode):
                     break
 
             elif re.search("^ref_genome:", line) and in_processed_genomes_list:
-                m = re.search("^ref_genome:\s+(\S+)\n$", line)
+                m = re.search(r'^ref_genome:\s+(\S+)\n$', line)
                 current_genome_name = m.group(1)
 
             elif re.search("BLAST finished", line) and in_processed_genomes_list: