updates

ndaniel · Nov 8, 2019 · 9574f3d · 9574f3d
1 parent be0b04e
commit 9574f3d
Show file tree

Hide file tree

Showing 13 changed files with 11,138 additions and 16,147 deletions.
diff --git a/NEWS b/NEWS
@@ -1,12 +1,13 @@
 Version Release History for FusionCatcher
 =========================================
 
-Version 1.20 -- November 15, 2019
+Version 1.20 -- November 8, 2019
 --------------------------------
  * added support for Ensembl version 96, 97, and 98
  * increased sensitivity by adding an extra anchor
  * added --limitOutSJcollapsed for better control of STAR aligner
  * fixed memory bug leading to seg-fault in STAR aligner
+ * improved handling of low entropy reads
  * size of built files for different organisms has been reduced
  * fixed several minor bugs
  * updated manual

diff --git a/bin/FC b/bin/FC
diff --git a/bin/bootstrap.py b/bin/bootstrap.py
@@ -187,7 +187,7 @@ def PATHS(exe = None, prefix = None, installdir = None, internet = True):
         FUSIONCATCHER_PATH = expand(FUSIONCATCHER_PREFIX,'fusioncatcher')
 
     FUSIONCATCHER_BIN = expand(FUSIONCATCHER_PATH,'bin')
-    FUSIONCATCHER_URL = 'http://sourceforge.net/projects/fusioncatcher/files/fusioncatcher_v1.10.zip'
+    FUSIONCATCHER_URL = 'http://sourceforge.net/projects/fusioncatcher/files/fusioncatcher_v1.20.zip'
     FUSIONCATCHER_VERSION = "1.20"
     FUSIONCATCHER_DATA = expand(FUSIONCATCHER_PATH,'data')
     FUSIONCATCHER_CURRENT = expand(FUSIONCATCHER_DATA,'current')
@@ -2062,7 +2062,6 @@ def update_path(SOME_PATH,executable,subdir='src'):
         txt.append("wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/%s.tar.gz.ab -O %s.tar.gz.ab" % (v,os.path.join(FUSIONCATCHER_DATA.replace(" ","\\ "),v)))
         txt.append("wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/%s.tar.gz.ac -O %s.tar.gz.ac" % (v,os.path.join(FUSIONCATCHER_DATA.replace(" ","\\ "),v)))
         txt.append("wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/%s.tar.gz.ad -O %s.tar.gz.ad" % (v,os.path.join(FUSIONCATCHER_DATA.replace(" ","\\ "),v)))
-        txt.append("wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/%s.tar.gz.ae -O %s.tar.gz.ae" % (v,os.path.join(FUSIONCATCHER_DATA.replace(" ","\\ "),v)))
         txt.append("wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/%s.md5 -O %s/%s.md5" % (v,FUSIONCATCHER_DATA.replace(" ","\\ "),v))
         txt.append("cd %s" % (FUSIONCATCHER_DATA.replace(" ","\\ "),))
         txt.append("md5sum -c %s/%s.md5" % (FUSIONCATCHER_DATA.replace(" ","\\ "),v))

diff --git a/bin/build_report_fusions_map.py b/bin/build_report_fusions_map.py
@@ -658,7 +658,7 @@ def give_me_assembly(fasta, kmer = 31, velvet_dir = None ,tmp_dir = None):
     candidate_fusions = dict([( tuple(myorder(line[0], line[1])), line[2]) for line in candidate_fusions if line[6].lower() == 'further_analysis'])
 
     unmapped_reads = set()
-    if options.input_unmapped_reads_filename:
+    if options.input_unmapped_reads_filename and os.path.isfile(options.input_unmapped_reads_filename):
         print "Reading...",options.input_unmapped_reads_filename
         unmapped_reads = set([line.rstrip('\r\n') for line in file(options.input_unmapped_reads_filename,'r').readlines()])
 

diff --git a/bin/find_homolog_genes.py b/bin/find_homolog_genes.py
@@ -575,7 +575,7 @@ def homology(stuff):
     homolog = [k+'\t'+str(v)+'\n' for (k,v) in homolog.items() if v >= options.reads]
     file(options.output_filename,'w').writelines(homolog)
 
-    print >> sys.stderr, "Read '%s' found mapping on %d genes!" % (max_genes_per_read_id,max_genes_per_read)
+    #print >> sys.stderr, "Read '%s' found mapping on %d genes!" % (max_genes_per_read_id,max_genes_per_read)
 
     #if options.output_offending_reads_filename:
         #print "Writing...",options.output_offending_reads_filename

diff --git a/bin/fusioncatcher b/bin/fusioncatcher
diff --git a/bin/fusioncatcher-batch b/bin/fusioncatcher-batch
diff --git a/bin/fusioncatcher-build b/bin/fusioncatcher-build
diff --git a/bin/fusioncatcher.py b/bin/fusioncatcher.py
@@ -527,6 +527,25 @@ def is_known_extension(something):
 #                             "here for quality trimming is 0.05 (which is the default value of 'seqtk trimfq') or 0.10."
                     )
 
+    parser.add_option("--skip-trim-multiple-5",
+                      action = "store_true",
+                      dest = "skip_trim_multiple_5",
+                      default = False,
+                      help = optparse.SUPPRESS_HELP
+#                             "It trims the 3' ends of the reads to multiple of 5, "+
+#                             "for example 51bp to 50bp. It looks like for Illumina "+
+#                             "reads the last 51 or 76 or 101 or 151 is really bad quality."
+                    )
+
+    parser.add_option("--skip-filter-low-entropy",
+                      action = "store_true",
+                      dest = "skip_filter_low_entropy",
+                      default = False,
+                      help = optparse.SUPPRESS_HELP
+#                             "It masks with Ns the low entropy regions in reads."+
+                    )
+
+
 
     mydefault = sorted([
             "paralogs",
@@ -3407,7 +3426,7 @@ def is_known_extension(something):
     job.add('>>',info_file,kind='output')
     job.run()
 
-    output_file = outdir('orig.fq')
+    output_file = outdir('orig__.fq')
     # concatenate reads before trimming
     if len(list_input_files) > 1:
         #job.add('concatenate.py',kind='program')
@@ -3424,6 +3443,29 @@ def is_known_extension(something):
     else:
         job.link(new_list_input_files[0], output_file, temp_path=temp_flag)
 
+    if not options.skip_trim_multiple_5:
+        #bbduk.sh in=reads.fq out=clean.fq ftm=5
+        job.add(_BP_+'bbduk.sh',kind='program')
+        job.add('forcetrimmod=','5',kind='parameter',space='no')
+        job.add('in=',outdir('orig__.fq'),kind='input',space='no', temp_path=temp_flag)
+        job.add('out=',outdir('orig__x.fq'),kind='output',space='no')
+        job.run()
+    else:
+        job.link(outdir('orig__.fq'), outdir('orig__x.fq'), temp_path=temp_flag)
+
+    if not options.skip_filter_low_entropy:
+        #bbduk.sh in=r.fq out=o.fq entropy=0.1 entropymask=t entropyk=2 entropywindow=40
+        job.add(_BP_+'bbduk.sh',kind='program')
+        job.add('entropy=','0.1',kind='parameter',space='no')
+        job.add('entropymask=','t',kind='parameter',space='no')
+        job.add('entropyk=','2',kind='parameter',space='no')
+        job.add('entropywindow=','40',kind='parameter',space='no')
+        job.add('in=',outdir('orig__x.fq'),kind='input',space='no', temp_path=temp_flag)
+        job.add('out=',outdir('orig.fq'),kind='output',space='no')
+        job.run()
+    else:
+        job.link(outdir('orig__x.fq'), outdir('orig.fq'), temp_path=temp_flag)
+
     if not options.skip_deduplication:
         job.add('LC_ALL=C',kind='program')
         job.add('cat',kind='parameter')