From 760bafc0c728192024d893c5be6dd059046e429e Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Thu, 19 Dec 2019 17:41:46 +1100
Subject: [PATCH 1/9] translate to python3

bumped new version number

fixed a bug where if you requested more minExons than you had regions in
a target, the program would not terminate

changed the output table to be rounded to 3 digits fro log ratios and 6
digits for pValues
---
 .gitignore                          |    2 +
 README.md                           |   36 +
 contra.py                           | 1198 +++++++++++++++------------
 scripts/assign_bin_number_v2.py     |  274 +++---
 scripts/average_count.py            |  341 ++++----
 scripts/cn_analysis.v3.R            |   31 +-
 scripts/cn_apply_threshold.py       |  177 ++--
 scripts/convert_gene_coordinate.py  |  437 +++++-----
 scripts/convert_targeted_regions.py |   79 +-
 scripts/count_libsize.py            |   31 +-
 scripts/get_chr_length.py           |   27 +-
 scripts/split_chromosome.py         |  133 +--
 scripts/target_breakdown.py         |   70 +-
 scripts/vcf_out.py                  |   76 +-
 14 files changed, 1585 insertions(+), 1327 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0d20b64..5a77cbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 *.pyc
+test
+TestFiles
diff --git a/README.md b/README.md
index 7e0933c..a742196 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,39 @@
 # CONTRA
+CONTRA: copy number analysis for targeted resequencing.
+Li J, Lupat R, Amarasinghe KC, Thompson ER, Doyle MA, Ryland GL, Tothill RW, Halgamuge SK, Campbell IG, Gorringe KL.
 
+Bioinformatics Core Facility, Peter MacCallum Cancer Centre, VIC 3002, Australia. Jason.Li@petermac.org
+
+Method published 2012 in Bioinformatics doi: [10.1093/bioinformatics/bts146]{https://doi.org/10.1093/bioinformatics/bts146}
+
+
+previously available through
 http://contra-cnv.sourceforge.net/
+
+
+# Changelog
+v2.1.0
+Uses python3
+Improved performance through multithreading and restructuring of loops
+Autodetection of file formats (work in progress)
+
+v2.0.8
+Included NDE and WGCNV workflows.
+Removed FASTA dependency. Removed VCF support. Removed PDF file.
+Updated online documentation.
+
+v2.0.7
+Added --version
+
+v2.0.6
+Added option to remove Duplicates
+
+
+v2.0.4
+Fixed a bug with "chr" named chromosomes in cn_apply_threshold
+Catch errors with bam files do not contain reads in a targeted chromosome.
+
+
+v2.0.3
+Fixed baseline.py scalability to large sample size
+Now compatable with R2.14+
diff --git a/contra.py b/contra.py
index abfed0a..fc84f7e 100755
--- a/contra.py
+++ b/contra.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # ----------------------------------------------------------------------#
 # Copyright (c) 2011, Richard Lupat & Jason Li.
@@ -19,13 +19,13 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 23 July 2012 16:43PM
 
 
 import os
-from optparse import OptionParser
+import argparse
 import sys
 import subprocess
 import shlex
@@ -42,585 +42,701 @@
 from scripts.count_libsize import *
 from scripts.target_breakdown import *
 
-#VERSION
-VERSION="2.0.8"
+# VERSION
+VERSION = "2.0.10"
 
-#Absolute Path
+# Absolute Path
 scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
 
+
 class Params:
-	"""
-	Class for top-level system parameters
-	"""
-		
-	def __init__(self):
-		# command-line option definition
-		self.parser = OptionParser()
-		self.parser.add_option("-t", "--target", 
-			help="Target region definition file [REQUIRED] [BED Format]",
-			action="store", type="string", dest="target")
-		self.parser.add_option("-s", "--test", 
-			help="Alignment file for the test sample [REQUIRED] [BAM/SAM]",
-			action="store", type="string", dest="test")	
-		self.parser.add_option("-c", "--control", 
-			help="Alignment file for the control sample [REQUIRED] [BAM/SAM]",
-			action="store", type="string", dest="control") 
-		self.parser.add_option("-f", "--fasta", 
-			help="Reference Genome [NOT REQUIRED since v2.0.8][FASTA]",
-			action="store", type="string", dest="fasta")
-		self.parser.add_option("-o", "--outFolder", 
-			help="the output folder path name to store the output of analysis [REQUIRED]",
-			action="store", type="string", dest="outFolder") 
-		self.parser.add_option("--numBin", 
-			help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]",
-			action="store", type="string", dest="numBin", default="20") 
-		self.parser.add_option("--minReadDepth", 
-			help="The threshold for minimum read depth for each bases [10]",
-			action="store", type="string", dest="minReadDepth", default=10) 
-		self.parser.add_option("--minNBases",
-			help="The threshold for minimum number of bases for each target regions [10]",
-			action="store", type="string", dest="minNBases", default= 10) 
-		self.parser.add_option("--sam", 
-			help="If the specified, test and control sample are in SAM [False]",
-			action="store_true", dest="sam", default="False") 
-		self.parser.add_option("--bed",
-			help="if specified, control will be in BED format [False]",
-			action="store_true", dest = "bedInput", default="False")
-		self.parser.add_option("--pval", 
-			help="The p-value threshold for filtering [0.05]. Applies to Adjusted P-Value.",
-			action="store", type="string", dest="pval", default=0.05) 
-		self.parser.add_option("--sampleName", 
-			help ="The name to be appended to the front of default output name ['']",
-			action="store", type="string", dest="sampleName", default='')
-		self.parser.add_option("--nomultimapped", 
-			help="The option to remove multi-mapped reads [False]",
-			action="store_true", dest="nomultimapped",default="False")	
-		self.parser.add_option("-p", "--plot", 
-			help="Plots log-ratio distribution for each bin [False]", 
-			action="store_true", dest="plot", default="False")
-		self.parser.add_option("--minExon",
-			help="Minimum number of Exons in one bin (if less than this, bin that contains small number of exons"
-				+"will be moved to the adjacent bins) [2000] ",
-			action="store", type="string", dest="minExon", default="2000")
-		self.parser.add_option("--minControlRdForCall",
-			help="Minimum control readdepth for call [5]",
-			action="store", type="string", dest="minControl", default="5")
-
-		self.parser.add_option("--minTestRdForCall",
-			help="Minimum test readdepth for call [0]",
-			action="store", type="string", dest="minTest", default="0")
-
-		self.parser.add_option("--minAvgForCall",
-			help="Minimum average coverage for call [20]",
-			action="store", type="string", dest="minAvg", default="20")
-
-		self.parser.add_option("--maxRegionSize",
-			help="Maximum Region Size in target region (for breaking large region into smaller region. By default, maxRegionSize 0 means no breakdown) [0]",
-			action="store", type="string", dest="maxRegionSize", default="0") 
-
-		self.parser.add_option("--targetRegionSize",
-			help="Target Region Size for breakdown (if maxRegionSize is non zero) [200]",
-			action="store", type="string", dest="targetRegionSize", default="200")
-
-		self.parser.add_option("-l", "--largeDeletion",
-                        help="if specified, CONTRA will run large deletion analysis (CBS). User must have DNAcopy R-library installed to run the analysis. [False]",
-                        action="store_true", dest = "large", default="False")
-
-		self.parser.add_option("--smallSegment",
-			help="CBS segment size for calling large variations [1]",
-			action="store", type="string", dest="smallSegment", default="1")
-
-		self.parser.add_option("--largeSegment",
-			help="CBS segment size for calling large variations [25]",
-			action="store", type="string", dest="largeSegment", default="25")
-
-		self.parser.add_option("--lrCallStart",
-			help="Log ratios start range that will be used to call CNV [-0.3]",
-			action="store", type="string", dest="lrs", default="-0.3")
-
-		self.parser.add_option("--lrCallEnd",
-			help="Log ratios end range that will be used to call CNV [0.3]",
-			action="store", type="string", dest="lre", default="0.3")
-
-		self.parser.add_option("--passSize", 
-			help="Size of exons that passed the p-value threshold compare to the original exon size [0.35]",
-			action="store", type="string", dest="passSize", default="0.35")
-
-                ###
-		self.parser.add_option("--removeDups",
-			help="if specified, will remove PCR duplicates [False]",
-			action="store_true", dest = "removeDups", default="False")	
-		self.parser.add_option("--version",
-			help="Returns version",
-			action="store_true", dest = "version")
-
-		# required parameters list
-		self.ERRORLIST = []
-
-		# change system parameters based on any command line
-		(options, args) = self.parser.parse_args()
-		if options.target:
-			self.TARGET = options.target
-		else:
-			#self.parser.print_help()
-			#self.parser.error("--target not supplied")
-			self.ERRORLIST.append("target")
-
-		if options.test:
-			self.TEST = options.test
-		else:
-			#self.parser.error("--test not supplied")
-			self.ERRORLIST.append("test")
-
-		if options.control:
-			self.CONTROL = options.control
-		else:
-			#self.parser.error("--control not supplied")
-			self.ERRORLIST.append("control")
-
-#		if options.fasta:
-#			self.FASTA = options.fasta
-#		else:
-#			#self.parser.error("--fasta not supplied")
-#			self.ERRORLIST.append("fasta")
-
-		if options.outFolder:
-			self.OUTFOLDER = options.outFolder
-		else:
-			#self.parser.error("--outFolder not supplied")
-			self.ERRORLIST.append("outfolder")
-
-		if len(self.ERRORLIST) != 0:
-			self.parser.print_help()
-			self.parser.error("Missing required parameters")	
-
-		if options.numBin:
-			binsNumber = options.numBin.split(",")
-			try:
-				self.NUMBIN = [int(j) for j in binsNumber]
-			except:
-				self.NUMBIN = [20]
-		if options.minReadDepth:
-			self.MINREADDEPTH = int(options.minReadDepth)
-		if options.minNBases:
-			self.MINNBASES = int(options.minNBases)
-		if options.sam:
-			self.SAM = str(options.sam)
-		if options.pval:
-			self.PVAL = options.pval
-		if options.sampleName:
-			self.SAMPLENAME = options.sampleName
-		else:
-			self.SAMPLENAME = 'No-SampleName'
-		if options.nomultimapped:
-			self.NOMULTIMAPPED = str(options.nomultimapped)
-		if options.plot:
-			self.PLOT = str(options.plot)
-		if options.bedInput:
-			self.BEDINPUT = options.bedInput
-		if options.minExon:
-			self.MINEXON	= int(options.minExon)
-		if options.minControl:
-			self.MINCONTROL = options.minControl
-		if options.minTest:
-			self.MINTEST = options.minTest
-		if options.minAvg:
-			self.MINAVG  = options.minAvg
-		if options.maxRegionSize:
-			self.MAXREGIONSIZE = int(options.maxRegionSize)
-		if options.targetRegionSize:
-			self.TARGETREGIONSIZE = int(options.targetRegionSize)
-		if options.large:
-			self.LARGE	= str(options.large)
-		if options.smallSegment:
-			self.SMALLSEGMENT = options.smallSegment
-		if options.largeSegment:
-			self.LARGESEGMENT = options.largeSegment
-		if options.lre:
-			self.LRE	= options.lre
-		if options.lrs:
-			self.LRS	= options.lrs
-		if options.passSize:
-			self.PASSSIZE	= options.passSize
-			
-                ### either "False" or True atn
-		if options.removeDups:
-			self.REMOVEDUPS = str(options.removeDups)
-			
-	def repeat(self):
-		# params test
-		print "target		:", self.TARGET
-		print "test		:", self.TEST
-		print "control		:", self.CONTROL
-#		print "fasta		:", self.FASTA
-		print "outfolder	:", self.OUTFOLDER
-		print "numBin		:", self.NUMBIN
-		print "minreaddepth	:", self.MINREADDEPTH
-		print "minNBases	:", self.MINNBASES
-		print "sam		:", self.SAM
-		print "pval		:", self.PVAL
-		print "sampleName	:", self.SAMPLENAME
-		print "nomultimapped	:", self.NOMULTIMAPPED
-		print "plot		:", self.PLOT
-		print "bedInput		:", self.BEDINPUT
-		print "minExon		:", self.MINEXON
-		print "largeDeletion	:", self.LARGE
-		print "removeDups	:", self.REMOVEDUPS
+    """
+    Class for top-level system parameters
+    """
+
+    def __init__(self):
+        # command-line option definition
+        self.parser = argparse.ArgumentParser()
+        # we remove the optionals so the required come first
+        self.parser._action_groups.pop()
+
+        # create the group of required params
+        self.required = self.parser.add_argument_group("required arguments")
+
+        # create the group of optional
+        self.optional = self.parser.add_argument_group("optional arguments")
+
+        self.required.add_argument(
+            "-t",
+            "--target",
+            help="Target region definition file [BED Format]",
+            required=True,
+        )
+        self.required.add_argument(
+            "-s",
+            "--test",
+            help="Alignment file for the test sample [BAM/SAM]",
+            required=True,
+        )
+        self.required.add_argument(
+            "-c",
+            "--control",
+            help="Alignment file for the control sample [BAM/SAM]",
+            required=True,
+        )
+        self.optional.add_argument("-f", "--fasta", help="Reference Genome [FASTA]")
+        self.required.add_argument(
+            "-o",
+            "--outFolder",
+            help="the output folder path name to store the output of analysis",
+            required=True,
+        )
+        self.optional.add_argument(
+            "--numBin",
+            help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]",
+            default="20",
+        )
+        self.optional.add_argument(
+            "--minReadDepth",
+            help="The threshold for minimum read depth for each bases [10]",
+            type=int,
+            default=10,
+        )
+        self.optional.add_argument(
+            "--minNBases",
+            help="The threshold for minimum number of bases for each target regions [10]",
+            type=int,
+            default=10,
+        )
+        self.optional.add_argument(
+            "--sam",
+            help="If the specified, test and control sample are in SAM [False]",
+            action="store_true",
+            dest="sam",
+            default=False,
+        )
+        self.optional.add_argument(
+            "--bed",
+            help="if specified, control will be in BED format [False]",
+            action="store_true",
+            dest="bedInput",
+            default=False,
+        )
+        self.optional.add_argument(
+            "--pval",
+            help="The p-value threshold for filtering [0.05]. Applies to Adjusted P-Value.",
+            type=float,
+            dest="pval",
+            default=0.05,
+        )
+        self.optional.add_argument(
+            "--sampleName",
+            help="The name to be appended to the front of default output name ['']",
+            default="",
+        )
+        self.optional.add_argument(
+            "--nomultimapped",
+            help="The option to remove multi-mapped reads [False]",
+            action="store_true",
+            default=False,
+        )
+        self.optional.add_argument(
+            "-p",
+            "--plot",
+            help="Plots log-ratio distribution for each bin [False]",
+            action="store_true",
+            default=False,
+        )
+        self.optional.add_argument(
+            "--minExon",
+            help="Minimum number of Exons in one bin (if less than this, bin that contains small number of exons"
+            + "will be moved to the adjacent bins) [2000] ",
+            type=int,
+            default=100,
+        )
+        self.optional.add_argument(
+            "--minControlRdForCall",
+            help="Minimum control readdepth for call [5]",
+            type=int,
+            dest="minControl",
+            default=5,
+        )
+
+        self.optional.add_argument(
+            "--minTestRdForCall",
+            help="Minimum test readdepth for call [0]",
+            type=int,
+            dest="minTest",
+            default=0,
+        )
+
+        self.optional.add_argument(
+            "--minAvgForCall",
+            help="Minimum average coverage for call [20]",
+            type=int,
+            dest="minAvg",
+            default=20,
+        )
+
+        self.optional.add_argument(
+            "--maxRegionSize",
+            help="Maximum Region Size in target region (for breaking large region into smaller region. By default, maxRegionSize 0 means no breakdown) [0]",
+            default=0,
+        )
+
+        self.optional.add_argument(
+            "--targetRegionSize",
+            help="Target Region Size for breakdown (if maxRegionSize is non zero) [200]",
+            type=int,
+            default=100,
+        )
+
+        self.optional.add_argument(
+            "-l",
+            "--largeDeletion",
+            help="if specified, CONTRA will run large deletion analysis (CBS). User must have DNAcopy R-library installed to run the analysis. [False]",
+            action="store_true",
+            dest="large",
+            default=False,
+        )
+
+        self.optional.add_argument(
+            "--smallSegment",
+            help="CBS segment size for calling large variations [1]",
+            type=int,
+            default=1,
+        )
+
+        self.optional.add_argument(
+            "--largeSegment",
+            help="CBS segment size for calling large variations [25]",
+            type=int,
+            default=25,
+        )
+
+        self.optional.add_argument(
+            "--lrCallStart",
+            help="Log ratios start range that will be used to call CNV [-0.3]",
+            type=float,
+            dest="lrs",
+            default=-0.3,
+        )
+
+        self.optional.add_argument(
+            "--lrCallEnd",
+            help="Log ratios end range that will be used to call CNV [0.3]",
+            type=float,
+            dest="lre",
+            default=0.3,
+        )
+
+        self.optional.add_argument(
+            "--passSize",
+            help="Size of exons that passed the p-value threshold compare to the original exon size [0.35]",
+            type=float,
+            default=0.35,
+        )
+
+        ###
+        self.optional.add_argument(
+            "--removeDups",
+            help="if specified, will remove PCR duplicates [False]",
+            action="store_true",
+            default=False,
+        )
+        self.optional.add_argument("--version", action="version", version=VERSION)
+
+        # required parameters list
+        self.ERRORLIST = []
+
+        # change system parameters based on any command line
+        options = self.parser.parse_args()
+        print(options)
+
+        if options.target:
+            self.TARGET = options.target
+        else:
+            # self.parser.print_help()
+            # self.parser.error("--target not supplied")
+            self.ERRORLIST.append("target")
+
+        if options.test:
+            if not os.path.isfile(options.test):
+                self.ERRORLIST.append("test")
+            self.TEST = options.test
+
+        else:
+            # self.parser.error("--test not supplied")
+            self.ERRORLIST.append("test")
+
+        if options.control:
+            if not os.path.isfile(options.test):
+                self.ERRORLIST.append("control")
+            self.CONTROL = options.control
+        else:
+            # self.parser.error("--control not supplied")
+            self.ERRORLIST.append("control")
+
+        if options.outFolder:
+            self.OUTFOLDER = options.outFolder
+        else:
+            # self.parser.error("--outFolder not supplied")
+            self.ERRORLIST.append("outfolder")
+
+        if len(self.ERRORLIST) != 0:
+            # self.parser.print_help()
+            self.parser.error("Missing required parameters: " + str(self.ERRORLIST))
+
+        if options.numBin:
+            binsNumber = options.numBin.split(",")
+            try:
+                self.NUMBIN = [int(j) for j in binsNumber]
+            except:
+                self.NUMBIN = [20]
+
+        self.MINREADDEPTH = options.minReadDepth
+        self.MINNBASES = options.minNBases
+        self.SAM = options.sam
+        self.PVAL = options.pval
+
+        if options.sampleName:
+            self.SAMPLENAME = options.sampleName
+        else:
+            self.SAMPLENAME = "No-SampleName"
+
+        self.NOMULTIMAPPED = options.nomultimapped
+        self.PLOT = options.plot
+        self.BEDINPUT = options.bedInput
+        self.MINEXON = options.minExon
+        self.MINCONTROL = options.minControl
+        self.MINTEST = options.minTest
+        self.MINAVG = options.minAvg
+        self.MAXREGIONSIZE = options.maxRegionSize
+        self.TARGETREGIONSIZE = options.targetRegionSize
+        self.LARGE = options.large
+        self.SMALLSEGMENT = options.smallSegment
+        self.LARGESEGMENT = options.largeSegment
+        self.LRE = options.lre
+        self.LRS = options.lrs
+        self.PASSSIZE = options.passSize
+        self.REMOVEDUPS = options.removeDups
+
+        if self.BEDINPUT and self.SAM:
+            print("cant parse sam and BED input, setting input to BED")
+            self.SAM = False
+
+    def repeat(self):
+        # params test
+        print("target        :", self.TARGET)
+        print("test          :", self.TEST)
+        print("control       :", self.CONTROL)
+        #        print "fasta        :", self.FASTA
+        print("outfolder     :", self.OUTFOLDER)
+        print("numBin        :", self.NUMBIN)
+        print("minreaddepth  :", self.MINREADDEPTH)
+        print("minNBases     :", self.MINNBASES)
+        print("sam           :", self.SAM)
+        print("pval          :", self.PVAL)
+        print("sampleName    :", self.SAMPLENAME)
+        print("nomultimapped :", self.NOMULTIMAPPED)
+        print("plot          :", self.PLOT)
+        print("bedInput      :", self.BEDINPUT)
+        print("minExon       :", self.MINEXON)
+        print("largeDeletion :", self.LARGE)
+        print("removeDups    :", self.REMOVEDUPS)
+
+
 def checkOutputFolder(outF):
-	print "Creating Output Folder :",
- 
-	if outF[len(outF)-1] == "/":
-		outF = outF[:len(outF)-1]
-
-	try:
-		os.mkdir(outF)
-	except:
-		print "output folder already exists: " + outF
-#		print "cannot create folder '%s'" %outF
-#		print "if folder already exist, please specify other folder"
-#		sys.exit(1)
-	
-	try:
-		os.mkdir(outF+"/table")
-		os.mkdir(outF+"/plot")
-		os.mkdir(outF+"/buf")
-		os.mkdir(outF+"/buf/ctrData/")
-		os.mkdir(outF+"/buf/testData/")
-	except:
-		print "[ERROR: CANNOT CREATE SUBFOLDERS]"
-		sys.exit(1)
-
-	print " Done."
-
-	return outF
-
-
-#BEDINPUT
+    print("Creating Output Folder :"),
+
+    if outF[len(outF) - 1] == "/":
+        outF = outF[: len(outF) - 1]
+
+    try:
+        os.makedirs(outF, exist_ok=True)
+    except:
+        raise IOError("Could not create outputFolder")
+    #        print "cannot create folder '{}'" %outF
+    #        print "if folder already exist, please specify other folder"
+    #        sys.exit(1)
+
+    try:
+        os.makedirs(outF + "/table", exist_ok=True)
+        os.makedirs(outF + "/plot", exist_ok=True)
+
+        # dont need to create parent folder with makedirs
+        os.makedirs(outF + "/buf/ctrData/", exist_ok=True)
+        os.makedirs(outF + "/buf/testData/", exist_ok=True)
+    except:
+        raise IOError("[ERROR: CANNOT CREATE SUBFOLDERS]")
+
+    print(" Done.")
+
+    return outF
+
+
+# BEDINPUT
 def countTotalReads3(params, folder):
-	tempFileName    = folder + "/temp.txt"
-        tempReadFile    = open(tempFileName, "w")
-	libsize		= get_libsize(params.BEDINPUT)
-	tempReadFile.write(libsize)
-        #tempReadFile.write(params.CONTROLREADCOUNT)
-        tempReadFile.close()
+    tempFileName = folder + "/temp.txt"
+    tempReadFile = open(tempFileName, "w")
+    libsize = get_libsize(params.BEDINPUT)
+    tempReadFile.write(libsize)
+    tempReadFile.close()
 
 
 def countTotalReads(params, folder):
-	if 'testData' in folder:
-		inF = params.TEST
-	else:
-		inF = params.CONTROL
+    if "testData" in folder:
+        inF = params.TEST
+    else:
+        inF = params.CONTROL
+
+    # Get Total ReadCount TODO: use
+    getreadcount = os.system(
+        "samtools view {} | wc -l > {}/temp.txt".format(inF, folder)
+    )
 
-	# Get Total ReadCount
-	getreadcount = os.system("samtools view %s | wc -l > %s/temp.txt" %(inF,folder))
 
 def samToBam(samfile, bamfile):
-	args = shlex.split("samtools view -bS %s -o %s" %(samfile, bamfile))
-	samtobam = subprocess.call(args)
+    args = shlex.split("samtools view -bS {} -o {}".format(samfile, bamfile))
+    samtobam = subprocess.call(args)
+
+    return bamfile
 
-	return bamfile
 
 def removeMultiMapped(inF, newBAM):
-        # Get New BAM Files with mapping quality > 0
-        args = shlex.split("samtools view -bq 1 %s -o %s" %(inF, newBAM))
-	removeMM = subprocess.call(args)
-        print "Multi mapped reads removed. "
-        
-def removeDups(inF, newBAM):
-        # Remove
-        args = shlex.split("samtools view -b -F 0x400 %s -o %s" %(inF, newBAM))
-	removeDupsCall = subprocess.call(args)
-        print "Removed PCR duplicates. "
-        
-#BEDINPUT
-def convertBamSimple(params, folder, targetList, genomeFile):
-	if 'testData' in folder:
-                inF = params.TEST
-                print "Converting TEST Sample... "
-        else:
-                inF = params.CONTROL
-                print "Converting CONTROL Sample... "
-	
-	#Copy file to working folder
-	os.system("cp %s %s" %(inF, folder+"sample.BEDGRAPH"))
+    # Get New BAM Files with mapping quality > 0
+    args = shlex.split("samtools view -bq 1 {} -o {}".format(inF, newBAM))
+    removeMM = subprocess.call(args)
+    print("Multi mapped reads removed.")
 
-	# Split Bedgraph by its chromosomes
-        splitByChromosome(folder)
 
-        # Slice the coverage files to only cover the targeted regions
-        print "Getting targeted regions DOC..."
-        convertGeneCoordinate(targetList, folder)
-        
-	# LIBSIZE
-	libsize = str(get_libsize(folder+"geneRefCoverage2.txt"))
-	tempLibSize = open(folder + "/temp.txt", "w")
-	tempLibSize.write(libsize)
-	tempLibSize.close()
+def removeDups(inF, newBAM):
+    # Remove dups
+    args = shlex.split("samtools view -b -F 0x400 {} -o {}".format(inF, newBAM))
+    removeDupsCall = subprocess.call(args)
+    print("Removed PCR duplicates.")
 
-	print "Targeted regions pre-processing: Done"
 
+# BEDINPUT
+def convertBamSimple(params, folder, targetList, genomeFile):
+    if "testData" in folder:
+        inF = params.TEST
+        print("Converting TEST Sample... ")
+    else:
+        inF = params.CONTROL
+        print("Converting CONTROL Sample... ")
 
-def convertBam(params, folder, targetList, genomeFile):
-	if 'testData' in folder:
-		inF = params.TEST
-		print "Converting TEST Sample... "
-	else:
-		inF = params.CONTROL
-		print "Converting CONTROL Sample... "
-	
-	# Convert BAM Files to BEDGRAPH
-	bedgraph = folder + "sample.BEDGRAPH"
-	args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(inF, genomeFile))
-	print "DEBUG 123 " + " ".join(args)
-	#output = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0]
-	iOutFile = open(bedgraph, "w")
-	#iOutFile.write(output)
-	output	= subprocess.Popen(args, stdout = iOutFile).wait()
-	iOutFile.close()
-
-	# Split Bedgraph by its chromosomes
-	splitByChromosome(folder)
-
-	# Slice the coverage files to only cover the targeted regions
-	print "Getting targeted regions DOC..."
-	convertGeneCoordinate(targetList, folder)
-	
-	# LIBSIZE
-        libsize = str(get_libsize(folder+"geneRefCoverage2.txt"))
-	tempLibSize = open(folder + "temp.txt", "w")
-	tempLibSize.write(libsize)
-	tempLibSize.close()
-
-	print "Targeted regions pre-processing: Done"		
+    # Copy file to working folder
+    os.system("cp {} {}".format(inF, folder + "sample.BEDGRAPH"))
 
-def analysisPerBin(params, num_bin, outFolder, targetList):
-	import shutil
-	
-	bufLoc = outFolder + "/buf"
-	# Assign bin number to the median and average file
-	numBin = assignBin(num_bin, bufLoc+"/average.txt", bufLoc+"/bin", targetList, params.MINEXON)
-	
-	#copy bin_boundary to plot folder
-	#outBounFile = os.path.join(outFolder,  "plot", "bin_boundary"+str(num_bin))
-	#curBounFile = os.path.join(bufLoc, "bin" + str(num_bin) + ".boundaries.txt")
-	#shutil.copy(curBounFile, outBounFile)
-
-
-	print "Significance Test ...  "
-	rScriptName = os.path.join(scriptPath, "scripts", "cn_analysis.v3.R")
-	args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s %s %s" 
-		%(rScriptName, num_bin, params.MINREADDEPTH, params.MINNBASES, outFolder, params.SAMPLENAME,params.PLOT, numBin, params.MINCONTROL, params.MINTEST, params.MINAVG))
-	rscr = subprocess.call(args)
-
-
-	print "Generating Output Files ... "
-	# Analysis of CNV
-	tNameList = os.listdir(outFolder+"/table/")
-	if num_bin > 1:
-		tNameId = str(num_bin) + "bins"
-	else:
-		tNameId = str(num_bin) + "bin"
-        for tName in tNameList:
-		if tNameId in tName:
-			break
-
-        if "CNATable" in tName:
-		tName = tName[:len(tName)-4]
-		tableName = outFolder + "/table/" + tName
-		bufTable  = bufLoc + "/" + tName
-		applyThreshold(tableName, bufTable, params.PVAL, 100000) #params.MAXGAP = 100000
-
-		# Large Region CBS
-		if (params.LARGE != "False"):
-			print "DEBUG 266a"
-			rScriptName2 = os.path.join(scriptPath, "scripts", "large_region_cbs.R")
-			args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s"
-			%(rScriptName2, tableName+".txt", params.SMALLSEGMENT, params.LARGESEGMENT, params.PVAL, params.PASSSIZE, params.LRS, params.LRE, bufLoc))
-			rscr2 = subprocess.call(args)
-			print str(args)
-		else:
-			print "DEBUG 266b"
-
-		# Generate the DNA sequence (for VCF file)
-		print  "Skipping VCF generation.. use tabular file instead."
-#		bedFile  = bufTable + ".BED"
-#		bedFasta = bufTable + ".fastaOut.txt"
-#		fastaFile = params.FASTA
-#		args = shlex.split("fastaFromBed -fi %s -bed %s -fo %s -name"
-#				%(fastaFile, bedFile, bedFasta))
-#		print (" ".join(args))
-#		fastaBED = subprocess.call(args)
-#
-#		# Write VCF
-#		print  "Creating VCF file ... "
-#		vcfFile = tableName + ".vcf"
-#		vcf_out(bedFasta, vcfFile)
-#
-#		print "%s created. " %(vcfFile)
+    # Split Bedgraph by its chromosomes
+    splitByChromosome(folder)
 
-        else:
-		print "Table not found"
+    # Slice the coverage files to only cover the targeted regions
+    print("Getting targeted regions DOC...")
+    convertGeneCoordinate(targetList, folder)
 
-def removeTempFolder(tempFolderPath):
-	import shutil
-	
-	shutil.rmtree(tempFolderPath)
+    # LIBSIZE
+    libsize = str(get_libsize(folder + "geneRefCoverage2.txt"))
+    tempLibSize = open(folder + "/temp.txt", "w")
+    tempLibSize.write(libsize)
+    tempLibSize.close()
 
-	print "Temp Folder Removed"
+    print("Targeted regions pre-processing: Done")
 
 
-def main():
-	if len(sys.argv) == 2 and sys.argv[1] == "--version":
-		print VERSION
-		sys.exit(1)
-	
-	# option handling
-	params = Params()
-	params.repeat()
+def convertBam(params, folder, targetList, genomeFile):
+    if "testData" in folder:
+        inF = params.TEST
+        print("Converting TEST Sample... ")
+    else:
+        inF = params.CONTROL
+        print("Converting CONTROL Sample... ")
 
-	# output folder handling
-	outFolder = checkOutputFolder(params.OUTFOLDER)
-	bufLoc = outFolder + "/buf"
+    # Convert BAM Files to BEDGRAPH
+    bedgraph = folder + "sample.BEDGRAPH"
+    args = shlex.split("genomeCoverageBed -ibam {} -bga -g {}".format(inF, genomeFile))
 
-	# convert target file
-	sorted_target = os.path.join(bufLoc, "target.BED")
-	os.system("sort -k1,1 -k2n %s > %s" %(params.TARGET, sorted_target))	
+    # output = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0]
+    iOutFile = open(bedgraph, "w")
+    # iOutFile.write(output)
+    output = subprocess.Popen(args, stdout=iOutFile).wait()
+    iOutFile.close()
 
-	# target breakdown
-	if params.MAXREGIONSIZE > 0:
-		new_target = os.path.join(bufLoc, "target_breakdown.BED")
-		target_breakdown(sorted_target, params.MAXREGIONSIZE, params.TARGETREGIONSIZE, new_target)
-		sorted_target = new_target
+    # Split Bedgraph by its chromosomes
+    splitByChromosome(folder)
 
-	targetList = convertTarget(sorted_target)
+    # Slice the coverage files to only cover the targeted regions
+    print("Getting targeted regions DOC...")
+    convertGeneCoordinate(targetList, folder)
 
-	# convert sam to bam if -sam specified
-	if (params.SAM == "True"):
-		print "Pre-processing SAM files"
+    # LIBSIZE
+    libsize = str(get_libsize(folder + "geneRefCoverage2.txt"))
+    tempLibSize = open(folder + "temp.txt", "w")
+    tempLibSize.write(libsize)
+    tempLibSize.close()
 
-		test_bam = bufLoc + "/test.BAM"
-		ctr_bam  = bufLoc + "/control.BAM"
+    print("Targeted regions pre-processing: Done")
 
-		samTest = Process(target= samToBam, args=(params.TEST, test_bam))
-		if params.BEDINPUT == "False":
-			samCtr = Process(target= samToBam, args=(params.CONTROL, ctr_bam))
 
-		samTest.start()
-		if params.BEDINPUT == "False":
-			samCtr.start()
+def analysisPerBin(params, num_bin, outFolder, targetList):
+    import shutil
+
+    bufLoc = outFolder + "/buf"
+    # Assign bin number to the median and average file
+    numBin = assignBin(
+        num_bin, bufLoc + "/average.txt", bufLoc + "/bin", targetList, params.MINEXON
+    )
+
+    print("Significance Test ...  ")
+    rScriptName = os.path.join(scriptPath, "scripts", "cn_analysis.v3.R")
+    args = shlex.split(
+        "Rscript {} {} {} {} {} {} {} {} {} {} {}".format(
+            rScriptName,
+            num_bin,
+            params.MINREADDEPTH,
+            params.MINNBASES,
+            outFolder,
+            params.SAMPLENAME,
+            params.PLOT,
+            numBin,
+            params.MINCONTROL,
+            params.MINTEST,
+            params.MINAVG,
+        )
+    )
+    rscr = subprocess.call(args)
+
+    print("Generating Output Files ... ")
+    # Analysis of CNV
+    tNameList = os.listdir(outFolder + "/table/")
+    if num_bin > 1:
+        tNameId = str(num_bin) + "bins"
+    else:
+        tNameId = str(num_bin) + "bin"
+
+    for tName in tNameList:
+        if tNameId in tName:
+            break
+
+    if "CNATable" in tName:
+        tName = tName[: len(tName) - 4]
+        tableName = outFolder + "/table/" + tName
+        bufTable = bufLoc + "/" + tName
+        applyThreshold(
+            tableName, bufTable, params.PVAL, 100000
+        )  # params.MAXGAP = 100000
+
+        # Large Region CBS
+        if params.LARGE != "False":
+            print("DEBUG 266a")
+            rScriptName2 = os.path.join(scriptPath, "scripts", "large_region_cbs.R")
+            args = shlex.split(
+                "Rscript {} {} {} {} {} {} {} {} {}".format(
+                    rScriptName2,
+                    tableName + ".txt",
+                    params.SMALLSEGMENT,
+                    params.LARGESEGMENT,
+                    params.PVAL,
+                    params.PASSSIZE,
+                    params.LRS,
+                    params.LRE,
+                    bufLoc,
+                )
+            )
+            rscr2 = subprocess.call(args)
+            print(str(args))
+        else:
+            print("DEBUG 266b")
 
-		samTest.join()
-		if params.BEDINPUT == "False":
-			samCtr.join()
+        # Generate the DNA sequence (for VCF file)
+        print("Skipping VCF generation.. use tabular file instead.")
 
-		params.TEST = test_bam
-		if params.BEDINPUT == "False":
-			params.CONTROL = ctr_bam
+    else:
+        print("Table not found")
 
-	# remove multi mapped reads if --nomultimapped is specified
-	if (params.NOMULTIMAPPED == "True"):
-		print "Removing multi-mapped reads"
 
-		test_bam = bufLoc + "/test_reliable.BAM"
-                ctr_bam  = bufLoc + "/control_reliable.BAM"
+def removeTempFolder(tempFolderPath):
+    import shutil
 
-                bamTest = Process(target= removeMultiMapped, args=(params.TEST, test_bam))
-		if params.BEDINPUT == "False":
-                	bamCtr = Process(target= removeMultiMapped, args=(params.CONTROL, ctr_bam))
+    shutil.rmtree(tempFolderPath)
 
-                bamTest.start()
-		if params.BEDINPUT == "False":
-	                bamCtr.start()
+    print("Temp Folder Removed")
 
-                bamTest.join()
-		if params.BEDINPUT == "False":
-	                bamCtr.join()
 
-                params.TEST = test_bam
-		if params.BEDINPUT == "False":
-	                params.CONTROL = ctr_bam
+def main():
+
+    # TODO: use argparse
+    # if len(sys.argv) == 2 and sys.argv[1] == "--version":
+    #     print(VERSION)
+    #     sys.exit(1)
+
+    # option handling
+    params = Params()
+    params.repeat()
+
+    # output folder handling
+    outFolder = checkOutputFolder(params.OUTFOLDER)
+    bufLoc = outFolder + "/buf"
+
+    # convert target file
+    sorted_target = os.path.join(bufLoc, "target.BED")
+    os.system("sort -k1,1 -k2n {} > {}".format(params.TARGET, sorted_target))
+
+    # target breakdown
+    if params.MAXREGIONSIZE > 0:
+        new_target = os.path.join(bufLoc, "target_breakdown.BED")
+        target_breakdown(
+            sorted_target, params.MAXREGIONSIZE, params.TARGETREGIONSIZE, new_target
+        )
+        sorted_target = new_target
+
+    targetList = convertTarget(sorted_target)
+
+    # convert sam to bam if -sam specified
+    if params.SAM:
+        print("Pre-processing SAM files")
+
+        test_bam = bufLoc + "/test.BAM"
+        ctr_bam = bufLoc + "/control.BAM"
+
+        samTest = Process(target=samToBam, args=(params.TEST, test_bam))
+        if not params.BEDINPUT:
+            samCtr = Process(target=samToBam, args=(params.CONTROL, ctr_bam))
+
+        samTest.start()
+        if not params.BEDINPUT:
+            samCtr.start()
+
+        samTest.join()
+        if not params.BEDINPUT:
+            samCtr.join()
+
+        params.TEST = test_bam
+        if not params.BEDINPUT:
+            params.CONTROL = ctr_bam
+
+    # remove multi mapped reads if --nomultimapped is specified
+    if params.NOMULTIMAPPED:
+        print("Removing multi-mapped reads")
+        test_bam = bufLoc + "/test_reliable.BAM"
+        ctr_bam = bufLoc + "/control_reliable.BAM"
+
+        bamTest = Process(target=removeMultiMapped, args=(params.TEST, test_bam))
+        if not params.BEDINPUT:
+            bamCtr = Process(target=removeMultiMapped, args=(params.CONTROL, ctr_bam))
+            bamTest.start()
+
+        if not params.BEDINPUT:
+            bamCtr.start()
+            bamTest.join()
+
+        if not params.BEDINPUT:
+            bamCtr.join()
+            params.TEST = test_bam
+        if not params.BEDINPUT:
+            params.CONTROL = ctr_bam
 
         ###
-	# Remove PCR duplicates if --removeDups specified
-	if (params.REMOVEDUPS == "True"):
-                print "Removing reads marked as duplicates (PCR)"
-                
-		test_bam = bufLoc + "/test_removedups.BAM"
-                ctr_bam  = bufLoc + "/control_removedups.BAM"
-                
-                bamTest = Process(target = removeDups, args=(params.TEST, test_bam))
-		if params.BEDINPUT == "False":
-                	bamCtr = Process(target = removeDups, args=(params.CONTROL, ctr_bam))
-
-                bamTest.start()
-		if params.BEDINPUT == "False":
-	                bamCtr.start()
-
-                bamTest.join()
-		if params.BEDINPUT == "False":
-	                bamCtr.join()
-
-                params.TEST = test_bam
-		if params.BEDINPUT == "False":
-	                params.CONTROL = ctr_bam
-                
-	
-	# Get Chromosome Length
-	genomeFile = bufLoc + '/sample.Genome'
-	get_genome(params.TEST, genomeFile)
-
-	# spawn bam converting scripts
-	pTest = Process(target= convertBam, 
-			args=(params, bufLoc+'/testData/', targetList, genomeFile))
-
-	#BEDINPUT
-	if params.BEDINPUT == "False":
-
-		cTest = Process(target= convertBam, 
-			args=(params, bufLoc+'/ctrData/' , targetList, genomeFile))
-	else:
-		cTest = Process(target= convertBamSimple,
-			args=(params, bufLoc+'/ctrData/', targetList, genomeFile))
-	# start the processes
-	pTest.start()
-	cTest.start()
-
-	# wait for all the processes to finish before continuing
-	pTest.join()
-	cTest.join()
-
-	# Get the read depth count from temporary folder
-	for folder in [bufLoc+'/testData/', bufLoc+'/ctrData/']:	
-		if 'testData' in folder:
-			t1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n"))
-		else:
-			n1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n"))
-	print "Test file read depth 	= ", t1
-	print "Control file read depth 	= ", n1
-	print "Pre-processing Completed. "
-	
-	# Get the Average of the Log Ratio	
-	print "Getting the Log Ratio ... "
-	testListName = bufLoc + '/testData/geneRefCoverage.txt'
-	controlListName = bufLoc + '/ctrData/geneRefCoverage.txt'
-	avOut = bufLoc + "/average.txt"
-	averageCount(testListName, controlListName, avOut, t1, n1, params.MINREADDEPTH, params.MINNBASES)	
-
-	# Analysis. [Bin, significance test, large deletion, vcf output] 	
-	print "Binning ... "
-	binProc = []
-	for numBin in params.NUMBIN:	
-		binProc.append(Process(target= analysisPerBin, args=(params,numBin,outFolder,targetList)))
-
-	for proc in binProc:
-		proc.start()
-
-	for proc in binProc:
-		proc.join()
-		
-	# Removed Temp Folder 
-	removeTempFolder(bufLoc)
-	
+    # Remove PCR duplicates if --removeDups specified
+    if params.REMOVEDUPS:
+        print("Removing reads marked as duplicates (PCR)")
+
+        test_bam = bufLoc + "/test_removedups.BAM"
+        ctr_bam = bufLoc + "/control_removedups.BAM"
+
+        bamTest = Process(target=removeDups, args=(params.TEST, test_bam))
+
+        if not params.BEDINPUT:
+            bamCtr = Process(target=removeDups, args=(params.CONTROL, ctr_bam))
+
+            bamTest.start()
+        if not params.BEDINPUT:
+            bamCtr.start()
+            bamTest.join()
+        if not params.BEDINPUT:
+            bamCtr.join()
+
+            params.TEST = test_bam
+        if not params.BEDINPUT:
+            params.CONTROL = ctr_bam
+
+    # Get Chromosome Length
+    genomeFile = bufLoc + "/sample.Genome"
+    get_genome(params.TEST, genomeFile)
+
+    # spawn bam converting scripts
+    pTest = Process(
+        target=convertBam, args=(params, bufLoc + "/testData/", targetList, genomeFile)
+    )
+
+    # BEDINPUT
+    if not params.BEDINPUT:
+
+        cTest = Process(
+            target=convertBam,
+            args=(params, bufLoc + "/ctrData/", targetList, genomeFile),
+        )
+    else:
+        cTest = Process(
+            target=convertBamSimple,
+            args=(params, bufLoc + "/ctrData/", targetList, genomeFile),
+        )
+    # start the processes
+    pTest.start()
+    cTest.start()
+
+    # wait for all the processes to finish before continuing
+    pTest.join()
+    cTest.join()
+
+    # Get the read depth count from temporary folder
+    for folder in [bufLoc + "/testData/", bufLoc + "/ctrData/"]:
+        fh = open(folder + "temp.txt")
+        tmpDP = int(fh.readlines()[0].strip("\n"))
+
+        if "testData" in folder:
+            t1 = tmpDP
+        else:
+            n1 = tmpDP
+
+    print("Test file read depth     = ", t1)
+    print("Control file read depth     = ", n1)
+    print("Pre-processing Completed. ")
+
+    # Get the Average of the Log Ratio
+    print("Getting the Log Ratio ... ")
+    testListName = bufLoc + "/testData/geneRefCoverage.txt"
+    controlListName = bufLoc + "/ctrData/geneRefCoverage.txt"
+    avOut = bufLoc + "/average.txt"
+    averageCount(
+        testListName,
+        controlListName,
+        avOut,
+        t1,
+        n1,
+        params.MINREADDEPTH,
+        params.MINNBASES,
+    )
+
+    # Analysis. [Bin, significance test, large deletion, vcf output]
+    print("Binning ... ")
+    binProc = []
+    for numBin in params.NUMBIN:
+        binProc.append(
+            Process(target=analysisPerBin, args=(params, numBin, outFolder, targetList))
+        )
+
+    for proc in binProc:
+        proc.start()
+
+    for proc in binProc:
+        proc.join()
+
+    # Removed Temp Folder
+    removeTempFolder(bufLoc)
+
+
 if __name__ == "__main__":
-	main()
-	print "Done... "
+    main()
+    print("Done... ")
diff --git a/scripts/assign_bin_number_v2.py b/scripts/assign_bin_number_v2.py
index 4eb2660..be19d15 100755
--- a/scripts/assign_bin_number_v2.py
+++ b/scripts/assign_bin_number_v2.py
@@ -17,137 +17,151 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 12 October 2011 16:43PM
 
 
 def assignBin(binNumber, srcFile, binFile, targetList, minExons):
-#def assignBin(binNumber, srcFile, binFile, minExons):
-	from math import log
-
-	src = file.readlines(open(srcFile))
-	#binOut = open(binFile, "w")
-
-	minExons = int(minExons)
-	count = 0
-	logcov_list = []
-
-	# Get the Log Coverage List for the normal sample
-	for exon in src:
-		exon 	= exon.split()
-		cov1	= float(exon[7])
-		cov2	= float(exon[8])
-		cov	= (cov1 + cov2) / 2
-		if (cov > 0):
-			logcov	= log(cov, 2)
-		else:
-			logcov	= 0
-		logcov_list.append(logcov)
-	#print "Len of logcov_list:", len(logcov_list)
-
-	# Calculate the boundaries of the bins
-	minLog 		= min(logcov_list)
-	maxLog 		= max(logcov_list)
-	boundary	= (maxLog-minLog)/binNumber
-	#print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber
-
-
-	# Split exons to different bins
-	bin_list = []
-	boundary_dict = {}
-	for logcov in logcov_list:
-		i = 1
-		set_boundary = minLog + boundary
-		while (logcov > set_boundary):
-			i += 1
-			set_boundary = minLog + (boundary * i)
-		#boundary_dict[i] = set_boundary
-		bin_list.append(i)
-
-	for i in range(binNumber+2):
-		boundary_dict[i] = minLog + (boundary * i)
-
-
-	# Check the number of exons for each bin
-	# Merge with the adjacent bins if it is too small
-	for z in range(1, binNumber+1):
-		element = bin_list.count(z)
-		#print "Bin", z, "has", element, "elements"
-		if (element < minExons):
-			while (bin_list.count(z) != 0):
-				if (z != binNumber):
-					bin_list[bin_list.index(z)]+=1 
-				else:
-					bin_list[bin_list.index(z)]-=1
-
-
-	# Check the number of exons in the last bin
-	last_bin_number = sorted(set(bin_list))[-1]
-	if len(set(bin_list)) > 1:
-		second_last_bin	= sorted(set(bin_list))[-2]
-	else:
-		second_last_bin = last_bin_number
-	element		= bin_list.count(last_bin_number)
-	if (element < minExons):
-		while (bin_list.count(last_bin_number) != 0):
-			if (last_bin_number != 1):
-				#bin_list[bin_list.index(last_bin_number)] -= 1
-				bin_list[bin_list.index(last_bin_number)] = second_last_bin
-
-	final_number_of_bin = len(set(bin_list))
-
-	# List out the binning boundaries in a txt file
-	boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))]
-	i = 1
-
-	#boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt"
-	boundary_file = binFile + str(binNumber) + ".boundaries.txt"
-	boOut	= open(boundary_file, "w")
-	boOut.write("\t".join([str(0), str(minLog)])+"\n")
-	for z in boundary_list:
-		if (i==final_number_of_bin):
-			z = maxLog
-		boOut.write("\t".join([str(i), str(z)])+"\n")
-		i += 1	
-	boOut.close()
-
-	# Re-sort the bin numbers - get rid of gaps 
-	curr_z = 1
-	bin_number_dict = {}
-	for z in sorted(set(bin_list)):
-		bin_number_dict[z] = curr_z
-		curr_z += 1
-
-
-	# Append the bin number to the original file 
-	#binFile = binFile + str(final_number_of_bin)+".txt"
-	binFile	= binFile + str(binNumber) + ".txt"
-	binOut	= open(binFile, "w")
-	
-	for exons in src:
-		exon 	= exons.split()
-		id	= int(exon[0])
-		gene	= exon[1]
-		exonNumber	= exon[5]
-		
-		target	= targetList[int(id)-1]
-		if target.id == id:
-			chr		= target.chr
-			oriStart	= target.start
-			oriEnd		= target.end
-
-		else:
-			print "ERROR..."
-
-		f_bin_number = str(bin_number_dict[bin_list[count]])
-		binOut.write("\t".join([exons.strip("\n"), f_bin_number,chr, oriStart, oriEnd]))
-		binOut.write("\n")
-		count += 1
-
-	binOut.close()
-	print "End of assign.bin.number.py with %s exons in %s bins" %(count, final_number_of_bin)
-
-
-	return final_number_of_bin
-			
+    # def assignBin(binNumber, srcFile, binFile, minExons):
+    from math import log
+
+    sourceFH = open(srcFile)
+    src = sourceFH.readlines()
+    # binOut = open(binFile, "w")
+
+    minExons = int(minExons)
+    count = 0
+    logcov_list = []
+
+    # Get the Log Coverage List for the normal sample
+    for exon in src:
+        exon = exon.split()
+        cov1 = float(exon[7])
+        cov2 = float(exon[8])
+        cov = (cov1 + cov2) / 2
+        if cov > 0:
+            logcov = log(cov, 2)
+        else:
+            logcov = 0
+        logcov_list.append(logcov)
+    # print "Len of logcov_list:", len(logcov_list)
+
+    # Calculate the boundaries of the bins
+    minLog = min(logcov_list)
+    maxLog = max(logcov_list)
+    boundary = (maxLog - minLog) / binNumber
+    # print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber
+
+    # Split exons to different bins
+    bin_list = []
+    boundary_dict = {}
+    for logcov in logcov_list:
+        i = 1
+        set_boundary = minLog + boundary
+        while logcov > set_boundary:
+            i += 1
+            set_boundary = minLog + (boundary * i)
+        # boundary_dict[i] = set_boundary
+        bin_list.append(i)
+
+    for i in range(binNumber + 2):
+        boundary_dict[i] = minLog + (boundary * i)
+
+    # Check the number of exons for each bin
+    # Merge with the adjacent bins if it is too small
+    if len(bin_list) < minExons:
+        newMinExons = int(len(bin_list) / 3)
+        print(
+            f"The total amount of exons ({len(bin_list)}) is smaller than the specified minExons ({minExons})\nSetting minExons to one third of the amount of targeted areas ({newMinExons})"
+        )
+        minExons = newMinExons
+
+    print("before" + str(bin_list))
+    for z in range(1, binNumber + 1):
+        element = bin_list.count(z)
+        print("Bin", z, "has", element, "elements")
+        if element < minExons:
+            while bin_list.count(z) != 0:
+                if z != binNumber:
+                    bin_list[bin_list.index(z)] += 1
+                else:
+                    bin_list[bin_list.index(z)] -= 1
+
+    print("after" + str(bin_list))
+    # Check the number of exons in the last bin
+    last_bin_number = sorted(set(bin_list))[-1]
+
+    if len(set(bin_list)) > 1:
+        second_last_bin = sorted(set(bin_list))[-2]
+    else:
+        second_last_bin = last_bin_number
+
+    element = bin_list.count(last_bin_number)
+    if element < minExons:
+        while bin_list.count(last_bin_number) != 0:
+            if last_bin_number != 1:
+                # bin_list[bin_list.index(last_bin_number)] -= 1
+                bin_list[bin_list.index(last_bin_number)] = second_last_bin
+            print("checking through bins")
+            print(bin_list)
+
+    final_number_of_bin = len(set(bin_list))
+
+    # List out the binning boundaries in a txt file
+    boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))]
+    i = 1
+
+    # boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt"
+    boundary_file = binFile + str(binNumber) + ".boundaries.txt"
+    boOut = open(boundary_file, "w")
+    boOut.write("\t".join([str(0), str(minLog)]) + "\n")
+    for z in boundary_list:
+        if i == final_number_of_bin:
+            z = maxLog
+        boOut.write("\t".join([str(i), str(z)]) + "\n")
+        i += 1
+    boOut.close()
+
+    # Re-sort the bin numbers - get rid of gaps
+    curr_z = 1
+    bin_number_dict = {}
+    for z in sorted(set(bin_list)):
+        bin_number_dict[z] = curr_z
+        curr_z += 1
+
+    # Append the bin number to the original file
+    # binFile = binFile + str(final_number_of_bin)+".txt"
+    binFile = binFile + str(binNumber) + ".txt"
+    binOut = open(binFile, "w")
+
+    for exons in src:
+        exon = exons.split()
+        id = int(exon[0])
+        gene = exon[1]
+        exonNumber = exon[5]
+
+        target = targetList[int(id) - 1]
+        if target.id == id:
+            chr = target.chr
+            oriStart = target.start
+            oriEnd = target.end
+
+        else:
+            print("ERROR...")
+
+        f_bin_number = str(bin_number_dict[bin_list[count]])
+        binOut.write(
+            "\t".join([exons.strip("\n"), f_bin_number, chr, oriStart, oriEnd])
+        )
+        binOut.write("\n")
+        count += 1
+
+    binOut.close()
+    print(
+        "End of assign.bin.number.py with {} exons in {} bins".format(
+            count, final_number_of_bin
+        )
+    )
+
+    return final_number_of_bin
diff --git a/scripts/average_count.py b/scripts/average_count.py
index dd4cb32..a6a7567 100755
--- a/scripts/average_count.py
+++ b/scripts/average_count.py
@@ -17,179 +17,202 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 28 Sep 2011 11:00AM
 
 import sys
 import math
 
+
 def getAverage(list1):
-	if len(list1) > 0:
-		return float(sum(list1))/len(list1)
-	
-	return 0.0
+    if len(list1) > 0:
+        return float(sum(list1)) / len(list1)
+
+    return 0.0
+
 
 def getStdDev(list1, avg):
-	var = 0.0
-	for x in list1:
-		var += (avg - x) ** 2
+    var = 0.0
+    for x in list1:
+        var += (avg - x) ** 2
 
-	if (len(list1)-1) > 0:
-		var /= (len(list1)-1)
+    if (len(list1) - 1) > 0:
+        var /= len(list1) - 1
+
+    return math.sqrt(var)
 
-	return math.sqrt(var)
 
 def getMinMax(list1):
-	length = len(list1)
-	if length != 0:
-       		min = list1[0]
-        	max = list1[length-1]
-	else:
-        	min = 0
-        	max = 0
+    length = len(list1)
+    if length != 0:
+        min = list1[0]
+        max = list1[length - 1]
+    else:
+        min = 0
+        max = 0
+
+    return min, max
 
-	return min, max
 
 def getMedian(list1):
-	length = len(list1)
-	if length == 0:
-        	median = 0
-	elif length % 2 == 0:
-        	median = (list1[length/2]+list1[(length/2) - 1])/2
-	else:
-        	median = list1[length/2]
-	return median
+    length = len(list1)
+    center = int(length / 2)
+    if length == 0:
+        median = 0
+    elif length % 2 == 0:
+        median = (list1[center] + list1[center - 1]) / 2
+    else:
+        median = list1[center]
+    return median
+
 
 def createDataDict(count, list1, r, offset, id_check, exon_check):
-	tDict 	 = {}
-	tDictOri = {}
-
-	while count < len(list1):
-		t 	= list1[count].split()
-		tId     = t[5]
-		tExon   = t[6]
-
-		if (tId != id_check) or (tExon != exon_check):
-       	 		return count, tDict, tDictOri
-
-        	tStart  = int(t[2])
-       		tEnd    = int(t[3])
-	        tCov    = float(t[4]) / r + offset       #GeoMean Normalisation
-        	tCovOri = float(t[4]) + offset     #without scaling
-
-        	#filling dict
-        	while tStart < tEnd:
-                	tDict[tStart] = tCov
-                	tDictOri[tStart] = tCovOri #without scaling
-                	tStart += 1
-
-        	count += 1
-
-	return count, tDict, tDictOri
-
-def getFactor (val1, val2):
-	r = math.sqrt(val1 * val2)
-	r1 = val1/r
-	r2 = val2/r
-	return r1, r2
-
-def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases):
-	tList = file.readlines(open(tFile))
-	nList = file.readlines(open(nFile))
-	# constant & counter
-	OFF    = 1
-	tCount = 0
-	nCount = 0
-
-	# create and open files
-	output = open(averageOut, "w")
-	
-	# Offset and Ratio for Geometric Mean Normalisation
-	r1, r2 = getFactor(tReadCount, nReadCount)	
-	if rd_threshold > 0:
-		#OFF = 0 
-		OFF = 0.5
-
-	#big loop
-	while (nCount < len(nList)):
-		# initialisation, get the chr, geneID, geneName
-		init	= tList[tCount].split()
-		initial = init[5] 
-		_exon 	= init[6]
-		chr 	= init[1]
-		gene 	= init[0]
-		_start 	= int(init[2])
-
-		# check if t-gene and n-gene refer to the same gene
-		check_init = nList[nCount].split()
-		if check_init[5] != initial or check_init[6] != _exon:
-			print "Initial: ", initial
-			print "Check_Init.id: ", check_init[5]
-			print "_Exon: ", _exon
-			print "Check_Init.exon: ", check_init[6]
-			print "Error. Comparing different Gene"
-			sys.exit(1)
-		
-		# create data dictionary for tumour and normal data (per each regions/ exon)
-		tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon)
-		nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon)		
-		# check number of bases in the both gene dict
-		if len(nDict) != len(tDict):
-			print "N:", len(nDict)
-			print "T:", len(tDict)
-			print "Error. Different length of dict"
-			sys.exit(1)
-
-		# compare coverage
-		count = _start
-		_max = max(nDict.keys())
-		ratioList = []
-		tumourList = []
-		normalList = []
-		tumourOriList = []
-		normalOriList = []
-		while count <= _max:
-			# get ratio
-			if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold):
-				ratio = 0.0
-			else:
-				if tDict[count] == 0:
-					tDict[count] = 0.5
-
-				ratio = math.log((float(tDict[count]) / nDict[count]),2)
-				tumourList.append(tDict[count])
-				tumourOriList.append(tDictOri[count])
-				normalList.append(nDict[count])
-				normalOriList.append(nDictOri[count])
-				ratioList.append(ratio)
-		
-			count += 1
-
-		ratioLen = len(ratioList)
-
-		# get average
-		avg = getAverage(ratioList)
-		sd = getStdDev(ratioList, avg)
-		tumourAvg= str(round(getAverage(tumourList),3))
-		normalAvg= str(round(getAverage(normalList),3))
-		tumourOriAvg = str(round(getAverage(tumourOriList),3))
-		normalOriAvg = str(round(getAverage(normalOriList),3))
-
-		# get median
-		ratioList.sort()
-		min_logratio, max_logratio = getMinMax(ratioList)
-		median	= getMedian(ratioList)
-
-		# write output
-		if ratioLen >= minNBases:
-			output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t")
-			output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t")
-			output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") 
-			output.write(tumourOriAvg + "\t" + normalOriAvg + "\t")
-			output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t")
-			output.write(str(round(max_logratio,3)) + "\n")
-
-	output.close()
-
-	#print "End of averageCount.py with the last target = '%s'" %(initial) 
+    tDict = {}
+    tDictOri = {}
+
+    while count < len(list1):
+        t = list1[count].split()
+        tId = t[5]
+        tExon = t[6]
+
+        if (tId != id_check) or (tExon != exon_check):
+            return count, tDict, tDictOri
+
+        tStart = int(t[2])
+        tEnd = int(t[3])
+        tCov = float(t[4]) / r + offset  # GeoMean Normalisation
+        tCovOri = float(t[4]) + offset  # without scaling
+
+        # filling dict
+        while tStart < tEnd:
+            tDict[tStart] = tCov
+            tDictOri[tStart] = tCovOri  # without scaling
+            tStart += 1
+
+        count += 1
+
+    return count, tDict, tDictOri
+
+
+def getFactor(val1, val2):
+    r = math.sqrt(val1 * val2)
+    r1 = val1 / r
+    r2 = val2 / r
+    return r1, r2
+
+
+def averageCount(
+    tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases
+):
+    tFH = open(tFile)
+    tList = tFH.readlines()
+
+    nFH = open(nFile)
+    nList = nFH.readlines()
+    # constant & counter
+    OFF = 1
+    tCount = 0
+    nCount = 0
+
+    # create and open files
+    output = open(averageOut, "w")
+
+    # Offset and Ratio for Geometric Mean Normalisation
+    r1, r2 = getFactor(tReadCount, nReadCount)
+    if rd_threshold > 0:
+        # OFF = 0
+        OFF = 0.5
+
+    # big loop
+    while nCount < len(nList):
+        # initialisation, get the chr, geneID, geneName
+        init = tList[tCount].split()
+        initial = init[5]
+        _exon = init[6]
+        chr = init[1]
+        gene = init[0]
+        _start = int(init[2])
+
+        # print("initial: " + initial)
+        # print("Check_Init.id: ", check_init[5])
+        # print("_Exon: ", _exon)
+        # print("Check_Init.exon: ", check_init[6])
+
+        # check if t-gene and n-gene refer to the same gene
+        check_init = nList[nCount].split()
+        if check_init[5] != initial or check_init[6] != _exon:
+            print("Initial: ", initial)
+            print("Check_Init.id: ", check_init[5])
+            print("_Exon: ", _exon)
+            print("Check_Init.exon: ", check_init[6])
+            print("Error. Comparing different Gene")
+            sys.exit(1)
+
+        # create data dictionary for tumour and normal data (per each regions/ exon)
+        tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon)
+        nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon)
+        # check number of bases in the both gene dict
+        if len(nDict) != len(tDict):
+            print("N:", len(nDict))
+            print("T:", len(tDict))
+            print("Error. Different length of dict")
+            sys.exit(1)
+
+        # compare coverage
+        count = _start
+        _max = max(nDict.keys())
+        ratioList = []
+        tumourList = []
+        normalList = []
+        tumourOriList = []
+        normalOriList = []
+        while count <= _max:
+            # get ratio
+            if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold):
+                ratio = 0.0
+            else:
+                if tDict[count] == 0:
+                    tDict[count] = 0.5
+
+                ratio = math.log((float(tDict[count]) / nDict[count]), 2)
+                tumourList.append(tDict[count])
+                tumourOriList.append(tDictOri[count])
+                normalList.append(nDict[count])
+                normalOriList.append(nDictOri[count])
+                ratioList.append(ratio)
+
+            count += 1
+
+        assert len(ratioList) == len(tumourList), "List do not have the same length"
+
+        ratioLen = len(ratioList)
+
+        # get average
+        avg = getAverage(ratioList)
+        sd = getStdDev(ratioList, avg)
+        tumourAvg = str(round(getAverage(tumourList), 3))
+        normalAvg = str(round(getAverage(normalList), 3))
+        tumourOriAvg = str(round(getAverage(tumourOriList), 3))
+        normalOriAvg = str(round(getAverage(normalOriList), 3))
+
+        # get median
+        ratioList.sort()
+        min_logratio, max_logratio = getMinMax(ratioList)
+        print(ratioList)
+        median = getMedian(ratioList)
+
+        # write output
+        if ratioLen >= minNBases:
+            output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t")
+            output.write(str(round(avg, 3)) + "\t" + str(count) + "\t" + _exon + "\t")
+            output.write(str(round(sd, 3)) + "\t" + tumourAvg + "\t" + normalAvg + "\t")
+            output.write(tumourOriAvg + "\t" + normalOriAvg + "\t")
+            output.write(
+                str(round(median, 3)) + "\t" + str(round(min_logratio, 3)) + "\t"
+            )
+            output.write(str(round(max_logratio, 3)) + "\n")
+
+    output.close()
+
+    # print "End of averageCount.py with the last target = '%s'" %(initial)
diff --git a/scripts/cn_analysis.v3.R b/scripts/cn_analysis.v3.R
index 480024f..9e65f61 100755
--- a/scripts/cn_analysis.v3.R
+++ b/scripts/cn_analysis.v3.R
@@ -17,7 +17,7 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
+#
 #-----------------------------------------------------------------------#
 # Last Updated : 31 Oct 2011 17:00PM
 
@@ -49,7 +49,7 @@ pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf",
 # cnAverageFile = paste("bin", bins, ".txt", sep="")
 cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="")
 boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="")
-print (cnAverageFile)
+#print (cnAverageFile)
 cn.average = read.delim(cnAverageFile, as.is=F, header=F)
 cn.boundary= read.delim(boundariesFile,as.is=F, header=F)
 
@@ -57,7 +57,7 @@ cn.boundary= read.delim(boundariesFile,as.is=F, header=F)
 cn.average.aboveTs = cn.average[cn.average$V3>min.bases,]
 cn.average.list = as.matrix(cn.average.aboveTs$V4)
 
-# Get the mean and sd for each bins 
+# Get the mean and sd for each bins
 cn.average.mean = c()
 cn.average.sd = c()
 cn.average.log= c()
@@ -136,7 +136,7 @@ if (bins > 1 ){
 }
 
 
-# Put the data's details into matrices 
+# Put the data's details into matrices
 ids 		= as.matrix(cn.average.aboveTs$V1)
 exons 		= as.matrix(cn.average.aboveTs$V6)
 exons.pos 	= as.matrix(cn.average.aboveTs$V5)
@@ -211,7 +211,7 @@ fit.sd.fn <- function(x, fit.a, fit.b){
 	result = 2 ^ (fit.mean.fn(x, fit.a, fit.b))
 	return (result)
 }
-	
+
 # Get the P Values, called the gain/loss
 # with average and sd from each bins
 pVal.list = c()
@@ -225,7 +225,7 @@ for (i in 1:nrow(cn.average.list)){
 	logcov	 = logcov.mean[i]
 	exon.bin = Bin[i]
 
-	if (length(logratios.sd) > 1){	
+	if (length(logratios.sd) > 1){
 		#pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b))
 		pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), sd.fn(logcov))
 	} else {
@@ -259,11 +259,21 @@ outdf$P.Value[wh.to.excl]=NA
 outdf$Adjusted.P.Value[wh.to.excl]=NA
 
 
+# rounding numbers to 3 digits
+outdf$Mean.of.LogRatio <- round(outdf$Mean.of.LogRatio, digits=3)
+outdf$Adjusted.Mean.of.LogRatio <- round(outdf$Adjusted.Mean.of.LogRatio, digits=3)
+outdf$SD.of.LogRatio <- round(outdf$SD.of.LogRatio, digits=3)
+outdf$Median.of.LogRatio <- round(outdf$Median.of.LogRatio, digits=3)
+
+#I dont know if it is smart to round the pValue... but maybe like 6 digits instead?
+outdf$P.Value <- round(outdf$P.Value, digits=6)
+outdf$Adjusted.P.Value <- round(outdf$Adjusted.P.Value, digits=6)
+
 write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
 
 #Plotting SD
 #a.sd.fn  	= rep(fit.sd.a, length(logratios.sd.ori))
-#b.sd.fn    	= rep(fit.sd.b, length(logratios.sd.ori)) 
+#b.sd.fn    	= rep(fit.sd.b, length(logratios.sd.ori))
 #sd.after.fit 	= fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b)
 #sd.out.f 	= paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="")
 #sd.outdf 	= data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn)
@@ -271,8 +281,5 @@ write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
 
 
 #End of the script
-print ("End of cn_analysis.R")
-print (i)
-
-
-
+message("End of cn_analysis.R")
+#print (i)
diff --git a/scripts/cn_apply_threshold.py b/scripts/cn_apply_threshold.py
index ee7afd0..3961c08 100755
--- a/scripts/cn_apply_threshold.py
+++ b/scripts/cn_apply_threshold.py
@@ -17,98 +17,113 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 12 Oct 2011 11:00AM
 
+
 def applyThreshold(outputName, bufTable, threshold, maxGap):
-	srcFile = outputName + ".txt"
-	outFile = bufTable + ".LargeVariations.txt"
-	bedFile = bufTable + ".BED"
-	fFile 	= outputName + ".DetailsFILTERED.txt" 
-	ts	= float(threshold)
+    srcFile = outputName + ".txt"
+    outFile = bufTable + ".LargeVariations.txt"
+    bedFile = bufTable + ".BED"
+    fFile = outputName + ".DetailsFILTERED.txt"
+    ts = float(threshold)
+
+    # Read and open files
+    sourceFH = open(srcFile)
+    srcTable = sourceFH.readlines()
+    outTable = open(outFile, "w")
+    bedOut = open(bedFile, "w")
+    filteredTable = open(fFile, "w")
 
-	# Read and open files
-	srcTable = file.readlines(open(srcFile))
-	outTable = open(outFile, "w")
-	bedOut	 = open(bedFile, "w")
-	filteredTable = open(fFile, "w")
+    # header
+    outTable.write("Chr \tStartCoordinate \tEndCoordinate \tGenes \tGain.Loss \n")
+    filteredTable.write(srcTable[0])
 
+    prevChr = ""
+    prevStatus = ""
+    prevEnd = -1
+    genes = []
+    chrList = []
 
-	#header
-	outTable.write("Chr \tStartCoordinate \tEndCoordinate \tGenes \tGain.Loss \n")
-	filteredTable.write(srcTable[0])
+    for exons in srcTable:
+        exon = exons.split()
+        try:
+            adjPVal = float(exon[12])
+        except:
+            continue
 
-	prevChr = ''
-	prevStatus = ''
-	prevEnd = -1
-	genes = []
-	chrList = []
+        # print (str(adjPVal)+"  <= "+str(ts))
+        if adjPVal <= ts:
+            chr = exon[3]
+            gene = exon[2]
+            status = exon[13]
+            start = exon[4]
+            end = exon[5]
 
-	for exons in srcTable:
-		exon = exons.split()
-		try:
-			adjPVal = float(exon[12])
-		except:
-			continue
+            # For first row
+            if prevEnd == -1:
+                gap = 0
+            else:
+                gap = int(prevEnd) - int(start)
 
-		#print (str(adjPVal)+"  <= "+str(ts))
-		if adjPVal <= ts:
-			chr 	= exon[3]
-			gene 	= exon[2]
-			status 	= exon[13]
-			start 	= exon[4]
-			end	= exon[5]
+            # Write Filtered Table
+            filteredTable.write(exons)
 
-			# For first row
-			if prevEnd == -1:
-				gap = 0
-			else:
-				gap = int(prevEnd) - int(start)
+            # Write Bed File
+            # bedOut.write(chr.strip("chr") +"\t" +start +"\t"+ end+"\t"+
+            #   chr.strip("chr")+":"+start+"-"+end+":"+str(adjPVal)+"\n")
+            bedOut.write(
+                chr
+                + "\t"
+                + start
+                + "\t"
+                + end
+                + "\t"
+                + chr
+                + ":"
+                + start
+                + "-"
+                + end
+                + ":"
+                + str(adjPVal)
+                + "\n"
+            )
 
-			# Write Filtered Table
-			filteredTable.write(exons)
+            if prevChr == "" and prevStatus == "":
+                if chr not in chrList:
+                    print(chr)
+                    chrList.append(chr)
+            elif (chr == prevChr) and (status == prevStatus) and (gap < maxGap):
+                start = prevStart
+            else:
+                outTable.write(prevChr + "\t" + prevStart + "\t" + prevEnd + "\t")
+                for gsym in genes:
+                    outTable.write(gsym + ", ")
+                outTable.write("\t" + prevStatus + "\n")
+                genes = []
 
-			# Write Bed File
-			#bedOut.write(chr.strip("chr") +"\t" +start +"\t"+ end+"\t"+ 
-			#   chr.strip("chr")+":"+start+"-"+end+":"+str(adjPVal)+"\n")
-			bedOut.write(chr +"\t" +start +"\t"+ end+"\t"+ 
-			   chr+":"+start+"-"+end+":"+str(adjPVal)+"\n")
-		
-			if prevChr == '' and prevStatus == '':
-				if chr not in chrList:
-					print chr
-					chrList.append(chr)
-			elif (chr == prevChr) and (status == prevStatus) and (gap < maxGap):
-				start = prevStart
-			else:
-				outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t")
-				for gsym in genes:
-					outTable.write(gsym + ", ")
-				outTable.write("\t" + prevStatus + "\n")
-				genes=[]
-		
-			if gene not in genes:
-				genes.append(gene)
-			prevChr = chr
-			prevStatus = status
-			prevStart = start
-			prevEnd = end
-		elif len(genes) > 0:
-			outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t")
-			for gsym in genes:
-				outTable.write(gsym + ", " )
-			outTable.write("\t" + prevStatus + "\n")
-			prevChr = ''
-			prevStatus = ''
-			genes = []
+            if gene not in genes:
+                genes.append(gene)
+            prevChr = chr
+            prevStatus = status
+            prevStart = start
+            prevEnd = end
+        elif len(genes) > 0:
+            outTable.write(prevChr + "\t" + prevStart + "\t" + prevEnd + "\t")
+            for gsym in genes:
+                outTable.write(gsym + ", ")
+            outTable.write("\t" + prevStatus + "\n")
+            prevChr = ""
+            prevStatus = ""
+            genes = []
 
-	if len(genes) > 0:
-        	outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t")
-        	for gsym in genes:
-                	outTable.write(gsym + ", ")
-        	outTable.write("\t" + prevStatus + "\n")
+    if len(genes) > 0:
+        outTable.write(prevChr + "\t" + prevStart + "\t" + prevEnd + "\t")
+        for gsym in genes:
+            outTable.write(gsym + ", ")
+        outTable.write("\t" + prevStatus + "\n")
 
-	filteredTable.close()
-	bedOut.close()
-	outTable.close() 
+    filteredTable.close()
+    bedOut.close()
+    outTable.close()
diff --git a/scripts/convert_gene_coordinate.py b/scripts/convert_gene_coordinate.py
index 08a2ec7..6f20858 100755
--- a/scripts/convert_gene_coordinate.py
+++ b/scripts/convert_gene_coordinate.py
@@ -17,219 +17,258 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 16 May 2013 11:00AM
 
 import sys
 import os
 
-def outputwrite(output, gene,chr,a,b,c,id,n,sOri, eOri):
-        id = str(id)
-	n = str(n)
-        output.write(gene+"\t"+chr+"\t"+a+"\t"+b+"\t"+c+"\t"+id+"\t"+n+"\t"+sOri+"\t"+eOri+"\n")
-
-def outputwrite2(output2, chr,c,sOri, eOri):
-        output2.write(chr+"\t"+sOri+"\t"+eOri+"\t"+c+"\n")
-
-
-def convertGeneCoordinate(targetList, bufLocFolder): 
-	inputfile2 = bufLocFolder + "chr/"
-	outputfile = bufLocFolder + "geneRefCoverage.txt"
-	outputfile2= bufLocFolder + "geneRefCoverage2.txt"
-
-	output= open(outputfile,"w")
-	output2 = open(outputfile2 , "w")
-
-	rn = 1
-	prevchr = ""
-	for target in targetList:
-		chr = target.chr
-		starts = target.oriStart.split(",")
-		ends = target.oriEnd.split(",")
-
-		if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
-			continue
-
-		if (prevchr != chr):
-			print chr #progress checking
-			prevchr = chr
-			t = 0
-			ff=inputfile2+chr+".txt"
-			if not os.path.isfile(ff):
-				sys.stderr.write("WARNING: %s NOT FOUND (should be generated by splitByChromosome3). \
-Likely due to no reads found in a targeted chromosome (%s), in turn likely due to currupted bam files\n" %
-					(ff,chr))
-				#sys.exit(1)
-				covFile=[]
-			else:
-				covFile = file.readlines(open(inputfile2+chr+".txt","r"))
-
-		for n in range(0,target.numberExon):
-			if t >= len(covFile):
-				break
-			cov = covFile[t].split()
-			while  ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
-				if (int(cov[1]) > int(starts[n])):
-					t-=1
-				else:
-					t+=1
-				cov = covFile[t].split() 
-	
-			while  ((int(ends[n]) < int(cov[1])) or (int(ends[n]) >  int(cov[2]))):
-                		# print output #
-				if (rn == 1):
-					prev = target.id
-                		nID = target.id
-                		if (nID != prev):
-					rn = 1
-				ref1 = str(rn)
-				ref2 = str(int(cov[2]) - int(starts[n]) + rn)
-				outputwrite(output, target.gene,chr,ref1,ref2,cov[3],target.id,n,starts[n],cov[2])
-
-				outputwrite2(output2, chr,cov[3],starts[n],cov[2])
-
-
-				rn = int(ref2)
-				prev = nID
-				# -- #
-
-				# get to the next line of inputfile#
-				t+= 1
-				cov = covFile[t].split()
-				starts[n] = cov[1]	
-		
-		#print output #
-		if (t == 0) and (t >= len(covFile)):
-			continue
-
-		if (rn == 1):
-			prev = target.id
-		nID = target.id
-		if (nID != prev):
-			rn = 1
-		ref1 = str(rn)
-		ref2 = str(int(ends[n]) - int(starts[n]) + rn)	
-		outputwrite(output, target.gene, chr, ref1, ref2, cov[3], target.id, n, starts[n], ends[n])	
-
-		outputwrite2(output2, chr, cov[3], starts[n], ends[n])
-
-
-		rn = int(ref2)
-		prev = nID
-		# -- #
-	output.close()	
-	output2.close()
-
 
-def convertGeneCoordinate2(targetList, bufLocFolder):
-        inputfile2 = bufLocFolder + "chr/"
-        outputfile = bufLocFolder + "geneRefCoverage.txt"
-        outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt"
-
-        output= open(outputfile,"w")
-        output_avg = open(outputfile_avg,"w")
-
-        rn = 1
-        prevchr = ""
-        for target in targetList:
-                chr = target.chr
-                starts = target.oriStart.split(",")
-                ends = target.oriEnd.split(",")
-                target_ttl_readdepth = 0
-                starts_leftmost = starts[0]
-
-
-                if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
-                        continue
-
-                if (prevchr != chr):
-                        print chr #progress checking
-                        prevchr = chr
-                        t = 0
-			ff=inputfile2+chr+".txt"
-			if not os.path.isfile(ff):
-				sys.stderr.write("WARNING: %s NOT FOUND (should be generated by splitByChromosome3). \
-Likely due to no reads found in a targeted chromosome (%s), in turn likely due to currupted bam files\n" %
-					(ff,chr))
-				#sys.exit(1)
-				covFile=[]
-			else:
-                        	covFile = file.readlines(open(inputfile2+chr+".txt","r"))
-
-                for n in range(0,target.numberExon):
-                        if t >= len(covFile):
-                                break
-                        cov = covFile[t].split()
-                        while  ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
-                                if (int(cov[1]) > int(starts[n])):                                        t-=1
-                                else:
-                                        t+=1
-                                cov = covFile[t].split()
-
-                        while  ((int(ends[n]) < int(cov[1])) or (int(ends[n]) >  int(cov[2]))):
-                                # print output #
-                                if (rn == 1):
-                                        prev = target.id
-                                nID = target.id
-                                if (nID != prev):
-                                        rn = 1
-                                ref1 = str(rn)
-                                ref2 = str(int(cov[2]) - int(starts[n]) + rn)
-                                outputwrite2(output, chr,cov[3],starts[n],cov[2])
-                                tmprange=int(cov[2])-int(starts[n])+1
-                                target_ttl_readdepth+=int(cov[3])*tmprange
-                                #target_length+=tmprange
-
-                                rn = int(ref2)
-                                prev = nID
-                                # -- #
-
-                                # get to the next line of inputfile#
-                                t+= 1
-                                cov = covFile[t].split()
-                                starts[n] = cov[1]
-
-                #print output #
-                if (t == 0) and (t >= len(covFile)):
-                        continue
-
-                if (rn == 1):
-                        prev = target.id
+def outputwrite(output, gene, chr, a, b, c, id, n, sOri, eOri):
+    id = str(id)
+    n = str(n)
+    output.write(
+        gene
+        + "\t"
+        + chr
+        + "\t"
+        + a
+        + "\t"
+        + b
+        + "\t"
+        + c
+        + "\t"
+        + id
+        + "\t"
+        + n
+        + "\t"
+        + sOri
+        + "\t"
+        + eOri
+        + "\n"
+    )
+
+
+def outputwrite2(output2, chr, c, sOri, eOri):
+    output2.write(chr + "\t" + sOri + "\t" + eOri + "\t" + c + "\n")
+
+
+def convertGeneCoordinate(targetList, bufLocFolder):
+    inputfile2 = bufLocFolder + "chr/"
+    outputfile = bufLocFolder + "geneRefCoverage.txt"
+    outputfile2 = bufLocFolder + "geneRefCoverage2.txt"
+
+    output = open(outputfile, "w")
+    output2 = open(outputfile2, "w")
+
+    rn = 1
+    prevchr = ""
+    for target in targetList:
+        chr = target.chr
+        starts = target.oriStart.split(",")
+        ends = target.oriEnd.split(",")
+
+        if (len(chr) > 5) or (chr[len(chr) - 1] == "M"):
+            continue
+
+        if prevchr != chr:
+            print(chr)  # progress checking
+            prevchr = chr
+            t = 0
+            ff = inputfile2 + chr + ".txt"
+            if not os.path.isfile(ff):
+                sys.stderr.write(
+                    "WARNING: {} NOT FOUND (should be generated by splitByChromosome3). \
+Likely due to no reads found in a targeted chromosome ({}), in turn likely due to currupted bam files\n".format(
+                        ff, chr
+                    )
+                )
+                # sys.exit(1)
+                covFile = []
+            else:
+                ffh = open(ff, "r")
+                covFile = ffh.readlines()
+
+        for n in range(0, target.numberExon):
+            if t >= len(covFile):
+                break
+            cov = covFile[t].split()
+            while (int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2])):
+                if int(cov[1]) > int(starts[n]):
+                    t -= 1
+                else:
+                    t += 1
+                cov = covFile[t].split()
+
+            while (int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2])):
+                # print output #
+                if rn == 1:
+                    prev = target.id
                 nID = target.id
-                if (nID != prev):
-                        rn = 1
+                if nID != prev:
+                    rn = 1
+
                 ref1 = str(rn)
-                ref2 = str(int(ends[n]) - int(starts[n]) + rn)
-                outputwrite2(output, chr, cov[3], starts[n], ends[n])
-                tmprange=int(ends[n])-int(starts[n])+1
-                target_ttl_readdepth+=int(cov[3])*tmprange
-                #target_length+=tmprange
-                target_length = int(ends[n])-int(starts_leftmost)+1
-                output_avg.write("\t".join([chr,starts_leftmost,ends[n],str(target_ttl_readdepth/target_length),str(target_length)])+"\n")
+                ref2 = str(int(cov[2]) - int(starts[n]) + rn)
+                outputwrite(
+                    output,
+                    target.gene,
+                    chr,
+                    ref1,
+                    ref2,
+                    cov[3],
+                    target.id,
+                    n,
+                    starts[n],
+                    cov[2],
+                )
+
+                outputwrite2(output2, chr, cov[3], starts[n], cov[2])
 
                 rn = int(ref2)
                 prev = nID
                 # -- #
-        output.close()
-        output_avg.close()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 
+                # get to the next line of inputfile#
+                t += 1
+                cov = covFile[t].split()
+                starts[n] = cov[1]
+
+        # print output #
+        if (t == 0) and (t >= len(covFile)):
+            continue
+
+        if rn == 1:
+            prev = target.id
+        nID = target.id
+        if nID != prev:
+            rn = 1
+        ref1 = str(rn)
+        ref2 = str(int(ends[n]) - int(starts[n]) + rn)
+        outputwrite(
+            output,
+            target.gene,
+            chr,
+            ref1,
+            ref2,
+            cov[3],
+            target.id,
+            n,
+            starts[n],
+            ends[n],
+        )
+
+        outputwrite2(output2, chr, cov[3], starts[n], ends[n])
+
+        rn = int(ref2)
+        prev = nID
+        # -- #
+    output.close()
+    output2.close()
 
 
+def convertGeneCoordinate2(targetList, bufLocFolder):
+    inputfile2 = bufLocFolder + "chr/"
+    outputfile = bufLocFolder + "geneRefCoverage.txt"
+    outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt"
+
+    output = open(outputfile, "w")
+    output_avg = open(outputfile_avg, "w")
+
+    rn = 1
+    prevchr = ""
+    for target in targetList:
+        chr = target.chr
+        starts = target.oriStart.split(",")
+        ends = target.oriEnd.split(",")
+        target_ttl_readdepth = 0
+        starts_leftmost = starts[0]
+
+        if (len(chr) > 5) or (chr[len(chr) - 1] == "M"):
+            continue
+
+        if prevchr != chr:
+            print(chr)  # progress checking
+            prevchr = chr
+            t = 0
+            ff = inputfile2 + chr + ".txt"
+            if not os.path.isfile(ff):
+                sys.stderr.write(
+                    "WARNING: %s NOT FOUND (should be generated by splitByChromosome3). \
+            Likely due to no reads found in a targeted chromosome (%s), in turn likely due to currupted bam files\n"
+                    % (ff, chr)
+                )
+                # sys.exit(1)
+                covFile = []
+            else:
+                covFile = file.readlines(open(inputfile2 + chr + ".txt", "r"))
+
+        for n in range(0, target.numberExon):
+            if t >= len(covFile):
+                break
+            cov = covFile[t].split()
+            while (int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2])):
+                if int(cov[1]) > int(starts[n]):
+                    t -= 1
+                else:
+                    t += 1
+                cov = covFile[t].split()
+
+            while (int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2])):
+                # print output #
+                if rn == 1:
+                    prev = target.id
+                nID = target.id
+                if nID != prev:
+                    rn = 1
+                ref1 = str(rn)
+                ref2 = str(int(cov[2]) - int(starts[n]) + rn)
+                outputwrite2(output, chr, cov[3], starts[n], cov[2])
+                tmprange = int(cov[2]) - int(starts[n]) + 1
+                target_ttl_readdepth += int(cov[3]) * tmprange
+                # target_length+=tmprange
 
+                rn = int(ref2)
+                prev = nID
+                # -- #
 
+                # get to the next line of inputfile#
+                t += 1
+                cov = covFile[t].split()
+                starts[n] = cov[1]
+
+        # print output #
+        if (t == 0) and (t >= len(covFile)):
+            continue
+
+        if rn == 1:
+            prev = target.id
+        nID = target.id
+        if nID != prev:
+            rn = 1
+        ref1 = str(rn)
+        ref2 = str(int(ends[n]) - int(starts[n]) + rn)
+        outputwrite2(output, chr, cov[3], starts[n], ends[n])
+        tmprange = int(ends[n]) - int(starts[n]) + 1
+        target_ttl_readdepth += int(cov[3]) * tmprange
+        # target_length+=tmprange
+        target_length = int(ends[n]) - int(starts_leftmost) + 1
+        output_avg.write(
+            "\t".join(
+                [
+                    chr,
+                    starts_leftmost,
+                    ends[n],
+                    str(target_ttl_readdepth / target_length),
+                    str(target_length),
+                ]
+            )
+            + "\n"
+        )
+
+        rn = int(ref2)
+        prev = nID
+        # -- #
+    output.close()
+    output_avg.close()
diff --git a/scripts/convert_targeted_regions.py b/scripts/convert_targeted_regions.py
index 209a18c..830f011 100755
--- a/scripts/convert_targeted_regions.py
+++ b/scripts/convert_targeted_regions.py
@@ -17,55 +17,56 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 28 Mar 2011 11:00AM
 
+
 class Target:
-	"""
+    """
 	Class for target regions
 	"""
-	
-	population = 0
 
-	def __init__(self):
-		self.id = 0
-		self.gene = "unknown"
-		self.chr = "chr1"
-		self.start = 0
-		self.end = 0
-		self.numberExon = 0
-		self.oriStart = 0
-		self.oriEnd = 0
+    population = 0
+
+    def __init__(self):
+        self.id = 0
+        self.gene = "unknown"
+        self.chr = "chr1"
+        self.start = 0
+        self.end = 0
+        self.numberExon = 0
+        self.oriStart = 0
+        self.oriEnd = 0
 
-def convertTarget(target):
-	targets = open(target)
 
-	targetList = []
+def convertTarget(target):
+    targets = open(target)
 
-	count = 0
-	for region in targets:
-		region = region.split()
-		chr = "chr" + region[0].strip("chr")
-		start = region[1]
-		end = region[2]
-		try:
-			gene = region[3]
-		except:
-			gene = "unknown"	
-		count += 1
+    targetList = []
 
-		aTarget = Target()
-		aTarget.id = count
-		aTarget.gene = gene
-		aTarget.chr  = chr
-		aTarget.start = start
-		aTarget.end = end
-		aTarget.numberExon = 1
-		aTarget.oriStart = start
-		aTarget.oriEnd	 = end
+    count = 0
+    for region in targets:
+        region = region.split()
+        chr = "chr" + region[0].strip("chr")
+        start = region[1]
+        end = region[2]
+        try:
+            gene = region[3]
+        except:
+            gene = "unknown"
+        count += 1
 
-		targetList.append(aTarget)
+        aTarget = Target()
+        aTarget.id = count
+        aTarget.gene = gene
+        aTarget.chr = chr
+        aTarget.start = start
+        aTarget.end = end
+        aTarget.numberExon = 1
+        aTarget.oriStart = start
+        aTarget.oriEnd = end
 
+        targetList.append(aTarget)
 
-	return targetList
+    return targetList
diff --git a/scripts/count_libsize.py b/scripts/count_libsize.py
index 618ef38..fa6363e 100755
--- a/scripts/count_libsize.py
+++ b/scripts/count_libsize.py
@@ -17,22 +17,23 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 05 October 2011 16:43PM
 
+
 def get_libsize(bedgraph_file):
-	bedgraph = open(bedgraph_file)
-	libsize = 0
-	for line in bedgraph:
-		line	= line.split()
-		chr	= line[0]
-		start	= int(line[1])
-		end	= int(line[2])
-		cov	= float(line[3])
-		#cov	= int(line[3])
-		
-		libsize += (end-start)*cov
+    bedgraph = open(bedgraph_file)
+    libsize = 0
+    for line in bedgraph:
+        line = line.split()
+        chr = line[0]
+        start = int(line[1])
+        end = int(line[2])
+        cov = float(line[3])
+        # cov	= int(line[3])
+
+        libsize += (end - start) * cov
 
-	bedgraph.close()
-	return int(libsize)
+    bedgraph.close()
+    return int(libsize)
diff --git a/scripts/get_chr_length.py b/scripts/get_chr_length.py
index 35923e3..2ceb204 100755
--- a/scripts/get_chr_length.py
+++ b/scripts/get_chr_length.py
@@ -17,27 +17,26 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 05 October 2011 16:43PM
 
 
 import subprocess, shlex
 
-def get_genome(srcFile, genomeOut):
-	genome = open(genomeOut, "w")
 
-	args = shlex.split("samtools view -H %s" %(srcFile))
-	raw_header = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0]
-	headers = raw_header.split("\n")
+def get_genome(srcFile, genomeOut):
+    genome = open(genomeOut, "w")
 
-	for header in headers:
-		header = header.split("\t")
-		if header[0][1:] != "SQ":
-			continue
+    args = shlex.split("samtools view -H {}".format(srcFile))
+    raw_header = subprocess.Popen(args, stdout=subprocess.PIPE).communicate()[0]
+    headers = str(raw_header).split("\n")
 
-		genome.write(header[1].strip("SN:") + "\t" + header[2].strip("LN:") + "\n")
+    for header in headers:
+        header = header.split("\t")
+        if header[0][1:] != "SQ":
+            continue
 
-	genome.close()
+        genome.write(header[1].strip("SN:") + "\t" + header[2].strip("LN:") + "\n")
 
-		
+    genome.close()
diff --git a/scripts/split_chromosome.py b/scripts/split_chromosome.py
index 78d9198..176043c 100755
--- a/scripts/split_chromosome.py
+++ b/scripts/split_chromosome.py
@@ -17,89 +17,90 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 03 Sep 2011 15:00PM
 
 
 import os
 
+
 def splitByChromosome(destFolder):
 
-	try:
-		os.mkdir(destFolder + "chr/")
-	except:
-		print "folder exist"
+    try:
+        os.mkdir(destFolder + "chr/")
+    except:
+        print("folder exist")
+
+    inputfile = destFolder + "sample.BEDGRAPH"
+    outputfile = destFolder + "chr/chr1.txt"
+    file = open(inputfile, "r")
+    output = open(outputfile, "w")
+    check = "1"
 
-	inputfile = destFolder + "sample.BEDGRAPH"
-	outputfile = destFolder + "chr/chr1.txt"
-	file = open(inputfile,"r")
-	output = open(outputfile,"w")
-	check = "1"
+    for row in file:
+        if row[0] == "#":
+            continue
 
-	for row in file:
-		if row[0] == '#':
-			continue
+        cols = row.split()
+        chr = cols[0].strip("chr")
+        if chr != check:
+            output.close()
+            check = chr
+            output = open(destFolder + "chr/chr" + check + ".txt", "w")
+        output.write(row)
 
-		cols = row.split()
-		chr = cols[0].strip("chr")
-		if (chr != check):
-			output.close()
-			check = chr
-			output = open(destFolder+ "chr/chr"+check+".txt","w")
-		output.write(row)
+    output.close()
 
-	output.close()
 
-#JLMod
+# JLMod
 def splitByChromosome3(infile):
-        destFolder = os.path.dirname(infile)+"/"
-
-        try:
-                os.mkdir(destFolder + "chr/")
-        except:
-                print "folder exist"
-
-        #inputfile = destFolder + "sample.BEDGRAPH"
-        inputfile=infile
-        outputfile = destFolder + "chr/chr1.txt"
-        file = open(inputfile,"r")
-        output = open(outputfile,"w")
-        check = "1"
-
-        for row in file:
-                cols = row.split()
-                chr = cols[0].strip("chr")
-                if (chr != check):
-                        output.close()
-                        check = chr
-                        output = open(destFolder+ "chr/chr"+check+".txt","w")
-                output.write(row)
-
-        output.close()
+    destFolder = os.path.dirname(infile) + "/"
 
-def splitByChromosome2(folder):
+    try:
+        os.mkdir(destFolder + "chr/")
+    except:
+        print("folder exist")
 
-        try:
-                os.mkdir(folder + "target/")
-        except:
-                print "folder exist"
+    # inputfile = destFolder + "sample.BEDGRAPH"
+    inputfile = infile
+    outputfile = destFolder + "chr/chr1.txt"
+    file = open(inputfile, "r")
+    output = open(outputfile, "w")
+    check = "1"
 
-        inputfile = folder + "geneRefCoverage.txt"
-        outputfile = folder + "target/chr1.txt"
-        file = open(inputfile,"r")
-        output = open(outputfile,"w")
-        check = "1"
+    for row in file:
+        cols = row.split()
+        chr = cols[0].strip("chr")
+        if chr != check:
+            output.close()
+            check = chr
+            output = open(destFolder + "chr/chr" + check + ".txt", "w")
+        output.write(row)
 
-        for row in file:
-                cols = row.split()
-                chr = cols[0].strip("chr")
-                if (chr != check):
-                        output.close()
-                        check = chr
-                        output = open(folder+ "target/chr"+check+".txt","w")
-                output.write(row)
+    output.close()
 
-        output.close()
 
+def splitByChromosome2(folder):
 
+    try:
+        os.mkdir(folder + "target/")
+    except:
+        print("folder exist")
+
+    inputfile = folder + "geneRefCoverage.txt"
+    outputfile = folder + "target/chr1.txt"
+    file = open(inputfile, "r")
+    output = open(outputfile, "w")
+    check = "1"
+
+    for row in file:
+        cols = row.split()
+        chr = cols[0].strip("chr")
+        if chr != check:
+            output.close()
+            check = chr
+            output = open(folder + "target/chr" + check + ".txt", "w")
+        output.write(row)
+
+    output.close()
diff --git a/scripts/target_breakdown.py b/scripts/target_breakdown.py
index c86e373..82b3805 100755
--- a/scripts/target_breakdown.py
+++ b/scripts/target_breakdown.py
@@ -17,40 +17,42 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 05 October 2011 16:43PM
 
+
 def target_breakdown(filename, maxRegionSize, targetRegionSize, out_fname):
-	from math import ceil
-
-	a	= open(filename)
-	output 	= open(out_fname, "w")
-
-	for x in a:
-		x 	= x.split()
-		chr 	= x[0]
-		start 	= int(x[1])
-		end	= int(x[2])
-		info	= ""
-		if (len(x) > 3):
-			info	= x[3]
-	
-		regionSize = end-start
-
-		if regionSize > maxRegionSize:
-			seg	= ceil(regionSize/float(targetRegionSize))
-			limit	= ceil(regionSize/float(seg))
-	
-			s_end = start	
-			for i in range(int(seg)):
-				s_start = s_end
-				s_end = int(start + (limit*(i+1)))
-				if s_end > end:
-					s_end = end
-				output.write("\t".join([chr, str(s_start), str(s_end), info+"(SPLIT)"])+"\n")
-		else:
-			output.write("\t".join([chr, str(start), str(end), info])+"\n")
-
-	output.close()
-	
+    from math import ceil
+
+    a = open(filename)
+    output = open(out_fname, "w")
+
+    for x in a:
+        x = x.split()
+        chr = x[0]
+        start = int(x[1])
+        end = int(x[2])
+        info = ""
+        if len(x) > 3:
+            info = x[3]
+
+        regionSize = end - start
+
+        if regionSize > maxRegionSize:
+            seg = ceil(regionSize / float(targetRegionSize))
+            limit = ceil(regionSize / float(seg))
+
+            s_end = start
+            for i in range(int(seg)):
+                s_start = s_end
+                s_end = int(start + (limit * (i + 1)))
+                if s_end > end:
+                    s_end = end
+                output.write(
+                    "\t".join([chr, str(s_start), str(s_end), info + "(SPLIT)"]) + "\n"
+                )
+        else:
+            output.write("\t".join([chr, str(start), str(end), info]) + "\n")
+
+    output.close()
diff --git a/scripts/vcf_out.py b/scripts/vcf_out.py
index c0e2706..2ae3fde 100755
--- a/scripts/vcf_out.py
+++ b/scripts/vcf_out.py
@@ -17,48 +17,50 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
-#-----------------------------------------------------------------------#
+#
+# -----------------------------------------------------------------------#
 # Last Updated : 09 Apr 2011 11:00AM
 
-def vcf_out(inF, outF):
-	import math
-	f = file.readlines(open(inF))
-	vcf = open(outF, "w")
-
-	#header
-	vcf.write("##fileformat=VCFv4.0\n")
-	vcf.write("##reference=1000GenomesPilot-NCBI36\n")
-	vcf.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"\n')
-	vcf.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"\n')
-	vcf.write('##ALT=<ID=CNV,Description="Copy number variable region"\n')
-	vcf.write("#CHROM \tPOS \tID \tREF \tALT \tQUAL \tFILTER \tINFO\n")
 
-	count = 0 
-	while count < len(f):
-		if (count % 2 == 0):
-			region = f[count].strip(">\n")
-			region = region.split(":")
-			chr = region[0]
+def vcf_out(inF, outF):
+    import math
 
-			adjPVal = float(region[2])
-			if adjPVal <= 0:
-				adjPVal = 0
-			else:
-				adjPVal = -10 * math.log(adjPVal, 10)
-			adjPVal = str(round(adjPVal,3))		
-			region[1] = region[1].split("-")
-			start = region[1][0]
-			end = region[1][1]
-		else:
-			ref = f[count].strip("\n")
-			vcf.write(chr +"\t"+ start + "\t" + "." + "\t" + ref + "\t")
-			vcf.write("<CNV>" + "\t" + adjPVal + "\t" + "PASS" + "\t")
-			vcf.write("SVTYPE=CNV;END="+ end + "\n")
-		count += 1		
+    f = file.readlines(open(inF))
+    vcf = open(outF, "w")
 
-	vcf.close()
-	
+    # header
+    vcf.write("##fileformat=VCFv4.0\n")
+    vcf.write("##reference=1000GenomesPilot-NCBI36\n")
+    vcf.write(
+        '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"\n'
+    )
+    vcf.write(
+        '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"\n'
+    )
+    vcf.write('##ALT=<ID=CNV,Description="Copy number variable region"\n')
+    vcf.write("#CHROM \tPOS \tID \tREF \tALT \tQUAL \tFILTER \tINFO\n")
 
+    count = 0
+    while count < len(f):
+        if count % 2 == 0:
+            region = f[count].strip(">\n")
+            region = region.split(":")
+            chr = region[0]
 
+            adjPVal = float(region[2])
+            if adjPVal <= 0:
+                adjPVal = 0
+            else:
+                adjPVal = -10 * math.log(adjPVal, 10)
+            adjPVal = str(round(adjPVal, 3))
+            region[1] = region[1].split("-")
+            start = region[1][0]
+            end = region[1][1]
+        else:
+            ref = f[count].strip("\n")
+            vcf.write(chr + "\t" + start + "\t" + "." + "\t" + ref + "\t")
+            vcf.write("<CNV>" + "\t" + adjPVal + "\t" + "PASS" + "\t")
+            vcf.write("SVTYPE=CNV;END=" + end + "\n")
+        count += 1
 
+    vcf.close()

From 5f83f557e97c31cc8e482d80b11e4401c2030515 Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 10:58:25 +1100
Subject: [PATCH 2/9] converted the WGCNV parts to python 3

---
 WGCNV-NDE/1_renorm.py       |  42 +++++----
 WGCNV-NDE/2_process.py      | 179 ++++++++++++++++++++----------------
 WGCNV-NDE/3_wgsummary.R     |  50 +++++-----
 WGCNV-NDE/4_plot.R          |   2 +-
 WGCNV-NDE/wgcnv_wrapper.py  |  83 ++++++++---------
 WGCNV-SINGLE/wgcnv.py       |  78 ++++++++--------
 WGCNV-SINGLE/wgcnv_noBAF.py |  78 ++++++++--------
 scripts/cn_analysis.v4.R    |  15 ++-
 scripts/large_region_cbs.R  |  27 ++----
 9 files changed, 281 insertions(+), 273 deletions(-)

diff --git a/WGCNV-NDE/1_renorm.py b/WGCNV-NDE/1_renorm.py
index b225a49..376d28b 100755
--- a/WGCNV-NDE/1_renorm.py
+++ b/WGCNV-NDE/1_renorm.py
@@ -5,7 +5,8 @@
 import os
 import sys
 
-def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
+
+def renorm_file(path_CONTRA_out, path_out):  # Arguments: two paths to files
     f_CONTRA_out = open(path_CONTRA_out, "r")
     f_out = open(path_out, "w")
 
@@ -16,23 +17,24 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
     headers = lines[0]
     contra = []
     for line in lines[1:]:
-        contra.append(line.split('\t'))
+        contra.append(line.split("\t"))
     # For each line in contra:
     # 0 = region ID, 6 = Mean.of.LogRatio, 7 = Adjusted.Mean.of.LogRatio, 14 = tumour.rd, 15 = normal.rd, 16 = tumour.rd.ori, 17 = normal.rd.ori
-        
+
     # 2. Go through data, calculating x_T, N_T, x_N, N_N
     # as well as creating list of genes to remove (big gains)
-    removed_regions = [] # Listed by region ID
-    remove_threshold = 2 # Arbitrarily chosen - can be adjusted
+    removed_regions = []  # Listed by region ID
+    remove_threshold = 2  # Arbitrarily chosen - can be adjusted
     ##### PROBLEM: due to threshold might only take some regions of a particular gene when in reality, want all of them??
     # Does this make a difference?
     x_T = N_T = x_N = N_N = 0.0
 
     # 4 = OriStCoordinate, 5 = OriEndCoordinate
 
-
     for line in contra:
-        k = float(line[5]) - float(line[4]) + 1 # Width / number of bases. Inclusive, so add 1.
+        k = (
+            float(line[5]) - float(line[4]) + 1
+        )  # Width / number of bases. Inclusive, so add 1.
         if float(line[7]) > remove_threshold:
             removed_regions.append(line[0])
             x_T += float(line[16]) * k
@@ -43,29 +45,33 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
     # print removed_regions
 
     # 3. Calculate a_2, b_2
-    n = 1 - float(x_N)/N_N
-    t = 1 - float(x_T)/N_T
-    a_2 = math.sqrt(n/t)
-    b_2 = math.sqrt(t/n) # or 1/a_2
-
+    n = 1 - float(x_N) / N_N
+    t = 1 - float(x_T) / N_T
+    a_2 = math.sqrt(n / t)
+    b_2 = math.sqrt(t / n)  # or 1/a_2
 
     # 4. Rescale read depths (and recalculate log ratio??)
     # Add to new columns
     # Need to multiply necessary tumour.rd (d_t) by a_2, normal.rd (d_n) by b_2
-    headers = "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n" # fix headers
+    headers = (
+        "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n"
+    )  # fix headers
     f_out.write(headers)
     for line in contra:
         temp = [line[2], line[3], line[4], line[5]]
         # Recalculate Mean.of.Log.Ratio
-        temp.append(str(math.log((float(line[14]) * a_2) /(float(line[15]) * b_2 ), 2))+'\n')
+        temp.append(
+            str(math.log((float(line[14]) * a_2) / (float(line[15]) * b_2), 2)) + "\n"
+        )
         # Write to output
-        f_out.write('\t'.join(temp))
+        f_out.write("\t".join(temp))
 
     # 5. Close file
     f_out.close()
 
+
 if len(sys.argv) < 3:
-    print "1_renorm.py: not enough input arguments"
+    print("1_renorm.py: not enough input arguments")
     sys.exit(1)
 
 contraF = sys.argv[1]
@@ -73,5 +79,5 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
 
 renorm_file(contraF, outF)
 
-if sys.argv[3] == 'T':
-    print "1_renorm.py: created "+outF
+if sys.argv[3] == "T":
+    print(f"1_renorm.py: created {outF}")
diff --git a/WGCNV-NDE/2_process.py b/WGCNV-NDE/2_process.py
index 708a980..873f318 100755
--- a/WGCNV-NDE/2_process.py
+++ b/WGCNV-NDE/2_process.py
@@ -4,7 +4,7 @@
 import copy
 import sys
 
-#-----------------------------------------------------------------------------------
+# -----------------------------------------------------------------------------------
 # Based on the 27/08 version (Ca2015_Histograms/renorm.py)
 # Modified to just be run on one file
 # More descriptive for the exon level (coordinates in columns)
@@ -12,7 +12,7 @@
 # 17/09 - removed "Name" column
 #       - edited _exon.txt output to use matched/"right" coordinates
 # 28/10 - no longer outputs NA
-#-----------------------------------------------------------------------------------
+# -----------------------------------------------------------------------------------
 
 # Global dictionary - {<Gene>: {<Exon> : [List of log ratios]}}
 # where Exon = startcoord_endcoord (Should be chromosome_startcoord_endcoord but thats not in renorm)
@@ -20,38 +20,38 @@
 
 # Error file (write out errors here)
 # JKS WHO NEEDS ERRORS
-#error_out = "ALL_get_exons_err.txt"
-#if testing_flag:
+# error_out = "ALL_get_exons_err.txt"
+# if testing_flag:
 #    error_out = "TEST_get_exons_err.txt"
-#errors = open(error_out, 'w')
+# errors = open(error_out, 'w')
 
 if len(sys.argv) < 4:
-    print "2_process.py: not enough input arguments"
+    print("2_process.py: not enough input arguments")
     sys.exit(1)
 
 filename = sys.argv[1]
 wholegene_output = sys.argv[2]
 exon_output = sys.argv[3]
-debug='F'
+debug = "F"
 if len(sys.argv) == 6:
-	debug=sys.argv[5]
+    debug = sys.argv[5]
 
 
 # Create gdict entries based on myTSCA.bed.annotated.txt
 # Format: chromosome startcoord endcoord gene
 if len(sys.argv) >= 5:
-	myTSCA_in = sys.argv[4]
+    myTSCA_in = sys.argv[4]
 else:
-	myTSCA_in = "/mnt/Storage/NextGenSequencing/Data/Yeap/myTSCA.bed.annotated.txt"
+    myTSCA_in = "/mnt/Storage/NextGenSequencing/Data/Yeap/myTSCA.bed.annotated.txt"
 
-myTSCA = open(myTSCA_in, 'r')
+myTSCA = open(myTSCA_in, "r")
 for line in myTSCA.readlines():
-    entries = line.split('\t')
-    entries[3] = entries[3].rstrip('\n')
+    entries = line.split("\t")
+    entries[3] = entries[3].rstrip("\n")
     if entries[3] not in gdict:
         gdict[entries[3]] = {}
-#    gdict[entries[3]]['_'.join(entries[1:3])] = []
-    gdict[entries[3]][entries[0]+":"+'_'.join(entries[1:3])] = []
+    #    gdict[entries[3]]['_'.join(entries[1:3])] = []
+    gdict[entries[3]][entries[0] + ":" + "_".join(entries[1:3])] = []
 myTSCA.close()
 
 # Helper function
@@ -61,6 +61,7 @@ def calculate_average(numblist):
         total += float(item)
     return total / float(len(numblist))
 
+
 # Helper function - list may contain "NA" entries
 def calculate_average_withmissing(numblist):
     total = 0.0
@@ -74,15 +75,16 @@ def calculate_average_withmissing(numblist):
     else:
         return "NA"
 
+
 # Given a CONTRA output file, generate a list of exons with average log ratio per exon.
 def process_file(f_in):
-    GENE_IND=0
-    CHR_IND=1
-    START_IND=2
-    END_IND=3
-    LR_IND=4
+    GENE_IND = 0
+    CHR_IND = 1
+    START_IND = 2
+    END_IND = 3
+    LR_IND = 4
 
-    f = open(f_in, 'r')
+    f = open(f_in, "r")
     # 0 - Gene Sym
     # 1 - OriStCoord, 2 - OriEndCoord
     # 3 - Mean.of.LogRatio (renormalised)
@@ -94,20 +96,25 @@ def process_file(f_in):
 
     # Temp list of all genes in gdict
     all_genes = gdict.keys()
-    
+
     # Preprocessing
     contra = []
     for line in f.readlines()[1:]:
-        newline = line.split('\t')
-        name = newline[GENE_IND].split('(')[0]
+        newline = line.split("\t")
+        name = newline[GENE_IND].split("(")[0]
         newline[GENE_IND] = name
-        newline[-1] = newline[-1].rstrip('\n')
+        newline[-1] = newline[-1].rstrip("\n")
         contra.append(newline)
 
     f.close()
 
     currgene = contra[0][GENE_IND]
-    temp = [contra[0][START_IND], contra[0][END_IND], [contra[0][LR_IND]], contra[0][CHR_IND]] # start end list of averages
+    temp = [
+        contra[0][START_IND],
+        contra[0][END_IND],
+        [contra[0][LR_IND]],
+        contra[0][CHR_IND],
+    ]  # start end list of averages
     exons[currgene] = {}
     exon_n = 1
     for line in contra[1:]:
@@ -118,14 +125,14 @@ def process_file(f_in):
             exons[currgene][exon_n] = temp
             currgene = line[GENE_IND]
             temp = [line[START_IND], line[END_IND], [line[LR_IND]], line[CHR_IND]]
-            if currgene not in exons: # if exon data gets broken up...
+            if currgene not in exons:  # if exon data gets broken up...
                 exon_n = 1
                 exons[currgene] = {}
             else:
                 exon_n = len(exons[currgene].keys()) + 1
-                
+
         # If its a new exon...
-        elif line[START_IND] != temp[1]: # ie coordinates don't match up
+        elif line[START_IND] != temp[1]:  # ie coordinates don't match up
             # Add old to exons dictionary, after calculating average log ratio over the exon.
             temp[2] = calculate_average(temp[2])
             exons[currgene][exon_n] = temp
@@ -146,8 +153,8 @@ def process_file(f_in):
         for exon in exons[gene].keys():
             start = int(exons[gene][exon][0])
             end = int(exons[gene][exon][1])
-            ch = exons[gene][exon][3].replace("chr","")
-            exon_name = ch+":"+str(start)+'_'+str(end)
+            ch = exons[gene][exon][3].replace("chr", "")
+            exon_name = ch + ":" + str(start) + "_" + str(end)
             if exon_name in exon_list:
                 gdict[gene][exon_name].append(exons[gene][exon][2])
                 exon_list.remove(exon_name)
@@ -155,88 +162,98 @@ def process_file(f_in):
                 # Attempt to match it to something in exon_list: up to 20 bases missing from each side is OK
                 matched = False
                 for possible_match in exon_list:
-                    components = possible_match.split(":")[1].split('_')
+                    components = possible_match.split(":")[1].split("_")
                     possible_start = int(components[0])
                     possible_end = int(components[1])
                     start_diff = start - possible_start
                     end_diff = possible_end - end
-                    if start_diff < 21 and end_diff < 21 and start_diff >= 0 and end_diff >= 0:
+                    if (
+                        start_diff < 21
+                        and end_diff < 21
+                        and start_diff >= 0
+                        and end_diff >= 0
+                    ):
                         # Can match these two together
-                        #errors.write('\t'+gene+'_'+possible_match+" is matched with incomplete "+exon_name+'\n')
+                        # errors.write('\t'+gene+'_'+possible_match+" is matched with incomplete "+exon_name+'\n')
                         gdict[gene][possible_match].append(exons[gene][exon][2])
                         exon_list.remove(possible_match)
                         matched = True
                     # Couldn't find a match for it: output error
-                    #errors.write(gene+'_'+exon_name+" could not be found\n")
+                    # errors.write(gene+'_'+exon_name+" could not be found\n")
         # Take care of missing data: remaining exons in exonlist
 
         for missing_exon in exon_list:
             gdict[gene][missing_exon].append("NA")
-            #errors.write(gene+'_'+missing_exon+" marked as missing\n")
+            # errors.write(gene+'_'+missing_exon+" marked as missing\n")
         all_genes.remove(gene)
 
     for gene in all_genes:
         # These genes had nothing in the input data. So, add NA to each exon of the gene.
         for exon in gdict[gene]:
             gdict[gene][exon].append("NA")
-        #errors.write(gene+" not found in input data\n")
-    
+        # errors.write(gene+" not found in input data\n")
+
     # print 'Finished '+f_in[:9]
 
+
 process_file(filename)
 
 
+# Write it out to a file?
+# if writeout:
+#   f2 = open(exon_output, 'w')
+#  headers = ['Gene', 'Start Coord', 'End Coord', 'Mean Log Ratio\n']
+# f2.write('\t'.join(headers))
+# for gene in sorted(exons.keys()):
+#   for exon in sorted(exons[gene].keys()):
+#      f2.write('\t'.join([gene, str(exons[gene][exon][0]), str(exons[gene][exon][1]), str(exons[gene][exon][2])+'\n']))
 
-    # Write it out to a file?
-    #if writeout:
-     #   f2 = open(exon_output, 'w')
-      #  headers = ['Gene', 'Start Coord', 'End Coord', 'Mean Log Ratio\n']
-       # f2.write('\t'.join(headers))
-        #for gene in sorted(exons.keys()):
-         #   for exon in sorted(exons[gene].keys()):
-          #      f2.write('\t'.join([gene, str(exons[gene][exon][0]), str(exons[gene][exon][1]), str(exons[gene][exon][2])+'\n']))
+# f2.close()
 
-        #f2.close()
 
-          
 # write out gdict...
-#output_name = "ALL_exons.txt"
-#out = open(output_name, 'w')
+# output_name = "ALL_exons.txt"
+# out = open(output_name, 'w')
 # Write headers
-#out.write("Exon Name\t"+'\t'.join(good_filenames)+'\n') # dont have bad headers!
-#for gene in sorted(gdict.keys()):
+# out.write("Exon Name\t"+'\t'.join(good_filenames)+'\n') # dont have bad headers!
+# for gene in sorted(gdict.keys()):
 #    for exon in sorted(gdict[gene].keys()):
 #        out.write(gene+'_'+str(exon))
 #        for ratio in gdict[gene][exon]:
 #            out.write('\t'+str(ratio))
- #       out.write('\n')
+#       out.write('\n')
 
-#out.close()
-#print "Finished processing out to "+output_name
+# out.close()
+# print "Finished processing out to "+output_name
 
-f2 = open(exon_output, 'w')
-headers = ['Gene','Chr', 'Start Coord', 'End Coord', 'Mean Log Ratio\n']
-f2.write('\t'.join(headers))
+f2 = open(exon_output, "w")
+headers = ["Gene", "Chr", "Start Coord", "End Coord", "Mean Log Ratio\n"]
+f2.write("\t".join(headers))
 for gene in sorted(gdict.keys()):
     for exon in sorted(gdict[gene].keys()):
         log_ratio = str(gdict[gene][exon][0])
-        if (log_ratio != "NA"):
-            ch,coord=exon.split(":")
-	    out = [gene, ch, coord.split('_')[0], coord.split('_')[1], str(gdict[gene][exon][0])]
-            f2.write('\t'.join(out))
-            f2.write('\n')
+        if log_ratio != "NA":
+            ch, coord = exon.split(":")
+            out = [
+                gene,
+                ch,
+                coord.split("_")[0],
+                coord.split("_")[1],
+                str(gdict[gene][exon][0]),
+            ]
+            f2.write("\t".join(out))
+            f2.write("\n")
 f2.close()
 
 
-
 # Duplicate gdict, just in case...
 
 gdict2 = copy.deepcopy(gdict)
 
 for gene in gdict2.keys():
-    if ';' in gene: # If gene needs to be split
-        #print 'splitting',gene
-        components = gene.split(';')
+    if ";" in gene:  # If gene needs to be split
+        # print 'splitting',gene
+        components = gene.split(";")
         for component_gene in components:
             if component_gene not in gdict2:
                 gdict2[component_gene] = {}
@@ -245,25 +262,27 @@ def process_file(f_in):
             gdict2[components[1]][exon] = gdict[gene][exon][:]
         del gdict2[gene]
 
-wholegene = open(wholegene_output, 'w')
+wholegene = open(wholegene_output, "w")
 # write headers
 wholegene.write("Gene\tRatio\n")
 for gene in sorted(gdict2.keys()):
     sample_temp = []
-    for exon in gdict2[gene].keys(): # For each exon (in that gene):
-        #print len(gdict[gene][exon])
-        #print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon]))
-        sample_temp.append(gdict2[gene][exon][0]) # collect the log ratio of that exon of that sample (of that gene)
+    for exon in gdict2[gene].keys():  # For each exon (in that gene):
+        # print len(gdict[gene][exon])
+        # print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon]))
+        sample_temp.append(
+            gdict2[gene][exon][0]
+        )  # collect the log ratio of that exon of that sample (of that gene)
     log_ratio = calculate_average_withmissing(sample_temp)
     if log_ratio != "NA":
         wholegene.write(gene)
-        wholegene.write('\t'+str(log_ratio)) # whole gene log ratio = average of log ratios across all exons of that gene
-        wholegene.write('\n')
+        wholegene.write(
+            "\t" + str(log_ratio)
+        )  # whole gene log ratio = average of log ratios across all exons of that gene
+        wholegene.write("\n")
 wholegene.close()
 
-#if sys.argv[4] == 'T':
-if debug == 'T':
-    print "2_process.py: created "+wholegene_output
-    print "2_process.py: created "+exon_output
-
-
+# if sys.argv[4] == 'T':
+if debug == "T":
+    print(f"2_process.py: created {wholegene_output}")
+    print(f"2_process.py: created {exon_output}")
diff --git a/WGCNV-NDE/3_wgsummary.R b/WGCNV-NDE/3_wgsummary.R
index 794807d..5be5491 100644
--- a/WGCNV-NDE/3_wgsummary.R
+++ b/WGCNV-NDE/3_wgsummary.R
@@ -5,7 +5,7 @@
 
 # command line input based on Jason Li's wgcnv_withBAFplot.R
 x=commandArgs()
-args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...? 
+args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...?
 
 scriptPath = args[1]
 f_out = args[4]
@@ -55,7 +55,7 @@ mu = 0.05
 getExonStat = function(exon, ex_av) {
   ex_all_av = data_thresh_exon[exon,]$Cropped.Mean
   ex_all_var = data_thresh_exon[exon,]$Cropped.Var
-  
+
   # Calculate the pval. If log ratio below average use lower tail, otherwise use upper
  # print(exon)
  # print(ex_av)
@@ -66,7 +66,7 @@ getExonStat = function(exon, ex_av) {
     tail = FALSE
   }
   ex_pval = pnorm(ex_av, ex_all_av, sqrt(ex_all_var), lower.tail = tail)
-  
+
   ex_gainloss = "."
   # Is it a gain or a loss? Using significance level of mu
   if (ex_pval < mu) {
@@ -77,10 +77,10 @@ getExonStat = function(exon, ex_av) {
       ex_gainloss = "gain"
     }
   }
-  
-  ex_outdf = data.frame(Exon = exon, Av.LR = ex_av, Overall.Av.LR = ex_all_av, Overall.Var.LR = ex_all_var, 
+
+  ex_outdf = data.frame(Exon = exon, Av.LR = ex_av, Overall.Av.LR = ex_all_av, Overall.Var.LR = ex_all_var,
                         pval = ex_pval, GainLoss = ex_gainloss)
-  
+
   return(ex_outdf)
 
 }
@@ -103,7 +103,7 @@ getGeneStat = function(gene) {
     tail = FALSE
   }
   pval = pnorm(av, all_av, sqrt(all_var), lower.tail = tail)
-  
+
   gainloss = "."
   # Is it a gain or a loss? Using significance level of mu
   if (pval < mu) {
@@ -114,12 +114,12 @@ getGeneStat = function(gene) {
       gainloss = "gain"
     }
   }
-  
-  
+
+
   # Now make it into a data frame and return it
   outdf = data.frame(Gene = gene, Av.LR = av, Overall.Av.LR = all_av, Overall.Var.LR = all_var,
                      pval = pval, GainLoss = gainloss)
-  
+
   # Technically don't have to do this, but do it anyway
   return (outdf)
 }
@@ -137,13 +137,13 @@ for (i in 1:length(exons)) {
 
     exon = paste(ex_row$Gene, "_",ex_row$Chr,":",ex_row$Start.Coord, "_",ex_row$End.Coord, sep = "")
 #    print(exon)
-    
+
     if ( ! is.na( data_thresh_exon[exon,"Cropped.Var"]) ) {
 	    # Get the data
 	    ex_tmpdf = getExonStat(exon, ex_row$Mean.Log.Ratio)
 	    # Want to have column names and not append (ie overwrite) for the first gene
 	    # But for the rest, want to have no column names (already have!) and append (so we dont lose data)
-	    write.table(ex_tmpdf, f_ex_out, col.names = !app, append = app, sep = "\t", 
+	    write.table(ex_tmpdf, f_ex_out, col.names = !app, append = app, sep = "\t",
 			row.names = FALSE, quote = FALSE)
 	    app = TRUE
     }
@@ -153,7 +153,7 @@ for (i in 1:length(exons)) {
 # Now, we want to adjust the pvalues and then overwrite the table with new data
 ex_outdf = read.delim(f_ex_out, as.is = TRUE)
 ex_outdf$adj.pval = p.adjust(ex_outdf$pval, method = "BH")
-write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t", 
+write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t",
             row.names = FALSE, quote = FALSE)
 # Then go calculate the adj.GainLoss
 ex_outdf = read.delim(f_ex_out, as.is = TRUE)
@@ -167,7 +167,7 @@ for (i in 1:length(row.names(ex_outdf))) {
     }
   }
 }
-write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t", 
+write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t",
             row.names = FALSE, quote = FALSE)
 
 
@@ -183,13 +183,13 @@ for (i in 1:length(genes)) {
   # Check gene actually exists
   #print(is.na(data_wg[gene,]))
   if(is.na(data_wg[gene,]) == FALSE) {
-  
+
     if (! is.na(data_thresh_wg[gene,]$Cropped.Var) ) {
 	    # Get the data
 	    tmpdf = getGeneStat(gene)
 	    # Want to have column names and not append (ie overwrite) for the first gene
 	    # But for the rest, want to have no column names (already have!) and append (so we dont lose data)
-	    write.table(tmpdf, f_out, col.names = !app, append = app, sep = "\t", 
+	    write.table(tmpdf, f_out, col.names = !app, append = app, sep = "\t",
 			row.names = FALSE, quote = FALSE)
 	    app = TRUE
     }
@@ -199,7 +199,7 @@ for (i in 1:length(genes)) {
 # Now, we want to adjust the pvalues and then overwrite the table with new data
 outdf = read.delim(f_out, as.is = TRUE)
 outdf$adj.pval = p.adjust(outdf$pval, method = "BH")
-write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t", 
+write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t",
             row.names = FALSE, quote = FALSE)
 
 # Now calculate the exon level information for each gene
@@ -216,15 +216,15 @@ for (i in 1:length(genes)) {
   gene.exons.sig.gain.adj = 0
   gene.exons.sig.loss.adj = 0
   gene.n.exons = 0
-  
+
 print (paste("Processing gene ",i,"out of",length(genes)))
-  
+
   # Go through each exon in f_ex_out
   for (j in 1:length(exons)) {
    # print(genes[i])
-    
+
       tmp_ex_name = ex_outdf[j,]$Exon
-  #print(tmp_ex_name) 
+  #print(tmp_ex_name)
   if (!is.na(tmp_ex_name)) { # why is this even a problem??
   if (tmp_ex_name == genes[i] || grepl(genes[i], tmp_ex_name)) {
       # An exon of the gene
@@ -243,13 +243,13 @@ print (paste("Processing gene ",i,"out of",length(genes)))
     }
   }
 
-  
+
   outdf[i,]$exons.sig.gain = gene.exons.sig.gain
   outdf[i,]$exons.sig.loss = gene.exons.sig.loss
   outdf[i,]$exons.sig.gain.adj = gene.exons.sig.gain.adj
   outdf[i,]$exons.sig.loss.adj = gene.exons.sig.loss.adj
   outdf[i,]$n.exons = gene.n.exons
-  
+
   # adj.GainLoss stuff here
   if (outdf[i,]$adj.pval < mu) {
     if (outdf[i,]$Av.LR < outdf[i,]$Overall.Av.LR) {
@@ -258,10 +258,10 @@ print (paste("Processing gene ",i,"out of",length(genes)))
       outdf[i,]$adj.GainLoss = "gain"
     }
   }
-  
+
 }
 
-write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t", 
+write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t",
             row.names = FALSE, quote = FALSE)
 
 
diff --git a/WGCNV-NDE/4_plot.R b/WGCNV-NDE/4_plot.R
index 5e31561..78f3171 100644
--- a/WGCNV-NDE/4_plot.R
+++ b/WGCNV-NDE/4_plot.R
@@ -4,7 +4,7 @@
 #g.dat=read.delim("1003-Tumour_wgSummary.txt",as.is=T)
 
 x=commandArgs()
-args=x[match("--args",x)+1:length(x)] 
+args=x[match("--args",x)+1:length(x)]
 
 #ex.f="14K0038_exSummary.txt"
 #g.f="14K0038_wgSummary.txt"
diff --git a/WGCNV-NDE/wgcnv_wrapper.py b/WGCNV-NDE/wgcnv_wrapper.py
index abaefa4..23e3978 100755
--- a/WGCNV-NDE/wgcnv_wrapper.py
+++ b/WGCNV-NDE/wgcnv_wrapper.py
@@ -9,44 +9,45 @@
 
 import sys
 import os
-USAGE = "python %s <contra_output_basedir> <output> <thresh_wg.txt> <thresh_ex.txt> <bed.annotated.txt> [debug output T/F] [human|mouse]" %sys.argv[0]
+
+USAGE = f"python {sys.argv[0]} <contra_output_basedir> <output> <thresh_wg.txt> <thresh_ex.txt> <bed.annotated.txt> [debug output T/F] [human|mouse]"
 
 # If not enough arguments, print how to use the file
 if len(sys.argv) < 5:
-    print USAGE
+    print(USAGE)
     sys.exit(1)
 if len(sys.argv) < 7:
-    debug = 'F'
+    debug = "F"
 else:
     debug = sys.argv[6]
 if len(sys.argv) < 8:
-    species="human"
+    species = "human"
 else:
-    species=sys.argv[7]
+    species = sys.argv[7]
 
-bedAnnotatedTxt=sys.argv[5]
+bedAnnotatedTxt = sys.argv[5]
 # Grab the sample path and name from command line input
 sampPath = sys.argv[1]
 sampName = os.path.basename(sampPath)
-scriptPath=os.path.realpath(os.path.dirname(sys.argv[0]))
+scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
 
-contraF=None
+contraF = None
 # Attempt to find the appropriate CONTRA table file
-for root,dirs,files in os.walk(sampPath):
+for root, dirs, files in os.walk(sampPath):
     # find the table folder
-    if os.path.basename(root)=="table":
+    if os.path.basename(root) == "table":
         for f in files:
             if f.endswith("bins.txt"):
                 if contraF is not None:
-                    print "WARNING: multiple contra output files detected."
+                    print("WARNING: multiple contra output files detected.")
                     break
-                contraF=os.path.join(root,f)
+                contraF = os.path.join(root, f)
         if contraF:
             break
 
 # If there isnt a CONTRA file, exit
 if contraF == None:
-    print "ERROR: Could not find CONTRA output file."
+    print("ERROR: Could not find CONTRA output file.")
     sys.exit(1)
 
 # Create output directory if needed
@@ -55,28 +56,27 @@
     os.makedirs(outPath)
 
 
-
 # 1. RENORM.PY
 # makes _renorm.txt
 # This may not be necessary, depending if renorm already exists... <- i guess just remake it to ensure no tampering?
 # Perhaps come back and tidy this up later?
 renorm_out = os.path.normpath(os.path.join(outPath, sampName + "_renorm.txt"))
-renorm_cmd = "python %s/1_renorm.py %s %s %s" % (scriptPath, contraF, renorm_out, debug)
-if debug == 'T':
-    print "............"
-    print "Step 1: 1_renorm.py"
-    print renorm_cmd
+renorm_cmd = f"python {scriptPath}/1_renorm.py {contraF} {renorm_out} {debug}"
+if debug == "T":
+    print("............")
+    print("Step 1: 1_renorm.py")
+    print(renorm_cmd)
 os.system(renorm_cmd)
 
 # 2. PROCESS.PY
 # makes _exon.txt, _wg.txt
 process_exon_out = os.path.normpath(os.path.join(outPath, sampName + "_exon.txt"))
 process_wg_out = os.path.normpath(os.path.join(outPath, sampName + "_wg.txt"))
-process_cmd = "python %s/2_process.py %s %s %s %s %s" % (scriptPath, renorm_out, process_wg_out, process_exon_out, bedAnnotatedTxt, debug)
-if debug == 'T':
-    print "............"
-    print "Step 2: 2_process.py"
-    print process_cmd
+process_cmd = f"python {scriptPath}/2_process.py {renorm_out} {process_wg_out} {process_exon_out} {bedAnnotatedTxt} {debug}"
+if debug == "T":
+    print("............")
+    print("Step 2: 2_process.py")
+    print(process_cmd)
 os.system(process_cmd)
 
 # 3. WGSUMMARY.R
@@ -85,34 +85,31 @@
 ex_summary_out = os.path.normpath(os.path.join(outPath, sampName + "_exSummary.txt"))
 thresh_wg = sys.argv[3]
 thresh_exon = sys.argv[4]
-summary_cmd = "Rscript %s %s %s %s %s %s %s %s %s" % (os.path.join(scriptPath, "3_wgsummary.R"), \
-                                                    scriptPath, process_wg_out, process_exon_out, summary_out, ex_summary_out,
-							thresh_wg, thresh_exon, debug)
-if debug == 'T':
-    print "............"
-    print "Step 3: 3_wgsummary.R"
-    print summary_cmd
+summary_cmd = f"Rscript {scriptPath}/3_wgsummary.R {scriptPath} {process_wg_out} {process_exon_out} {summary_out} {ex_summary_out} {thresh_wg} {thresh_exon} {debug}"
+
+if debug == "T":
+    print("............")
+    print("Step 3: 3_wgsummary.R")
+    print(summary_cmd)
 os.system(summary_cmd)
 
-print "Finished: created %s %s" % (summary_out, ex_summary_out)
+print(f"Finished: created {summary_out} {ex_summary_out}")
 
 
 # 4. PLOT.R
 
-outf = os.path.normpath(os.path.join(outPath,sampName+"_plot.pdf"))
+outf = os.path.normpath(os.path.join(outPath, sampName + "_plot.pdf"))
 if species == "human":
-    species_f = os.path.join(scriptPath,"hg19_chrlen.txt")
+    species_f = os.path.join(scriptPath, "hg19_chrlen.txt")
 else:
-    raise Exception("species not implemented: "+species)
+    raise Exception("species not implemented: " + species)
 
-plotScript = os.path.join(scriptPath,"4_plot.R")
-plot_cmd = "Rscript %s %s %s %s %s %s %s" % (plotScript, ex_summary_out,summary_out,sampName,species,species_f,outf)
+plotScript = os.path.join(scriptPath, "4_plot.R")
+plot_cmd = f"Rscript {plotScript} {ex_summary_out} {summary_out} {sampName} {species} {species_f} {outf}"
 
 
-print "............"
-print "Step 4: 4_plot.R"
-print plot_cmd
+print("............")
+print("Step 4: 4_plot.R")
+print(plot_cmd)
 os.system(plot_cmd)
-print "Finished: created %s" % outf
-
-
+print("Finished: created {outf}")
diff --git a/WGCNV-SINGLE/wgcnv.py b/WGCNV-SINGLE/wgcnv.py
index 6f5edf6..6f819a0 100755
--- a/WGCNV-SINGLE/wgcnv.py
+++ b/WGCNV-SINGLE/wgcnv.py
@@ -2,68 +2,70 @@
 
 import sys
 import os
-USAGE="%s <sampContraOutPath> <mouse|human> <variantF> [outDir] [outPrefix]" % sys.argv[0]
+
+USAGE = (
+    f"{sys.argv[0]} <sampContraOutPath> <mouse|human> <variantF> [outDir] [outPrefix]"
+)
 
 if len(sys.argv) < 4:
-	print USAGE
-	sys.exit(1)
+    print(USAGE)
+    sys.exit(1)
 
 
-sampPath=sys.argv[1]
-samp=os.path.basename(sampPath)
+sampPath = sys.argv[1]
+samp = os.path.basename(sampPath)
 
-if sys.argv[2]=="human":
-	chrlen_f="hg19_chrlen.txt"
-elif sys.argv[2]=="mouse":
-	chrlen_f="mm9_chrlen.txt"
+if sys.argv[2] == "human":
+    chrlen_f = "hg19_chrlen.txt"
+elif sys.argv[2] == "mouse":
+    chrlen_f = "mm9_chrlen.txt"
 else:
-	print "Invalid input - %s" % sys.argv[2]
+    print(f"Invalid input - {sys.argv[2]}")
 
 variantF = sys.argv[3]
 
 if len(sys.argv) > 4:
-	outPath=sys.argv[4]
+    outPath = sys.argv[4]
 else:
-	outPath="."
+    outPath = "."
 
 if len(sys.argv) > 5:
-	outPrefix=sys.argv[5]
+    outPrefix = sys.argv[5]
 else:
-	outPrefix=samp
+    outPrefix = samp
+
+scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
+rscript = os.path.join(scriptPath, "wgcnv_withBAFplot.R")
 
-scriptPath=os.path.realpath(os.path.dirname(sys.argv[0]))
-rscript=os.path.join(scriptPath,"wgcnv_withBAFplot.R")
 
+contraF = None
+for root, dirs, files in os.walk(sampPath):
+    if os.path.basename(root) == "table":
+        for f in files:
+            if f.endswith("bins.txt"):
+                if contraF is not None:
+                    print("WARNING: multiple contra output files are detected.")
+                    break
+                contraF = os.path.join(root, f)
 
-contraF=None
-for root,dirs,files in os.walk(sampPath):
-	if os.path.basename(root)=="table":
-		for f in files:
-			if f.endswith("bins.txt"):
-				if contraF is not None:
-					print "WARNING: multiple contra output files is detected."
-					break
-				contraF=os.path.join(root,f)
-		
-		if contraF:
-			break
+        if contraF:
+            break
 
-print contraF
+print(contraF)
 
-#variantF = os.path.join(sampPath,samp+"_Ensembl_annotated.tsv")
+# variantF = os.path.join(sampPath,samp+"_Ensembl_annotated.tsv")
 if not os.path.isfile(variantF):
-	raise Exception("Variant file not found %s" % variantF)
+    raise Exception(f"Variant file not found {variantF}")
 
 
 if not os.path.exists(outPath):
-	os.makedirs(outPath)
+    os.makedirs(outPath)
 
-outp=os.path.join(outPath,outPrefix)
-cmd="Rscript %s %s %s %s %s" % (rscript,contraF,variantF,chrlen_f,outp)
-print cmd
+outp = os.path.join(outPath, outPrefix)
+cmd = f"Rscript {rscript} {contraF} {variantF} {chrlen_f} {outp}"
+print(cmd)
 os.system(cmd)
 
-cmd2="convert %s*byChr*png %s_byChr.pdf" % (outp,outp)
+cmd2 = f"convert {outp}*byChr*png {outp}_byChr.pdf"
 os.system(cmd2)
-os.system("rm %s*byChr*png" %outp)
-
+os.system(f"rm {outp}*byChr*png")
diff --git a/WGCNV-SINGLE/wgcnv_noBAF.py b/WGCNV-SINGLE/wgcnv_noBAF.py
index b626693..d8b97b0 100755
--- a/WGCNV-SINGLE/wgcnv_noBAF.py
+++ b/WGCNV-SINGLE/wgcnv_noBAF.py
@@ -2,66 +2,66 @@
 
 import sys
 import os
-USAGE="%s <sampPath> <mouse|human> [outDir] [outPrefix]" % sys.argv[0]
+
+USAGE = f"{sys.argv[0]} <sampPath> <mouse|human> [outDir] [outPrefix]"
 
 if len(sys.argv) < 3:
-	print USAGE
-	sys.exit(1)
+    print(USAGE)
+    sys.exit(1)
 
 
-sampPath=sys.argv[1]
-samp=os.path.basename(sampPath)
+sampPath = sys.argv[1]
+samp = os.path.basename(sampPath)
 
-if sys.argv[2]=="human":
-	chrlen_f="hg19_chrlen.txt"
-elif sys.argv[2]=="mouse":
-	chrlen_f="mm9_chrlen.txt"
+if sys.argv[2] == "human":
+    chrlen_f = "hg19_chrlen.txt"
+elif sys.argv[2] == "mouse":
+    chrlen_f = "mm9_chrlen.txt"
 else:
-	print "Invalid input - %s" % sys.argv[2]
+    print(f"Invalid input - {sys.argv[2]}")
 
 if len(sys.argv) > 3:
-	outPath=sys.argv[3]
+    outPath = sys.argv[3]
 else:
-	outPath="."
+    outPath = "."
 
 if len(sys.argv) > 4:
-	outPrefix=sys.argv[4]
+    outPrefix = sys.argv[4]
 else:
-	outPrefix=samp
+    outPrefix = samp
+
+scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
+rscript = os.path.join(scriptPath, "wgcnv_withBAFplot.R")
 
-scriptPath=os.path.realpath(os.path.dirname(sys.argv[0]))
-rscript=os.path.join(scriptPath,"wgcnv_withBAFplot.R")
 
+contraF = None
+for root, dirs, files in os.walk(sampPath):
+    if os.path.basename(root) == "table":
+        for f in files:
+            if f.endswith("bins.txt"):
+                if contraF is not None:
+                    print("WARNING: multiple contra output files is detected.")
+                    break
+                contraF = os.path.join(root, f)
 
-contraF=None
-for root,dirs,files in os.walk(sampPath):
-	if os.path.basename(root)=="table":
-		for f in files:
-			if f.endswith("bins.txt"):
-				if contraF is not None:
-					print "WARNING: multiple contra output files is detected."
-					break
-				contraF=os.path.join(root,f)
-		
-		if contraF:
-			break
+        if contraF:
+            break
 
-print contraF
+print(contraF)
 
-variantF = "NA"#os.path.join(sampPath,samp+"_Ensembl_annotated.tsv")
-#if not os.path.isfile(variantF):
-#	raise Exception("Variant file not found %s" % variantF)
+variantF = "NA"  # os.path.join(sampPath,samp+"_Ensembl_annotated.tsv")
+# if not os.path.isfile(variantF):
+# 	raise Exception("Variant file not found %s" % variantF)
 
 
 if not os.path.exists(outPath):
-	os.makedirs(outPath)
+    os.makedirs(outPath)
 
-outp=os.path.join(outPath,outPrefix)
-cmd="Rscript %s %s %s %s %s" % (rscript,contraF,variantF,chrlen_f,outp)
-print cmd
+outp = os.path.join(outPath, outPrefix)
+cmd = f"Rscript {rscript} {contraF} {variantF} {chrlen_f} {outp}"
+print(cmd)
 os.system(cmd)
 
-cmd2="convert %s*byChr*png %s_byChr.pdf" % (outp,outp)
+cmd2 = f"convert {outp}*byChr*png {outp}_byChr.pdf"
 os.system(cmd2)
-os.system("rm %s*byChr*png" %outp)
-
+os.system(f"rm {outp}*byChr*png")
diff --git a/scripts/cn_analysis.v4.R b/scripts/cn_analysis.v4.R
index 14db147..c99d714 100755
--- a/scripts/cn_analysis.v4.R
+++ b/scripts/cn_analysis.v4.R
@@ -17,7 +17,7 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
+#
 #-----------------------------------------------------------------------#
 # Last Updated : 30 Sept 2011 17:00PM
 
@@ -57,7 +57,7 @@ cn.boundary= read.delim(boundariesFile,as.is=F, header=F)
 cn.average.aboveTs = cn.average[cn.average$V3>min.bases,]
 cn.average.list = as.matrix(cn.average.aboveTs$V4)
 
-# Get the mean and sd for each bins 
+# Get the mean and sd for each bins
 cn.average.mean = c()
 cn.average.sd = c()
 cn.average.log= c()
@@ -87,7 +87,7 @@ if (plotoption == "True"){
 	dev.off()
 }
 
-# Put the data's details into matrices 
+# Put the data's details into matrices
 ids 		= as.matrix(cn.average.aboveTs$V1)
 exons 		= as.matrix(cn.average.aboveTs$V6)
 exons.pos 	= as.matrix(cn.average.aboveTs$V5)
@@ -162,7 +162,7 @@ fit.sd.fn <- function(x, fit.a, fit.b){
 	result = 2 ^ (fit.mean.fn(x, fit.a, fit.b))
 	return (result)
 }
-	
+
 # Get the P Values, called the gain/loss
 # with average and sd from each bins
 pVal.list = c()
@@ -176,7 +176,7 @@ for (i in 1:nrow(cn.average.list)){
 	logcov	 = logcov.mean[i]
 	exon.bin = Bin[i]
 
-	if (length(logratios.sd) > 1){	
+	if (length(logratios.sd) > 1){
 		pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b))
 	} else {
 		pVal <- pnorm(logratio, 0, logratios.sd[exon.bin])
@@ -213,7 +213,7 @@ write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
 
 #Plotting SD
 #a.sd.fn  	= rep(fit.sd.a, length(logratios.sd.ori))
-#b.sd.fn    	= rep(fit.sd.b, length(logratios.sd.ori)) 
+#b.sd.fn    	= rep(fit.sd.b, length(logratios.sd.ori))
 #sd.after.fit 	= fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b)
 #sd.out.f 	= paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="")
 #sd.outdf 	= data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn)
@@ -223,6 +223,3 @@ write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
 #End of the script
 print ("End of cn_analysis.R")
 print (i)
-
-
-
diff --git a/scripts/large_region_cbs.R b/scripts/large_region_cbs.R
index 64c9797..476b859 100755
--- a/scripts/large_region_cbs.R
+++ b/scripts/large_region_cbs.R
@@ -17,7 +17,7 @@
 #    You should have received a copy of the GNU General Public License
 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 #
-# 
+#
 #-----------------------------------------------------------------------#
 # Last Updated : 12 October 2011 16:43PM
 
@@ -73,7 +73,7 @@ for (i in kmax.range){
 		cna.start.coord = dat[dat$Targeted.Region.ID==(cna.start.id),]$OriStCoordinate
 		cna.end.coord	= dat[dat$Targeted.Region.ID==(cna.end.id), ]$OriEndCoordinate
 
-		dat.inrange 		= dat[dat$Targeted.Region.ID<=cna.end.id&dat$Targeted.Region.ID>=cna.start.id,]	
+		dat.inrange 		= dat[dat$Targeted.Region.ID<=cna.end.id&dat$Targeted.Region.ID>=cna.start.id,]
 		cna.segment.logratios 	= dat.inrange$Adjusted.Mean.of.LogRatio
 		cna.segment.pvalues   	= dat.inrange$Adjusted.P.Value
 		segment.pvals.above	= dat.inrange[dat.inrange$Adjusted.P.Value<=param.pval.cutoff,]$Adjusted.P.Value
@@ -97,7 +97,7 @@ for (i in kmax.range){
 				cna.segment.calls = c(cna.segment.calls, "CNV")
 			}
 		}
-	}	
+	}
 
 
 	outdf = data.frame(Chr=cna.segment.out$chrom, Target.Start=cna.segment.out$loc.start, Target.End=cna.segment.out$loc.end, NumberOfTargets=cna.segment.out$num.mark, OriStCoordinate=cna.segment.start, OriEndCoordinate=cna.segment.end, CBS.Mean = cna.segment.out$seg.mean, LogRatios = cna.segment.mean, Above.PValues.Cutoff=cna.segment.pvals.size, Calls = cna.segment.calls)
@@ -123,11 +123,11 @@ for (i in 1:nrow(small.call)){
         start.small     = small.segment$Target.Start
         end.small       = small.segment$Target.End
         match.large.nocall = large.nocall[large.nocall$Chr==chr.small,]
-        
-        match.large.nocall = match.large.nocall[match.large.nocall$Target.End>=start.small,]    
-        match.large.nocall = match.large.nocall[match.large.nocall$Target.Start<=start.small,]   
+
+        match.large.nocall = match.large.nocall[match.large.nocall$Target.End>=start.small,]
+        match.large.nocall = match.large.nocall[match.large.nocall$Target.Start<=start.small,]
         merged.hole        = match.large.nocall[match.large.nocall$Target.End>=start.small,]
-        
+
         if (nrow(merged.hole) > 0){
                 included.segment = merge(included.segment, small.segment, all=T)
         }
@@ -135,16 +135,3 @@ for (i in 1:nrow(small.call)){
 
 out.f = paste(logcov.file, ".LargeDeletion.txt", sep="")
 write.table(included.segment, out.f, sep="\t", quote=F, row.names=F)
-
-
-
-
-
-
-
-
-
-
-
-
-

From ac944e3d5504eaae5c5d6d316322587306d408d1 Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 11:04:31 +1100
Subject: [PATCH 3/9] too much work to do autodetection currently

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index a742196..0c75a48 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ http://contra-cnv.sourceforge.net/
 v2.1.0
 Uses python3
 Improved performance through multithreading and restructuring of loops
-Autodetection of file formats (work in progress)
 
 v2.0.8
 Included NDE and WGCNV workflows.

From e1e9ceda8489df62fadc3abdbe6059fabd41ed86 Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 11:13:29 +1100
Subject: [PATCH 4/9] updated NDE part to python3

---
 .gitignore                        |   3 -
 NullDistEstimation/1_renormAll.py |  82 +++++++------
 NullDistEstimation/2_getExons.py  | 189 ++++++++++++++++--------------
 NullDistEstimation/3_density.R    |  18 +--
 NullDistEstimation/nde_wrapper.py |  76 ++++++------
 5 files changed, 201 insertions(+), 167 deletions(-)
 delete mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 5a77cbc..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.pyc
-test
-TestFiles
diff --git a/NullDistEstimation/1_renormAll.py b/NullDistEstimation/1_renormAll.py
index fa83e2e..3ed6508 100644
--- a/NullDistEstimation/1_renormAll.py
+++ b/NullDistEstimation/1_renormAll.py
@@ -6,7 +6,7 @@
 import sys
 
 if len(sys.argv) < 5:
-    print "Error: 1_renormAll.py Not enough arguments"
+    print("Error: 1_renormAll.py Not enough arguments")
     sys.exit(0)
 
 contraList = sys.argv[1]
@@ -14,11 +14,14 @@
 renormList = sys.argv[3]
 debug = sys.argv[4]
 
-if debug == 'T':
-    a_and_b = open(os.path.join(os.path.split(renormList)[0],'debug/1_abstats.txt'), 'w')
-    a_and_b.write('SampleName\ta_2\tb_2\t\n')
+if debug == "T":
+    a_and_b = open(
+        os.path.join(os.path.split(renormList)[0], "debug/1_abstats.txt"), "w"
+    )
+    a_and_b.write("SampleName\ta_2\tb_2\t\n")
 
-def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
+
+def renorm_file(path_CONTRA_out, path_out):  # Arguments: two paths to files
     f_CONTRA_out = open(path_CONTRA_out, "r")
     f_out = open(path_out, "w")
 
@@ -29,23 +32,24 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
     headers = lines[0]
     contra = []
     for line in lines[1:]:
-        contra.append(line.split('\t'))
+        contra.append(line.split("\t"))
     # For each line in contra:
     # 0 = region ID, 6 = Mean.of.LogRatio, 7 = Adjusted.Mean.of.LogRatio, 14 = tumour.rd, 15 = normal.rd, 16 = tumour.rd.ori, 17 = normal.rd.ori
-        
+
     # 2. Go through data, calculating x_T, N_T, x_N, N_N
     # as well as creating list of genes to remove (big gains)
-    removed_regions = [] # Listed by region ID
-    remove_threshold = 2 # Arbitrarily chosen - can be adjusted
+    removed_regions = []  # Listed by region ID
+    remove_threshold = 2  # Arbitrarily chosen - can be adjusted
     ##### PROBLEM: due to threshold might only take some regions of a particular gene when in reality, want all of them??
     # Does this make a difference?
     x_T = N_T = x_N = N_N = 0.0
 
     # 4 = OriStCoordinate, 5 = OriEndCoordinate
 
-
     for line in contra:
-        k = float(line[5]) - float(line[4]) + 1 # Width / number of bases. Inclusive, so add 1.
+        k = (
+            float(line[5]) - float(line[4]) + 1
+        )  # Width / number of bases. Inclusive, so add 1.
         if float(line[7]) > remove_threshold:
             removed_regions.append(line[0])
             x_T += float(line[16]) * k
@@ -56,42 +60,47 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
     # print removed_regions
 
     # 3. Calculate a_2, b_2
-    n = 1 - float(x_N)/N_N
-    t = 1 - float(x_T)/N_T
-    a_2 = math.sqrt(n/t)
-    b_2 = math.sqrt(t/n) # or 1/a_2
-
+    n = 1 - float(x_N) / N_N
+    t = 1 - float(x_T) / N_T
+    a_2 = math.sqrt(n / t)
+    b_2 = math.sqrt(t / n)  # or 1/a_2
 
     # 4. Rescale read depths (and recalculate log ratio??)
     # Add to new columns
     # Need to multiply necessary tumour.rd (d_t) by a_2, normal.rd (d_n) by b_2
-    headers = "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n" # fix headers
+    headers = (
+        "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n"
+    )  # fix headers
     f_out.write(headers)
     for line in contra:
         temp = [line[2], line[3], line[4], line[5]]
         # calculate new read depths (actually you dont need these)
-        #temp.append(str(float(line[14]) * a_2))
-        #temp.append(str(float(line[15]) * b_2))
+        # temp.append(str(float(line[14]) * a_2))
+        # temp.append(str(float(line[15]) * b_2))
         # Recalculate Mean.of.Log.Ratio
-        temp.append(str(math.log((float(line[14]) * a_2) /(float(line[15]) * b_2 ), 2))+'\n')
+        temp.append(
+            str(math.log((float(line[14]) * a_2) / (float(line[15]) * b_2), 2)) + "\n"
+        )
         # Write to output
-        f_out.write('\t'.join(temp))
+        f_out.write("\t".join(temp))
 
     # 5. Close file
     f_out.close()
 
-    if debug == 'T':
-        a_and_b.write('\t'.join([os.path.basename(path_CONTRA_out), str(a_2), str(b_2)+'\n']))
+    if debug == "T":
+        a_and_b.write(
+            "\t".join([os.path.basename(path_CONTRA_out), str(a_2), str(b_2) + "\n"])
+        )
 
 
 # get file paths from list
 # note that these are paths, not just names: makes it a bit trickier
-dirs = open(contraList, 'r')
-renormGood = open(renormList, 'w')
+dirs = open(contraList, "r")
+renormGood = open(renormList, "w")
 filepaths = []
 bad_filepaths = []
 for line in dirs.readlines():
-    filepaths.append(line.rstrip('\n\r'))
+    filepaths.append(line.rstrip("\n\r"))
 dirs.close()
 
 middle = "/table/"
@@ -102,21 +111,20 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files
 count = 0
 for filepath in filepaths:
     count += 1
-    if count % 100 == 0 and debug == 'T':
-        print "...Now processing "+str(count)+"..."
-    filename = os.path.basename(filepath.rstrip("/\\ ")) # want to get file
-    cfilepath = os.path.join(filepath+middle, filename+suffix)
+    if count % 100 == 0 and debug == "T":
+        print(f"...Now processing {str(count)}...")
+    filename = os.path.basename(filepath.rstrip("/\\ "))  # want to get file
+    cfilepath = os.path.join(filepath + middle, filename + suffix)
     # print cfilepath
-    outpath = os.path.join(renormPath, filename+outsuffix)
+    outpath = os.path.join(renormPath, filename + outsuffix)
     if os.path.exists(cfilepath):
         renorm_file(cfilepath, outpath)
-        renormGood.write(filename+'\n')
+        renormGood.write(filename + "\n")
     else:
         bad_filepaths.append(filename)
 
-if debug == 'T':
+if debug == "T":
     a_and_b.close()
-    print "Finished processing out to %s" % renormList
-    print "Bad paths:"
-    print " ".join(bad_filepaths)
-
+    print(f"Finished processing out to {renormList}")
+    print("Bad paths:")
+    print(" ".join(bad_filepaths))
diff --git a/NullDistEstimation/2_getExons.py b/NullDistEstimation/2_getExons.py
index c594247..4c5b09f 100644
--- a/NullDistEstimation/2_getExons.py
+++ b/NullDistEstimation/2_getExons.py
@@ -2,15 +2,15 @@
 import os
 import sys
 
-#-----------------------------------------------------------------------------------
+# -----------------------------------------------------------------------------------
 # Parts based off 10/09/13 version of get_exons_current.py
 # Modified to take command line input
 # removed deepcopy cos that was stupid
-#-----------------------------------------------------------------------------------
+# -----------------------------------------------------------------------------------
 # getExons_cmd = "python 2_getExons.py %s %s %s %s %s %s" % (renormPath, renormList, exonSummary, wgSummary, debug, bed)
 
-if (sys.argv < 7):
-    print "step 2 error not enough arguments"
+if sys.argv < 7:
+    print("step 2 error not enough arguments")
     sys.exit(1)
 
 renormPath = sys.argv[1]
@@ -26,24 +26,26 @@
 gdict = {}
 
 # Error file (write out errors here)
-if debug == 'T':
-    error_out = os.path.join(os.path.split(exonSummary)[0], "debug/2_getExons_errors.txt")
-    errors = open(error_out, 'w')
+if debug == "T":
+    error_out = os.path.join(
+        os.path.split(exonSummary)[0], "debug/2_getExons_errors.txt"
+    )
+    errors = open(error_out, "w")
 
 
 # Create gdict entries based on myTSCA.bed.annotated.txt
 # Format: chromosome startcoord endcoord gene
-#myTSCA_in = "/home/lij/Current_Routine_Support/TSCA_WGCNV/myTSCA.bed.annotated.txt"
-#myTSCA_in = "/home/eyeap/haloplex/halo.bed.annotated.txt" # halo bed?
+# myTSCA_in = "/home/lij/Current_Routine_Support/TSCA_WGCNV/myTSCA.bed.annotated.txt"
+# myTSCA_in = "/home/eyeap/haloplex/halo.bed.annotated.txt" # halo bed?
 myTSCA_in = bed
-myTSCA = open(myTSCA_in, 'r')
+myTSCA = open(myTSCA_in, "r")
 for line in myTSCA.readlines():
-    entries = line.split('\t')
-    entries[3] = entries[3].rstrip('\n')
+    entries = line.split("\t")
+    entries[3] = entries[3].rstrip("\n")
     if entries[3] not in gdict:
         gdict[entries[3]] = {}
-    #gdict[entries[3]]['_'.join(entries[1:3])] = []
-    gdict[entries[3]][entries[0]+":"+'_'.join(entries[1:3])] = []
+    # gdict[entries[3]]['_'.join(entries[1:3])] = []
+    gdict[entries[3]][entries[0] + ":" + "_".join(entries[1:3])] = []
 myTSCA.close()
 
 # Helper function
@@ -53,6 +55,7 @@ def calculate_average(numblist):
         total += float(item)
     return total / float(len(numblist))
 
+
 # Helper function - list may contain "NA" entries
 def calculate_average_withmissing(numblist):
     total = 0.0
@@ -66,17 +69,18 @@ def calculate_average_withmissing(numblist):
     else:
         return "NA"
 
+
 # Given a CONTRA output file, generate a list of exons with average log ratio per exon.
 def process_file(f_in):
-    GENE_IND=0
-    CHR_IND=1
-    START_IND=2
-    END_IND=3
-    LR_IND=4
-    
-    f = open(f_in, 'r')
-    if debug == 'T':
-        errors.write('\n\n'+f_in+'\n')
+    GENE_IND = 0
+    CHR_IND = 1
+    START_IND = 2
+    END_IND = 3
+    LR_IND = 4
+
+    f = open(f_in, "r")
+    if debug == "T":
+        errors.write("\n\n" + f_in + "\n")
     # 0 - Gene Sym
     # 1 - OriStCoord, 2 - OriEndCoord
     # 3 - Mean.of.LogRatio (renormalised)
@@ -88,20 +92,25 @@ def process_file(f_in):
 
     # Temp list of all genes in gdict
     all_genes = gdict.keys()
-    
+
     # Preprocessing
     contra = []
     for line in f.readlines()[1:]:
-        newline = line.split('\t')
-        name = newline[GENE_IND].split('(')[0]
+        newline = line.split("\t")
+        name = newline[GENE_IND].split("(")[0]
         newline[GENE_IND] = name
-        newline[-1] = newline[-1].rstrip('\n')
+        newline[-1] = newline[-1].rstrip("\n")
         contra.append(newline)
 
     f.close()
 
     currgene = contra[0][GENE_IND]
-    temp = [contra[0][START_IND], contra[0][END_IND], [contra[0][LR_IND]],contra[0][CHR_IND]] # start end list of averages
+    temp = [
+        contra[0][START_IND],
+        contra[0][END_IND],
+        [contra[0][LR_IND]],
+        contra[0][CHR_IND],
+    ]  # start end list of averages
     exons[currgene] = {}
     exon_n = 1
     for line in contra[1:]:
@@ -112,14 +121,14 @@ def process_file(f_in):
             exons[currgene][exon_n] = temp
             currgene = line[GENE_IND]
             temp = [line[START_IND], line[END_IND], [line[LR_IND]], line[CHR_IND]]
-            if currgene not in exons: # if exon data gets broken up...
+            if currgene not in exons:  # if exon data gets broken up...
                 exon_n = 1
                 exons[currgene] = {}
             else:
                 exon_n = len(exons[currgene].keys()) + 1
-                
+
         # If its a new exon...
-        elif line[START_IND] != temp[1]: # ie coordinates don't match up
+        elif line[START_IND] != temp[1]:  # ie coordinates don't match up
             # Add old to exons dictionary, after calculating average log ratio over the exon.
             temp[2] = calculate_average(temp[2])
             exons[currgene][exon_n] = temp
@@ -136,16 +145,16 @@ def process_file(f_in):
     # Add appropriate data to global dictionary gdict
     # If gdict not yet populated (empty):
     for gene in exons.keys():
-	#print "gene %s " % gene 
-	#print "gdict %s " % str(gdict)
-	#print "gdict[gene] %s " % str(gdict[gene])
+        # print "gene %s " % gene
+        # print "gdict %s " % str(gdict)
+        # print "gdict[gene] %s " % str(gdict[gene])
         exon_list = gdict[gene].keys()
         for exon in exons[gene].keys():
             start = int(exons[gene][exon][0])
             end = int(exons[gene][exon][1])
-	    ch= exons[gene][exon][3].replace("chr","")
-            exon_name = ch+":"+str(start)+'_'+str(end)
-#            print "DEBUG 33333333 %s  in   %s" % ( exon_name , str(exon_list[0:10]))
+            ch = exons[gene][exon][3].replace("chr", "")
+            exon_name = ch + ":" + str(start) + "_" + str(end)
+            #            print "DEBUG 33333333 %s  in   %s" % ( exon_name , str(exon_list[0:10]))
             if exon_name in exon_list:
                 gdict[gene][exon_name].append(exons[gene][exon][2])
                 exon_list.remove(exon_name)
@@ -153,82 +162,95 @@ def process_file(f_in):
                 # Attempt to match it to something in exon_list: up to 20 bases missing from each side is OK
                 matched = False
                 for possible_match in exon_list:
-                    components = possible_match.split(":")[1].split('_')
+                    components = possible_match.split(":")[1].split("_")
                     possible_start = int(components[0])
                     possible_end = int(components[1])
                     start_diff = start - possible_start
                     end_diff = possible_end - end
-                    if start_diff < 21 and end_diff < 21 and start_diff >= 0 and end_diff >= 0:
+                    if (
+                        start_diff < 21
+                        and end_diff < 21
+                        and start_diff >= 0
+                        and end_diff >= 0
+                    ):
                         # Can match these two together
-                        if debug == 'T':
-                            errors.write('\t'+gene+'_'+possible_match+" is matched with incomplete "+exon_name+'\n')
+                        if debug == "T":
+                            errors.write(
+                                "\t"
+                                + gene
+                                + "_"
+                                + possible_match
+                                + " is matched with incomplete "
+                                + exon_name
+                                + "\n"
+                            )
                         gdict[gene][possible_match].append(exons[gene][exon][2])
                         exon_list.remove(possible_match)
                         matched = True
                 if not matched:
                     # Couldn't find a match for it: output error
-                    if debug == 'T':
-                        errors.write(gene+'_'+exon_name+" could not be found\n")
+                    if debug == "T":
+                        errors.write(gene + "_" + exon_name + " could not be found\n")
         # Take care of missing data: remaining exons in exonlist
         for missing_exon in exon_list:
             gdict[gene][missing_exon].append("NA")
-            if debug == 'T':
-                errors.write(gene+'_'+missing_exon+" marked as missing\n")
+            if debug == "T":
+                errors.write(gene + "_" + missing_exon + " marked as missing\n")
         all_genes.remove(gene)
 
     for gene in all_genes:
         # These genes had nothing in the input data. So, add NA to each exon of the gene.
         for exon in gdict[gene]:
             gdict[gene][exon].append("NA")
-        if debug == 'T':
-            errors.write(gene+" not found in input data\n")
-    
-                                
+        if debug == "T":
+            errors.write(gene + " not found in input data\n")
+
+
 # get filenames
-dirs = open(renormList, 'r')
+dirs = open(renormList, "r")
 filenames = []
 good_filenames = []
 bad_filenames = []
 for line in dirs.readlines():
-    filenames.append(line.rstrip('\n\r'))
+    filenames.append(line.rstrip("\n\r"))
 dirs.close()
 
 count = 0
 for filename in filenames:
     count += 1
-    if count % 100 == 0 and debug == 'T':
-        print "...Now processing "+str(count)+"..."
-    filepath = os.path.join(renormPath, filename+"_renorm.txt")
+    if count % 100 == 0 and debug == "T":
+        print(f"...Now processing {str(count)}...")
+    filepath = os.path.join(renormPath, filename + "_renorm.txt")
     if os.path.exists(filepath):
         process_file(filepath)
         good_filenames.append(filename)
     else:
-        if debug == 'T':
-            print "Error: Could not find "+filename
-        bad_filenames.append(filename)           
+        if debug == "T":
+            print(f"Error: Could not find {filename}")
+        bad_filenames.append(filename)
 
 # write out gdict...
 output_name = exonSummary
-out = open(output_name, 'w')
+out = open(output_name, "w")
 # Write headers
-out.write("Exon Name\t"+'\t'.join(good_filenames)+'\n') # dont have bad headers!
+out.write("Exon Name\t" + "\t".join(good_filenames) + "\n")  # dont have bad headers!
 for gene in sorted(gdict.keys()):
     for exon in sorted(gdict[gene].keys()):
-        out.write(gene+'_'+str(exon))
+        out.write(gene + "_" + str(exon))
         for ratio in gdict[gene][exon]:
-            out.write('\t'+str(ratio))
-        out.write('\n')
+            out.write("\t" + str(ratio))
+        out.write("\n")
 
 out.close()
 
-if debug == 'T':
-    print "Finished processing out to "+output_name
+if debug == "T":
+    print(f"Finished processing out to {output_name}")
 
 
 for gene in gdict.keys():
-    if ';' in gene: # If gene needs to be split
-        #print 'splitting',gene
-        components = gene.split(';')
+    if ";" in gene:  # If gene needs to be split
+        # print 'splitting',gene
+        components = gene.split(";")
         for component_gene in components:
             if component_gene not in gdict:
                 gdict[component_gene] = {}
@@ -238,30 +260,27 @@ def process_file(f_in):
         del gdict[gene]
 
 
-wholegene = open(wgSummary, 'w')
+wholegene = open(wgSummary, "w")
 # write headers
-wholegene.write("Gene\t"+'\t'.join(good_filenames)+'\n')
+wholegene.write("Gene\t" + "\t".join(good_filenames) + "\n")
 for gene in sorted(gdict.keys()):
     wholegene.write(gene)
-    for i in range(0, len(good_filenames)): # For each sample...
+    for i in range(0, len(good_filenames)):  # For each sample...
         sample_temp = []
-        for exon in gdict[gene].keys(): # For each exon (in that gene):
-            #print len(gdict[gene][exon])
-            #print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon]))
-            sample_temp.append(gdict[gene][exon][i]) # collect the log ratio of that exon of that sample (of that gene)
-        wholegene.write('\t'+str(calculate_average_withmissing(sample_temp))) # whole gene log ratio = average of log ratios across all exons of that gene
-    wholegene.write('\n')
+        for exon in gdict[gene].keys():  # For each exon (in that gene):
+            # print len(gdict[gene][exon])
+            # print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon]))
+            sample_temp.append(
+                gdict[gene][exon][i]
+            )  # collect the log ratio of that exon of that sample (of that gene)
+        wholegene.write(
+            "\t" + str(calculate_average_withmissing(sample_temp))
+        )  # whole gene log ratio = average of log ratios across all exons of that gene
+    wholegene.write("\n")
 wholegene.close()
 
 
 if debug == "T":
-    print "Finished writing whole-gene data to " + wgSummary
+    print(f"Finished writing whole-gene data to {wgSummary}")
     errors.close()
-    print "Finished writing errors to " + error_out
-
-
-
-
-
-
-
+    print(f"Finished writing errors to {error_out}")
diff --git a/NullDistEstimation/3_density.R b/NullDistEstimation/3_density.R
index d30ea20..84598d9 100644
--- a/NullDistEstimation/3_density.R
+++ b/NullDistEstimation/3_density.R
@@ -5,7 +5,7 @@
 
 # command line input based on Jason Li's wgcnv_withBAFplot.R
 x=commandArgs()
-args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...? 
+args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...?
 
 summaryin= args[1]
 threshout= args[2]
@@ -62,13 +62,13 @@ print(i)
 		# from .75 to 3.5
 		#for (i in 5:175) {
 		#	thresh = i/50
-		
+
 		n.steps=100
 		#num_range = max(numrow) - min(numrow)
 		#num_range_step = num_range/100
-		
+
 		#num_range_step = best_thresh / n.steps
-		
+
 		num_range_step = sd1*2*2 / n.steps
 		for (i2 in 1:n.steps) {
 			thresh = i2 * num_range_step
@@ -94,14 +94,14 @@ print(i)
 			best_p = shapiro.test(numrow[!is.na(numrow)])$p.value
 			best_thresh = max ( max(numrow) - mean, mean - min(numrow))
 		}
-	} 
+	}
 
 	if (hists == 'T' && length(numrow[!is.na(numrow)]) > 1) {
 	  #x11()
-	  plot(density(numrow, na.rm = TRUE), 
+	  plot(density(numrow, na.rm = TRUE),
 		main = rowname,
 		sub = paste("Mean: ", round(mean,4), " || Threshholds:", round(mean-best_thresh,4), round(mean+best_thresh,4)),
-		xlab = paste("Variance:", round(var(numrow, na.rm = TRUE),4), 
+		xlab = paste("Variance:", round(var(numrow, na.rm = TRUE),4),
 			" || Cropped variance: ", round(var(numrow[numrow > mean-best_thresh & numrow < mean+best_thresh], na.rm = TRUE),4)
 			)
 	  )
@@ -136,10 +136,10 @@ dataout = data.frame(rownames, means, vars, left_threshs, right_threshs, cropped
 colnames(dataout) <- c("Gene Name", "Mean(trimmed 0.2)", "Variance", "Left Thresh", "Right Thresh",
 					"Cropped Mean", "Cropped Var")
 
-write.table(dataout, threshout, quote = FALSE, sep = '\t', 
+write.table(dataout, threshout, quote = FALSE, sep = '\t',
 	row.names = FALSE, col.names = TRUE)
 
-if (hists == 'T') { 
+if (hists == 'T') {
   dev.off()
 }
 
diff --git a/NullDistEstimation/nde_wrapper.py b/NullDistEstimation/nde_wrapper.py
index 18af87c..d44074e 100755
--- a/NullDistEstimation/nde_wrapper.py
+++ b/NullDistEstimation/nde_wrapper.py
@@ -8,14 +8,15 @@
 
 import sys
 import os
-USAGE = "%s <CONTRA_out_list.txt> <output_folder> <bed> [debug output T/F] [histograms T/F]" % sys.argv[0]
+
+USAGE = f"{sys.argv[0]} <CONTRA_out_list.txt> <output_folder> <bed> [debug output T/F] [histograms T/F]"
 # CONTRA_out_list = list of CONTRA output folder names / path to, inc name
 
 # If not enough arguments, print how to use the file
-debug = 'F'
-histograms = 'F'
+debug = "F"
+histograms = "F"
 if len(sys.argv) < 4:
-    print USAGE
+    print(USAGE)
     sys.exit(1)
 if len(sys.argv) > 4:
     debug = sys.argv[4]
@@ -29,13 +30,13 @@
 # technically vulnerable since only checking at one point...
 contraList = sys.argv[1]
 if not os.path.exists(contraList):
-    print "Error: couldn't find contra list"
+    print("Error: couldn't find contra list")
     sys.exit(1)
 
 # Create output folders if needed
 # Structure: outPath/renorm, and outpath/histograms if required
 outPath = sys.argv[2]
-bed=sys.argv[3]
+bed = sys.argv[3]
 renormPath = os.path.join(outPath, "renorm")
 histPath = os.path.join(outPath, "histograms")
 debugPath = os.path.join(outPath, "debug")
@@ -43,29 +44,43 @@
     os.makedirs(outPath)
 if not os.path.exists(renormPath):
     os.makedirs(renormPath)
-if histograms == 'T' and not os.path.exists(histPath):
+if histograms == "T" and not os.path.exists(histPath):
     os.makedirs(histPath)
-if debug == 'T' and not os.path.exists(debugPath):
+if debug == "T" and not os.path.exists(debugPath):
     os.makedirs(debugPath)
 
 # Step 1 - 1_renormAll.py
 # Takes in the list of all contra files, renormalises them to renormPath, and creates a new
 # list renormList
 renormList = os.path.join(outPath, "renorm_list.txt")
-renorm_cmd = "python %s/1_renormAll.py %s %s %s %s" % (scriptPath, contraList, renormPath, renormList, debug)
-if debug == 'T':
-    print "Step 1: 1_renormAll.py"
-    print renorm_cmd
+renorm_cmd = "python %s/1_renormAll.py %s %s %s %s" % (
+    scriptPath,
+    contraList,
+    renormPath,
+    renormList,
+    debug,
+)
+if debug == "T":
+    print("Step 1: 1_renormAll.py")
+    print(renorm_cmd)
 os.system(renorm_cmd)
 
 # Step 2 - 2_getExons.py
 # Takes in renormPath and renormList, and creates summary tables exonSummary wgSummary (as well as errors if debug)
 exonSummary = os.path.join(outPath, "summary_ex.txt")
 wgSummary = os.path.join(outPath, "summary_wg.txt")
-getExons_cmd = "python %s/2_getExons.py %s %s %s %s %s %s" % (scriptPath, renormPath, renormList, exonSummary, wgSummary, debug, bed)
-if debug == 'T':
-    print "Step 2: 2_getExons.py"
-    print getExons_cmd
+getExons_cmd = "python %s/2_getExons.py %s %s %s %s %s %s" % (
+    scriptPath,
+    renormPath,
+    renormList,
+    exonSummary,
+    wgSummary,
+    debug,
+    bed,
+)
+if debug == "T":
+    print("Step 2: 2_getExons.py")
+    print(getExons_cmd)
 
 os.system(getExons_cmd)
 
@@ -75,26 +90,21 @@
 # Requires exonSummary wgSummary + paths out
 exonThresh = os.path.join(outPath, "thresh_exons.txt")
 wgThresh = os.path.join(outPath, "thresh_wg.txt")
-density_wg_cmd = "Rscript %s %s %s %s %s" % (os.path.join(scriptPath, "3_density.R"), wgSummary, wgThresh, debug, histograms)
-density_ex_cmd = "Rscript %s %s %s %s %s" % (os.path.join(scriptPath, "3_density.R"), exonSummary, exonThresh, debug, histograms)
-if histograms == 'T':
+density_wg_cmd = (
+    f"Rscript {scriptPath}/3_density.R {wgSummary} {wgThresh} {debug} {histograms}"
+)
+density_ex_cmd = (
+    f"Rscript {scriptPath}/3_density.R {exonSummary} {exonThresh} %{debug} {histograms}"
+)
+
+if histograms == "T":
     density_wg_cmd = density_wg_cmd + " " + histPath + "/wg.pdf"
     density_ex_cmd = density_ex_cmd + " " + histPath + "/exon.pdf"
 
-if debug == 'T':
-    print "Step 3: 3_density.R"
-    print density_wg_cmd
-    print density_ex_cmd
+if debug == "T":
+    print("Step 3: 3_density.R")
+    print(density_wg_cmd)
+    print(density_ex_cmd)
 
 os.system(density_wg_cmd)
 os.system(density_ex_cmd)
-
-
-
-
-
-
-
-
-
-

From 9e37e34caea63f39774b9c2353ebc4a0955f6517 Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 11:43:47 +1100
Subject: [PATCH 5/9] baseline converted to python 3

---
 baseline.py | 386 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 213 insertions(+), 173 deletions(-)

diff --git a/baseline.py b/baseline.py
index f1e0c5d..7cc41ad 100755
--- a/baseline.py
+++ b/baseline.py
@@ -11,168 +11,202 @@
 from scripts.split_chromosome import splitByChromosome3 as splitByChromosome
 from scripts.get_chr_length import *
 from scripts.convert_targeted_regions import *
-from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate
+from scripts.convert_gene_coordinate import (
+    convertGeneCoordinate2 as convertGeneCoordinate,
+)
 from multiprocessing import Pool
 
+
 class Params:
-	"""
+    """
 	Class for top-level system parameters:
 	"""
 
-	def __init__(self):
-
-		def mult_files(option, opt_str, value, parser):
-			args=[]
-			for arg in parser.rargs:
-				if arg[0] != "-":
-					args.append(arg)
-				else:
-					del parser.rargs[:len(args)]
-					break
-			if getattr(parser.values, option.dest):
-				args.extend(getattr(parser.values, option.dest))
-			setattr(parser.values, option.dest, args)
-
-
-		#command-line option definition
-		self.parser = OptionParser()
-		self.parser.add_option("-t", "--target",
-			help="Target region definition file [REQUIRED] [BED Format]",
-			action="store", type="string", dest="target")
-		self.parser.add_option("-f", "--files",
-			help="Files to be converted to baselines [REQUIRED] [BAM]",
-			action="callback", callback=mult_files, dest="files")
-		self.parser.add_option("-o", "--output",
-			help="Output folder [REQUIRED]",
-			action="store", type="string", dest="output")
-		self.parser.add_option("-c","--trim",
-			help="Portion of outliers to be removed before calculating average [Default: 0.2]",
-			action="store", dest="trim", default="0.2")
-		self.parser.add_option("-n","--name",
-			help="Output baseline name [Default: baseline]",
-			action="store", dest="name", default="baseline")
-
-		# required parameters list:
-		self.ERRORLIST = []
-
-		#change system parameters based on any command line arguments
-		(options, args) = self.parser.parse_args()
-		if options.target:
-			self.TARGET=options.target
-		else:
-			self.ERRORLIST.append("target")
-
-		if options.files:
-			self.FILES=options.files
-		else:
-			self.ERRORLIST.append("files")
-
-		if options.output:
-			self.OUTPUT=options.output
-		else:
-			self.ERRORLIST.append("output")
-
-		if options.trim:
-			self.TRIM=float(options.trim)
-		
-		if options.name:
-			self.NAME=options.name
-
-		if len(self.ERRORLIST) != 0:
-			self.parser.print_help()
-			self.parser.error("Missing required parameters")
-
-
-	def repeat(self):
-		#params test
-		print "target	:", self.TARGET
-		print "files	:", self.FILES
-		print "output	:", self.OUTPUT
-		print "trim	:", self.TRIM
-		print "name	:", self.NAME
+    def __init__(self):
+        def mult_files(option, opt_str, value, parser):
+            args = []
+            for arg in parser.rargs:
+                if arg[0] != "-":
+                    args.append(arg)
+                else:
+                    del parser.rargs[: len(args)]
+                    break
+            if getattr(parser.values, option.dest):
+                args.extend(getattr(parser.values, option.dest))
+            setattr(parser.values, option.dest, args)
+
+        # command-line option definition
+        self.parser = OptionParser()
+        self.parser.add_option(
+            "-t",
+            "--target",
+            help="Target region definition file [REQUIRED] [BED Format]",
+            action="store",
+            type="string",
+            dest="target",
+        )
+        self.parser.add_option(
+            "-f",
+            "--files",
+            help="Files to be converted to baselines [REQUIRED] [BAM]",
+            action="callback",
+            callback=mult_files,
+            dest="files",
+        )
+        self.parser.add_option(
+            "-o",
+            "--output",
+            help="Output folder [REQUIRED]",
+            action="store",
+            type="string",
+            dest="output",
+        )
+        self.parser.add_option(
+            "-c",
+            "--trim",
+            help="Portion of outliers to be removed before calculating average [Default: 0.2]",
+            action="store",
+            dest="trim",
+            default="0.2",
+        )
+        self.parser.add_option(
+            "-n",
+            "--name",
+            help="Output baseline name [Default: baseline]",
+            action="store",
+            dest="name",
+            default="baseline",
+        )
+
+        # required parameters list:
+        self.ERRORLIST = []
+
+        # change system parameters based on any command line arguments
+        (options, args) = self.parser.parse_args()
+        if options.target:
+            self.TARGET = options.target
+        else:
+            self.ERRORLIST.append("target")
+
+        if options.files:
+            self.FILES = options.files
+        else:
+            self.ERRORLIST.append("files")
+
+        if options.output:
+            self.OUTPUT = options.output
+        else:
+            self.ERRORLIST.append("output")
+
+        if options.trim:
+            self.TRIM = float(options.trim)
+
+        if options.name:
+            self.NAME = options.name
+
+        if len(self.ERRORLIST) != 0:
+            self.parser.print_help()
+            self.parser.error("Missing required parameters")
+
+    def repeat(self):
+        # params test
+        print(f"target	: {self.TARGET}")
+        print(f"files	: {self.FILES}")
+        print(f"output	: {self.OUTPUT}")
+        print(f"trim	: {self.TRIM}")
+        print(f"name	: {self.NAME}")
 
 
 # option handling
-params = Params() 
-#params.repeat()
-targetFile	= params.TARGET
-infiles		= params.FILES
-output_dir	= params.OUTPUT
-
-#Debug
-print " ------ baseline.py ------- "
-print "Target:", targetFile
+params = Params()
+# params.repeat()
+targetFile = params.TARGET
+infiles = params.FILES
+output_dir = params.OUTPUT
+
+# Debug
+print(" ------ baseline.py ------- ")
+print(f"Target: {targetFile}")
 for files in infiles:
-	print "File:", files
-print "Output Directory: ", output_dir
+    print(f"File: {files}")
+print(f"Output Directory: {output_dir}")
+
 
 def make_new_directory(outdir):
-	print outdir
-	if not os.path.exists(outdir):
-		os.mkdir(outdir)
+    print(outdir)
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
 
-print " ----- creating output directory -----"
+
+print(" ----- creating output directory -----")
 make_new_directory(output_dir)
-outdir	= os.path.join(output_dir, "buf")
+outdir = os.path.join(output_dir, "buf")
 make_new_directory(outdir)
 
-targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted")
-os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2))
+targetFile2 = os.path.join(outdir, os.path.basename(targetFile) + ".sorted")
+os.system("sort -k1,1 -k2n %s > %s" % (targetFile, targetFile2))
+
 
 def processInFile(infile):
-	infilename = os.path.basename(infile)
-	s_outdir = os.path.join(outdir, infilename)
-	make_new_directory(s_outdir)
-        genomeFile = os.path.join(s_outdir,infilename +'.chrsummary')
-        get_genome(infile, genomeFile)
-
-        bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH")
-        args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile))
-        iOutFile = open(bedgraph, "w")
-        output = subprocess.Popen(args, stdout = iOutFile).wait()
-        iOutFile.close()
-
-        targetList = convertTarget(targetFile2)
-
-        splitByChromosome(bedgraph)
-
-        convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/")
-        bedgraph_tgtonly = bedgraph+".TARGETONLY"
-        bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE"
-        os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly)
-        os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg)
-	shutil.copy(bedgraph_tgtonly,outdir)
-	shutil.copy(bedgraph_tgtonly_avg,outdir)
-
-print "----- Processing Files -----"
+    infilename = os.path.basename(infile)
+    s_outdir = os.path.join(outdir, infilename)
+    make_new_directory(s_outdir)
+    genomeFile = os.path.join(s_outdir, infilename + ".chrsummary")
+    get_genome(infile, genomeFile)
+
+    bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH")
+    args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" % (infile, genomeFile))
+    iOutFile = open(bedgraph, "w")
+    output = subprocess.Popen(args, stdout=iOutFile).wait()
+    iOutFile.close()
+
+    targetList = convertTarget(targetFile2)
+
+    splitByChromosome(bedgraph)
+
+    convertGeneCoordinate(targetList, os.path.dirname(bedgraph) + "/")
+    bedgraph_tgtonly = bedgraph + ".TARGETONLY"
+    bedgraph_tgtonly_avg = bedgraph + ".TARGETONLY.AVERAGE"
+    os.rename(
+        os.path.join(os.path.dirname(bedgraph), "geneRefCoverage.txt"), bedgraph_tgtonly
+    )
+    os.rename(
+        os.path.join(os.path.dirname(bedgraph), "geneRefCoverage_targetAverage.txt"),
+        bedgraph_tgtonly_avg,
+    )
+    shutil.copy(bedgraph_tgtonly, outdir)
+    shutil.copy(bedgraph_tgtonly_avg, outdir)
+
+
+print("----- Processing Files -----")
 pool = Pool(5)
-pool.map(processInFile,infiles)
+pool.map(processInFile, infiles)
 
 
 # STEP 2 -> Create Union BedGraph
-allfiles_names 	= [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")]
-allfiles_path 	= [os.path.join(outdir,x) for x in allfiles_names]
+allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")]
+allfiles_path = [os.path.join(outdir, x) for x in allfiles_names]
 
-args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names
+args = ["unionBedGraphs", "-header", "-i"] + allfiles_path + ["-names"] + allfiles_names
 
-print (str(args))
-fo	= os.path.join(outdir,"TARGETONLY.union.txt")
-foh	= open(fo,"w")
+print(str(args))
+fo = os.path.join(outdir, "TARGETONLY.union.txt")
+foh = open(fo, "w")
 
-subprocess.Popen(args,stdout=foh).wait()
+subprocess.Popen(args, stdout=foh).wait()
 foh.close()
 
 # STEP 3 -> POOL METHOD
-TRIM	= params.TRIM #TRIM 	= 0.2
-f	= fo
-fh	= open(f)
+TRIM = params.TRIM  # TRIM 	= 0.2
+f = fo
+fh = open(f)
 
-fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt")
-fo_libsize=fo+".LIBSIZE"
+fo = os.path.join(output_dir, params.NAME + ".pooled2_TRIM" + str(TRIM) + ".txt")
+fo_libsize = fo + ".LIBSIZE"
 
 
-foh=open(fo,'w')
-fo_libsize_h=open(fo_libsize,'w')
+foh = open(fo, "w")
+fo_libsize_h = open(fo_libsize, "w")
 
 
 fh.readline()
@@ -180,61 +214,65 @@ def processInFile(infile):
 Ns = None
 nSamples = None
 
-lineCount=0
+lineCount = 0
 for line in fh:
-        lineCount+=1
-        line_elems=line.rstrip().split("\t")
-        rangeLen=int(line_elems[2])-int(line_elems[1])
-        if not Ns:
-                nSamples=len(line_elems)-3
-                Ns = [0 for x in range(nSamples)]
-        for i in range(nSamples):
-                ind=i+3
-                Ns[i] += int(line_elems[ind])*rangeLen
+    lineCount += 1
+    line_elems = line.rstrip().split("\t")
+    rangeLen = int(line_elems[2]) - int(line_elems[1])
+    if not Ns:
+        nSamples = len(line_elems) - 3
+        Ns = [0 for x in range(nSamples)]
+    for i in range(nSamples):
+        ind = i + 3
+        Ns[i] += int(line_elems[ind]) * rangeLen
 
 
 fh.close()
-fh=open(f)
+fh = open(f)
 fh.readline()
 
 
-lineCount=0
-nSamples_exclude = int(math.floor(TRIM*nSamples))
+lineCount = 0
+nSamples_exclude = int(math.floor(TRIM * nSamples))
+
 
 def gm_mean(xs):
-        tmpprod = 1
-        p = 1.0/len(xs)
-        for x in xs:
-                tmpprod = tmpprod * math.pow(x,p)
-        return tmpprod
+    tmpprod = 1
+    p = 1.0 / len(xs)
+    for x in xs:
+        tmpprod = tmpprod * math.pow(x, p)
+    return tmpprod
+
 
 Ns_gmean = gm_mean(Ns)
 
+
 def meanstdv(x):
-        from math import sqrt
-        n, mean, std = len(x), 0, 0
-        for a in x:
-                mean = mean+a
-        mean = mean / float(n)
-        for a in x:
-                std = std+(a-mean)**2
-        std=sqrt(std/float(n-1))
-        return mean, std
+    from math import sqrt
+
+    n, mean, std = len(x), 0, 0
+    for a in x:
+        mean = mean + a
+    mean = mean / float(n)
+    for a in x:
+        std = std + (a - mean) ** 2
+    std = sqrt(std / float(n - 1))
+    return mean, std
 
 
 libsize = 0
 for line in fh:
-        lineCount+=1
-        line_elems=line.rstrip().split("\t")
-        rangeLen=int(line_elems[2])-int(line_elems[1])
-        xs_tmp=[int(x) for x in line_elems[3:]]
-        xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))]
-        xs.sort()
-        xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)]
-	#trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed))
-        trimmed_mean,std=meanstdv(xs_trimmed)
-        libsize+=rangeLen*trimmed_mean
-	foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n")
+    lineCount += 1
+    line_elems = line.rstrip().split("\t")
+    rangeLen = int(line_elems[2]) - int(line_elems[1])
+    xs_tmp = [int(x) for x in line_elems[3:]]
+    xs = [float(xs_tmp[i]) * Ns_gmean / Ns[i] for i in range(len(xs_tmp))]
+    xs.sort()
+    xs_trimmed = xs[nSamples_exclude : (nSamples - nSamples_exclude)]
+    # trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed))
+    trimmed_mean, std = meanstdv(xs_trimmed)
+    libsize += rangeLen * trimmed_mean
+    foh.write("\t".join(line_elems[0:3] + [str(trimmed_mean), str(std)]) + "\n")
 
 
 fo_libsize_h.write(str(int(round(libsize))))
@@ -243,11 +281,13 @@ def meanstdv(x):
 foh.close()
 fo_libsize_h.close()
 
+
 def removeTempFolder(tempFolderPath):
-	import shutil
+    import shutil
+
+    shutil.rmtree(tempFolderPath)
+    print("Temp Folder Removed")
 
-	shutil.rmtree(tempFolderPath)
-	print "Temp Folder Removed"
 
 # Removed Temp Folder
-#removeTempFolder(outdir)
+# removeTempFolder(outdir)

From 734ce44d89bb110f8c3cacc901d3bcb4a4fc80c2 Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 12:08:31 +1100
Subject: [PATCH 6/9] reworked parameter parsing

---
 baseline.py | 72 +++++++++++++++++++++--------------------------------
 contra.py   |  1 +
 2 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/baseline.py b/baseline.py
index 7cc41ad..f1c72a6 100755
--- a/baseline.py
+++ b/baseline.py
@@ -7,7 +7,7 @@
 import subprocess
 import shutil
 import math
-from optparse import OptionParser
+import argparse
 from scripts.split_chromosome import splitByChromosome3 as splitByChromosome
 from scripts.get_chr_length import *
 from scripts.convert_targeted_regions import *
@@ -23,58 +23,41 @@ class Params:
 	"""
 
     def __init__(self):
-        def mult_files(option, opt_str, value, parser):
-            args = []
-            for arg in parser.rargs:
-                if arg[0] != "-":
-                    args.append(arg)
-                else:
-                    del parser.rargs[: len(args)]
-                    break
-            if getattr(parser.values, option.dest):
-                args.extend(getattr(parser.values, option.dest))
-            setattr(parser.values, option.dest, args)
 
         # command-line option definition
-        self.parser = OptionParser()
-        self.parser.add_option(
+        self.parser = argparse.ArgumentParser()
+        self.parser.add_argument(
             "-t",
             "--target",
             help="Target region definition file [REQUIRED] [BED Format]",
-            action="store",
-            type="string",
-            dest="target",
+            required=True,
         )
-        self.parser.add_option(
+        self.parser.add_argument(
             "-f",
             "--files",
             help="Files to be converted to baselines [REQUIRED] [BAM]",
-            action="callback",
-            callback=mult_files,
+            nargs="+",
             dest="files",
+            required=True,
         )
-        self.parser.add_option(
+        self.parser.add_argument(
             "-o",
             "--output",
             help="Output folder [REQUIRED]",
-            action="store",
-            type="string",
             dest="output",
+            required=True,
         )
-        self.parser.add_option(
+        self.parser.add_argument(
             "-c",
             "--trim",
             help="Portion of outliers to be removed before calculating average [Default: 0.2]",
-            action="store",
-            dest="trim",
-            default="0.2",
+            type=float,
+            default=0.2,
         )
-        self.parser.add_option(
+        self.parser.add_argument(
             "-n",
             "--name",
             help="Output baseline name [Default: baseline]",
-            action="store",
-            dest="name",
             default="baseline",
         )
 
@@ -82,13 +65,20 @@ def mult_files(option, opt_str, value, parser):
         self.ERRORLIST = []
 
         # change system parameters based on any command line arguments
-        (options, args) = self.parser.parse_args()
+        options = self.parser.parse_args()
         if options.target:
+            if not os.path.isfile(options.target):
+                print(f"Could not find bed file {options.target}")
+                self.ERRORLIST.append("control")
             self.TARGET = options.target
         else:
             self.ERRORLIST.append("target")
 
         if options.files:
+            for bam in options.files:
+                if not os.path.isfile(options.target):
+                    print(f"Could not find bam file {bam}")
+                    self.ERRORLIST.append("files")
             self.FILES = options.files
         else:
             self.ERRORLIST.append("files")
@@ -98,23 +88,17 @@ def mult_files(option, opt_str, value, parser):
         else:
             self.ERRORLIST.append("output")
 
-        if options.trim:
-            self.TRIM = float(options.trim)
-
-        if options.name:
-            self.NAME = options.name
-
         if len(self.ERRORLIST) != 0:
-            self.parser.print_help()
-            self.parser.error("Missing required parameters")
+            # self.parser.print_help()
+            self.parser.error("Missing required parameters: " + str(self.ERRORLIST))
 
     def repeat(self):
         # params test
-        print(f"target	: {self.TARGET}")
-        print(f"files	: {self.FILES}")
-        print(f"output	: {self.OUTPUT}")
-        print(f"trim	: {self.TRIM}")
-        print(f"name	: {self.NAME}")
+        print(f"target  :{self.TARGET}")
+        print(f"files   :{self.FILES}")
+        print(f"output  :{self.OUTPUT}")
+        print(f"trim    :{self.TRIM}")
+        print(f"name    :{self.NAME}")
 
 
 # option handling
diff --git a/contra.py b/contra.py
index fc84f7e..39a77b8 100755
--- a/contra.py
+++ b/contra.py
@@ -94,6 +94,7 @@ def __init__(self):
         self.optional.add_argument(
             "--numBin",
             help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]",
+            type=int,
             default="20",
         )
         self.optional.add_argument(

From daeac1261bef60e8f2e99b6c9bc1e3a878936208 Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 12:09:58 +1100
Subject: [PATCH 7/9] string formating changed to python43 notation

---
 baseline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/baseline.py b/baseline.py
index f1c72a6..e402e47 100755
--- a/baseline.py
+++ b/baseline.py
@@ -128,7 +128,7 @@ def make_new_directory(outdir):
 make_new_directory(outdir)
 
 targetFile2 = os.path.join(outdir, os.path.basename(targetFile) + ".sorted")
-os.system("sort -k1,1 -k2n %s > %s" % (targetFile, targetFile2))
+os.system(f"sort -k1,1 -k2n {targetFile} > {targetFile2}")
 
 
 def processInFile(infile):
@@ -139,7 +139,7 @@ def processInFile(infile):
     get_genome(infile, genomeFile)
 
     bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH")
-    args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" % (infile, genomeFile))
+    args = shlex.split(f"genomeCoverageBed -ibam {infile} -bga -g {genomeFile}")
     iOutFile = open(bedgraph, "w")
     output = subprocess.Popen(args, stdout=iOutFile).wait()
     iOutFile.close()

From ca9606ea4d5d26da0c1073320fa67db3b35169ba Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 13:31:42 +1100
Subject: [PATCH 8/9] added a todo for furture parameter optimisation

---
 contra.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contra.py b/contra.py
index 39a77b8..115ae7f 100755
--- a/contra.py
+++ b/contra.py
@@ -109,6 +109,7 @@ def __init__(self):
             type=int,
             default=10,
         )
+        # TODO: change this to a choice argument with sam/bed
         self.optional.add_argument(
             "--sam",
             help="If the specified, test and control sample are in SAM [False]",

From 3908a02d2f508e1b03b2894bf7787cdeacb696cf Mon Sep 17 00:00:00 2001
From: SHollizeck <sebastian.hollizeck@gmx.de>
Date: Fri, 3 Jan 2020 13:36:10 +1100
Subject: [PATCH 9/9] changes shebang line to use python3

---
 NullDistEstimation/1_renormAll.py | 2 +-
 NullDistEstimation/2_getExons.py  | 2 +-
 NullDistEstimation/nde_wrapper.py | 2 +-
 WGCNV-NDE/1_renorm.py             | 2 +-
 WGCNV-NDE/2_process.py            | 2 +-
 WGCNV-NDE/wgcnv_wrapper.py        | 2 +-
 WGCNV-SINGLE/wgcnv.py             | 2 +-
 WGCNV-SINGLE/wgcnv_noBAF.py       | 2 +-
 baseline.py                       | 2 +-
 contra.py                         | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/NullDistEstimation/1_renormAll.py b/NullDistEstimation/1_renormAll.py
index 3ed6508..4d11051 100644
--- a/NullDistEstimation/1_renormAll.py
+++ b/NullDistEstimation/1_renormAll.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 # Usage: "python 1_renormAll.py %s %s %s %s" % (contraList, renormPath, renormList, debug)
 
 import math
diff --git a/NullDistEstimation/2_getExons.py b/NullDistEstimation/2_getExons.py
index 4c5b09f..05f69b7 100644
--- a/NullDistEstimation/2_getExons.py
+++ b/NullDistEstimation/2_getExons.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 import os
 import sys
 
diff --git a/NullDistEstimation/nde_wrapper.py b/NullDistEstimation/nde_wrapper.py
index d44074e..aa982e6 100755
--- a/NullDistEstimation/nde_wrapper.py
+++ b/NullDistEstimation/nde_wrapper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # 24/09 wrapper script for estimating the null distribution
 # on a per exon and per gene basis from a bunch of CONTRA outputs
diff --git a/WGCNV-NDE/1_renorm.py b/WGCNV-NDE/1_renorm.py
index 376d28b..a417e92 100755
--- a/WGCNV-NDE/1_renorm.py
+++ b/WGCNV-NDE/1_renorm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 # print '.................inside renorm'
 import math
diff --git a/WGCNV-NDE/2_process.py b/WGCNV-NDE/2_process.py
index 873f318..7c57c47 100755
--- a/WGCNV-NDE/2_process.py
+++ b/WGCNV-NDE/2_process.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 # Usage: python 2_process.py [_renorm.txt] [_wg.txt] [_exon.txt] [bed.annotated.txt] [debug]
 import os
 import copy
diff --git a/WGCNV-NDE/wgcnv_wrapper.py b/WGCNV-NDE/wgcnv_wrapper.py
index 23e3978..a177880 100755
--- a/WGCNV-NDE/wgcnv_wrapper.py
+++ b/WGCNV-NDE/wgcnv_wrapper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 # 10/10  wrapper script for WGCNV - server edit
 # based on wgcnv_noBAF.py by Jason Li
diff --git a/WGCNV-SINGLE/wgcnv.py b/WGCNV-SINGLE/wgcnv.py
index 6f819a0..72af93f 100755
--- a/WGCNV-SINGLE/wgcnv.py
+++ b/WGCNV-SINGLE/wgcnv.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 import sys
 import os
diff --git a/WGCNV-SINGLE/wgcnv_noBAF.py b/WGCNV-SINGLE/wgcnv_noBAF.py
index d8b97b0..180e9c8 100755
--- a/WGCNV-SINGLE/wgcnv_noBAF.py
+++ b/WGCNV-SINGLE/wgcnv_noBAF.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 import sys
 import os
diff --git a/baseline.py b/baseline.py
index e402e47..1d501ef 100755
--- a/baseline.py
+++ b/baseline.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 import os
 import sys
diff --git a/contra.py b/contra.py
index 115ae7f..4f68aa5 100755
--- a/contra.py
+++ b/contra.py
@@ -21,7 +21,7 @@
 #
 #
 # -----------------------------------------------------------------------#
-# Last Updated : 23 July 2012 16:43PM
+# Last Updated : 03 January 2020 13:43
 
 
 import os