From 760bafc0c728192024d893c5be6dd059046e429e Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Thu, 19 Dec 2019 17:41:46 +1100 Subject: [PATCH 1/9] translate to python3 bumped new version number fixed a bug where if you requested more minExons than you had regions in a target, the program would not terminate changed the output table to be rounded to 3 digits fro log ratios and 6 digits for pValues --- .gitignore | 2 + README.md | 36 + contra.py | 1198 +++++++++++++++------------ scripts/assign_bin_number_v2.py | 274 +++--- scripts/average_count.py | 341 ++++---- scripts/cn_analysis.v3.R | 31 +- scripts/cn_apply_threshold.py | 177 ++-- scripts/convert_gene_coordinate.py | 437 +++++----- scripts/convert_targeted_regions.py | 79 +- scripts/count_libsize.py | 31 +- scripts/get_chr_length.py | 27 +- scripts/split_chromosome.py | 133 +-- scripts/target_breakdown.py | 70 +- scripts/vcf_out.py | 76 +- 14 files changed, 1585 insertions(+), 1327 deletions(-) diff --git a/.gitignore b/.gitignore index 0d20b64..5a77cbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.pyc +test +TestFiles diff --git a/README.md b/README.md index 7e0933c..a742196 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,39 @@ # CONTRA +CONTRA: copy number analysis for targeted resequencing. +Li J, Lupat R, Amarasinghe KC, Thompson ER, Doyle MA, Ryland GL, Tothill RW, Halgamuge SK, Campbell IG, Gorringe KL. +Bioinformatics Core Facility, Peter MacCallum Cancer Centre, VIC 3002, Australia. Jason.Li@petermac.org + +Method published 2012 in Bioinformatics doi: [10.1093/bioinformatics/bts146]{https://doi.org/10.1093/bioinformatics/bts146} + + +previously available through http://contra-cnv.sourceforge.net/ + + +# Changelog +v2.1.0 +Uses python3 +Improved performance through multithreading and restructuring of loops +Autodetection of file formats (work in progress) + +v2.0.8 +Included NDE and WGCNV workflows. +Removed FASTA dependency. Removed VCF support. Removed PDF file. +Updated online documentation. + +v2.0.7 +Added --version + +v2.0.6 +Added option to remove Duplicates + + +v2.0.4 +Fixed a bug with "chr" named chromosomes in cn_apply_threshold +Catch errors with bam files do not contain reads in a targeted chromosome. + + +v2.0.3 +Fixed baseline.py scalability to large sample size +Now compatable with R2.14+ diff --git a/contra.py b/contra.py index abfed0a..fc84f7e 100755 --- a/contra.py +++ b/contra.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # ----------------------------------------------------------------------# # Copyright (c) 2011, Richard Lupat & Jason Li. @@ -19,13 +19,13 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 23 July 2012 16:43PM import os -from optparse import OptionParser +import argparse import sys import subprocess import shlex @@ -42,585 +42,701 @@ from scripts.count_libsize import * from scripts.target_breakdown import * -#VERSION -VERSION="2.0.8" +# VERSION +VERSION = "2.0.10" -#Absolute Path +# Absolute Path scriptPath = os.path.realpath(os.path.dirname(sys.argv[0])) + class Params: - """ - Class for top-level system parameters - """ - - def __init__(self): - # command-line option definition - self.parser = OptionParser() - self.parser.add_option("-t", "--target", - help="Target region definition file [REQUIRED] [BED Format]", - action="store", type="string", dest="target") - self.parser.add_option("-s", "--test", - help="Alignment file for the test sample [REQUIRED] [BAM/SAM]", - action="store", type="string", dest="test") - self.parser.add_option("-c", "--control", - help="Alignment file for the control sample [REQUIRED] [BAM/SAM]", - action="store", type="string", dest="control") - self.parser.add_option("-f", "--fasta", - help="Reference Genome [NOT REQUIRED since v2.0.8][FASTA]", - action="store", type="string", dest="fasta") - self.parser.add_option("-o", "--outFolder", - help="the output folder path name to store the output of analysis [REQUIRED]", - action="store", type="string", dest="outFolder") - self.parser.add_option("--numBin", - help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]", - action="store", type="string", dest="numBin", default="20") - self.parser.add_option("--minReadDepth", - help="The threshold for minimum read depth for each bases [10]", - action="store", type="string", dest="minReadDepth", default=10) - self.parser.add_option("--minNBases", - help="The threshold for minimum number of bases for each target regions [10]", - action="store", type="string", dest="minNBases", default= 10) - self.parser.add_option("--sam", - help="If the specified, test and control sample are in SAM [False]", - action="store_true", dest="sam", default="False") - self.parser.add_option("--bed", - help="if specified, control will be in BED format [False]", - action="store_true", dest = "bedInput", default="False") - self.parser.add_option("--pval", - help="The p-value threshold for filtering [0.05]. Applies to Adjusted P-Value.", - action="store", type="string", dest="pval", default=0.05) - self.parser.add_option("--sampleName", - help ="The name to be appended to the front of default output name ['']", - action="store", type="string", dest="sampleName", default='') - self.parser.add_option("--nomultimapped", - help="The option to remove multi-mapped reads [False]", - action="store_true", dest="nomultimapped",default="False") - self.parser.add_option("-p", "--plot", - help="Plots log-ratio distribution for each bin [False]", - action="store_true", dest="plot", default="False") - self.parser.add_option("--minExon", - help="Minimum number of Exons in one bin (if less than this, bin that contains small number of exons" - +"will be moved to the adjacent bins) [2000] ", - action="store", type="string", dest="minExon", default="2000") - self.parser.add_option("--minControlRdForCall", - help="Minimum control readdepth for call [5]", - action="store", type="string", dest="minControl", default="5") - - self.parser.add_option("--minTestRdForCall", - help="Minimum test readdepth for call [0]", - action="store", type="string", dest="minTest", default="0") - - self.parser.add_option("--minAvgForCall", - help="Minimum average coverage for call [20]", - action="store", type="string", dest="minAvg", default="20") - - self.parser.add_option("--maxRegionSize", - help="Maximum Region Size in target region (for breaking large region into smaller region. By default, maxRegionSize 0 means no breakdown) [0]", - action="store", type="string", dest="maxRegionSize", default="0") - - self.parser.add_option("--targetRegionSize", - help="Target Region Size for breakdown (if maxRegionSize is non zero) [200]", - action="store", type="string", dest="targetRegionSize", default="200") - - self.parser.add_option("-l", "--largeDeletion", - help="if specified, CONTRA will run large deletion analysis (CBS). User must have DNAcopy R-library installed to run the analysis. [False]", - action="store_true", dest = "large", default="False") - - self.parser.add_option("--smallSegment", - help="CBS segment size for calling large variations [1]", - action="store", type="string", dest="smallSegment", default="1") - - self.parser.add_option("--largeSegment", - help="CBS segment size for calling large variations [25]", - action="store", type="string", dest="largeSegment", default="25") - - self.parser.add_option("--lrCallStart", - help="Log ratios start range that will be used to call CNV [-0.3]", - action="store", type="string", dest="lrs", default="-0.3") - - self.parser.add_option("--lrCallEnd", - help="Log ratios end range that will be used to call CNV [0.3]", - action="store", type="string", dest="lre", default="0.3") - - self.parser.add_option("--passSize", - help="Size of exons that passed the p-value threshold compare to the original exon size [0.35]", - action="store", type="string", dest="passSize", default="0.35") - - ### - self.parser.add_option("--removeDups", - help="if specified, will remove PCR duplicates [False]", - action="store_true", dest = "removeDups", default="False") - self.parser.add_option("--version", - help="Returns version", - action="store_true", dest = "version") - - # required parameters list - self.ERRORLIST = [] - - # change system parameters based on any command line - (options, args) = self.parser.parse_args() - if options.target: - self.TARGET = options.target - else: - #self.parser.print_help() - #self.parser.error("--target not supplied") - self.ERRORLIST.append("target") - - if options.test: - self.TEST = options.test - else: - #self.parser.error("--test not supplied") - self.ERRORLIST.append("test") - - if options.control: - self.CONTROL = options.control - else: - #self.parser.error("--control not supplied") - self.ERRORLIST.append("control") - -# if options.fasta: -# self.FASTA = options.fasta -# else: -# #self.parser.error("--fasta not supplied") -# self.ERRORLIST.append("fasta") - - if options.outFolder: - self.OUTFOLDER = options.outFolder - else: - #self.parser.error("--outFolder not supplied") - self.ERRORLIST.append("outfolder") - - if len(self.ERRORLIST) != 0: - self.parser.print_help() - self.parser.error("Missing required parameters") - - if options.numBin: - binsNumber = options.numBin.split(",") - try: - self.NUMBIN = [int(j) for j in binsNumber] - except: - self.NUMBIN = [20] - if options.minReadDepth: - self.MINREADDEPTH = int(options.minReadDepth) - if options.minNBases: - self.MINNBASES = int(options.minNBases) - if options.sam: - self.SAM = str(options.sam) - if options.pval: - self.PVAL = options.pval - if options.sampleName: - self.SAMPLENAME = options.sampleName - else: - self.SAMPLENAME = 'No-SampleName' - if options.nomultimapped: - self.NOMULTIMAPPED = str(options.nomultimapped) - if options.plot: - self.PLOT = str(options.plot) - if options.bedInput: - self.BEDINPUT = options.bedInput - if options.minExon: - self.MINEXON = int(options.minExon) - if options.minControl: - self.MINCONTROL = options.minControl - if options.minTest: - self.MINTEST = options.minTest - if options.minAvg: - self.MINAVG = options.minAvg - if options.maxRegionSize: - self.MAXREGIONSIZE = int(options.maxRegionSize) - if options.targetRegionSize: - self.TARGETREGIONSIZE = int(options.targetRegionSize) - if options.large: - self.LARGE = str(options.large) - if options.smallSegment: - self.SMALLSEGMENT = options.smallSegment - if options.largeSegment: - self.LARGESEGMENT = options.largeSegment - if options.lre: - self.LRE = options.lre - if options.lrs: - self.LRS = options.lrs - if options.passSize: - self.PASSSIZE = options.passSize - - ### either "False" or True atn - if options.removeDups: - self.REMOVEDUPS = str(options.removeDups) - - def repeat(self): - # params test - print "target :", self.TARGET - print "test :", self.TEST - print "control :", self.CONTROL -# print "fasta :", self.FASTA - print "outfolder :", self.OUTFOLDER - print "numBin :", self.NUMBIN - print "minreaddepth :", self.MINREADDEPTH - print "minNBases :", self.MINNBASES - print "sam :", self.SAM - print "pval :", self.PVAL - print "sampleName :", self.SAMPLENAME - print "nomultimapped :", self.NOMULTIMAPPED - print "plot :", self.PLOT - print "bedInput :", self.BEDINPUT - print "minExon :", self.MINEXON - print "largeDeletion :", self.LARGE - print "removeDups :", self.REMOVEDUPS + """ + Class for top-level system parameters + """ + + def __init__(self): + # command-line option definition + self.parser = argparse.ArgumentParser() + # we remove the optionals so the required come first + self.parser._action_groups.pop() + + # create the group of required params + self.required = self.parser.add_argument_group("required arguments") + + # create the group of optional + self.optional = self.parser.add_argument_group("optional arguments") + + self.required.add_argument( + "-t", + "--target", + help="Target region definition file [BED Format]", + required=True, + ) + self.required.add_argument( + "-s", + "--test", + help="Alignment file for the test sample [BAM/SAM]", + required=True, + ) + self.required.add_argument( + "-c", + "--control", + help="Alignment file for the control sample [BAM/SAM]", + required=True, + ) + self.optional.add_argument("-f", "--fasta", help="Reference Genome [FASTA]") + self.required.add_argument( + "-o", + "--outFolder", + help="the output folder path name to store the output of analysis", + required=True, + ) + self.optional.add_argument( + "--numBin", + help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]", + default="20", + ) + self.optional.add_argument( + "--minReadDepth", + help="The threshold for minimum read depth for each bases [10]", + type=int, + default=10, + ) + self.optional.add_argument( + "--minNBases", + help="The threshold for minimum number of bases for each target regions [10]", + type=int, + default=10, + ) + self.optional.add_argument( + "--sam", + help="If the specified, test and control sample are in SAM [False]", + action="store_true", + dest="sam", + default=False, + ) + self.optional.add_argument( + "--bed", + help="if specified, control will be in BED format [False]", + action="store_true", + dest="bedInput", + default=False, + ) + self.optional.add_argument( + "--pval", + help="The p-value threshold for filtering [0.05]. Applies to Adjusted P-Value.", + type=float, + dest="pval", + default=0.05, + ) + self.optional.add_argument( + "--sampleName", + help="The name to be appended to the front of default output name ['']", + default="", + ) + self.optional.add_argument( + "--nomultimapped", + help="The option to remove multi-mapped reads [False]", + action="store_true", + default=False, + ) + self.optional.add_argument( + "-p", + "--plot", + help="Plots log-ratio distribution for each bin [False]", + action="store_true", + default=False, + ) + self.optional.add_argument( + "--minExon", + help="Minimum number of Exons in one bin (if less than this, bin that contains small number of exons" + + "will be moved to the adjacent bins) [2000] ", + type=int, + default=100, + ) + self.optional.add_argument( + "--minControlRdForCall", + help="Minimum control readdepth for call [5]", + type=int, + dest="minControl", + default=5, + ) + + self.optional.add_argument( + "--minTestRdForCall", + help="Minimum test readdepth for call [0]", + type=int, + dest="minTest", + default=0, + ) + + self.optional.add_argument( + "--minAvgForCall", + help="Minimum average coverage for call [20]", + type=int, + dest="minAvg", + default=20, + ) + + self.optional.add_argument( + "--maxRegionSize", + help="Maximum Region Size in target region (for breaking large region into smaller region. By default, maxRegionSize 0 means no breakdown) [0]", + default=0, + ) + + self.optional.add_argument( + "--targetRegionSize", + help="Target Region Size for breakdown (if maxRegionSize is non zero) [200]", + type=int, + default=100, + ) + + self.optional.add_argument( + "-l", + "--largeDeletion", + help="if specified, CONTRA will run large deletion analysis (CBS). User must have DNAcopy R-library installed to run the analysis. [False]", + action="store_true", + dest="large", + default=False, + ) + + self.optional.add_argument( + "--smallSegment", + help="CBS segment size for calling large variations [1]", + type=int, + default=1, + ) + + self.optional.add_argument( + "--largeSegment", + help="CBS segment size for calling large variations [25]", + type=int, + default=25, + ) + + self.optional.add_argument( + "--lrCallStart", + help="Log ratios start range that will be used to call CNV [-0.3]", + type=float, + dest="lrs", + default=-0.3, + ) + + self.optional.add_argument( + "--lrCallEnd", + help="Log ratios end range that will be used to call CNV [0.3]", + type=float, + dest="lre", + default=0.3, + ) + + self.optional.add_argument( + "--passSize", + help="Size of exons that passed the p-value threshold compare to the original exon size [0.35]", + type=float, + default=0.35, + ) + + ### + self.optional.add_argument( + "--removeDups", + help="if specified, will remove PCR duplicates [False]", + action="store_true", + default=False, + ) + self.optional.add_argument("--version", action="version", version=VERSION) + + # required parameters list + self.ERRORLIST = [] + + # change system parameters based on any command line + options = self.parser.parse_args() + print(options) + + if options.target: + self.TARGET = options.target + else: + # self.parser.print_help() + # self.parser.error("--target not supplied") + self.ERRORLIST.append("target") + + if options.test: + if not os.path.isfile(options.test): + self.ERRORLIST.append("test") + self.TEST = options.test + + else: + # self.parser.error("--test not supplied") + self.ERRORLIST.append("test") + + if options.control: + if not os.path.isfile(options.test): + self.ERRORLIST.append("control") + self.CONTROL = options.control + else: + # self.parser.error("--control not supplied") + self.ERRORLIST.append("control") + + if options.outFolder: + self.OUTFOLDER = options.outFolder + else: + # self.parser.error("--outFolder not supplied") + self.ERRORLIST.append("outfolder") + + if len(self.ERRORLIST) != 0: + # self.parser.print_help() + self.parser.error("Missing required parameters: " + str(self.ERRORLIST)) + + if options.numBin: + binsNumber = options.numBin.split(",") + try: + self.NUMBIN = [int(j) for j in binsNumber] + except: + self.NUMBIN = [20] + + self.MINREADDEPTH = options.minReadDepth + self.MINNBASES = options.minNBases + self.SAM = options.sam + self.PVAL = options.pval + + if options.sampleName: + self.SAMPLENAME = options.sampleName + else: + self.SAMPLENAME = "No-SampleName" + + self.NOMULTIMAPPED = options.nomultimapped + self.PLOT = options.plot + self.BEDINPUT = options.bedInput + self.MINEXON = options.minExon + self.MINCONTROL = options.minControl + self.MINTEST = options.minTest + self.MINAVG = options.minAvg + self.MAXREGIONSIZE = options.maxRegionSize + self.TARGETREGIONSIZE = options.targetRegionSize + self.LARGE = options.large + self.SMALLSEGMENT = options.smallSegment + self.LARGESEGMENT = options.largeSegment + self.LRE = options.lre + self.LRS = options.lrs + self.PASSSIZE = options.passSize + self.REMOVEDUPS = options.removeDups + + if self.BEDINPUT and self.SAM: + print("cant parse sam and BED input, setting input to BED") + self.SAM = False + + def repeat(self): + # params test + print("target :", self.TARGET) + print("test :", self.TEST) + print("control :", self.CONTROL) + # print "fasta :", self.FASTA + print("outfolder :", self.OUTFOLDER) + print("numBin :", self.NUMBIN) + print("minreaddepth :", self.MINREADDEPTH) + print("minNBases :", self.MINNBASES) + print("sam :", self.SAM) + print("pval :", self.PVAL) + print("sampleName :", self.SAMPLENAME) + print("nomultimapped :", self.NOMULTIMAPPED) + print("plot :", self.PLOT) + print("bedInput :", self.BEDINPUT) + print("minExon :", self.MINEXON) + print("largeDeletion :", self.LARGE) + print("removeDups :", self.REMOVEDUPS) + + def checkOutputFolder(outF): - print "Creating Output Folder :", - - if outF[len(outF)-1] == "/": - outF = outF[:len(outF)-1] - - try: - os.mkdir(outF) - except: - print "output folder already exists: " + outF -# print "cannot create folder '%s'" %outF -# print "if folder already exist, please specify other folder" -# sys.exit(1) - - try: - os.mkdir(outF+"/table") - os.mkdir(outF+"/plot") - os.mkdir(outF+"/buf") - os.mkdir(outF+"/buf/ctrData/") - os.mkdir(outF+"/buf/testData/") - except: - print "[ERROR: CANNOT CREATE SUBFOLDERS]" - sys.exit(1) - - print " Done." - - return outF - - -#BEDINPUT + print("Creating Output Folder :"), + + if outF[len(outF) - 1] == "/": + outF = outF[: len(outF) - 1] + + try: + os.makedirs(outF, exist_ok=True) + except: + raise IOError("Could not create outputFolder") + # print "cannot create folder '{}'" %outF + # print "if folder already exist, please specify other folder" + # sys.exit(1) + + try: + os.makedirs(outF + "/table", exist_ok=True) + os.makedirs(outF + "/plot", exist_ok=True) + + # dont need to create parent folder with makedirs + os.makedirs(outF + "/buf/ctrData/", exist_ok=True) + os.makedirs(outF + "/buf/testData/", exist_ok=True) + except: + raise IOError("[ERROR: CANNOT CREATE SUBFOLDERS]") + + print(" Done.") + + return outF + + +# BEDINPUT def countTotalReads3(params, folder): - tempFileName = folder + "/temp.txt" - tempReadFile = open(tempFileName, "w") - libsize = get_libsize(params.BEDINPUT) - tempReadFile.write(libsize) - #tempReadFile.write(params.CONTROLREADCOUNT) - tempReadFile.close() + tempFileName = folder + "/temp.txt" + tempReadFile = open(tempFileName, "w") + libsize = get_libsize(params.BEDINPUT) + tempReadFile.write(libsize) + tempReadFile.close() def countTotalReads(params, folder): - if 'testData' in folder: - inF = params.TEST - else: - inF = params.CONTROL + if "testData" in folder: + inF = params.TEST + else: + inF = params.CONTROL + + # Get Total ReadCount TODO: use + getreadcount = os.system( + "samtools view {} | wc -l > {}/temp.txt".format(inF, folder) + ) - # Get Total ReadCount - getreadcount = os.system("samtools view %s | wc -l > %s/temp.txt" %(inF,folder)) def samToBam(samfile, bamfile): - args = shlex.split("samtools view -bS %s -o %s" %(samfile, bamfile)) - samtobam = subprocess.call(args) + args = shlex.split("samtools view -bS {} -o {}".format(samfile, bamfile)) + samtobam = subprocess.call(args) + + return bamfile - return bamfile def removeMultiMapped(inF, newBAM): - # Get New BAM Files with mapping quality > 0 - args = shlex.split("samtools view -bq 1 %s -o %s" %(inF, newBAM)) - removeMM = subprocess.call(args) - print "Multi mapped reads removed. " - -def removeDups(inF, newBAM): - # Remove - args = shlex.split("samtools view -b -F 0x400 %s -o %s" %(inF, newBAM)) - removeDupsCall = subprocess.call(args) - print "Removed PCR duplicates. " - -#BEDINPUT -def convertBamSimple(params, folder, targetList, genomeFile): - if 'testData' in folder: - inF = params.TEST - print "Converting TEST Sample... " - else: - inF = params.CONTROL - print "Converting CONTROL Sample... " - - #Copy file to working folder - os.system("cp %s %s" %(inF, folder+"sample.BEDGRAPH")) + # Get New BAM Files with mapping quality > 0 + args = shlex.split("samtools view -bq 1 {} -o {}".format(inF, newBAM)) + removeMM = subprocess.call(args) + print("Multi mapped reads removed.") - # Split Bedgraph by its chromosomes - splitByChromosome(folder) - # Slice the coverage files to only cover the targeted regions - print "Getting targeted regions DOC..." - convertGeneCoordinate(targetList, folder) - - # LIBSIZE - libsize = str(get_libsize(folder+"geneRefCoverage2.txt")) - tempLibSize = open(folder + "/temp.txt", "w") - tempLibSize.write(libsize) - tempLibSize.close() +def removeDups(inF, newBAM): + # Remove dups + args = shlex.split("samtools view -b -F 0x400 {} -o {}".format(inF, newBAM)) + removeDupsCall = subprocess.call(args) + print("Removed PCR duplicates.") - print "Targeted regions pre-processing: Done" +# BEDINPUT +def convertBamSimple(params, folder, targetList, genomeFile): + if "testData" in folder: + inF = params.TEST + print("Converting TEST Sample... ") + else: + inF = params.CONTROL + print("Converting CONTROL Sample... ") -def convertBam(params, folder, targetList, genomeFile): - if 'testData' in folder: - inF = params.TEST - print "Converting TEST Sample... " - else: - inF = params.CONTROL - print "Converting CONTROL Sample... " - - # Convert BAM Files to BEDGRAPH - bedgraph = folder + "sample.BEDGRAPH" - args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(inF, genomeFile)) - print "DEBUG 123 " + " ".join(args) - #output = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0] - iOutFile = open(bedgraph, "w") - #iOutFile.write(output) - output = subprocess.Popen(args, stdout = iOutFile).wait() - iOutFile.close() - - # Split Bedgraph by its chromosomes - splitByChromosome(folder) - - # Slice the coverage files to only cover the targeted regions - print "Getting targeted regions DOC..." - convertGeneCoordinate(targetList, folder) - - # LIBSIZE - libsize = str(get_libsize(folder+"geneRefCoverage2.txt")) - tempLibSize = open(folder + "temp.txt", "w") - tempLibSize.write(libsize) - tempLibSize.close() - - print "Targeted regions pre-processing: Done" + # Copy file to working folder + os.system("cp {} {}".format(inF, folder + "sample.BEDGRAPH")) -def analysisPerBin(params, num_bin, outFolder, targetList): - import shutil - - bufLoc = outFolder + "/buf" - # Assign bin number to the median and average file - numBin = assignBin(num_bin, bufLoc+"/average.txt", bufLoc+"/bin", targetList, params.MINEXON) - - #copy bin_boundary to plot folder - #outBounFile = os.path.join(outFolder, "plot", "bin_boundary"+str(num_bin)) - #curBounFile = os.path.join(bufLoc, "bin" + str(num_bin) + ".boundaries.txt") - #shutil.copy(curBounFile, outBounFile) - - - print "Significance Test ... " - rScriptName = os.path.join(scriptPath, "scripts", "cn_analysis.v3.R") - args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s %s %s" - %(rScriptName, num_bin, params.MINREADDEPTH, params.MINNBASES, outFolder, params.SAMPLENAME,params.PLOT, numBin, params.MINCONTROL, params.MINTEST, params.MINAVG)) - rscr = subprocess.call(args) - - - print "Generating Output Files ... " - # Analysis of CNV - tNameList = os.listdir(outFolder+"/table/") - if num_bin > 1: - tNameId = str(num_bin) + "bins" - else: - tNameId = str(num_bin) + "bin" - for tName in tNameList: - if tNameId in tName: - break - - if "CNATable" in tName: - tName = tName[:len(tName)-4] - tableName = outFolder + "/table/" + tName - bufTable = bufLoc + "/" + tName - applyThreshold(tableName, bufTable, params.PVAL, 100000) #params.MAXGAP = 100000 - - # Large Region CBS - if (params.LARGE != "False"): - print "DEBUG 266a" - rScriptName2 = os.path.join(scriptPath, "scripts", "large_region_cbs.R") - args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s" - %(rScriptName2, tableName+".txt", params.SMALLSEGMENT, params.LARGESEGMENT, params.PVAL, params.PASSSIZE, params.LRS, params.LRE, bufLoc)) - rscr2 = subprocess.call(args) - print str(args) - else: - print "DEBUG 266b" - - # Generate the DNA sequence (for VCF file) - print "Skipping VCF generation.. use tabular file instead." -# bedFile = bufTable + ".BED" -# bedFasta = bufTable + ".fastaOut.txt" -# fastaFile = params.FASTA -# args = shlex.split("fastaFromBed -fi %s -bed %s -fo %s -name" -# %(fastaFile, bedFile, bedFasta)) -# print (" ".join(args)) -# fastaBED = subprocess.call(args) -# -# # Write VCF -# print "Creating VCF file ... " -# vcfFile = tableName + ".vcf" -# vcf_out(bedFasta, vcfFile) -# -# print "%s created. " %(vcfFile) + # Split Bedgraph by its chromosomes + splitByChromosome(folder) - else: - print "Table not found" + # Slice the coverage files to only cover the targeted regions + print("Getting targeted regions DOC...") + convertGeneCoordinate(targetList, folder) -def removeTempFolder(tempFolderPath): - import shutil - - shutil.rmtree(tempFolderPath) + # LIBSIZE + libsize = str(get_libsize(folder + "geneRefCoverage2.txt")) + tempLibSize = open(folder + "/temp.txt", "w") + tempLibSize.write(libsize) + tempLibSize.close() - print "Temp Folder Removed" + print("Targeted regions pre-processing: Done") -def main(): - if len(sys.argv) == 2 and sys.argv[1] == "--version": - print VERSION - sys.exit(1) - - # option handling - params = Params() - params.repeat() +def convertBam(params, folder, targetList, genomeFile): + if "testData" in folder: + inF = params.TEST + print("Converting TEST Sample... ") + else: + inF = params.CONTROL + print("Converting CONTROL Sample... ") - # output folder handling - outFolder = checkOutputFolder(params.OUTFOLDER) - bufLoc = outFolder + "/buf" + # Convert BAM Files to BEDGRAPH + bedgraph = folder + "sample.BEDGRAPH" + args = shlex.split("genomeCoverageBed -ibam {} -bga -g {}".format(inF, genomeFile)) - # convert target file - sorted_target = os.path.join(bufLoc, "target.BED") - os.system("sort -k1,1 -k2n %s > %s" %(params.TARGET, sorted_target)) + # output = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0] + iOutFile = open(bedgraph, "w") + # iOutFile.write(output) + output = subprocess.Popen(args, stdout=iOutFile).wait() + iOutFile.close() - # target breakdown - if params.MAXREGIONSIZE > 0: - new_target = os.path.join(bufLoc, "target_breakdown.BED") - target_breakdown(sorted_target, params.MAXREGIONSIZE, params.TARGETREGIONSIZE, new_target) - sorted_target = new_target + # Split Bedgraph by its chromosomes + splitByChromosome(folder) - targetList = convertTarget(sorted_target) + # Slice the coverage files to only cover the targeted regions + print("Getting targeted regions DOC...") + convertGeneCoordinate(targetList, folder) - # convert sam to bam if -sam specified - if (params.SAM == "True"): - print "Pre-processing SAM files" + # LIBSIZE + libsize = str(get_libsize(folder + "geneRefCoverage2.txt")) + tempLibSize = open(folder + "temp.txt", "w") + tempLibSize.write(libsize) + tempLibSize.close() - test_bam = bufLoc + "/test.BAM" - ctr_bam = bufLoc + "/control.BAM" + print("Targeted regions pre-processing: Done") - samTest = Process(target= samToBam, args=(params.TEST, test_bam)) - if params.BEDINPUT == "False": - samCtr = Process(target= samToBam, args=(params.CONTROL, ctr_bam)) - samTest.start() - if params.BEDINPUT == "False": - samCtr.start() +def analysisPerBin(params, num_bin, outFolder, targetList): + import shutil + + bufLoc = outFolder + "/buf" + # Assign bin number to the median and average file + numBin = assignBin( + num_bin, bufLoc + "/average.txt", bufLoc + "/bin", targetList, params.MINEXON + ) + + print("Significance Test ... ") + rScriptName = os.path.join(scriptPath, "scripts", "cn_analysis.v3.R") + args = shlex.split( + "Rscript {} {} {} {} {} {} {} {} {} {} {}".format( + rScriptName, + num_bin, + params.MINREADDEPTH, + params.MINNBASES, + outFolder, + params.SAMPLENAME, + params.PLOT, + numBin, + params.MINCONTROL, + params.MINTEST, + params.MINAVG, + ) + ) + rscr = subprocess.call(args) + + print("Generating Output Files ... ") + # Analysis of CNV + tNameList = os.listdir(outFolder + "/table/") + if num_bin > 1: + tNameId = str(num_bin) + "bins" + else: + tNameId = str(num_bin) + "bin" + + for tName in tNameList: + if tNameId in tName: + break + + if "CNATable" in tName: + tName = tName[: len(tName) - 4] + tableName = outFolder + "/table/" + tName + bufTable = bufLoc + "/" + tName + applyThreshold( + tableName, bufTable, params.PVAL, 100000 + ) # params.MAXGAP = 100000 + + # Large Region CBS + if params.LARGE != "False": + print("DEBUG 266a") + rScriptName2 = os.path.join(scriptPath, "scripts", "large_region_cbs.R") + args = shlex.split( + "Rscript {} {} {} {} {} {} {} {} {}".format( + rScriptName2, + tableName + ".txt", + params.SMALLSEGMENT, + params.LARGESEGMENT, + params.PVAL, + params.PASSSIZE, + params.LRS, + params.LRE, + bufLoc, + ) + ) + rscr2 = subprocess.call(args) + print(str(args)) + else: + print("DEBUG 266b") - samTest.join() - if params.BEDINPUT == "False": - samCtr.join() + # Generate the DNA sequence (for VCF file) + print("Skipping VCF generation.. use tabular file instead.") - params.TEST = test_bam - if params.BEDINPUT == "False": - params.CONTROL = ctr_bam + else: + print("Table not found") - # remove multi mapped reads if --nomultimapped is specified - if (params.NOMULTIMAPPED == "True"): - print "Removing multi-mapped reads" - test_bam = bufLoc + "/test_reliable.BAM" - ctr_bam = bufLoc + "/control_reliable.BAM" +def removeTempFolder(tempFolderPath): + import shutil - bamTest = Process(target= removeMultiMapped, args=(params.TEST, test_bam)) - if params.BEDINPUT == "False": - bamCtr = Process(target= removeMultiMapped, args=(params.CONTROL, ctr_bam)) + shutil.rmtree(tempFolderPath) - bamTest.start() - if params.BEDINPUT == "False": - bamCtr.start() + print("Temp Folder Removed") - bamTest.join() - if params.BEDINPUT == "False": - bamCtr.join() - params.TEST = test_bam - if params.BEDINPUT == "False": - params.CONTROL = ctr_bam +def main(): + + # TODO: use argparse + # if len(sys.argv) == 2 and sys.argv[1] == "--version": + # print(VERSION) + # sys.exit(1) + + # option handling + params = Params() + params.repeat() + + # output folder handling + outFolder = checkOutputFolder(params.OUTFOLDER) + bufLoc = outFolder + "/buf" + + # convert target file + sorted_target = os.path.join(bufLoc, "target.BED") + os.system("sort -k1,1 -k2n {} > {}".format(params.TARGET, sorted_target)) + + # target breakdown + if params.MAXREGIONSIZE > 0: + new_target = os.path.join(bufLoc, "target_breakdown.BED") + target_breakdown( + sorted_target, params.MAXREGIONSIZE, params.TARGETREGIONSIZE, new_target + ) + sorted_target = new_target + + targetList = convertTarget(sorted_target) + + # convert sam to bam if -sam specified + if params.SAM: + print("Pre-processing SAM files") + + test_bam = bufLoc + "/test.BAM" + ctr_bam = bufLoc + "/control.BAM" + + samTest = Process(target=samToBam, args=(params.TEST, test_bam)) + if not params.BEDINPUT: + samCtr = Process(target=samToBam, args=(params.CONTROL, ctr_bam)) + + samTest.start() + if not params.BEDINPUT: + samCtr.start() + + samTest.join() + if not params.BEDINPUT: + samCtr.join() + + params.TEST = test_bam + if not params.BEDINPUT: + params.CONTROL = ctr_bam + + # remove multi mapped reads if --nomultimapped is specified + if params.NOMULTIMAPPED: + print("Removing multi-mapped reads") + test_bam = bufLoc + "/test_reliable.BAM" + ctr_bam = bufLoc + "/control_reliable.BAM" + + bamTest = Process(target=removeMultiMapped, args=(params.TEST, test_bam)) + if not params.BEDINPUT: + bamCtr = Process(target=removeMultiMapped, args=(params.CONTROL, ctr_bam)) + bamTest.start() + + if not params.BEDINPUT: + bamCtr.start() + bamTest.join() + + if not params.BEDINPUT: + bamCtr.join() + params.TEST = test_bam + if not params.BEDINPUT: + params.CONTROL = ctr_bam ### - # Remove PCR duplicates if --removeDups specified - if (params.REMOVEDUPS == "True"): - print "Removing reads marked as duplicates (PCR)" - - test_bam = bufLoc + "/test_removedups.BAM" - ctr_bam = bufLoc + "/control_removedups.BAM" - - bamTest = Process(target = removeDups, args=(params.TEST, test_bam)) - if params.BEDINPUT == "False": - bamCtr = Process(target = removeDups, args=(params.CONTROL, ctr_bam)) - - bamTest.start() - if params.BEDINPUT == "False": - bamCtr.start() - - bamTest.join() - if params.BEDINPUT == "False": - bamCtr.join() - - params.TEST = test_bam - if params.BEDINPUT == "False": - params.CONTROL = ctr_bam - - - # Get Chromosome Length - genomeFile = bufLoc + '/sample.Genome' - get_genome(params.TEST, genomeFile) - - # spawn bam converting scripts - pTest = Process(target= convertBam, - args=(params, bufLoc+'/testData/', targetList, genomeFile)) - - #BEDINPUT - if params.BEDINPUT == "False": - - cTest = Process(target= convertBam, - args=(params, bufLoc+'/ctrData/' , targetList, genomeFile)) - else: - cTest = Process(target= convertBamSimple, - args=(params, bufLoc+'/ctrData/', targetList, genomeFile)) - # start the processes - pTest.start() - cTest.start() - - # wait for all the processes to finish before continuing - pTest.join() - cTest.join() - - # Get the read depth count from temporary folder - for folder in [bufLoc+'/testData/', bufLoc+'/ctrData/']: - if 'testData' in folder: - t1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n")) - else: - n1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n")) - print "Test file read depth = ", t1 - print "Control file read depth = ", n1 - print "Pre-processing Completed. " - - # Get the Average of the Log Ratio - print "Getting the Log Ratio ... " - testListName = bufLoc + '/testData/geneRefCoverage.txt' - controlListName = bufLoc + '/ctrData/geneRefCoverage.txt' - avOut = bufLoc + "/average.txt" - averageCount(testListName, controlListName, avOut, t1, n1, params.MINREADDEPTH, params.MINNBASES) - - # Analysis. [Bin, significance test, large deletion, vcf output] - print "Binning ... " - binProc = [] - for numBin in params.NUMBIN: - binProc.append(Process(target= analysisPerBin, args=(params,numBin,outFolder,targetList))) - - for proc in binProc: - proc.start() - - for proc in binProc: - proc.join() - - # Removed Temp Folder - removeTempFolder(bufLoc) - + # Remove PCR duplicates if --removeDups specified + if params.REMOVEDUPS: + print("Removing reads marked as duplicates (PCR)") + + test_bam = bufLoc + "/test_removedups.BAM" + ctr_bam = bufLoc + "/control_removedups.BAM" + + bamTest = Process(target=removeDups, args=(params.TEST, test_bam)) + + if not params.BEDINPUT: + bamCtr = Process(target=removeDups, args=(params.CONTROL, ctr_bam)) + + bamTest.start() + if not params.BEDINPUT: + bamCtr.start() + bamTest.join() + if not params.BEDINPUT: + bamCtr.join() + + params.TEST = test_bam + if not params.BEDINPUT: + params.CONTROL = ctr_bam + + # Get Chromosome Length + genomeFile = bufLoc + "/sample.Genome" + get_genome(params.TEST, genomeFile) + + # spawn bam converting scripts + pTest = Process( + target=convertBam, args=(params, bufLoc + "/testData/", targetList, genomeFile) + ) + + # BEDINPUT + if not params.BEDINPUT: + + cTest = Process( + target=convertBam, + args=(params, bufLoc + "/ctrData/", targetList, genomeFile), + ) + else: + cTest = Process( + target=convertBamSimple, + args=(params, bufLoc + "/ctrData/", targetList, genomeFile), + ) + # start the processes + pTest.start() + cTest.start() + + # wait for all the processes to finish before continuing + pTest.join() + cTest.join() + + # Get the read depth count from temporary folder + for folder in [bufLoc + "/testData/", bufLoc + "/ctrData/"]: + fh = open(folder + "temp.txt") + tmpDP = int(fh.readlines()[0].strip("\n")) + + if "testData" in folder: + t1 = tmpDP + else: + n1 = tmpDP + + print("Test file read depth = ", t1) + print("Control file read depth = ", n1) + print("Pre-processing Completed. ") + + # Get the Average of the Log Ratio + print("Getting the Log Ratio ... ") + testListName = bufLoc + "/testData/geneRefCoverage.txt" + controlListName = bufLoc + "/ctrData/geneRefCoverage.txt" + avOut = bufLoc + "/average.txt" + averageCount( + testListName, + controlListName, + avOut, + t1, + n1, + params.MINREADDEPTH, + params.MINNBASES, + ) + + # Analysis. [Bin, significance test, large deletion, vcf output] + print("Binning ... ") + binProc = [] + for numBin in params.NUMBIN: + binProc.append( + Process(target=analysisPerBin, args=(params, numBin, outFolder, targetList)) + ) + + for proc in binProc: + proc.start() + + for proc in binProc: + proc.join() + + # Removed Temp Folder + removeTempFolder(bufLoc) + + if __name__ == "__main__": - main() - print "Done... " + main() + print("Done... ") diff --git a/scripts/assign_bin_number_v2.py b/scripts/assign_bin_number_v2.py index 4eb2660..be19d15 100755 --- a/scripts/assign_bin_number_v2.py +++ b/scripts/assign_bin_number_v2.py @@ -17,137 +17,151 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 12 October 2011 16:43PM def assignBin(binNumber, srcFile, binFile, targetList, minExons): -#def assignBin(binNumber, srcFile, binFile, minExons): - from math import log - - src = file.readlines(open(srcFile)) - #binOut = open(binFile, "w") - - minExons = int(minExons) - count = 0 - logcov_list = [] - - # Get the Log Coverage List for the normal sample - for exon in src: - exon = exon.split() - cov1 = float(exon[7]) - cov2 = float(exon[8]) - cov = (cov1 + cov2) / 2 - if (cov > 0): - logcov = log(cov, 2) - else: - logcov = 0 - logcov_list.append(logcov) - #print "Len of logcov_list:", len(logcov_list) - - # Calculate the boundaries of the bins - minLog = min(logcov_list) - maxLog = max(logcov_list) - boundary = (maxLog-minLog)/binNumber - #print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber - - - # Split exons to different bins - bin_list = [] - boundary_dict = {} - for logcov in logcov_list: - i = 1 - set_boundary = minLog + boundary - while (logcov > set_boundary): - i += 1 - set_boundary = minLog + (boundary * i) - #boundary_dict[i] = set_boundary - bin_list.append(i) - - for i in range(binNumber+2): - boundary_dict[i] = minLog + (boundary * i) - - - # Check the number of exons for each bin - # Merge with the adjacent bins if it is too small - for z in range(1, binNumber+1): - element = bin_list.count(z) - #print "Bin", z, "has", element, "elements" - if (element < minExons): - while (bin_list.count(z) != 0): - if (z != binNumber): - bin_list[bin_list.index(z)]+=1 - else: - bin_list[bin_list.index(z)]-=1 - - - # Check the number of exons in the last bin - last_bin_number = sorted(set(bin_list))[-1] - if len(set(bin_list)) > 1: - second_last_bin = sorted(set(bin_list))[-2] - else: - second_last_bin = last_bin_number - element = bin_list.count(last_bin_number) - if (element < minExons): - while (bin_list.count(last_bin_number) != 0): - if (last_bin_number != 1): - #bin_list[bin_list.index(last_bin_number)] -= 1 - bin_list[bin_list.index(last_bin_number)] = second_last_bin - - final_number_of_bin = len(set(bin_list)) - - # List out the binning boundaries in a txt file - boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))] - i = 1 - - #boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt" - boundary_file = binFile + str(binNumber) + ".boundaries.txt" - boOut = open(boundary_file, "w") - boOut.write("\t".join([str(0), str(minLog)])+"\n") - for z in boundary_list: - if (i==final_number_of_bin): - z = maxLog - boOut.write("\t".join([str(i), str(z)])+"\n") - i += 1 - boOut.close() - - # Re-sort the bin numbers - get rid of gaps - curr_z = 1 - bin_number_dict = {} - for z in sorted(set(bin_list)): - bin_number_dict[z] = curr_z - curr_z += 1 - - - # Append the bin number to the original file - #binFile = binFile + str(final_number_of_bin)+".txt" - binFile = binFile + str(binNumber) + ".txt" - binOut = open(binFile, "w") - - for exons in src: - exon = exons.split() - id = int(exon[0]) - gene = exon[1] - exonNumber = exon[5] - - target = targetList[int(id)-1] - if target.id == id: - chr = target.chr - oriStart = target.start - oriEnd = target.end - - else: - print "ERROR..." - - f_bin_number = str(bin_number_dict[bin_list[count]]) - binOut.write("\t".join([exons.strip("\n"), f_bin_number,chr, oriStart, oriEnd])) - binOut.write("\n") - count += 1 - - binOut.close() - print "End of assign.bin.number.py with %s exons in %s bins" %(count, final_number_of_bin) - - - return final_number_of_bin - + # def assignBin(binNumber, srcFile, binFile, minExons): + from math import log + + sourceFH = open(srcFile) + src = sourceFH.readlines() + # binOut = open(binFile, "w") + + minExons = int(minExons) + count = 0 + logcov_list = [] + + # Get the Log Coverage List for the normal sample + for exon in src: + exon = exon.split() + cov1 = float(exon[7]) + cov2 = float(exon[8]) + cov = (cov1 + cov2) / 2 + if cov > 0: + logcov = log(cov, 2) + else: + logcov = 0 + logcov_list.append(logcov) + # print "Len of logcov_list:", len(logcov_list) + + # Calculate the boundaries of the bins + minLog = min(logcov_list) + maxLog = max(logcov_list) + boundary = (maxLog - minLog) / binNumber + # print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber + + # Split exons to different bins + bin_list = [] + boundary_dict = {} + for logcov in logcov_list: + i = 1 + set_boundary = minLog + boundary + while logcov > set_boundary: + i += 1 + set_boundary = minLog + (boundary * i) + # boundary_dict[i] = set_boundary + bin_list.append(i) + + for i in range(binNumber + 2): + boundary_dict[i] = minLog + (boundary * i) + + # Check the number of exons for each bin + # Merge with the adjacent bins if it is too small + if len(bin_list) < minExons: + newMinExons = int(len(bin_list) / 3) + print( + f"The total amount of exons ({len(bin_list)}) is smaller than the specified minExons ({minExons})\nSetting minExons to one third of the amount of targeted areas ({newMinExons})" + ) + minExons = newMinExons + + print("before" + str(bin_list)) + for z in range(1, binNumber + 1): + element = bin_list.count(z) + print("Bin", z, "has", element, "elements") + if element < minExons: + while bin_list.count(z) != 0: + if z != binNumber: + bin_list[bin_list.index(z)] += 1 + else: + bin_list[bin_list.index(z)] -= 1 + + print("after" + str(bin_list)) + # Check the number of exons in the last bin + last_bin_number = sorted(set(bin_list))[-1] + + if len(set(bin_list)) > 1: + second_last_bin = sorted(set(bin_list))[-2] + else: + second_last_bin = last_bin_number + + element = bin_list.count(last_bin_number) + if element < minExons: + while bin_list.count(last_bin_number) != 0: + if last_bin_number != 1: + # bin_list[bin_list.index(last_bin_number)] -= 1 + bin_list[bin_list.index(last_bin_number)] = second_last_bin + print("checking through bins") + print(bin_list) + + final_number_of_bin = len(set(bin_list)) + + # List out the binning boundaries in a txt file + boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))] + i = 1 + + # boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt" + boundary_file = binFile + str(binNumber) + ".boundaries.txt" + boOut = open(boundary_file, "w") + boOut.write("\t".join([str(0), str(minLog)]) + "\n") + for z in boundary_list: + if i == final_number_of_bin: + z = maxLog + boOut.write("\t".join([str(i), str(z)]) + "\n") + i += 1 + boOut.close() + + # Re-sort the bin numbers - get rid of gaps + curr_z = 1 + bin_number_dict = {} + for z in sorted(set(bin_list)): + bin_number_dict[z] = curr_z + curr_z += 1 + + # Append the bin number to the original file + # binFile = binFile + str(final_number_of_bin)+".txt" + binFile = binFile + str(binNumber) + ".txt" + binOut = open(binFile, "w") + + for exons in src: + exon = exons.split() + id = int(exon[0]) + gene = exon[1] + exonNumber = exon[5] + + target = targetList[int(id) - 1] + if target.id == id: + chr = target.chr + oriStart = target.start + oriEnd = target.end + + else: + print("ERROR...") + + f_bin_number = str(bin_number_dict[bin_list[count]]) + binOut.write( + "\t".join([exons.strip("\n"), f_bin_number, chr, oriStart, oriEnd]) + ) + binOut.write("\n") + count += 1 + + binOut.close() + print( + "End of assign.bin.number.py with {} exons in {} bins".format( + count, final_number_of_bin + ) + ) + + return final_number_of_bin diff --git a/scripts/average_count.py b/scripts/average_count.py index dd4cb32..a6a7567 100755 --- a/scripts/average_count.py +++ b/scripts/average_count.py @@ -17,179 +17,202 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 28 Sep 2011 11:00AM import sys import math + def getAverage(list1): - if len(list1) > 0: - return float(sum(list1))/len(list1) - - return 0.0 + if len(list1) > 0: + return float(sum(list1)) / len(list1) + + return 0.0 + def getStdDev(list1, avg): - var = 0.0 - for x in list1: - var += (avg - x) ** 2 + var = 0.0 + for x in list1: + var += (avg - x) ** 2 - if (len(list1)-1) > 0: - var /= (len(list1)-1) + if (len(list1) - 1) > 0: + var /= len(list1) - 1 + + return math.sqrt(var) - return math.sqrt(var) def getMinMax(list1): - length = len(list1) - if length != 0: - min = list1[0] - max = list1[length-1] - else: - min = 0 - max = 0 + length = len(list1) + if length != 0: + min = list1[0] + max = list1[length - 1] + else: + min = 0 + max = 0 + + return min, max - return min, max def getMedian(list1): - length = len(list1) - if length == 0: - median = 0 - elif length % 2 == 0: - median = (list1[length/2]+list1[(length/2) - 1])/2 - else: - median = list1[length/2] - return median + length = len(list1) + center = int(length / 2) + if length == 0: + median = 0 + elif length % 2 == 0: + median = (list1[center] + list1[center - 1]) / 2 + else: + median = list1[center] + return median + def createDataDict(count, list1, r, offset, id_check, exon_check): - tDict = {} - tDictOri = {} - - while count < len(list1): - t = list1[count].split() - tId = t[5] - tExon = t[6] - - if (tId != id_check) or (tExon != exon_check): - return count, tDict, tDictOri - - tStart = int(t[2]) - tEnd = int(t[3]) - tCov = float(t[4]) / r + offset #GeoMean Normalisation - tCovOri = float(t[4]) + offset #without scaling - - #filling dict - while tStart < tEnd: - tDict[tStart] = tCov - tDictOri[tStart] = tCovOri #without scaling - tStart += 1 - - count += 1 - - return count, tDict, tDictOri - -def getFactor (val1, val2): - r = math.sqrt(val1 * val2) - r1 = val1/r - r2 = val2/r - return r1, r2 - -def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases): - tList = file.readlines(open(tFile)) - nList = file.readlines(open(nFile)) - # constant & counter - OFF = 1 - tCount = 0 - nCount = 0 - - # create and open files - output = open(averageOut, "w") - - # Offset and Ratio for Geometric Mean Normalisation - r1, r2 = getFactor(tReadCount, nReadCount) - if rd_threshold > 0: - #OFF = 0 - OFF = 0.5 - - #big loop - while (nCount < len(nList)): - # initialisation, get the chr, geneID, geneName - init = tList[tCount].split() - initial = init[5] - _exon = init[6] - chr = init[1] - gene = init[0] - _start = int(init[2]) - - # check if t-gene and n-gene refer to the same gene - check_init = nList[nCount].split() - if check_init[5] != initial or check_init[6] != _exon: - print "Initial: ", initial - print "Check_Init.id: ", check_init[5] - print "_Exon: ", _exon - print "Check_Init.exon: ", check_init[6] - print "Error. Comparing different Gene" - sys.exit(1) - - # create data dictionary for tumour and normal data (per each regions/ exon) - tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon) - nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon) - # check number of bases in the both gene dict - if len(nDict) != len(tDict): - print "N:", len(nDict) - print "T:", len(tDict) - print "Error. Different length of dict" - sys.exit(1) - - # compare coverage - count = _start - _max = max(nDict.keys()) - ratioList = [] - tumourList = [] - normalList = [] - tumourOriList = [] - normalOriList = [] - while count <= _max: - # get ratio - if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold): - ratio = 0.0 - else: - if tDict[count] == 0: - tDict[count] = 0.5 - - ratio = math.log((float(tDict[count]) / nDict[count]),2) - tumourList.append(tDict[count]) - tumourOriList.append(tDictOri[count]) - normalList.append(nDict[count]) - normalOriList.append(nDictOri[count]) - ratioList.append(ratio) - - count += 1 - - ratioLen = len(ratioList) - - # get average - avg = getAverage(ratioList) - sd = getStdDev(ratioList, avg) - tumourAvg= str(round(getAverage(tumourList),3)) - normalAvg= str(round(getAverage(normalList),3)) - tumourOriAvg = str(round(getAverage(tumourOriList),3)) - normalOriAvg = str(round(getAverage(normalOriList),3)) - - # get median - ratioList.sort() - min_logratio, max_logratio = getMinMax(ratioList) - median = getMedian(ratioList) - - # write output - if ratioLen >= minNBases: - output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t") - output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t") - output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") - output.write(tumourOriAvg + "\t" + normalOriAvg + "\t") - output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t") - output.write(str(round(max_logratio,3)) + "\n") - - output.close() - - #print "End of averageCount.py with the last target = '%s'" %(initial) + tDict = {} + tDictOri = {} + + while count < len(list1): + t = list1[count].split() + tId = t[5] + tExon = t[6] + + if (tId != id_check) or (tExon != exon_check): + return count, tDict, tDictOri + + tStart = int(t[2]) + tEnd = int(t[3]) + tCov = float(t[4]) / r + offset # GeoMean Normalisation + tCovOri = float(t[4]) + offset # without scaling + + # filling dict + while tStart < tEnd: + tDict[tStart] = tCov + tDictOri[tStart] = tCovOri # without scaling + tStart += 1 + + count += 1 + + return count, tDict, tDictOri + + +def getFactor(val1, val2): + r = math.sqrt(val1 * val2) + r1 = val1 / r + r2 = val2 / r + return r1, r2 + + +def averageCount( + tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases +): + tFH = open(tFile) + tList = tFH.readlines() + + nFH = open(nFile) + nList = nFH.readlines() + # constant & counter + OFF = 1 + tCount = 0 + nCount = 0 + + # create and open files + output = open(averageOut, "w") + + # Offset and Ratio for Geometric Mean Normalisation + r1, r2 = getFactor(tReadCount, nReadCount) + if rd_threshold > 0: + # OFF = 0 + OFF = 0.5 + + # big loop + while nCount < len(nList): + # initialisation, get the chr, geneID, geneName + init = tList[tCount].split() + initial = init[5] + _exon = init[6] + chr = init[1] + gene = init[0] + _start = int(init[2]) + + # print("initial: " + initial) + # print("Check_Init.id: ", check_init[5]) + # print("_Exon: ", _exon) + # print("Check_Init.exon: ", check_init[6]) + + # check if t-gene and n-gene refer to the same gene + check_init = nList[nCount].split() + if check_init[5] != initial or check_init[6] != _exon: + print("Initial: ", initial) + print("Check_Init.id: ", check_init[5]) + print("_Exon: ", _exon) + print("Check_Init.exon: ", check_init[6]) + print("Error. Comparing different Gene") + sys.exit(1) + + # create data dictionary for tumour and normal data (per each regions/ exon) + tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon) + nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon) + # check number of bases in the both gene dict + if len(nDict) != len(tDict): + print("N:", len(nDict)) + print("T:", len(tDict)) + print("Error. Different length of dict") + sys.exit(1) + + # compare coverage + count = _start + _max = max(nDict.keys()) + ratioList = [] + tumourList = [] + normalList = [] + tumourOriList = [] + normalOriList = [] + while count <= _max: + # get ratio + if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold): + ratio = 0.0 + else: + if tDict[count] == 0: + tDict[count] = 0.5 + + ratio = math.log((float(tDict[count]) / nDict[count]), 2) + tumourList.append(tDict[count]) + tumourOriList.append(tDictOri[count]) + normalList.append(nDict[count]) + normalOriList.append(nDictOri[count]) + ratioList.append(ratio) + + count += 1 + + assert len(ratioList) == len(tumourList), "List do not have the same length" + + ratioLen = len(ratioList) + + # get average + avg = getAverage(ratioList) + sd = getStdDev(ratioList, avg) + tumourAvg = str(round(getAverage(tumourList), 3)) + normalAvg = str(round(getAverage(normalList), 3)) + tumourOriAvg = str(round(getAverage(tumourOriList), 3)) + normalOriAvg = str(round(getAverage(normalOriList), 3)) + + # get median + ratioList.sort() + min_logratio, max_logratio = getMinMax(ratioList) + print(ratioList) + median = getMedian(ratioList) + + # write output + if ratioLen >= minNBases: + output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t") + output.write(str(round(avg, 3)) + "\t" + str(count) + "\t" + _exon + "\t") + output.write(str(round(sd, 3)) + "\t" + tumourAvg + "\t" + normalAvg + "\t") + output.write(tumourOriAvg + "\t" + normalOriAvg + "\t") + output.write( + str(round(median, 3)) + "\t" + str(round(min_logratio, 3)) + "\t" + ) + output.write(str(round(max_logratio, 3)) + "\n") + + output.close() + + # print "End of averageCount.py with the last target = '%s'" %(initial) diff --git a/scripts/cn_analysis.v3.R b/scripts/cn_analysis.v3.R index 480024f..9e65f61 100755 --- a/scripts/cn_analysis.v3.R +++ b/scripts/cn_analysis.v3.R @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# +# #-----------------------------------------------------------------------# # Last Updated : 31 Oct 2011 17:00PM @@ -49,7 +49,7 @@ pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf", # cnAverageFile = paste("bin", bins, ".txt", sep="") cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="") boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="") -print (cnAverageFile) +#print (cnAverageFile) cn.average = read.delim(cnAverageFile, as.is=F, header=F) cn.boundary= read.delim(boundariesFile,as.is=F, header=F) @@ -57,7 +57,7 @@ cn.boundary= read.delim(boundariesFile,as.is=F, header=F) cn.average.aboveTs = cn.average[cn.average$V3>min.bases,] cn.average.list = as.matrix(cn.average.aboveTs$V4) -# Get the mean and sd for each bins +# Get the mean and sd for each bins cn.average.mean = c() cn.average.sd = c() cn.average.log= c() @@ -136,7 +136,7 @@ if (bins > 1 ){ } -# Put the data's details into matrices +# Put the data's details into matrices ids = as.matrix(cn.average.aboveTs$V1) exons = as.matrix(cn.average.aboveTs$V6) exons.pos = as.matrix(cn.average.aboveTs$V5) @@ -211,7 +211,7 @@ fit.sd.fn <- function(x, fit.a, fit.b){ result = 2 ^ (fit.mean.fn(x, fit.a, fit.b)) return (result) } - + # Get the P Values, called the gain/loss # with average and sd from each bins pVal.list = c() @@ -225,7 +225,7 @@ for (i in 1:nrow(cn.average.list)){ logcov = logcov.mean[i] exon.bin = Bin[i] - if (length(logratios.sd) > 1){ + if (length(logratios.sd) > 1){ #pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b)) pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), sd.fn(logcov)) } else { @@ -259,11 +259,21 @@ outdf$P.Value[wh.to.excl]=NA outdf$Adjusted.P.Value[wh.to.excl]=NA +# rounding numbers to 3 digits +outdf$Mean.of.LogRatio <- round(outdf$Mean.of.LogRatio, digits=3) +outdf$Adjusted.Mean.of.LogRatio <- round(outdf$Adjusted.Mean.of.LogRatio, digits=3) +outdf$SD.of.LogRatio <- round(outdf$SD.of.LogRatio, digits=3) +outdf$Median.of.LogRatio <- round(outdf$Median.of.LogRatio, digits=3) + +#I dont know if it is smart to round the pValue... but maybe like 6 digits instead? +outdf$P.Value <- round(outdf$P.Value, digits=6) +outdf$Adjusted.P.Value <- round(outdf$Adjusted.P.Value, digits=6) + write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T) #Plotting SD #a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori)) -#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori)) +#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori)) #sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b) #sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="") #sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn) @@ -271,8 +281,5 @@ write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T) #End of the script -print ("End of cn_analysis.R") -print (i) - - - +message("End of cn_analysis.R") +#print (i) diff --git a/scripts/cn_apply_threshold.py b/scripts/cn_apply_threshold.py index ee7afd0..3961c08 100755 --- a/scripts/cn_apply_threshold.py +++ b/scripts/cn_apply_threshold.py @@ -17,98 +17,113 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 12 Oct 2011 11:00AM + def applyThreshold(outputName, bufTable, threshold, maxGap): - srcFile = outputName + ".txt" - outFile = bufTable + ".LargeVariations.txt" - bedFile = bufTable + ".BED" - fFile = outputName + ".DetailsFILTERED.txt" - ts = float(threshold) + srcFile = outputName + ".txt" + outFile = bufTable + ".LargeVariations.txt" + bedFile = bufTable + ".BED" + fFile = outputName + ".DetailsFILTERED.txt" + ts = float(threshold) + + # Read and open files + sourceFH = open(srcFile) + srcTable = sourceFH.readlines() + outTable = open(outFile, "w") + bedOut = open(bedFile, "w") + filteredTable = open(fFile, "w") - # Read and open files - srcTable = file.readlines(open(srcFile)) - outTable = open(outFile, "w") - bedOut = open(bedFile, "w") - filteredTable = open(fFile, "w") + # header + outTable.write("Chr \tStartCoordinate \tEndCoordinate \tGenes \tGain.Loss \n") + filteredTable.write(srcTable[0]) + prevChr = "" + prevStatus = "" + prevEnd = -1 + genes = [] + chrList = [] - #header - outTable.write("Chr \tStartCoordinate \tEndCoordinate \tGenes \tGain.Loss \n") - filteredTable.write(srcTable[0]) + for exons in srcTable: + exon = exons.split() + try: + adjPVal = float(exon[12]) + except: + continue - prevChr = '' - prevStatus = '' - prevEnd = -1 - genes = [] - chrList = [] + # print (str(adjPVal)+" <= "+str(ts)) + if adjPVal <= ts: + chr = exon[3] + gene = exon[2] + status = exon[13] + start = exon[4] + end = exon[5] - for exons in srcTable: - exon = exons.split() - try: - adjPVal = float(exon[12]) - except: - continue + # For first row + if prevEnd == -1: + gap = 0 + else: + gap = int(prevEnd) - int(start) - #print (str(adjPVal)+" <= "+str(ts)) - if adjPVal <= ts: - chr = exon[3] - gene = exon[2] - status = exon[13] - start = exon[4] - end = exon[5] + # Write Filtered Table + filteredTable.write(exons) - # For first row - if prevEnd == -1: - gap = 0 - else: - gap = int(prevEnd) - int(start) + # Write Bed File + # bedOut.write(chr.strip("chr") +"\t" +start +"\t"+ end+"\t"+ + # chr.strip("chr")+":"+start+"-"+end+":"+str(adjPVal)+"\n") + bedOut.write( + chr + + "\t" + + start + + "\t" + + end + + "\t" + + chr + + ":" + + start + + "-" + + end + + ":" + + str(adjPVal) + + "\n" + ) - # Write Filtered Table - filteredTable.write(exons) + if prevChr == "" and prevStatus == "": + if chr not in chrList: + print(chr) + chrList.append(chr) + elif (chr == prevChr) and (status == prevStatus) and (gap < maxGap): + start = prevStart + else: + outTable.write(prevChr + "\t" + prevStart + "\t" + prevEnd + "\t") + for gsym in genes: + outTable.write(gsym + ", ") + outTable.write("\t" + prevStatus + "\n") + genes = [] - # Write Bed File - #bedOut.write(chr.strip("chr") +"\t" +start +"\t"+ end+"\t"+ - # chr.strip("chr")+":"+start+"-"+end+":"+str(adjPVal)+"\n") - bedOut.write(chr +"\t" +start +"\t"+ end+"\t"+ - chr+":"+start+"-"+end+":"+str(adjPVal)+"\n") - - if prevChr == '' and prevStatus == '': - if chr not in chrList: - print chr - chrList.append(chr) - elif (chr == prevChr) and (status == prevStatus) and (gap < maxGap): - start = prevStart - else: - outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t") - for gsym in genes: - outTable.write(gsym + ", ") - outTable.write("\t" + prevStatus + "\n") - genes=[] - - if gene not in genes: - genes.append(gene) - prevChr = chr - prevStatus = status - prevStart = start - prevEnd = end - elif len(genes) > 0: - outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t") - for gsym in genes: - outTable.write(gsym + ", " ) - outTable.write("\t" + prevStatus + "\n") - prevChr = '' - prevStatus = '' - genes = [] + if gene not in genes: + genes.append(gene) + prevChr = chr + prevStatus = status + prevStart = start + prevEnd = end + elif len(genes) > 0: + outTable.write(prevChr + "\t" + prevStart + "\t" + prevEnd + "\t") + for gsym in genes: + outTable.write(gsym + ", ") + outTable.write("\t" + prevStatus + "\n") + prevChr = "" + prevStatus = "" + genes = [] - if len(genes) > 0: - outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t") - for gsym in genes: - outTable.write(gsym + ", ") - outTable.write("\t" + prevStatus + "\n") + if len(genes) > 0: + outTable.write(prevChr + "\t" + prevStart + "\t" + prevEnd + "\t") + for gsym in genes: + outTable.write(gsym + ", ") + outTable.write("\t" + prevStatus + "\n") - filteredTable.close() - bedOut.close() - outTable.close() + filteredTable.close() + bedOut.close() + outTable.close() diff --git a/scripts/convert_gene_coordinate.py b/scripts/convert_gene_coordinate.py index 08a2ec7..6f20858 100755 --- a/scripts/convert_gene_coordinate.py +++ b/scripts/convert_gene_coordinate.py @@ -17,219 +17,258 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 16 May 2013 11:00AM import sys import os -def outputwrite(output, gene,chr,a,b,c,id,n,sOri, eOri): - id = str(id) - n = str(n) - output.write(gene+"\t"+chr+"\t"+a+"\t"+b+"\t"+c+"\t"+id+"\t"+n+"\t"+sOri+"\t"+eOri+"\n") - -def outputwrite2(output2, chr,c,sOri, eOri): - output2.write(chr+"\t"+sOri+"\t"+eOri+"\t"+c+"\n") - - -def convertGeneCoordinate(targetList, bufLocFolder): - inputfile2 = bufLocFolder + "chr/" - outputfile = bufLocFolder + "geneRefCoverage.txt" - outputfile2= bufLocFolder + "geneRefCoverage2.txt" - - output= open(outputfile,"w") - output2 = open(outputfile2 , "w") - - rn = 1 - prevchr = "" - for target in targetList: - chr = target.chr - starts = target.oriStart.split(",") - ends = target.oriEnd.split(",") - - if ((len(chr) > 5) or (chr[len(chr)-1] == "M")): - continue - - if (prevchr != chr): - print chr #progress checking - prevchr = chr - t = 0 - ff=inputfile2+chr+".txt" - if not os.path.isfile(ff): - sys.stderr.write("WARNING: %s NOT FOUND (should be generated by splitByChromosome3). \ -Likely due to no reads found in a targeted chromosome (%s), in turn likely due to currupted bam files\n" % - (ff,chr)) - #sys.exit(1) - covFile=[] - else: - covFile = file.readlines(open(inputfile2+chr+".txt","r")) - - for n in range(0,target.numberExon): - if t >= len(covFile): - break - cov = covFile[t].split() - while ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))): - if (int(cov[1]) > int(starts[n])): - t-=1 - else: - t+=1 - cov = covFile[t].split() - - while ((int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2]))): - # print output # - if (rn == 1): - prev = target.id - nID = target.id - if (nID != prev): - rn = 1 - ref1 = str(rn) - ref2 = str(int(cov[2]) - int(starts[n]) + rn) - outputwrite(output, target.gene,chr,ref1,ref2,cov[3],target.id,n,starts[n],cov[2]) - - outputwrite2(output2, chr,cov[3],starts[n],cov[2]) - - - rn = int(ref2) - prev = nID - # -- # - - # get to the next line of inputfile# - t+= 1 - cov = covFile[t].split() - starts[n] = cov[1] - - #print output # - if (t == 0) and (t >= len(covFile)): - continue - - if (rn == 1): - prev = target.id - nID = target.id - if (nID != prev): - rn = 1 - ref1 = str(rn) - ref2 = str(int(ends[n]) - int(starts[n]) + rn) - outputwrite(output, target.gene, chr, ref1, ref2, cov[3], target.id, n, starts[n], ends[n]) - - outputwrite2(output2, chr, cov[3], starts[n], ends[n]) - - - rn = int(ref2) - prev = nID - # -- # - output.close() - output2.close() - -def convertGeneCoordinate2(targetList, bufLocFolder): - inputfile2 = bufLocFolder + "chr/" - outputfile = bufLocFolder + "geneRefCoverage.txt" - outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt" - - output= open(outputfile,"w") - output_avg = open(outputfile_avg,"w") - - rn = 1 - prevchr = "" - for target in targetList: - chr = target.chr - starts = target.oriStart.split(",") - ends = target.oriEnd.split(",") - target_ttl_readdepth = 0 - starts_leftmost = starts[0] - - - if ((len(chr) > 5) or (chr[len(chr)-1] == "M")): - continue - - if (prevchr != chr): - print chr #progress checking - prevchr = chr - t = 0 - ff=inputfile2+chr+".txt" - if not os.path.isfile(ff): - sys.stderr.write("WARNING: %s NOT FOUND (should be generated by splitByChromosome3). \ -Likely due to no reads found in a targeted chromosome (%s), in turn likely due to currupted bam files\n" % - (ff,chr)) - #sys.exit(1) - covFile=[] - else: - covFile = file.readlines(open(inputfile2+chr+".txt","r")) - - for n in range(0,target.numberExon): - if t >= len(covFile): - break - cov = covFile[t].split() - while ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))): - if (int(cov[1]) > int(starts[n])): t-=1 - else: - t+=1 - cov = covFile[t].split() - - while ((int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2]))): - # print output # - if (rn == 1): - prev = target.id - nID = target.id - if (nID != prev): - rn = 1 - ref1 = str(rn) - ref2 = str(int(cov[2]) - int(starts[n]) + rn) - outputwrite2(output, chr,cov[3],starts[n],cov[2]) - tmprange=int(cov[2])-int(starts[n])+1 - target_ttl_readdepth+=int(cov[3])*tmprange - #target_length+=tmprange - - rn = int(ref2) - prev = nID - # -- # - - # get to the next line of inputfile# - t+= 1 - cov = covFile[t].split() - starts[n] = cov[1] - - #print output # - if (t == 0) and (t >= len(covFile)): - continue - - if (rn == 1): - prev = target.id +def outputwrite(output, gene, chr, a, b, c, id, n, sOri, eOri): + id = str(id) + n = str(n) + output.write( + gene + + "\t" + + chr + + "\t" + + a + + "\t" + + b + + "\t" + + c + + "\t" + + id + + "\t" + + n + + "\t" + + sOri + + "\t" + + eOri + + "\n" + ) + + +def outputwrite2(output2, chr, c, sOri, eOri): + output2.write(chr + "\t" + sOri + "\t" + eOri + "\t" + c + "\n") + + +def convertGeneCoordinate(targetList, bufLocFolder): + inputfile2 = bufLocFolder + "chr/" + outputfile = bufLocFolder + "geneRefCoverage.txt" + outputfile2 = bufLocFolder + "geneRefCoverage2.txt" + + output = open(outputfile, "w") + output2 = open(outputfile2, "w") + + rn = 1 + prevchr = "" + for target in targetList: + chr = target.chr + starts = target.oriStart.split(",") + ends = target.oriEnd.split(",") + + if (len(chr) > 5) or (chr[len(chr) - 1] == "M"): + continue + + if prevchr != chr: + print(chr) # progress checking + prevchr = chr + t = 0 + ff = inputfile2 + chr + ".txt" + if not os.path.isfile(ff): + sys.stderr.write( + "WARNING: {} NOT FOUND (should be generated by splitByChromosome3). \ +Likely due to no reads found in a targeted chromosome ({}), in turn likely due to currupted bam files\n".format( + ff, chr + ) + ) + # sys.exit(1) + covFile = [] + else: + ffh = open(ff, "r") + covFile = ffh.readlines() + + for n in range(0, target.numberExon): + if t >= len(covFile): + break + cov = covFile[t].split() + while (int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2])): + if int(cov[1]) > int(starts[n]): + t -= 1 + else: + t += 1 + cov = covFile[t].split() + + while (int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2])): + # print output # + if rn == 1: + prev = target.id nID = target.id - if (nID != prev): - rn = 1 + if nID != prev: + rn = 1 + ref1 = str(rn) - ref2 = str(int(ends[n]) - int(starts[n]) + rn) - outputwrite2(output, chr, cov[3], starts[n], ends[n]) - tmprange=int(ends[n])-int(starts[n])+1 - target_ttl_readdepth+=int(cov[3])*tmprange - #target_length+=tmprange - target_length = int(ends[n])-int(starts_leftmost)+1 - output_avg.write("\t".join([chr,starts_leftmost,ends[n],str(target_ttl_readdepth/target_length),str(target_length)])+"\n") + ref2 = str(int(cov[2]) - int(starts[n]) + rn) + outputwrite( + output, + target.gene, + chr, + ref1, + ref2, + cov[3], + target.id, + n, + starts[n], + cov[2], + ) + + outputwrite2(output2, chr, cov[3], starts[n], cov[2]) rn = int(ref2) prev = nID # -- # - output.close() - output_avg.close() - - - - - - - - - - - - - - - + # get to the next line of inputfile# + t += 1 + cov = covFile[t].split() + starts[n] = cov[1] + + # print output # + if (t == 0) and (t >= len(covFile)): + continue + + if rn == 1: + prev = target.id + nID = target.id + if nID != prev: + rn = 1 + ref1 = str(rn) + ref2 = str(int(ends[n]) - int(starts[n]) + rn) + outputwrite( + output, + target.gene, + chr, + ref1, + ref2, + cov[3], + target.id, + n, + starts[n], + ends[n], + ) + + outputwrite2(output2, chr, cov[3], starts[n], ends[n]) + + rn = int(ref2) + prev = nID + # -- # + output.close() + output2.close() +def convertGeneCoordinate2(targetList, bufLocFolder): + inputfile2 = bufLocFolder + "chr/" + outputfile = bufLocFolder + "geneRefCoverage.txt" + outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt" + + output = open(outputfile, "w") + output_avg = open(outputfile_avg, "w") + + rn = 1 + prevchr = "" + for target in targetList: + chr = target.chr + starts = target.oriStart.split(",") + ends = target.oriEnd.split(",") + target_ttl_readdepth = 0 + starts_leftmost = starts[0] + + if (len(chr) > 5) or (chr[len(chr) - 1] == "M"): + continue + + if prevchr != chr: + print(chr) # progress checking + prevchr = chr + t = 0 + ff = inputfile2 + chr + ".txt" + if not os.path.isfile(ff): + sys.stderr.write( + "WARNING: %s NOT FOUND (should be generated by splitByChromosome3). \ + Likely due to no reads found in a targeted chromosome (%s), in turn likely due to currupted bam files\n" + % (ff, chr) + ) + # sys.exit(1) + covFile = [] + else: + covFile = file.readlines(open(inputfile2 + chr + ".txt", "r")) + + for n in range(0, target.numberExon): + if t >= len(covFile): + break + cov = covFile[t].split() + while (int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2])): + if int(cov[1]) > int(starts[n]): + t -= 1 + else: + t += 1 + cov = covFile[t].split() + + while (int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2])): + # print output # + if rn == 1: + prev = target.id + nID = target.id + if nID != prev: + rn = 1 + ref1 = str(rn) + ref2 = str(int(cov[2]) - int(starts[n]) + rn) + outputwrite2(output, chr, cov[3], starts[n], cov[2]) + tmprange = int(cov[2]) - int(starts[n]) + 1 + target_ttl_readdepth += int(cov[3]) * tmprange + # target_length+=tmprange + rn = int(ref2) + prev = nID + # -- # + # get to the next line of inputfile# + t += 1 + cov = covFile[t].split() + starts[n] = cov[1] + + # print output # + if (t == 0) and (t >= len(covFile)): + continue + + if rn == 1: + prev = target.id + nID = target.id + if nID != prev: + rn = 1 + ref1 = str(rn) + ref2 = str(int(ends[n]) - int(starts[n]) + rn) + outputwrite2(output, chr, cov[3], starts[n], ends[n]) + tmprange = int(ends[n]) - int(starts[n]) + 1 + target_ttl_readdepth += int(cov[3]) * tmprange + # target_length+=tmprange + target_length = int(ends[n]) - int(starts_leftmost) + 1 + output_avg.write( + "\t".join( + [ + chr, + starts_leftmost, + ends[n], + str(target_ttl_readdepth / target_length), + str(target_length), + ] + ) + + "\n" + ) + + rn = int(ref2) + prev = nID + # -- # + output.close() + output_avg.close() diff --git a/scripts/convert_targeted_regions.py b/scripts/convert_targeted_regions.py index 209a18c..830f011 100755 --- a/scripts/convert_targeted_regions.py +++ b/scripts/convert_targeted_regions.py @@ -17,55 +17,56 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 28 Mar 2011 11:00AM + class Target: - """ + """ Class for target regions """ - - population = 0 - def __init__(self): - self.id = 0 - self.gene = "unknown" - self.chr = "chr1" - self.start = 0 - self.end = 0 - self.numberExon = 0 - self.oriStart = 0 - self.oriEnd = 0 + population = 0 + + def __init__(self): + self.id = 0 + self.gene = "unknown" + self.chr = "chr1" + self.start = 0 + self.end = 0 + self.numberExon = 0 + self.oriStart = 0 + self.oriEnd = 0 -def convertTarget(target): - targets = open(target) - targetList = [] +def convertTarget(target): + targets = open(target) - count = 0 - for region in targets: - region = region.split() - chr = "chr" + region[0].strip("chr") - start = region[1] - end = region[2] - try: - gene = region[3] - except: - gene = "unknown" - count += 1 + targetList = [] - aTarget = Target() - aTarget.id = count - aTarget.gene = gene - aTarget.chr = chr - aTarget.start = start - aTarget.end = end - aTarget.numberExon = 1 - aTarget.oriStart = start - aTarget.oriEnd = end + count = 0 + for region in targets: + region = region.split() + chr = "chr" + region[0].strip("chr") + start = region[1] + end = region[2] + try: + gene = region[3] + except: + gene = "unknown" + count += 1 - targetList.append(aTarget) + aTarget = Target() + aTarget.id = count + aTarget.gene = gene + aTarget.chr = chr + aTarget.start = start + aTarget.end = end + aTarget.numberExon = 1 + aTarget.oriStart = start + aTarget.oriEnd = end + targetList.append(aTarget) - return targetList + return targetList diff --git a/scripts/count_libsize.py b/scripts/count_libsize.py index 618ef38..fa6363e 100755 --- a/scripts/count_libsize.py +++ b/scripts/count_libsize.py @@ -17,22 +17,23 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 05 October 2011 16:43PM + def get_libsize(bedgraph_file): - bedgraph = open(bedgraph_file) - libsize = 0 - for line in bedgraph: - line = line.split() - chr = line[0] - start = int(line[1]) - end = int(line[2]) - cov = float(line[3]) - #cov = int(line[3]) - - libsize += (end-start)*cov + bedgraph = open(bedgraph_file) + libsize = 0 + for line in bedgraph: + line = line.split() + chr = line[0] + start = int(line[1]) + end = int(line[2]) + cov = float(line[3]) + # cov = int(line[3]) + + libsize += (end - start) * cov - bedgraph.close() - return int(libsize) + bedgraph.close() + return int(libsize) diff --git a/scripts/get_chr_length.py b/scripts/get_chr_length.py index 35923e3..2ceb204 100755 --- a/scripts/get_chr_length.py +++ b/scripts/get_chr_length.py @@ -17,27 +17,26 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 05 October 2011 16:43PM import subprocess, shlex -def get_genome(srcFile, genomeOut): - genome = open(genomeOut, "w") - args = shlex.split("samtools view -H %s" %(srcFile)) - raw_header = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0] - headers = raw_header.split("\n") +def get_genome(srcFile, genomeOut): + genome = open(genomeOut, "w") - for header in headers: - header = header.split("\t") - if header[0][1:] != "SQ": - continue + args = shlex.split("samtools view -H {}".format(srcFile)) + raw_header = subprocess.Popen(args, stdout=subprocess.PIPE).communicate()[0] + headers = str(raw_header).split("\n") - genome.write(header[1].strip("SN:") + "\t" + header[2].strip("LN:") + "\n") + for header in headers: + header = header.split("\t") + if header[0][1:] != "SQ": + continue - genome.close() + genome.write(header[1].strip("SN:") + "\t" + header[2].strip("LN:") + "\n") - + genome.close() diff --git a/scripts/split_chromosome.py b/scripts/split_chromosome.py index 78d9198..176043c 100755 --- a/scripts/split_chromosome.py +++ b/scripts/split_chromosome.py @@ -17,89 +17,90 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 03 Sep 2011 15:00PM import os + def splitByChromosome(destFolder): - try: - os.mkdir(destFolder + "chr/") - except: - print "folder exist" + try: + os.mkdir(destFolder + "chr/") + except: + print("folder exist") + + inputfile = destFolder + "sample.BEDGRAPH" + outputfile = destFolder + "chr/chr1.txt" + file = open(inputfile, "r") + output = open(outputfile, "w") + check = "1" - inputfile = destFolder + "sample.BEDGRAPH" - outputfile = destFolder + "chr/chr1.txt" - file = open(inputfile,"r") - output = open(outputfile,"w") - check = "1" + for row in file: + if row[0] == "#": + continue - for row in file: - if row[0] == '#': - continue + cols = row.split() + chr = cols[0].strip("chr") + if chr != check: + output.close() + check = chr + output = open(destFolder + "chr/chr" + check + ".txt", "w") + output.write(row) - cols = row.split() - chr = cols[0].strip("chr") - if (chr != check): - output.close() - check = chr - output = open(destFolder+ "chr/chr"+check+".txt","w") - output.write(row) + output.close() - output.close() -#JLMod +# JLMod def splitByChromosome3(infile): - destFolder = os.path.dirname(infile)+"/" - - try: - os.mkdir(destFolder + "chr/") - except: - print "folder exist" - - #inputfile = destFolder + "sample.BEDGRAPH" - inputfile=infile - outputfile = destFolder + "chr/chr1.txt" - file = open(inputfile,"r") - output = open(outputfile,"w") - check = "1" - - for row in file: - cols = row.split() - chr = cols[0].strip("chr") - if (chr != check): - output.close() - check = chr - output = open(destFolder+ "chr/chr"+check+".txt","w") - output.write(row) - - output.close() + destFolder = os.path.dirname(infile) + "/" -def splitByChromosome2(folder): + try: + os.mkdir(destFolder + "chr/") + except: + print("folder exist") - try: - os.mkdir(folder + "target/") - except: - print "folder exist" + # inputfile = destFolder + "sample.BEDGRAPH" + inputfile = infile + outputfile = destFolder + "chr/chr1.txt" + file = open(inputfile, "r") + output = open(outputfile, "w") + check = "1" - inputfile = folder + "geneRefCoverage.txt" - outputfile = folder + "target/chr1.txt" - file = open(inputfile,"r") - output = open(outputfile,"w") - check = "1" + for row in file: + cols = row.split() + chr = cols[0].strip("chr") + if chr != check: + output.close() + check = chr + output = open(destFolder + "chr/chr" + check + ".txt", "w") + output.write(row) - for row in file: - cols = row.split() - chr = cols[0].strip("chr") - if (chr != check): - output.close() - check = chr - output = open(folder+ "target/chr"+check+".txt","w") - output.write(row) + output.close() - output.close() +def splitByChromosome2(folder): + try: + os.mkdir(folder + "target/") + except: + print("folder exist") + + inputfile = folder + "geneRefCoverage.txt" + outputfile = folder + "target/chr1.txt" + file = open(inputfile, "r") + output = open(outputfile, "w") + check = "1" + + for row in file: + cols = row.split() + chr = cols[0].strip("chr") + if chr != check: + output.close() + check = chr + output = open(folder + "target/chr" + check + ".txt", "w") + output.write(row) + + output.close() diff --git a/scripts/target_breakdown.py b/scripts/target_breakdown.py index c86e373..82b3805 100755 --- a/scripts/target_breakdown.py +++ b/scripts/target_breakdown.py @@ -17,40 +17,42 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 05 October 2011 16:43PM + def target_breakdown(filename, maxRegionSize, targetRegionSize, out_fname): - from math import ceil - - a = open(filename) - output = open(out_fname, "w") - - for x in a: - x = x.split() - chr = x[0] - start = int(x[1]) - end = int(x[2]) - info = "" - if (len(x) > 3): - info = x[3] - - regionSize = end-start - - if regionSize > maxRegionSize: - seg = ceil(regionSize/float(targetRegionSize)) - limit = ceil(regionSize/float(seg)) - - s_end = start - for i in range(int(seg)): - s_start = s_end - s_end = int(start + (limit*(i+1))) - if s_end > end: - s_end = end - output.write("\t".join([chr, str(s_start), str(s_end), info+"(SPLIT)"])+"\n") - else: - output.write("\t".join([chr, str(start), str(end), info])+"\n") - - output.close() - + from math import ceil + + a = open(filename) + output = open(out_fname, "w") + + for x in a: + x = x.split() + chr = x[0] + start = int(x[1]) + end = int(x[2]) + info = "" + if len(x) > 3: + info = x[3] + + regionSize = end - start + + if regionSize > maxRegionSize: + seg = ceil(regionSize / float(targetRegionSize)) + limit = ceil(regionSize / float(seg)) + + s_end = start + for i in range(int(seg)): + s_start = s_end + s_end = int(start + (limit * (i + 1))) + if s_end > end: + s_end = end + output.write( + "\t".join([chr, str(s_start), str(s_end), info + "(SPLIT)"]) + "\n" + ) + else: + output.write("\t".join([chr, str(start), str(end), info]) + "\n") + + output.close() diff --git a/scripts/vcf_out.py b/scripts/vcf_out.py index c0e2706..2ae3fde 100755 --- a/scripts/vcf_out.py +++ b/scripts/vcf_out.py @@ -17,48 +17,50 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# -#-----------------------------------------------------------------------# +# +# -----------------------------------------------------------------------# # Last Updated : 09 Apr 2011 11:00AM -def vcf_out(inF, outF): - import math - f = file.readlines(open(inF)) - vcf = open(outF, "w") - - #header - vcf.write("##fileformat=VCFv4.0\n") - vcf.write("##reference=1000GenomesPilot-NCBI36\n") - vcf.write('##INFO=\n") - region = region.split(":") - chr = region[0] +def vcf_out(inF, outF): + import math - adjPVal = float(region[2]) - if adjPVal <= 0: - adjPVal = 0 - else: - adjPVal = -10 * math.log(adjPVal, 10) - adjPVal = str(round(adjPVal,3)) - region[1] = region[1].split("-") - start = region[1][0] - end = region[1][1] - else: - ref = f[count].strip("\n") - vcf.write(chr +"\t"+ start + "\t" + "." + "\t" + ref + "\t") - vcf.write("" + "\t" + adjPVal + "\t" + "PASS" + "\t") - vcf.write("SVTYPE=CNV;END="+ end + "\n") - count += 1 + f = file.readlines(open(inF)) + vcf = open(outF, "w") - vcf.close() - + # header + vcf.write("##fileformat=VCFv4.0\n") + vcf.write("##reference=1000GenomesPilot-NCBI36\n") + vcf.write( + '##INFO=\n") + region = region.split(":") + chr = region[0] + adjPVal = float(region[2]) + if adjPVal <= 0: + adjPVal = 0 + else: + adjPVal = -10 * math.log(adjPVal, 10) + adjPVal = str(round(adjPVal, 3)) + region[1] = region[1].split("-") + start = region[1][0] + end = region[1][1] + else: + ref = f[count].strip("\n") + vcf.write(chr + "\t" + start + "\t" + "." + "\t" + ref + "\t") + vcf.write("" + "\t" + adjPVal + "\t" + "PASS" + "\t") + vcf.write("SVTYPE=CNV;END=" + end + "\n") + count += 1 + vcf.close() From 5f83f557e97c31cc8e482d80b11e4401c2030515 Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 10:58:25 +1100 Subject: [PATCH 2/9] converted the WGCNV parts to python 3 --- WGCNV-NDE/1_renorm.py | 42 +++++---- WGCNV-NDE/2_process.py | 179 ++++++++++++++++++++---------------- WGCNV-NDE/3_wgsummary.R | 50 +++++----- WGCNV-NDE/4_plot.R | 2 +- WGCNV-NDE/wgcnv_wrapper.py | 83 ++++++++--------- WGCNV-SINGLE/wgcnv.py | 78 ++++++++-------- WGCNV-SINGLE/wgcnv_noBAF.py | 78 ++++++++-------- scripts/cn_analysis.v4.R | 15 ++- scripts/large_region_cbs.R | 27 ++---- 9 files changed, 281 insertions(+), 273 deletions(-) diff --git a/WGCNV-NDE/1_renorm.py b/WGCNV-NDE/1_renorm.py index b225a49..376d28b 100755 --- a/WGCNV-NDE/1_renorm.py +++ b/WGCNV-NDE/1_renorm.py @@ -5,7 +5,8 @@ import os import sys -def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files + +def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files f_CONTRA_out = open(path_CONTRA_out, "r") f_out = open(path_out, "w") @@ -16,23 +17,24 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files headers = lines[0] contra = [] for line in lines[1:]: - contra.append(line.split('\t')) + contra.append(line.split("\t")) # For each line in contra: # 0 = region ID, 6 = Mean.of.LogRatio, 7 = Adjusted.Mean.of.LogRatio, 14 = tumour.rd, 15 = normal.rd, 16 = tumour.rd.ori, 17 = normal.rd.ori - + # 2. Go through data, calculating x_T, N_T, x_N, N_N # as well as creating list of genes to remove (big gains) - removed_regions = [] # Listed by region ID - remove_threshold = 2 # Arbitrarily chosen - can be adjusted + removed_regions = [] # Listed by region ID + remove_threshold = 2 # Arbitrarily chosen - can be adjusted ##### PROBLEM: due to threshold might only take some regions of a particular gene when in reality, want all of them?? # Does this make a difference? x_T = N_T = x_N = N_N = 0.0 # 4 = OriStCoordinate, 5 = OriEndCoordinate - for line in contra: - k = float(line[5]) - float(line[4]) + 1 # Width / number of bases. Inclusive, so add 1. + k = ( + float(line[5]) - float(line[4]) + 1 + ) # Width / number of bases. Inclusive, so add 1. if float(line[7]) > remove_threshold: removed_regions.append(line[0]) x_T += float(line[16]) * k @@ -43,29 +45,33 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files # print removed_regions # 3. Calculate a_2, b_2 - n = 1 - float(x_N)/N_N - t = 1 - float(x_T)/N_T - a_2 = math.sqrt(n/t) - b_2 = math.sqrt(t/n) # or 1/a_2 - + n = 1 - float(x_N) / N_N + t = 1 - float(x_T) / N_T + a_2 = math.sqrt(n / t) + b_2 = math.sqrt(t / n) # or 1/a_2 # 4. Rescale read depths (and recalculate log ratio??) # Add to new columns # Need to multiply necessary tumour.rd (d_t) by a_2, normal.rd (d_n) by b_2 - headers = "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n" # fix headers + headers = ( + "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n" + ) # fix headers f_out.write(headers) for line in contra: temp = [line[2], line[3], line[4], line[5]] # Recalculate Mean.of.Log.Ratio - temp.append(str(math.log((float(line[14]) * a_2) /(float(line[15]) * b_2 ), 2))+'\n') + temp.append( + str(math.log((float(line[14]) * a_2) / (float(line[15]) * b_2), 2)) + "\n" + ) # Write to output - f_out.write('\t'.join(temp)) + f_out.write("\t".join(temp)) # 5. Close file f_out.close() + if len(sys.argv) < 3: - print "1_renorm.py: not enough input arguments" + print("1_renorm.py: not enough input arguments") sys.exit(1) contraF = sys.argv[1] @@ -73,5 +79,5 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files renorm_file(contraF, outF) -if sys.argv[3] == 'T': - print "1_renorm.py: created "+outF +if sys.argv[3] == "T": + print(f"1_renorm.py: created {outF}") diff --git a/WGCNV-NDE/2_process.py b/WGCNV-NDE/2_process.py index 708a980..873f318 100755 --- a/WGCNV-NDE/2_process.py +++ b/WGCNV-NDE/2_process.py @@ -4,7 +4,7 @@ import copy import sys -#----------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------------- # Based on the 27/08 version (Ca2015_Histograms/renorm.py) # Modified to just be run on one file # More descriptive for the exon level (coordinates in columns) @@ -12,7 +12,7 @@ # 17/09 - removed "Name" column # - edited _exon.txt output to use matched/"right" coordinates # 28/10 - no longer outputs NA -#----------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------------- # Global dictionary - {: { : [List of log ratios]}} # where Exon = startcoord_endcoord (Should be chromosome_startcoord_endcoord but thats not in renorm) @@ -20,38 +20,38 @@ # Error file (write out errors here) # JKS WHO NEEDS ERRORS -#error_out = "ALL_get_exons_err.txt" -#if testing_flag: +# error_out = "ALL_get_exons_err.txt" +# if testing_flag: # error_out = "TEST_get_exons_err.txt" -#errors = open(error_out, 'w') +# errors = open(error_out, 'w') if len(sys.argv) < 4: - print "2_process.py: not enough input arguments" + print("2_process.py: not enough input arguments") sys.exit(1) filename = sys.argv[1] wholegene_output = sys.argv[2] exon_output = sys.argv[3] -debug='F' +debug = "F" if len(sys.argv) == 6: - debug=sys.argv[5] + debug = sys.argv[5] # Create gdict entries based on myTSCA.bed.annotated.txt # Format: chromosome startcoord endcoord gene if len(sys.argv) >= 5: - myTSCA_in = sys.argv[4] + myTSCA_in = sys.argv[4] else: - myTSCA_in = "/mnt/Storage/NextGenSequencing/Data/Yeap/myTSCA.bed.annotated.txt" + myTSCA_in = "/mnt/Storage/NextGenSequencing/Data/Yeap/myTSCA.bed.annotated.txt" -myTSCA = open(myTSCA_in, 'r') +myTSCA = open(myTSCA_in, "r") for line in myTSCA.readlines(): - entries = line.split('\t') - entries[3] = entries[3].rstrip('\n') + entries = line.split("\t") + entries[3] = entries[3].rstrip("\n") if entries[3] not in gdict: gdict[entries[3]] = {} -# gdict[entries[3]]['_'.join(entries[1:3])] = [] - gdict[entries[3]][entries[0]+":"+'_'.join(entries[1:3])] = [] + # gdict[entries[3]]['_'.join(entries[1:3])] = [] + gdict[entries[3]][entries[0] + ":" + "_".join(entries[1:3])] = [] myTSCA.close() # Helper function @@ -61,6 +61,7 @@ def calculate_average(numblist): total += float(item) return total / float(len(numblist)) + # Helper function - list may contain "NA" entries def calculate_average_withmissing(numblist): total = 0.0 @@ -74,15 +75,16 @@ def calculate_average_withmissing(numblist): else: return "NA" + # Given a CONTRA output file, generate a list of exons with average log ratio per exon. def process_file(f_in): - GENE_IND=0 - CHR_IND=1 - START_IND=2 - END_IND=3 - LR_IND=4 + GENE_IND = 0 + CHR_IND = 1 + START_IND = 2 + END_IND = 3 + LR_IND = 4 - f = open(f_in, 'r') + f = open(f_in, "r") # 0 - Gene Sym # 1 - OriStCoord, 2 - OriEndCoord # 3 - Mean.of.LogRatio (renormalised) @@ -94,20 +96,25 @@ def process_file(f_in): # Temp list of all genes in gdict all_genes = gdict.keys() - + # Preprocessing contra = [] for line in f.readlines()[1:]: - newline = line.split('\t') - name = newline[GENE_IND].split('(')[0] + newline = line.split("\t") + name = newline[GENE_IND].split("(")[0] newline[GENE_IND] = name - newline[-1] = newline[-1].rstrip('\n') + newline[-1] = newline[-1].rstrip("\n") contra.append(newline) f.close() currgene = contra[0][GENE_IND] - temp = [contra[0][START_IND], contra[0][END_IND], [contra[0][LR_IND]], contra[0][CHR_IND]] # start end list of averages + temp = [ + contra[0][START_IND], + contra[0][END_IND], + [contra[0][LR_IND]], + contra[0][CHR_IND], + ] # start end list of averages exons[currgene] = {} exon_n = 1 for line in contra[1:]: @@ -118,14 +125,14 @@ def process_file(f_in): exons[currgene][exon_n] = temp currgene = line[GENE_IND] temp = [line[START_IND], line[END_IND], [line[LR_IND]], line[CHR_IND]] - if currgene not in exons: # if exon data gets broken up... + if currgene not in exons: # if exon data gets broken up... exon_n = 1 exons[currgene] = {} else: exon_n = len(exons[currgene].keys()) + 1 - + # If its a new exon... - elif line[START_IND] != temp[1]: # ie coordinates don't match up + elif line[START_IND] != temp[1]: # ie coordinates don't match up # Add old to exons dictionary, after calculating average log ratio over the exon. temp[2] = calculate_average(temp[2]) exons[currgene][exon_n] = temp @@ -146,8 +153,8 @@ def process_file(f_in): for exon in exons[gene].keys(): start = int(exons[gene][exon][0]) end = int(exons[gene][exon][1]) - ch = exons[gene][exon][3].replace("chr","") - exon_name = ch+":"+str(start)+'_'+str(end) + ch = exons[gene][exon][3].replace("chr", "") + exon_name = ch + ":" + str(start) + "_" + str(end) if exon_name in exon_list: gdict[gene][exon_name].append(exons[gene][exon][2]) exon_list.remove(exon_name) @@ -155,88 +162,98 @@ def process_file(f_in): # Attempt to match it to something in exon_list: up to 20 bases missing from each side is OK matched = False for possible_match in exon_list: - components = possible_match.split(":")[1].split('_') + components = possible_match.split(":")[1].split("_") possible_start = int(components[0]) possible_end = int(components[1]) start_diff = start - possible_start end_diff = possible_end - end - if start_diff < 21 and end_diff < 21 and start_diff >= 0 and end_diff >= 0: + if ( + start_diff < 21 + and end_diff < 21 + and start_diff >= 0 + and end_diff >= 0 + ): # Can match these two together - #errors.write('\t'+gene+'_'+possible_match+" is matched with incomplete "+exon_name+'\n') + # errors.write('\t'+gene+'_'+possible_match+" is matched with incomplete "+exon_name+'\n') gdict[gene][possible_match].append(exons[gene][exon][2]) exon_list.remove(possible_match) matched = True # Couldn't find a match for it: output error - #errors.write(gene+'_'+exon_name+" could not be found\n") + # errors.write(gene+'_'+exon_name+" could not be found\n") # Take care of missing data: remaining exons in exonlist for missing_exon in exon_list: gdict[gene][missing_exon].append("NA") - #errors.write(gene+'_'+missing_exon+" marked as missing\n") + # errors.write(gene+'_'+missing_exon+" marked as missing\n") all_genes.remove(gene) for gene in all_genes: # These genes had nothing in the input data. So, add NA to each exon of the gene. for exon in gdict[gene]: gdict[gene][exon].append("NA") - #errors.write(gene+" not found in input data\n") - + # errors.write(gene+" not found in input data\n") + # print 'Finished '+f_in[:9] + process_file(filename) +# Write it out to a file? +# if writeout: +# f2 = open(exon_output, 'w') +# headers = ['Gene', 'Start Coord', 'End Coord', 'Mean Log Ratio\n'] +# f2.write('\t'.join(headers)) +# for gene in sorted(exons.keys()): +# for exon in sorted(exons[gene].keys()): +# f2.write('\t'.join([gene, str(exons[gene][exon][0]), str(exons[gene][exon][1]), str(exons[gene][exon][2])+'\n'])) - # Write it out to a file? - #if writeout: - # f2 = open(exon_output, 'w') - # headers = ['Gene', 'Start Coord', 'End Coord', 'Mean Log Ratio\n'] - # f2.write('\t'.join(headers)) - #for gene in sorted(exons.keys()): - # for exon in sorted(exons[gene].keys()): - # f2.write('\t'.join([gene, str(exons[gene][exon][0]), str(exons[gene][exon][1]), str(exons[gene][exon][2])+'\n'])) +# f2.close() - #f2.close() - # write out gdict... -#output_name = "ALL_exons.txt" -#out = open(output_name, 'w') +# output_name = "ALL_exons.txt" +# out = open(output_name, 'w') # Write headers -#out.write("Exon Name\t"+'\t'.join(good_filenames)+'\n') # dont have bad headers! -#for gene in sorted(gdict.keys()): +# out.write("Exon Name\t"+'\t'.join(good_filenames)+'\n') # dont have bad headers! +# for gene in sorted(gdict.keys()): # for exon in sorted(gdict[gene].keys()): # out.write(gene+'_'+str(exon)) # for ratio in gdict[gene][exon]: # out.write('\t'+str(ratio)) - # out.write('\n') +# out.write('\n') -#out.close() -#print "Finished processing out to "+output_name +# out.close() +# print "Finished processing out to "+output_name -f2 = open(exon_output, 'w') -headers = ['Gene','Chr', 'Start Coord', 'End Coord', 'Mean Log Ratio\n'] -f2.write('\t'.join(headers)) +f2 = open(exon_output, "w") +headers = ["Gene", "Chr", "Start Coord", "End Coord", "Mean Log Ratio\n"] +f2.write("\t".join(headers)) for gene in sorted(gdict.keys()): for exon in sorted(gdict[gene].keys()): log_ratio = str(gdict[gene][exon][0]) - if (log_ratio != "NA"): - ch,coord=exon.split(":") - out = [gene, ch, coord.split('_')[0], coord.split('_')[1], str(gdict[gene][exon][0])] - f2.write('\t'.join(out)) - f2.write('\n') + if log_ratio != "NA": + ch, coord = exon.split(":") + out = [ + gene, + ch, + coord.split("_")[0], + coord.split("_")[1], + str(gdict[gene][exon][0]), + ] + f2.write("\t".join(out)) + f2.write("\n") f2.close() - # Duplicate gdict, just in case... gdict2 = copy.deepcopy(gdict) for gene in gdict2.keys(): - if ';' in gene: # If gene needs to be split - #print 'splitting',gene - components = gene.split(';') + if ";" in gene: # If gene needs to be split + # print 'splitting',gene + components = gene.split(";") for component_gene in components: if component_gene not in gdict2: gdict2[component_gene] = {} @@ -245,25 +262,27 @@ def process_file(f_in): gdict2[components[1]][exon] = gdict[gene][exon][:] del gdict2[gene] -wholegene = open(wholegene_output, 'w') +wholegene = open(wholegene_output, "w") # write headers wholegene.write("Gene\tRatio\n") for gene in sorted(gdict2.keys()): sample_temp = [] - for exon in gdict2[gene].keys(): # For each exon (in that gene): - #print len(gdict[gene][exon]) - #print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon])) - sample_temp.append(gdict2[gene][exon][0]) # collect the log ratio of that exon of that sample (of that gene) + for exon in gdict2[gene].keys(): # For each exon (in that gene): + # print len(gdict[gene][exon]) + # print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon])) + sample_temp.append( + gdict2[gene][exon][0] + ) # collect the log ratio of that exon of that sample (of that gene) log_ratio = calculate_average_withmissing(sample_temp) if log_ratio != "NA": wholegene.write(gene) - wholegene.write('\t'+str(log_ratio)) # whole gene log ratio = average of log ratios across all exons of that gene - wholegene.write('\n') + wholegene.write( + "\t" + str(log_ratio) + ) # whole gene log ratio = average of log ratios across all exons of that gene + wholegene.write("\n") wholegene.close() -#if sys.argv[4] == 'T': -if debug == 'T': - print "2_process.py: created "+wholegene_output - print "2_process.py: created "+exon_output - - +# if sys.argv[4] == 'T': +if debug == "T": + print(f"2_process.py: created {wholegene_output}") + print(f"2_process.py: created {exon_output}") diff --git a/WGCNV-NDE/3_wgsummary.R b/WGCNV-NDE/3_wgsummary.R index 794807d..5be5491 100644 --- a/WGCNV-NDE/3_wgsummary.R +++ b/WGCNV-NDE/3_wgsummary.R @@ -5,7 +5,7 @@ # command line input based on Jason Li's wgcnv_withBAFplot.R x=commandArgs() -args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...? +args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...? scriptPath = args[1] f_out = args[4] @@ -55,7 +55,7 @@ mu = 0.05 getExonStat = function(exon, ex_av) { ex_all_av = data_thresh_exon[exon,]$Cropped.Mean ex_all_var = data_thresh_exon[exon,]$Cropped.Var - + # Calculate the pval. If log ratio below average use lower tail, otherwise use upper # print(exon) # print(ex_av) @@ -66,7 +66,7 @@ getExonStat = function(exon, ex_av) { tail = FALSE } ex_pval = pnorm(ex_av, ex_all_av, sqrt(ex_all_var), lower.tail = tail) - + ex_gainloss = "." # Is it a gain or a loss? Using significance level of mu if (ex_pval < mu) { @@ -77,10 +77,10 @@ getExonStat = function(exon, ex_av) { ex_gainloss = "gain" } } - - ex_outdf = data.frame(Exon = exon, Av.LR = ex_av, Overall.Av.LR = ex_all_av, Overall.Var.LR = ex_all_var, + + ex_outdf = data.frame(Exon = exon, Av.LR = ex_av, Overall.Av.LR = ex_all_av, Overall.Var.LR = ex_all_var, pval = ex_pval, GainLoss = ex_gainloss) - + return(ex_outdf) } @@ -103,7 +103,7 @@ getGeneStat = function(gene) { tail = FALSE } pval = pnorm(av, all_av, sqrt(all_var), lower.tail = tail) - + gainloss = "." # Is it a gain or a loss? Using significance level of mu if (pval < mu) { @@ -114,12 +114,12 @@ getGeneStat = function(gene) { gainloss = "gain" } } - - + + # Now make it into a data frame and return it outdf = data.frame(Gene = gene, Av.LR = av, Overall.Av.LR = all_av, Overall.Var.LR = all_var, pval = pval, GainLoss = gainloss) - + # Technically don't have to do this, but do it anyway return (outdf) } @@ -137,13 +137,13 @@ for (i in 1:length(exons)) { exon = paste(ex_row$Gene, "_",ex_row$Chr,":",ex_row$Start.Coord, "_",ex_row$End.Coord, sep = "") # print(exon) - + if ( ! is.na( data_thresh_exon[exon,"Cropped.Var"]) ) { # Get the data ex_tmpdf = getExonStat(exon, ex_row$Mean.Log.Ratio) # Want to have column names and not append (ie overwrite) for the first gene # But for the rest, want to have no column names (already have!) and append (so we dont lose data) - write.table(ex_tmpdf, f_ex_out, col.names = !app, append = app, sep = "\t", + write.table(ex_tmpdf, f_ex_out, col.names = !app, append = app, sep = "\t", row.names = FALSE, quote = FALSE) app = TRUE } @@ -153,7 +153,7 @@ for (i in 1:length(exons)) { # Now, we want to adjust the pvalues and then overwrite the table with new data ex_outdf = read.delim(f_ex_out, as.is = TRUE) ex_outdf$adj.pval = p.adjust(ex_outdf$pval, method = "BH") -write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t", +write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t", row.names = FALSE, quote = FALSE) # Then go calculate the adj.GainLoss ex_outdf = read.delim(f_ex_out, as.is = TRUE) @@ -167,7 +167,7 @@ for (i in 1:length(row.names(ex_outdf))) { } } } -write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t", +write.table(ex_outdf, f_ex_out, col.names = TRUE, append = FALSE, sep = "\t", row.names = FALSE, quote = FALSE) @@ -183,13 +183,13 @@ for (i in 1:length(genes)) { # Check gene actually exists #print(is.na(data_wg[gene,])) if(is.na(data_wg[gene,]) == FALSE) { - + if (! is.na(data_thresh_wg[gene,]$Cropped.Var) ) { # Get the data tmpdf = getGeneStat(gene) # Want to have column names and not append (ie overwrite) for the first gene # But for the rest, want to have no column names (already have!) and append (so we dont lose data) - write.table(tmpdf, f_out, col.names = !app, append = app, sep = "\t", + write.table(tmpdf, f_out, col.names = !app, append = app, sep = "\t", row.names = FALSE, quote = FALSE) app = TRUE } @@ -199,7 +199,7 @@ for (i in 1:length(genes)) { # Now, we want to adjust the pvalues and then overwrite the table with new data outdf = read.delim(f_out, as.is = TRUE) outdf$adj.pval = p.adjust(outdf$pval, method = "BH") -write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t", +write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t", row.names = FALSE, quote = FALSE) # Now calculate the exon level information for each gene @@ -216,15 +216,15 @@ for (i in 1:length(genes)) { gene.exons.sig.gain.adj = 0 gene.exons.sig.loss.adj = 0 gene.n.exons = 0 - + print (paste("Processing gene ",i,"out of",length(genes))) - + # Go through each exon in f_ex_out for (j in 1:length(exons)) { # print(genes[i]) - + tmp_ex_name = ex_outdf[j,]$Exon - #print(tmp_ex_name) + #print(tmp_ex_name) if (!is.na(tmp_ex_name)) { # why is this even a problem?? if (tmp_ex_name == genes[i] || grepl(genes[i], tmp_ex_name)) { # An exon of the gene @@ -243,13 +243,13 @@ print (paste("Processing gene ",i,"out of",length(genes))) } } - + outdf[i,]$exons.sig.gain = gene.exons.sig.gain outdf[i,]$exons.sig.loss = gene.exons.sig.loss outdf[i,]$exons.sig.gain.adj = gene.exons.sig.gain.adj outdf[i,]$exons.sig.loss.adj = gene.exons.sig.loss.adj outdf[i,]$n.exons = gene.n.exons - + # adj.GainLoss stuff here if (outdf[i,]$adj.pval < mu) { if (outdf[i,]$Av.LR < outdf[i,]$Overall.Av.LR) { @@ -258,10 +258,10 @@ print (paste("Processing gene ",i,"out of",length(genes))) outdf[i,]$adj.GainLoss = "gain" } } - + } -write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t", +write.table(outdf, f_out, col.names = TRUE, append = FALSE, sep = "\t", row.names = FALSE, quote = FALSE) diff --git a/WGCNV-NDE/4_plot.R b/WGCNV-NDE/4_plot.R index 5e31561..78f3171 100644 --- a/WGCNV-NDE/4_plot.R +++ b/WGCNV-NDE/4_plot.R @@ -4,7 +4,7 @@ #g.dat=read.delim("1003-Tumour_wgSummary.txt",as.is=T) x=commandArgs() -args=x[match("--args",x)+1:length(x)] +args=x[match("--args",x)+1:length(x)] #ex.f="14K0038_exSummary.txt" #g.f="14K0038_wgSummary.txt" diff --git a/WGCNV-NDE/wgcnv_wrapper.py b/WGCNV-NDE/wgcnv_wrapper.py index abaefa4..23e3978 100755 --- a/WGCNV-NDE/wgcnv_wrapper.py +++ b/WGCNV-NDE/wgcnv_wrapper.py @@ -9,44 +9,45 @@ import sys import os -USAGE = "python %s [debug output T/F] [human|mouse]" %sys.argv[0] + +USAGE = f"python {sys.argv[0]} [debug output T/F] [human|mouse]" # If not enough arguments, print how to use the file if len(sys.argv) < 5: - print USAGE + print(USAGE) sys.exit(1) if len(sys.argv) < 7: - debug = 'F' + debug = "F" else: debug = sys.argv[6] if len(sys.argv) < 8: - species="human" + species = "human" else: - species=sys.argv[7] + species = sys.argv[7] -bedAnnotatedTxt=sys.argv[5] +bedAnnotatedTxt = sys.argv[5] # Grab the sample path and name from command line input sampPath = sys.argv[1] sampName = os.path.basename(sampPath) -scriptPath=os.path.realpath(os.path.dirname(sys.argv[0])) +scriptPath = os.path.realpath(os.path.dirname(sys.argv[0])) -contraF=None +contraF = None # Attempt to find the appropriate CONTRA table file -for root,dirs,files in os.walk(sampPath): +for root, dirs, files in os.walk(sampPath): # find the table folder - if os.path.basename(root)=="table": + if os.path.basename(root) == "table": for f in files: if f.endswith("bins.txt"): if contraF is not None: - print "WARNING: multiple contra output files detected." + print("WARNING: multiple contra output files detected.") break - contraF=os.path.join(root,f) + contraF = os.path.join(root, f) if contraF: break # If there isnt a CONTRA file, exit if contraF == None: - print "ERROR: Could not find CONTRA output file." + print("ERROR: Could not find CONTRA output file.") sys.exit(1) # Create output directory if needed @@ -55,28 +56,27 @@ os.makedirs(outPath) - # 1. RENORM.PY # makes _renorm.txt # This may not be necessary, depending if renorm already exists... <- i guess just remake it to ensure no tampering? # Perhaps come back and tidy this up later? renorm_out = os.path.normpath(os.path.join(outPath, sampName + "_renorm.txt")) -renorm_cmd = "python %s/1_renorm.py %s %s %s" % (scriptPath, contraF, renorm_out, debug) -if debug == 'T': - print "............" - print "Step 1: 1_renorm.py" - print renorm_cmd +renorm_cmd = f"python {scriptPath}/1_renorm.py {contraF} {renorm_out} {debug}" +if debug == "T": + print("............") + print("Step 1: 1_renorm.py") + print(renorm_cmd) os.system(renorm_cmd) # 2. PROCESS.PY # makes _exon.txt, _wg.txt process_exon_out = os.path.normpath(os.path.join(outPath, sampName + "_exon.txt")) process_wg_out = os.path.normpath(os.path.join(outPath, sampName + "_wg.txt")) -process_cmd = "python %s/2_process.py %s %s %s %s %s" % (scriptPath, renorm_out, process_wg_out, process_exon_out, bedAnnotatedTxt, debug) -if debug == 'T': - print "............" - print "Step 2: 2_process.py" - print process_cmd +process_cmd = f"python {scriptPath}/2_process.py {renorm_out} {process_wg_out} {process_exon_out} {bedAnnotatedTxt} {debug}" +if debug == "T": + print("............") + print("Step 2: 2_process.py") + print(process_cmd) os.system(process_cmd) # 3. WGSUMMARY.R @@ -85,34 +85,31 @@ ex_summary_out = os.path.normpath(os.path.join(outPath, sampName + "_exSummary.txt")) thresh_wg = sys.argv[3] thresh_exon = sys.argv[4] -summary_cmd = "Rscript %s %s %s %s %s %s %s %s %s" % (os.path.join(scriptPath, "3_wgsummary.R"), \ - scriptPath, process_wg_out, process_exon_out, summary_out, ex_summary_out, - thresh_wg, thresh_exon, debug) -if debug == 'T': - print "............" - print "Step 3: 3_wgsummary.R" - print summary_cmd +summary_cmd = f"Rscript {scriptPath}/3_wgsummary.R {scriptPath} {process_wg_out} {process_exon_out} {summary_out} {ex_summary_out} {thresh_wg} {thresh_exon} {debug}" + +if debug == "T": + print("............") + print("Step 3: 3_wgsummary.R") + print(summary_cmd) os.system(summary_cmd) -print "Finished: created %s %s" % (summary_out, ex_summary_out) +print(f"Finished: created {summary_out} {ex_summary_out}") # 4. PLOT.R -outf = os.path.normpath(os.path.join(outPath,sampName+"_plot.pdf")) +outf = os.path.normpath(os.path.join(outPath, sampName + "_plot.pdf")) if species == "human": - species_f = os.path.join(scriptPath,"hg19_chrlen.txt") + species_f = os.path.join(scriptPath, "hg19_chrlen.txt") else: - raise Exception("species not implemented: "+species) + raise Exception("species not implemented: " + species) -plotScript = os.path.join(scriptPath,"4_plot.R") -plot_cmd = "Rscript %s %s %s %s %s %s %s" % (plotScript, ex_summary_out,summary_out,sampName,species,species_f,outf) +plotScript = os.path.join(scriptPath, "4_plot.R") +plot_cmd = f"Rscript {plotScript} {ex_summary_out} {summary_out} {sampName} {species} {species_f} {outf}" -print "............" -print "Step 4: 4_plot.R" -print plot_cmd +print("............") +print("Step 4: 4_plot.R") +print(plot_cmd) os.system(plot_cmd) -print "Finished: created %s" % outf - - +print("Finished: created {outf}") diff --git a/WGCNV-SINGLE/wgcnv.py b/WGCNV-SINGLE/wgcnv.py index 6f5edf6..6f819a0 100755 --- a/WGCNV-SINGLE/wgcnv.py +++ b/WGCNV-SINGLE/wgcnv.py @@ -2,68 +2,70 @@ import sys import os -USAGE="%s [outDir] [outPrefix]" % sys.argv[0] + +USAGE = ( + f"{sys.argv[0]} [outDir] [outPrefix]" +) if len(sys.argv) < 4: - print USAGE - sys.exit(1) + print(USAGE) + sys.exit(1) -sampPath=sys.argv[1] -samp=os.path.basename(sampPath) +sampPath = sys.argv[1] +samp = os.path.basename(sampPath) -if sys.argv[2]=="human": - chrlen_f="hg19_chrlen.txt" -elif sys.argv[2]=="mouse": - chrlen_f="mm9_chrlen.txt" +if sys.argv[2] == "human": + chrlen_f = "hg19_chrlen.txt" +elif sys.argv[2] == "mouse": + chrlen_f = "mm9_chrlen.txt" else: - print "Invalid input - %s" % sys.argv[2] + print(f"Invalid input - {sys.argv[2]}") variantF = sys.argv[3] if len(sys.argv) > 4: - outPath=sys.argv[4] + outPath = sys.argv[4] else: - outPath="." + outPath = "." if len(sys.argv) > 5: - outPrefix=sys.argv[5] + outPrefix = sys.argv[5] else: - outPrefix=samp + outPrefix = samp + +scriptPath = os.path.realpath(os.path.dirname(sys.argv[0])) +rscript = os.path.join(scriptPath, "wgcnv_withBAFplot.R") -scriptPath=os.path.realpath(os.path.dirname(sys.argv[0])) -rscript=os.path.join(scriptPath,"wgcnv_withBAFplot.R") +contraF = None +for root, dirs, files in os.walk(sampPath): + if os.path.basename(root) == "table": + for f in files: + if f.endswith("bins.txt"): + if contraF is not None: + print("WARNING: multiple contra output files are detected.") + break + contraF = os.path.join(root, f) -contraF=None -for root,dirs,files in os.walk(sampPath): - if os.path.basename(root)=="table": - for f in files: - if f.endswith("bins.txt"): - if contraF is not None: - print "WARNING: multiple contra output files is detected." - break - contraF=os.path.join(root,f) - - if contraF: - break + if contraF: + break -print contraF +print(contraF) -#variantF = os.path.join(sampPath,samp+"_Ensembl_annotated.tsv") +# variantF = os.path.join(sampPath,samp+"_Ensembl_annotated.tsv") if not os.path.isfile(variantF): - raise Exception("Variant file not found %s" % variantF) + raise Exception(f"Variant file not found {variantF}") if not os.path.exists(outPath): - os.makedirs(outPath) + os.makedirs(outPath) -outp=os.path.join(outPath,outPrefix) -cmd="Rscript %s %s %s %s %s" % (rscript,contraF,variantF,chrlen_f,outp) -print cmd +outp = os.path.join(outPath, outPrefix) +cmd = f"Rscript {rscript} {contraF} {variantF} {chrlen_f} {outp}" +print(cmd) os.system(cmd) -cmd2="convert %s*byChr*png %s_byChr.pdf" % (outp,outp) +cmd2 = f"convert {outp}*byChr*png {outp}_byChr.pdf" os.system(cmd2) -os.system("rm %s*byChr*png" %outp) - +os.system(f"rm {outp}*byChr*png") diff --git a/WGCNV-SINGLE/wgcnv_noBAF.py b/WGCNV-SINGLE/wgcnv_noBAF.py index b626693..d8b97b0 100755 --- a/WGCNV-SINGLE/wgcnv_noBAF.py +++ b/WGCNV-SINGLE/wgcnv_noBAF.py @@ -2,66 +2,66 @@ import sys import os -USAGE="%s [outDir] [outPrefix]" % sys.argv[0] + +USAGE = f"{sys.argv[0]} [outDir] [outPrefix]" if len(sys.argv) < 3: - print USAGE - sys.exit(1) + print(USAGE) + sys.exit(1) -sampPath=sys.argv[1] -samp=os.path.basename(sampPath) +sampPath = sys.argv[1] +samp = os.path.basename(sampPath) -if sys.argv[2]=="human": - chrlen_f="hg19_chrlen.txt" -elif sys.argv[2]=="mouse": - chrlen_f="mm9_chrlen.txt" +if sys.argv[2] == "human": + chrlen_f = "hg19_chrlen.txt" +elif sys.argv[2] == "mouse": + chrlen_f = "mm9_chrlen.txt" else: - print "Invalid input - %s" % sys.argv[2] + print(f"Invalid input - {sys.argv[2]}") if len(sys.argv) > 3: - outPath=sys.argv[3] + outPath = sys.argv[3] else: - outPath="." + outPath = "." if len(sys.argv) > 4: - outPrefix=sys.argv[4] + outPrefix = sys.argv[4] else: - outPrefix=samp + outPrefix = samp + +scriptPath = os.path.realpath(os.path.dirname(sys.argv[0])) +rscript = os.path.join(scriptPath, "wgcnv_withBAFplot.R") -scriptPath=os.path.realpath(os.path.dirname(sys.argv[0])) -rscript=os.path.join(scriptPath,"wgcnv_withBAFplot.R") +contraF = None +for root, dirs, files in os.walk(sampPath): + if os.path.basename(root) == "table": + for f in files: + if f.endswith("bins.txt"): + if contraF is not None: + print("WARNING: multiple contra output files is detected.") + break + contraF = os.path.join(root, f) -contraF=None -for root,dirs,files in os.walk(sampPath): - if os.path.basename(root)=="table": - for f in files: - if f.endswith("bins.txt"): - if contraF is not None: - print "WARNING: multiple contra output files is detected." - break - contraF=os.path.join(root,f) - - if contraF: - break + if contraF: + break -print contraF +print(contraF) -variantF = "NA"#os.path.join(sampPath,samp+"_Ensembl_annotated.tsv") -#if not os.path.isfile(variantF): -# raise Exception("Variant file not found %s" % variantF) +variantF = "NA" # os.path.join(sampPath,samp+"_Ensembl_annotated.tsv") +# if not os.path.isfile(variantF): +# raise Exception("Variant file not found %s" % variantF) if not os.path.exists(outPath): - os.makedirs(outPath) + os.makedirs(outPath) -outp=os.path.join(outPath,outPrefix) -cmd="Rscript %s %s %s %s %s" % (rscript,contraF,variantF,chrlen_f,outp) -print cmd +outp = os.path.join(outPath, outPrefix) +cmd = f"Rscript {rscript} {contraF} {variantF} {chrlen_f} {outp}" +print(cmd) os.system(cmd) -cmd2="convert %s*byChr*png %s_byChr.pdf" % (outp,outp) +cmd2 = f"convert {outp}*byChr*png {outp}_byChr.pdf" os.system(cmd2) -os.system("rm %s*byChr*png" %outp) - +os.system(f"rm {outp}*byChr*png") diff --git a/scripts/cn_analysis.v4.R b/scripts/cn_analysis.v4.R index 14db147..c99d714 100755 --- a/scripts/cn_analysis.v4.R +++ b/scripts/cn_analysis.v4.R @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# +# #-----------------------------------------------------------------------# # Last Updated : 30 Sept 2011 17:00PM @@ -57,7 +57,7 @@ cn.boundary= read.delim(boundariesFile,as.is=F, header=F) cn.average.aboveTs = cn.average[cn.average$V3>min.bases,] cn.average.list = as.matrix(cn.average.aboveTs$V4) -# Get the mean and sd for each bins +# Get the mean and sd for each bins cn.average.mean = c() cn.average.sd = c() cn.average.log= c() @@ -87,7 +87,7 @@ if (plotoption == "True"){ dev.off() } -# Put the data's details into matrices +# Put the data's details into matrices ids = as.matrix(cn.average.aboveTs$V1) exons = as.matrix(cn.average.aboveTs$V6) exons.pos = as.matrix(cn.average.aboveTs$V5) @@ -162,7 +162,7 @@ fit.sd.fn <- function(x, fit.a, fit.b){ result = 2 ^ (fit.mean.fn(x, fit.a, fit.b)) return (result) } - + # Get the P Values, called the gain/loss # with average and sd from each bins pVal.list = c() @@ -176,7 +176,7 @@ for (i in 1:nrow(cn.average.list)){ logcov = logcov.mean[i] exon.bin = Bin[i] - if (length(logratios.sd) > 1){ + if (length(logratios.sd) > 1){ pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b)) } else { pVal <- pnorm(logratio, 0, logratios.sd[exon.bin]) @@ -213,7 +213,7 @@ write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T) #Plotting SD #a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori)) -#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori)) +#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori)) #sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b) #sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="") #sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn) @@ -223,6 +223,3 @@ write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T) #End of the script print ("End of cn_analysis.R") print (i) - - - diff --git a/scripts/large_region_cbs.R b/scripts/large_region_cbs.R index 64c9797..476b859 100755 --- a/scripts/large_region_cbs.R +++ b/scripts/large_region_cbs.R @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see . # -# +# #-----------------------------------------------------------------------# # Last Updated : 12 October 2011 16:43PM @@ -73,7 +73,7 @@ for (i in kmax.range){ cna.start.coord = dat[dat$Targeted.Region.ID==(cna.start.id),]$OriStCoordinate cna.end.coord = dat[dat$Targeted.Region.ID==(cna.end.id), ]$OriEndCoordinate - dat.inrange = dat[dat$Targeted.Region.ID<=cna.end.id&dat$Targeted.Region.ID>=cna.start.id,] + dat.inrange = dat[dat$Targeted.Region.ID<=cna.end.id&dat$Targeted.Region.ID>=cna.start.id,] cna.segment.logratios = dat.inrange$Adjusted.Mean.of.LogRatio cna.segment.pvalues = dat.inrange$Adjusted.P.Value segment.pvals.above = dat.inrange[dat.inrange$Adjusted.P.Value<=param.pval.cutoff,]$Adjusted.P.Value @@ -97,7 +97,7 @@ for (i in kmax.range){ cna.segment.calls = c(cna.segment.calls, "CNV") } } - } + } outdf = data.frame(Chr=cna.segment.out$chrom, Target.Start=cna.segment.out$loc.start, Target.End=cna.segment.out$loc.end, NumberOfTargets=cna.segment.out$num.mark, OriStCoordinate=cna.segment.start, OriEndCoordinate=cna.segment.end, CBS.Mean = cna.segment.out$seg.mean, LogRatios = cna.segment.mean, Above.PValues.Cutoff=cna.segment.pvals.size, Calls = cna.segment.calls) @@ -123,11 +123,11 @@ for (i in 1:nrow(small.call)){ start.small = small.segment$Target.Start end.small = small.segment$Target.End match.large.nocall = large.nocall[large.nocall$Chr==chr.small,] - - match.large.nocall = match.large.nocall[match.large.nocall$Target.End>=start.small,] - match.large.nocall = match.large.nocall[match.large.nocall$Target.Start<=start.small,] + + match.large.nocall = match.large.nocall[match.large.nocall$Target.End>=start.small,] + match.large.nocall = match.large.nocall[match.large.nocall$Target.Start<=start.small,] merged.hole = match.large.nocall[match.large.nocall$Target.End>=start.small,] - + if (nrow(merged.hole) > 0){ included.segment = merge(included.segment, small.segment, all=T) } @@ -135,16 +135,3 @@ for (i in 1:nrow(small.call)){ out.f = paste(logcov.file, ".LargeDeletion.txt", sep="") write.table(included.segment, out.f, sep="\t", quote=F, row.names=F) - - - - - - - - - - - - - From ac944e3d5504eaae5c5d6d316322587306d408d1 Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 11:04:31 +1100 Subject: [PATCH 3/9] too much work to do autodetection currently --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index a742196..0c75a48 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ http://contra-cnv.sourceforge.net/ v2.1.0 Uses python3 Improved performance through multithreading and restructuring of loops -Autodetection of file formats (work in progress) v2.0.8 Included NDE and WGCNV workflows. From e1e9ceda8489df62fadc3abdbe6059fabd41ed86 Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 11:13:29 +1100 Subject: [PATCH 4/9] updated NDE part to python3 --- .gitignore | 3 - NullDistEstimation/1_renormAll.py | 82 +++++++------ NullDistEstimation/2_getExons.py | 189 ++++++++++++++++-------------- NullDistEstimation/3_density.R | 18 +-- NullDistEstimation/nde_wrapper.py | 76 ++++++------ 5 files changed, 201 insertions(+), 167 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 5a77cbc..0000000 --- a/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.pyc -test -TestFiles diff --git a/NullDistEstimation/1_renormAll.py b/NullDistEstimation/1_renormAll.py index fa83e2e..3ed6508 100644 --- a/NullDistEstimation/1_renormAll.py +++ b/NullDistEstimation/1_renormAll.py @@ -6,7 +6,7 @@ import sys if len(sys.argv) < 5: - print "Error: 1_renormAll.py Not enough arguments" + print("Error: 1_renormAll.py Not enough arguments") sys.exit(0) contraList = sys.argv[1] @@ -14,11 +14,14 @@ renormList = sys.argv[3] debug = sys.argv[4] -if debug == 'T': - a_and_b = open(os.path.join(os.path.split(renormList)[0],'debug/1_abstats.txt'), 'w') - a_and_b.write('SampleName\ta_2\tb_2\t\n') +if debug == "T": + a_and_b = open( + os.path.join(os.path.split(renormList)[0], "debug/1_abstats.txt"), "w" + ) + a_and_b.write("SampleName\ta_2\tb_2\t\n") -def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files + +def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files f_CONTRA_out = open(path_CONTRA_out, "r") f_out = open(path_out, "w") @@ -29,23 +32,24 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files headers = lines[0] contra = [] for line in lines[1:]: - contra.append(line.split('\t')) + contra.append(line.split("\t")) # For each line in contra: # 0 = region ID, 6 = Mean.of.LogRatio, 7 = Adjusted.Mean.of.LogRatio, 14 = tumour.rd, 15 = normal.rd, 16 = tumour.rd.ori, 17 = normal.rd.ori - + # 2. Go through data, calculating x_T, N_T, x_N, N_N # as well as creating list of genes to remove (big gains) - removed_regions = [] # Listed by region ID - remove_threshold = 2 # Arbitrarily chosen - can be adjusted + removed_regions = [] # Listed by region ID + remove_threshold = 2 # Arbitrarily chosen - can be adjusted ##### PROBLEM: due to threshold might only take some regions of a particular gene when in reality, want all of them?? # Does this make a difference? x_T = N_T = x_N = N_N = 0.0 # 4 = OriStCoordinate, 5 = OriEndCoordinate - for line in contra: - k = float(line[5]) - float(line[4]) + 1 # Width / number of bases. Inclusive, so add 1. + k = ( + float(line[5]) - float(line[4]) + 1 + ) # Width / number of bases. Inclusive, so add 1. if float(line[7]) > remove_threshold: removed_regions.append(line[0]) x_T += float(line[16]) * k @@ -56,42 +60,47 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files # print removed_regions # 3. Calculate a_2, b_2 - n = 1 - float(x_N)/N_N - t = 1 - float(x_T)/N_T - a_2 = math.sqrt(n/t) - b_2 = math.sqrt(t/n) # or 1/a_2 - + n = 1 - float(x_N) / N_N + t = 1 - float(x_T) / N_T + a_2 = math.sqrt(n / t) + b_2 = math.sqrt(t / n) # or 1/a_2 # 4. Rescale read depths (and recalculate log ratio??) # Add to new columns # Need to multiply necessary tumour.rd (d_t) by a_2, normal.rd (d_n) by b_2 - headers = "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n" # fix headers + headers = ( + "Gene.Sym\tChr\tOriStCoordinate\tOriEndCoordinate\tMean.of.Log.Ratio.readj\n" + ) # fix headers f_out.write(headers) for line in contra: temp = [line[2], line[3], line[4], line[5]] # calculate new read depths (actually you dont need these) - #temp.append(str(float(line[14]) * a_2)) - #temp.append(str(float(line[15]) * b_2)) + # temp.append(str(float(line[14]) * a_2)) + # temp.append(str(float(line[15]) * b_2)) # Recalculate Mean.of.Log.Ratio - temp.append(str(math.log((float(line[14]) * a_2) /(float(line[15]) * b_2 ), 2))+'\n') + temp.append( + str(math.log((float(line[14]) * a_2) / (float(line[15]) * b_2), 2)) + "\n" + ) # Write to output - f_out.write('\t'.join(temp)) + f_out.write("\t".join(temp)) # 5. Close file f_out.close() - if debug == 'T': - a_and_b.write('\t'.join([os.path.basename(path_CONTRA_out), str(a_2), str(b_2)+'\n'])) + if debug == "T": + a_and_b.write( + "\t".join([os.path.basename(path_CONTRA_out), str(a_2), str(b_2) + "\n"]) + ) # get file paths from list # note that these are paths, not just names: makes it a bit trickier -dirs = open(contraList, 'r') -renormGood = open(renormList, 'w') +dirs = open(contraList, "r") +renormGood = open(renormList, "w") filepaths = [] bad_filepaths = [] for line in dirs.readlines(): - filepaths.append(line.rstrip('\n\r')) + filepaths.append(line.rstrip("\n\r")) dirs.close() middle = "/table/" @@ -102,21 +111,20 @@ def renorm_file(path_CONTRA_out, path_out): # Arguments: two paths to files count = 0 for filepath in filepaths: count += 1 - if count % 100 == 0 and debug == 'T': - print "...Now processing "+str(count)+"..." - filename = os.path.basename(filepath.rstrip("/\\ ")) # want to get file - cfilepath = os.path.join(filepath+middle, filename+suffix) + if count % 100 == 0 and debug == "T": + print(f"...Now processing {str(count)}...") + filename = os.path.basename(filepath.rstrip("/\\ ")) # want to get file + cfilepath = os.path.join(filepath + middle, filename + suffix) # print cfilepath - outpath = os.path.join(renormPath, filename+outsuffix) + outpath = os.path.join(renormPath, filename + outsuffix) if os.path.exists(cfilepath): renorm_file(cfilepath, outpath) - renormGood.write(filename+'\n') + renormGood.write(filename + "\n") else: bad_filepaths.append(filename) -if debug == 'T': +if debug == "T": a_and_b.close() - print "Finished processing out to %s" % renormList - print "Bad paths:" - print " ".join(bad_filepaths) - + print(f"Finished processing out to {renormList}") + print("Bad paths:") + print(" ".join(bad_filepaths)) diff --git a/NullDistEstimation/2_getExons.py b/NullDistEstimation/2_getExons.py index c594247..4c5b09f 100644 --- a/NullDistEstimation/2_getExons.py +++ b/NullDistEstimation/2_getExons.py @@ -2,15 +2,15 @@ import os import sys -#----------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------------- # Parts based off 10/09/13 version of get_exons_current.py # Modified to take command line input # removed deepcopy cos that was stupid -#----------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------------- # getExons_cmd = "python 2_getExons.py %s %s %s %s %s %s" % (renormPath, renormList, exonSummary, wgSummary, debug, bed) -if (sys.argv < 7): - print "step 2 error not enough arguments" +if sys.argv < 7: + print("step 2 error not enough arguments") sys.exit(1) renormPath = sys.argv[1] @@ -26,24 +26,26 @@ gdict = {} # Error file (write out errors here) -if debug == 'T': - error_out = os.path.join(os.path.split(exonSummary)[0], "debug/2_getExons_errors.txt") - errors = open(error_out, 'w') +if debug == "T": + error_out = os.path.join( + os.path.split(exonSummary)[0], "debug/2_getExons_errors.txt" + ) + errors = open(error_out, "w") # Create gdict entries based on myTSCA.bed.annotated.txt # Format: chromosome startcoord endcoord gene -#myTSCA_in = "/home/lij/Current_Routine_Support/TSCA_WGCNV/myTSCA.bed.annotated.txt" -#myTSCA_in = "/home/eyeap/haloplex/halo.bed.annotated.txt" # halo bed? +# myTSCA_in = "/home/lij/Current_Routine_Support/TSCA_WGCNV/myTSCA.bed.annotated.txt" +# myTSCA_in = "/home/eyeap/haloplex/halo.bed.annotated.txt" # halo bed? myTSCA_in = bed -myTSCA = open(myTSCA_in, 'r') +myTSCA = open(myTSCA_in, "r") for line in myTSCA.readlines(): - entries = line.split('\t') - entries[3] = entries[3].rstrip('\n') + entries = line.split("\t") + entries[3] = entries[3].rstrip("\n") if entries[3] not in gdict: gdict[entries[3]] = {} - #gdict[entries[3]]['_'.join(entries[1:3])] = [] - gdict[entries[3]][entries[0]+":"+'_'.join(entries[1:3])] = [] + # gdict[entries[3]]['_'.join(entries[1:3])] = [] + gdict[entries[3]][entries[0] + ":" + "_".join(entries[1:3])] = [] myTSCA.close() # Helper function @@ -53,6 +55,7 @@ def calculate_average(numblist): total += float(item) return total / float(len(numblist)) + # Helper function - list may contain "NA" entries def calculate_average_withmissing(numblist): total = 0.0 @@ -66,17 +69,18 @@ def calculate_average_withmissing(numblist): else: return "NA" + # Given a CONTRA output file, generate a list of exons with average log ratio per exon. def process_file(f_in): - GENE_IND=0 - CHR_IND=1 - START_IND=2 - END_IND=3 - LR_IND=4 - - f = open(f_in, 'r') - if debug == 'T': - errors.write('\n\n'+f_in+'\n') + GENE_IND = 0 + CHR_IND = 1 + START_IND = 2 + END_IND = 3 + LR_IND = 4 + + f = open(f_in, "r") + if debug == "T": + errors.write("\n\n" + f_in + "\n") # 0 - Gene Sym # 1 - OriStCoord, 2 - OriEndCoord # 3 - Mean.of.LogRatio (renormalised) @@ -88,20 +92,25 @@ def process_file(f_in): # Temp list of all genes in gdict all_genes = gdict.keys() - + # Preprocessing contra = [] for line in f.readlines()[1:]: - newline = line.split('\t') - name = newline[GENE_IND].split('(')[0] + newline = line.split("\t") + name = newline[GENE_IND].split("(")[0] newline[GENE_IND] = name - newline[-1] = newline[-1].rstrip('\n') + newline[-1] = newline[-1].rstrip("\n") contra.append(newline) f.close() currgene = contra[0][GENE_IND] - temp = [contra[0][START_IND], contra[0][END_IND], [contra[0][LR_IND]],contra[0][CHR_IND]] # start end list of averages + temp = [ + contra[0][START_IND], + contra[0][END_IND], + [contra[0][LR_IND]], + contra[0][CHR_IND], + ] # start end list of averages exons[currgene] = {} exon_n = 1 for line in contra[1:]: @@ -112,14 +121,14 @@ def process_file(f_in): exons[currgene][exon_n] = temp currgene = line[GENE_IND] temp = [line[START_IND], line[END_IND], [line[LR_IND]], line[CHR_IND]] - if currgene not in exons: # if exon data gets broken up... + if currgene not in exons: # if exon data gets broken up... exon_n = 1 exons[currgene] = {} else: exon_n = len(exons[currgene].keys()) + 1 - + # If its a new exon... - elif line[START_IND] != temp[1]: # ie coordinates don't match up + elif line[START_IND] != temp[1]: # ie coordinates don't match up # Add old to exons dictionary, after calculating average log ratio over the exon. temp[2] = calculate_average(temp[2]) exons[currgene][exon_n] = temp @@ -136,16 +145,16 @@ def process_file(f_in): # Add appropriate data to global dictionary gdict # If gdict not yet populated (empty): for gene in exons.keys(): - #print "gene %s " % gene - #print "gdict %s " % str(gdict) - #print "gdict[gene] %s " % str(gdict[gene]) + # print "gene %s " % gene + # print "gdict %s " % str(gdict) + # print "gdict[gene] %s " % str(gdict[gene]) exon_list = gdict[gene].keys() for exon in exons[gene].keys(): start = int(exons[gene][exon][0]) end = int(exons[gene][exon][1]) - ch= exons[gene][exon][3].replace("chr","") - exon_name = ch+":"+str(start)+'_'+str(end) -# print "DEBUG 33333333 %s in %s" % ( exon_name , str(exon_list[0:10])) + ch = exons[gene][exon][3].replace("chr", "") + exon_name = ch + ":" + str(start) + "_" + str(end) + # print "DEBUG 33333333 %s in %s" % ( exon_name , str(exon_list[0:10])) if exon_name in exon_list: gdict[gene][exon_name].append(exons[gene][exon][2]) exon_list.remove(exon_name) @@ -153,82 +162,95 @@ def process_file(f_in): # Attempt to match it to something in exon_list: up to 20 bases missing from each side is OK matched = False for possible_match in exon_list: - components = possible_match.split(":")[1].split('_') + components = possible_match.split(":")[1].split("_") possible_start = int(components[0]) possible_end = int(components[1]) start_diff = start - possible_start end_diff = possible_end - end - if start_diff < 21 and end_diff < 21 and start_diff >= 0 and end_diff >= 0: + if ( + start_diff < 21 + and end_diff < 21 + and start_diff >= 0 + and end_diff >= 0 + ): # Can match these two together - if debug == 'T': - errors.write('\t'+gene+'_'+possible_match+" is matched with incomplete "+exon_name+'\n') + if debug == "T": + errors.write( + "\t" + + gene + + "_" + + possible_match + + " is matched with incomplete " + + exon_name + + "\n" + ) gdict[gene][possible_match].append(exons[gene][exon][2]) exon_list.remove(possible_match) matched = True if not matched: # Couldn't find a match for it: output error - if debug == 'T': - errors.write(gene+'_'+exon_name+" could not be found\n") + if debug == "T": + errors.write(gene + "_" + exon_name + " could not be found\n") # Take care of missing data: remaining exons in exonlist for missing_exon in exon_list: gdict[gene][missing_exon].append("NA") - if debug == 'T': - errors.write(gene+'_'+missing_exon+" marked as missing\n") + if debug == "T": + errors.write(gene + "_" + missing_exon + " marked as missing\n") all_genes.remove(gene) for gene in all_genes: # These genes had nothing in the input data. So, add NA to each exon of the gene. for exon in gdict[gene]: gdict[gene][exon].append("NA") - if debug == 'T': - errors.write(gene+" not found in input data\n") - - + if debug == "T": + errors.write(gene + " not found in input data\n") + + # get filenames -dirs = open(renormList, 'r') +dirs = open(renormList, "r") filenames = [] good_filenames = [] bad_filenames = [] for line in dirs.readlines(): - filenames.append(line.rstrip('\n\r')) + filenames.append(line.rstrip("\n\r")) dirs.close() count = 0 for filename in filenames: count += 1 - if count % 100 == 0 and debug == 'T': - print "...Now processing "+str(count)+"..." - filepath = os.path.join(renormPath, filename+"_renorm.txt") + if count % 100 == 0 and debug == "T": + print(f"...Now processing {str(count)}...") + filepath = os.path.join(renormPath, filename + "_renorm.txt") if os.path.exists(filepath): process_file(filepath) good_filenames.append(filename) else: - if debug == 'T': - print "Error: Could not find "+filename - bad_filenames.append(filename) + if debug == "T": + print(f"Error: Could not find {filename}") + bad_filenames.append(filename) # write out gdict... output_name = exonSummary -out = open(output_name, 'w') +out = open(output_name, "w") # Write headers -out.write("Exon Name\t"+'\t'.join(good_filenames)+'\n') # dont have bad headers! +out.write("Exon Name\t" + "\t".join(good_filenames) + "\n") # dont have bad headers! for gene in sorted(gdict.keys()): for exon in sorted(gdict[gene].keys()): - out.write(gene+'_'+str(exon)) + out.write(gene + "_" + str(exon)) for ratio in gdict[gene][exon]: - out.write('\t'+str(ratio)) - out.write('\n') + out.write("\t" + str(ratio)) + out.write("\n") out.close() -if debug == 'T': - print "Finished processing out to "+output_name +if debug == "T": + print(f"Finished processing out to {output_name}") for gene in gdict.keys(): - if ';' in gene: # If gene needs to be split - #print 'splitting',gene - components = gene.split(';') + if ";" in gene: # If gene needs to be split + # print 'splitting',gene + components = gene.split(";") for component_gene in components: if component_gene not in gdict: gdict[component_gene] = {} @@ -238,30 +260,27 @@ def process_file(f_in): del gdict[gene] -wholegene = open(wgSummary, 'w') +wholegene = open(wgSummary, "w") # write headers -wholegene.write("Gene\t"+'\t'.join(good_filenames)+'\n') +wholegene.write("Gene\t" + "\t".join(good_filenames) + "\n") for gene in sorted(gdict.keys()): wholegene.write(gene) - for i in range(0, len(good_filenames)): # For each sample... + for i in range(0, len(good_filenames)): # For each sample... sample_temp = [] - for exon in gdict[gene].keys(): # For each exon (in that gene): - #print len(gdict[gene][exon]) - #print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon])) - sample_temp.append(gdict[gene][exon][i]) # collect the log ratio of that exon of that sample (of that gene) - wholegene.write('\t'+str(calculate_average_withmissing(sample_temp))) # whole gene log ratio = average of log ratios across all exons of that gene - wholegene.write('\n') + for exon in gdict[gene].keys(): # For each exon (in that gene): + # print len(gdict[gene][exon]) + # print "i = "+str(i)+" len(gdict[gene][exon])"+ str(len(gdict[gene][exon])) + sample_temp.append( + gdict[gene][exon][i] + ) # collect the log ratio of that exon of that sample (of that gene) + wholegene.write( + "\t" + str(calculate_average_withmissing(sample_temp)) + ) # whole gene log ratio = average of log ratios across all exons of that gene + wholegene.write("\n") wholegene.close() if debug == "T": - print "Finished writing whole-gene data to " + wgSummary + print(f"Finished writing whole-gene data to {wgSummary}") errors.close() - print "Finished writing errors to " + error_out - - - - - - - + print(f"Finished writing errors to {error_out}") diff --git a/NullDistEstimation/3_density.R b/NullDistEstimation/3_density.R index d30ea20..84598d9 100644 --- a/NullDistEstimation/3_density.R +++ b/NullDistEstimation/3_density.R @@ -5,7 +5,7 @@ # command line input based on Jason Li's wgcnv_withBAFplot.R x=commandArgs() -args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...? +args=x[match("--args",x)+1:length(x)] # gets the arguments after --args...? summaryin= args[1] threshout= args[2] @@ -62,13 +62,13 @@ print(i) # from .75 to 3.5 #for (i in 5:175) { # thresh = i/50 - + n.steps=100 #num_range = max(numrow) - min(numrow) #num_range_step = num_range/100 - + #num_range_step = best_thresh / n.steps - + num_range_step = sd1*2*2 / n.steps for (i2 in 1:n.steps) { thresh = i2 * num_range_step @@ -94,14 +94,14 @@ print(i) best_p = shapiro.test(numrow[!is.na(numrow)])$p.value best_thresh = max ( max(numrow) - mean, mean - min(numrow)) } - } + } if (hists == 'T' && length(numrow[!is.na(numrow)]) > 1) { #x11() - plot(density(numrow, na.rm = TRUE), + plot(density(numrow, na.rm = TRUE), main = rowname, sub = paste("Mean: ", round(mean,4), " || Threshholds:", round(mean-best_thresh,4), round(mean+best_thresh,4)), - xlab = paste("Variance:", round(var(numrow, na.rm = TRUE),4), + xlab = paste("Variance:", round(var(numrow, na.rm = TRUE),4), " || Cropped variance: ", round(var(numrow[numrow > mean-best_thresh & numrow < mean+best_thresh], na.rm = TRUE),4) ) ) @@ -136,10 +136,10 @@ dataout = data.frame(rownames, means, vars, left_threshs, right_threshs, cropped colnames(dataout) <- c("Gene Name", "Mean(trimmed 0.2)", "Variance", "Left Thresh", "Right Thresh", "Cropped Mean", "Cropped Var") -write.table(dataout, threshout, quote = FALSE, sep = '\t', +write.table(dataout, threshout, quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) -if (hists == 'T') { +if (hists == 'T') { dev.off() } diff --git a/NullDistEstimation/nde_wrapper.py b/NullDistEstimation/nde_wrapper.py index 18af87c..d44074e 100755 --- a/NullDistEstimation/nde_wrapper.py +++ b/NullDistEstimation/nde_wrapper.py @@ -8,14 +8,15 @@ import sys import os -USAGE = "%s [debug output T/F] [histograms T/F]" % sys.argv[0] + +USAGE = f"{sys.argv[0]} [debug output T/F] [histograms T/F]" # CONTRA_out_list = list of CONTRA output folder names / path to, inc name # If not enough arguments, print how to use the file -debug = 'F' -histograms = 'F' +debug = "F" +histograms = "F" if len(sys.argv) < 4: - print USAGE + print(USAGE) sys.exit(1) if len(sys.argv) > 4: debug = sys.argv[4] @@ -29,13 +30,13 @@ # technically vulnerable since only checking at one point... contraList = sys.argv[1] if not os.path.exists(contraList): - print "Error: couldn't find contra list" + print("Error: couldn't find contra list") sys.exit(1) # Create output folders if needed # Structure: outPath/renorm, and outpath/histograms if required outPath = sys.argv[2] -bed=sys.argv[3] +bed = sys.argv[3] renormPath = os.path.join(outPath, "renorm") histPath = os.path.join(outPath, "histograms") debugPath = os.path.join(outPath, "debug") @@ -43,29 +44,43 @@ os.makedirs(outPath) if not os.path.exists(renormPath): os.makedirs(renormPath) -if histograms == 'T' and not os.path.exists(histPath): +if histograms == "T" and not os.path.exists(histPath): os.makedirs(histPath) -if debug == 'T' and not os.path.exists(debugPath): +if debug == "T" and not os.path.exists(debugPath): os.makedirs(debugPath) # Step 1 - 1_renormAll.py # Takes in the list of all contra files, renormalises them to renormPath, and creates a new # list renormList renormList = os.path.join(outPath, "renorm_list.txt") -renorm_cmd = "python %s/1_renormAll.py %s %s %s %s" % (scriptPath, contraList, renormPath, renormList, debug) -if debug == 'T': - print "Step 1: 1_renormAll.py" - print renorm_cmd +renorm_cmd = "python %s/1_renormAll.py %s %s %s %s" % ( + scriptPath, + contraList, + renormPath, + renormList, + debug, +) +if debug == "T": + print("Step 1: 1_renormAll.py") + print(renorm_cmd) os.system(renorm_cmd) # Step 2 - 2_getExons.py # Takes in renormPath and renormList, and creates summary tables exonSummary wgSummary (as well as errors if debug) exonSummary = os.path.join(outPath, "summary_ex.txt") wgSummary = os.path.join(outPath, "summary_wg.txt") -getExons_cmd = "python %s/2_getExons.py %s %s %s %s %s %s" % (scriptPath, renormPath, renormList, exonSummary, wgSummary, debug, bed) -if debug == 'T': - print "Step 2: 2_getExons.py" - print getExons_cmd +getExons_cmd = "python %s/2_getExons.py %s %s %s %s %s %s" % ( + scriptPath, + renormPath, + renormList, + exonSummary, + wgSummary, + debug, + bed, +) +if debug == "T": + print("Step 2: 2_getExons.py") + print(getExons_cmd) os.system(getExons_cmd) @@ -75,26 +90,21 @@ # Requires exonSummary wgSummary + paths out exonThresh = os.path.join(outPath, "thresh_exons.txt") wgThresh = os.path.join(outPath, "thresh_wg.txt") -density_wg_cmd = "Rscript %s %s %s %s %s" % (os.path.join(scriptPath, "3_density.R"), wgSummary, wgThresh, debug, histograms) -density_ex_cmd = "Rscript %s %s %s %s %s" % (os.path.join(scriptPath, "3_density.R"), exonSummary, exonThresh, debug, histograms) -if histograms == 'T': +density_wg_cmd = ( + f"Rscript {scriptPath}/3_density.R {wgSummary} {wgThresh} {debug} {histograms}" +) +density_ex_cmd = ( + f"Rscript {scriptPath}/3_density.R {exonSummary} {exonThresh} %{debug} {histograms}" +) + +if histograms == "T": density_wg_cmd = density_wg_cmd + " " + histPath + "/wg.pdf" density_ex_cmd = density_ex_cmd + " " + histPath + "/exon.pdf" -if debug == 'T': - print "Step 3: 3_density.R" - print density_wg_cmd - print density_ex_cmd +if debug == "T": + print("Step 3: 3_density.R") + print(density_wg_cmd) + print(density_ex_cmd) os.system(density_wg_cmd) os.system(density_ex_cmd) - - - - - - - - - - From 9e37e34caea63f39774b9c2353ebc4a0955f6517 Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 11:43:47 +1100 Subject: [PATCH 5/9] baseline converted to python 3 --- baseline.py | 386 +++++++++++++++++++++++++++++----------------------- 1 file changed, 213 insertions(+), 173 deletions(-) diff --git a/baseline.py b/baseline.py index f1e0c5d..7cc41ad 100755 --- a/baseline.py +++ b/baseline.py @@ -11,168 +11,202 @@ from scripts.split_chromosome import splitByChromosome3 as splitByChromosome from scripts.get_chr_length import * from scripts.convert_targeted_regions import * -from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate +from scripts.convert_gene_coordinate import ( + convertGeneCoordinate2 as convertGeneCoordinate, +) from multiprocessing import Pool + class Params: - """ + """ Class for top-level system parameters: """ - def __init__(self): - - def mult_files(option, opt_str, value, parser): - args=[] - for arg in parser.rargs: - if arg[0] != "-": - args.append(arg) - else: - del parser.rargs[:len(args)] - break - if getattr(parser.values, option.dest): - args.extend(getattr(parser.values, option.dest)) - setattr(parser.values, option.dest, args) - - - #command-line option definition - self.parser = OptionParser() - self.parser.add_option("-t", "--target", - help="Target region definition file [REQUIRED] [BED Format]", - action="store", type="string", dest="target") - self.parser.add_option("-f", "--files", - help="Files to be converted to baselines [REQUIRED] [BAM]", - action="callback", callback=mult_files, dest="files") - self.parser.add_option("-o", "--output", - help="Output folder [REQUIRED]", - action="store", type="string", dest="output") - self.parser.add_option("-c","--trim", - help="Portion of outliers to be removed before calculating average [Default: 0.2]", - action="store", dest="trim", default="0.2") - self.parser.add_option("-n","--name", - help="Output baseline name [Default: baseline]", - action="store", dest="name", default="baseline") - - # required parameters list: - self.ERRORLIST = [] - - #change system parameters based on any command line arguments - (options, args) = self.parser.parse_args() - if options.target: - self.TARGET=options.target - else: - self.ERRORLIST.append("target") - - if options.files: - self.FILES=options.files - else: - self.ERRORLIST.append("files") - - if options.output: - self.OUTPUT=options.output - else: - self.ERRORLIST.append("output") - - if options.trim: - self.TRIM=float(options.trim) - - if options.name: - self.NAME=options.name - - if len(self.ERRORLIST) != 0: - self.parser.print_help() - self.parser.error("Missing required parameters") - - - def repeat(self): - #params test - print "target :", self.TARGET - print "files :", self.FILES - print "output :", self.OUTPUT - print "trim :", self.TRIM - print "name :", self.NAME + def __init__(self): + def mult_files(option, opt_str, value, parser): + args = [] + for arg in parser.rargs: + if arg[0] != "-": + args.append(arg) + else: + del parser.rargs[: len(args)] + break + if getattr(parser.values, option.dest): + args.extend(getattr(parser.values, option.dest)) + setattr(parser.values, option.dest, args) + + # command-line option definition + self.parser = OptionParser() + self.parser.add_option( + "-t", + "--target", + help="Target region definition file [REQUIRED] [BED Format]", + action="store", + type="string", + dest="target", + ) + self.parser.add_option( + "-f", + "--files", + help="Files to be converted to baselines [REQUIRED] [BAM]", + action="callback", + callback=mult_files, + dest="files", + ) + self.parser.add_option( + "-o", + "--output", + help="Output folder [REQUIRED]", + action="store", + type="string", + dest="output", + ) + self.parser.add_option( + "-c", + "--trim", + help="Portion of outliers to be removed before calculating average [Default: 0.2]", + action="store", + dest="trim", + default="0.2", + ) + self.parser.add_option( + "-n", + "--name", + help="Output baseline name [Default: baseline]", + action="store", + dest="name", + default="baseline", + ) + + # required parameters list: + self.ERRORLIST = [] + + # change system parameters based on any command line arguments + (options, args) = self.parser.parse_args() + if options.target: + self.TARGET = options.target + else: + self.ERRORLIST.append("target") + + if options.files: + self.FILES = options.files + else: + self.ERRORLIST.append("files") + + if options.output: + self.OUTPUT = options.output + else: + self.ERRORLIST.append("output") + + if options.trim: + self.TRIM = float(options.trim) + + if options.name: + self.NAME = options.name + + if len(self.ERRORLIST) != 0: + self.parser.print_help() + self.parser.error("Missing required parameters") + + def repeat(self): + # params test + print(f"target : {self.TARGET}") + print(f"files : {self.FILES}") + print(f"output : {self.OUTPUT}") + print(f"trim : {self.TRIM}") + print(f"name : {self.NAME}") # option handling -params = Params() -#params.repeat() -targetFile = params.TARGET -infiles = params.FILES -output_dir = params.OUTPUT - -#Debug -print " ------ baseline.py ------- " -print "Target:", targetFile +params = Params() +# params.repeat() +targetFile = params.TARGET +infiles = params.FILES +output_dir = params.OUTPUT + +# Debug +print(" ------ baseline.py ------- ") +print(f"Target: {targetFile}") for files in infiles: - print "File:", files -print "Output Directory: ", output_dir + print(f"File: {files}") +print(f"Output Directory: {output_dir}") + def make_new_directory(outdir): - print outdir - if not os.path.exists(outdir): - os.mkdir(outdir) + print(outdir) + if not os.path.exists(outdir): + os.mkdir(outdir) -print " ----- creating output directory -----" + +print(" ----- creating output directory -----") make_new_directory(output_dir) -outdir = os.path.join(output_dir, "buf") +outdir = os.path.join(output_dir, "buf") make_new_directory(outdir) -targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted") -os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2)) +targetFile2 = os.path.join(outdir, os.path.basename(targetFile) + ".sorted") +os.system("sort -k1,1 -k2n %s > %s" % (targetFile, targetFile2)) + def processInFile(infile): - infilename = os.path.basename(infile) - s_outdir = os.path.join(outdir, infilename) - make_new_directory(s_outdir) - genomeFile = os.path.join(s_outdir,infilename +'.chrsummary') - get_genome(infile, genomeFile) - - bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH") - args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile)) - iOutFile = open(bedgraph, "w") - output = subprocess.Popen(args, stdout = iOutFile).wait() - iOutFile.close() - - targetList = convertTarget(targetFile2) - - splitByChromosome(bedgraph) - - convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/") - bedgraph_tgtonly = bedgraph+".TARGETONLY" - bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE" - os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly) - os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg) - shutil.copy(bedgraph_tgtonly,outdir) - shutil.copy(bedgraph_tgtonly_avg,outdir) - -print "----- Processing Files -----" + infilename = os.path.basename(infile) + s_outdir = os.path.join(outdir, infilename) + make_new_directory(s_outdir) + genomeFile = os.path.join(s_outdir, infilename + ".chrsummary") + get_genome(infile, genomeFile) + + bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH") + args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" % (infile, genomeFile)) + iOutFile = open(bedgraph, "w") + output = subprocess.Popen(args, stdout=iOutFile).wait() + iOutFile.close() + + targetList = convertTarget(targetFile2) + + splitByChromosome(bedgraph) + + convertGeneCoordinate(targetList, os.path.dirname(bedgraph) + "/") + bedgraph_tgtonly = bedgraph + ".TARGETONLY" + bedgraph_tgtonly_avg = bedgraph + ".TARGETONLY.AVERAGE" + os.rename( + os.path.join(os.path.dirname(bedgraph), "geneRefCoverage.txt"), bedgraph_tgtonly + ) + os.rename( + os.path.join(os.path.dirname(bedgraph), "geneRefCoverage_targetAverage.txt"), + bedgraph_tgtonly_avg, + ) + shutil.copy(bedgraph_tgtonly, outdir) + shutil.copy(bedgraph_tgtonly_avg, outdir) + + +print("----- Processing Files -----") pool = Pool(5) -pool.map(processInFile,infiles) +pool.map(processInFile, infiles) # STEP 2 -> Create Union BedGraph -allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")] -allfiles_path = [os.path.join(outdir,x) for x in allfiles_names] +allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")] +allfiles_path = [os.path.join(outdir, x) for x in allfiles_names] -args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names +args = ["unionBedGraphs", "-header", "-i"] + allfiles_path + ["-names"] + allfiles_names -print (str(args)) -fo = os.path.join(outdir,"TARGETONLY.union.txt") -foh = open(fo,"w") +print(str(args)) +fo = os.path.join(outdir, "TARGETONLY.union.txt") +foh = open(fo, "w") -subprocess.Popen(args,stdout=foh).wait() +subprocess.Popen(args, stdout=foh).wait() foh.close() # STEP 3 -> POOL METHOD -TRIM = params.TRIM #TRIM = 0.2 -f = fo -fh = open(f) +TRIM = params.TRIM # TRIM = 0.2 +f = fo +fh = open(f) -fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt") -fo_libsize=fo+".LIBSIZE" +fo = os.path.join(output_dir, params.NAME + ".pooled2_TRIM" + str(TRIM) + ".txt") +fo_libsize = fo + ".LIBSIZE" -foh=open(fo,'w') -fo_libsize_h=open(fo_libsize,'w') +foh = open(fo, "w") +fo_libsize_h = open(fo_libsize, "w") fh.readline() @@ -180,61 +214,65 @@ def processInFile(infile): Ns = None nSamples = None -lineCount=0 +lineCount = 0 for line in fh: - lineCount+=1 - line_elems=line.rstrip().split("\t") - rangeLen=int(line_elems[2])-int(line_elems[1]) - if not Ns: - nSamples=len(line_elems)-3 - Ns = [0 for x in range(nSamples)] - for i in range(nSamples): - ind=i+3 - Ns[i] += int(line_elems[ind])*rangeLen + lineCount += 1 + line_elems = line.rstrip().split("\t") + rangeLen = int(line_elems[2]) - int(line_elems[1]) + if not Ns: + nSamples = len(line_elems) - 3 + Ns = [0 for x in range(nSamples)] + for i in range(nSamples): + ind = i + 3 + Ns[i] += int(line_elems[ind]) * rangeLen fh.close() -fh=open(f) +fh = open(f) fh.readline() -lineCount=0 -nSamples_exclude = int(math.floor(TRIM*nSamples)) +lineCount = 0 +nSamples_exclude = int(math.floor(TRIM * nSamples)) + def gm_mean(xs): - tmpprod = 1 - p = 1.0/len(xs) - for x in xs: - tmpprod = tmpprod * math.pow(x,p) - return tmpprod + tmpprod = 1 + p = 1.0 / len(xs) + for x in xs: + tmpprod = tmpprod * math.pow(x, p) + return tmpprod + Ns_gmean = gm_mean(Ns) + def meanstdv(x): - from math import sqrt - n, mean, std = len(x), 0, 0 - for a in x: - mean = mean+a - mean = mean / float(n) - for a in x: - std = std+(a-mean)**2 - std=sqrt(std/float(n-1)) - return mean, std + from math import sqrt + + n, mean, std = len(x), 0, 0 + for a in x: + mean = mean + a + mean = mean / float(n) + for a in x: + std = std + (a - mean) ** 2 + std = sqrt(std / float(n - 1)) + return mean, std libsize = 0 for line in fh: - lineCount+=1 - line_elems=line.rstrip().split("\t") - rangeLen=int(line_elems[2])-int(line_elems[1]) - xs_tmp=[int(x) for x in line_elems[3:]] - xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))] - xs.sort() - xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)] - #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed)) - trimmed_mean,std=meanstdv(xs_trimmed) - libsize+=rangeLen*trimmed_mean - foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n") + lineCount += 1 + line_elems = line.rstrip().split("\t") + rangeLen = int(line_elems[2]) - int(line_elems[1]) + xs_tmp = [int(x) for x in line_elems[3:]] + xs = [float(xs_tmp[i]) * Ns_gmean / Ns[i] for i in range(len(xs_tmp))] + xs.sort() + xs_trimmed = xs[nSamples_exclude : (nSamples - nSamples_exclude)] + # trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed)) + trimmed_mean, std = meanstdv(xs_trimmed) + libsize += rangeLen * trimmed_mean + foh.write("\t".join(line_elems[0:3] + [str(trimmed_mean), str(std)]) + "\n") fo_libsize_h.write(str(int(round(libsize)))) @@ -243,11 +281,13 @@ def meanstdv(x): foh.close() fo_libsize_h.close() + def removeTempFolder(tempFolderPath): - import shutil + import shutil + + shutil.rmtree(tempFolderPath) + print("Temp Folder Removed") - shutil.rmtree(tempFolderPath) - print "Temp Folder Removed" # Removed Temp Folder -#removeTempFolder(outdir) +# removeTempFolder(outdir) From 734ce44d89bb110f8c3cacc901d3bcb4a4fc80c2 Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 12:08:31 +1100 Subject: [PATCH 6/9] reworked parameter parsing --- baseline.py | 72 +++++++++++++++++++++-------------------------------- contra.py | 1 + 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/baseline.py b/baseline.py index 7cc41ad..f1c72a6 100755 --- a/baseline.py +++ b/baseline.py @@ -7,7 +7,7 @@ import subprocess import shutil import math -from optparse import OptionParser +import argparse from scripts.split_chromosome import splitByChromosome3 as splitByChromosome from scripts.get_chr_length import * from scripts.convert_targeted_regions import * @@ -23,58 +23,41 @@ class Params: """ def __init__(self): - def mult_files(option, opt_str, value, parser): - args = [] - for arg in parser.rargs: - if arg[0] != "-": - args.append(arg) - else: - del parser.rargs[: len(args)] - break - if getattr(parser.values, option.dest): - args.extend(getattr(parser.values, option.dest)) - setattr(parser.values, option.dest, args) # command-line option definition - self.parser = OptionParser() - self.parser.add_option( + self.parser = argparse.ArgumentParser() + self.parser.add_argument( "-t", "--target", help="Target region definition file [REQUIRED] [BED Format]", - action="store", - type="string", - dest="target", + required=True, ) - self.parser.add_option( + self.parser.add_argument( "-f", "--files", help="Files to be converted to baselines [REQUIRED] [BAM]", - action="callback", - callback=mult_files, + nargs="+", dest="files", + required=True, ) - self.parser.add_option( + self.parser.add_argument( "-o", "--output", help="Output folder [REQUIRED]", - action="store", - type="string", dest="output", + required=True, ) - self.parser.add_option( + self.parser.add_argument( "-c", "--trim", help="Portion of outliers to be removed before calculating average [Default: 0.2]", - action="store", - dest="trim", - default="0.2", + type=float, + default=0.2, ) - self.parser.add_option( + self.parser.add_argument( "-n", "--name", help="Output baseline name [Default: baseline]", - action="store", - dest="name", default="baseline", ) @@ -82,13 +65,20 @@ def mult_files(option, opt_str, value, parser): self.ERRORLIST = [] # change system parameters based on any command line arguments - (options, args) = self.parser.parse_args() + options = self.parser.parse_args() if options.target: + if not os.path.isfile(options.target): + print(f"Could not find bed file {options.target}") + self.ERRORLIST.append("control") self.TARGET = options.target else: self.ERRORLIST.append("target") if options.files: + for bam in options.files: + if not os.path.isfile(options.target): + print(f"Could not find bam file {bam}") + self.ERRORLIST.append("files") self.FILES = options.files else: self.ERRORLIST.append("files") @@ -98,23 +88,17 @@ def mult_files(option, opt_str, value, parser): else: self.ERRORLIST.append("output") - if options.trim: - self.TRIM = float(options.trim) - - if options.name: - self.NAME = options.name - if len(self.ERRORLIST) != 0: - self.parser.print_help() - self.parser.error("Missing required parameters") + # self.parser.print_help() + self.parser.error("Missing required parameters: " + str(self.ERRORLIST)) def repeat(self): # params test - print(f"target : {self.TARGET}") - print(f"files : {self.FILES}") - print(f"output : {self.OUTPUT}") - print(f"trim : {self.TRIM}") - print(f"name : {self.NAME}") + print(f"target :{self.TARGET}") + print(f"files :{self.FILES}") + print(f"output :{self.OUTPUT}") + print(f"trim :{self.TRIM}") + print(f"name :{self.NAME}") # option handling diff --git a/contra.py b/contra.py index fc84f7e..39a77b8 100755 --- a/contra.py +++ b/contra.py @@ -94,6 +94,7 @@ def __init__(self): self.optional.add_argument( "--numBin", help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]", + type=int, default="20", ) self.optional.add_argument( From daeac1261bef60e8f2e99b6c9bc1e3a878936208 Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 12:09:58 +1100 Subject: [PATCH 7/9] string formating changed to python43 notation --- baseline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baseline.py b/baseline.py index f1c72a6..e402e47 100755 --- a/baseline.py +++ b/baseline.py @@ -128,7 +128,7 @@ def make_new_directory(outdir): make_new_directory(outdir) targetFile2 = os.path.join(outdir, os.path.basename(targetFile) + ".sorted") -os.system("sort -k1,1 -k2n %s > %s" % (targetFile, targetFile2)) +os.system(f"sort -k1,1 -k2n {targetFile} > {targetFile2}") def processInFile(infile): @@ -139,7 +139,7 @@ def processInFile(infile): get_genome(infile, genomeFile) bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH") - args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" % (infile, genomeFile)) + args = shlex.split(f"genomeCoverageBed -ibam {infile} -bga -g {genomeFile}") iOutFile = open(bedgraph, "w") output = subprocess.Popen(args, stdout=iOutFile).wait() iOutFile.close() From ca9606ea4d5d26da0c1073320fa67db3b35169ba Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 13:31:42 +1100 Subject: [PATCH 8/9] added a todo for furture parameter optimisation --- contra.py | 1 + 1 file changed, 1 insertion(+) diff --git a/contra.py b/contra.py index 39a77b8..115ae7f 100755 --- a/contra.py +++ b/contra.py @@ -109,6 +109,7 @@ def __init__(self): type=int, default=10, ) + # TODO: change this to a choice argument with sam/bed self.optional.add_argument( "--sam", help="If the specified, test and control sample are in SAM [False]", From 3908a02d2f508e1b03b2894bf7787cdeacb696cf Mon Sep 17 00:00:00 2001 From: SHollizeck Date: Fri, 3 Jan 2020 13:36:10 +1100 Subject: [PATCH 9/9] changes shebang line to use python3 --- NullDistEstimation/1_renormAll.py | 2 +- NullDistEstimation/2_getExons.py | 2 +- NullDistEstimation/nde_wrapper.py | 2 +- WGCNV-NDE/1_renorm.py | 2 +- WGCNV-NDE/2_process.py | 2 +- WGCNV-NDE/wgcnv_wrapper.py | 2 +- WGCNV-SINGLE/wgcnv.py | 2 +- WGCNV-SINGLE/wgcnv_noBAF.py | 2 +- baseline.py | 2 +- contra.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/NullDistEstimation/1_renormAll.py b/NullDistEstimation/1_renormAll.py index 3ed6508..4d11051 100644 --- a/NullDistEstimation/1_renormAll.py +++ b/NullDistEstimation/1_renormAll.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # Usage: "python 1_renormAll.py %s %s %s %s" % (contraList, renormPath, renormList, debug) import math diff --git a/NullDistEstimation/2_getExons.py b/NullDistEstimation/2_getExons.py index 4c5b09f..05f69b7 100644 --- a/NullDistEstimation/2_getExons.py +++ b/NullDistEstimation/2_getExons.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import os import sys diff --git a/NullDistEstimation/nde_wrapper.py b/NullDistEstimation/nde_wrapper.py index d44074e..aa982e6 100755 --- a/NullDistEstimation/nde_wrapper.py +++ b/NullDistEstimation/nde_wrapper.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # 24/09 wrapper script for estimating the null distribution # on a per exon and per gene basis from a bunch of CONTRA outputs diff --git a/WGCNV-NDE/1_renorm.py b/WGCNV-NDE/1_renorm.py index 376d28b..a417e92 100755 --- a/WGCNV-NDE/1_renorm.py +++ b/WGCNV-NDE/1_renorm.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # print '.................inside renorm' import math diff --git a/WGCNV-NDE/2_process.py b/WGCNV-NDE/2_process.py index 873f318..7c57c47 100755 --- a/WGCNV-NDE/2_process.py +++ b/WGCNV-NDE/2_process.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # Usage: python 2_process.py [_renorm.txt] [_wg.txt] [_exon.txt] [bed.annotated.txt] [debug] import os import copy diff --git a/WGCNV-NDE/wgcnv_wrapper.py b/WGCNV-NDE/wgcnv_wrapper.py index 23e3978..a177880 100755 --- a/WGCNV-NDE/wgcnv_wrapper.py +++ b/WGCNV-NDE/wgcnv_wrapper.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # 10/10 wrapper script for WGCNV - server edit # based on wgcnv_noBAF.py by Jason Li diff --git a/WGCNV-SINGLE/wgcnv.py b/WGCNV-SINGLE/wgcnv.py index 6f819a0..72af93f 100755 --- a/WGCNV-SINGLE/wgcnv.py +++ b/WGCNV-SINGLE/wgcnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import sys import os diff --git a/WGCNV-SINGLE/wgcnv_noBAF.py b/WGCNV-SINGLE/wgcnv_noBAF.py index d8b97b0..180e9c8 100755 --- a/WGCNV-SINGLE/wgcnv_noBAF.py +++ b/WGCNV-SINGLE/wgcnv_noBAF.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import sys import os diff --git a/baseline.py b/baseline.py index e402e47..1d501ef 100755 --- a/baseline.py +++ b/baseline.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import os import sys diff --git a/contra.py b/contra.py index 115ae7f..4f68aa5 100755 --- a/contra.py +++ b/contra.py @@ -21,7 +21,7 @@ # # # -----------------------------------------------------------------------# -# Last Updated : 23 July 2012 16:43PM +# Last Updated : 03 January 2020 13:43 import os