diff --git a/README.md b/README.md index cebc9f6..b9201db 100755 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ conda install -f -c conda-forge -c bioconda wisecondorx ### Running WisecondorX There are three main stages (converting, reference creating and predicting) when using WisecondorX: -- Convert .bam to .npz files (for both reference and test samples) +- Convert aligned reads to .npz files (for both reference and test samples) - Create a reference (using reference .npz files) - **Important notes** - Automated gender prediction, required to consistently analyze sex chromosomes, is based on a Gaussian mixture @@ -56,11 +56,11 @@ There are three main stages (converting, reference creating and predicting) when observe additional improvement concerning normalization. - Predict copy number alterations (using the reference file and test .npz cases of interest) -### Stage (1) Convert .bam to .npz +### Stage (1) Convert aligned reads (sam/bam/cram) to .npz ```bash -WisecondorX convert input.bam output.npz [--optional arguments] +WisecondorX convert input.sam/bam/cram output.npz [--optional arguments] ```
Optional argument

| Function diff --git a/wisecondorX/convert_tools.py b/wisecondorX/convert_tools.py index ce5488e..41966d6 100644 --- a/wisecondorX/convert_tools.py +++ b/wisecondorX/convert_tools.py @@ -4,21 +4,31 @@ import numpy as np import pysam +import sys ''' -Converts bam file to numpy array by transforming +Converts aligned reads file to numpy array by transforming individual reads to counts per bin. ''' -def convert_bam(args): +def convert_reads(args): bins_per_chr = dict() for chr in range(1, 25): bins_per_chr[str(chr)] = None logging.info('Importing data ...') - bam_file = pysam.AlignmentFile(args.infile, 'rb') + if args.infile.endswith(".sam"): + reads_file = pysam.AlignmentFile(args.infile, 'r') + elif args.infile.endswith(".bam"): + reads_file = pysam.AlignmentFile(args.infile, 'rb') + elif args.infile.endswith(".cram"): + reads_file = pysam.AlignmentFile(args.infile, 'rc') + else: + logging.error( + "Unsupported input file type. Make sure your input filename has a correct extension (sam/bam/cram)") + sys.exit(1) reads_seen = 0 reads_kept = 0 @@ -28,9 +38,9 @@ def convert_bam(args): larp = -1 larp2 = -1 - logging.info('Converting bam ... This might take a while ...') + logging.info('Converting aligned reads ... This might take a while ...') - for index, chr in enumerate(bam_file.references): + for index, chr in enumerate(reads_file.references): chr_name = chr if chr_name[:3].lower() == 'chr': @@ -39,9 +49,9 @@ def convert_bam(args): continue logging.info('Working at {}; processing {} bins' - .format(chr, int(bam_file.lengths[index] / float(args.binsize) + 1))) - counts = np.zeros(int(bam_file.lengths[index] / float(args.binsize) + 1), dtype=np.int32) - bam_chr = bam_file.fetch(chr) + .format(chr, int(reads_file.lengths[index] / float(args.binsize) + 1))) + counts = np.zeros(int(reads_file.lengths[index] / float(args.binsize) + 1), dtype=np.int32) + bam_chr = reads_file.fetch(chr) if chr_name == 'X': chr_name = '23' @@ -81,9 +91,9 @@ def convert_bam(args): bins_per_chr[chr_name] = counts reads_kept += sum(counts) - qual_info = {'mapped': bam_file.mapped, - 'unmapped': bam_file.unmapped, - 'no_coordinate': bam_file.nocoordinate, + qual_info = {'mapped': reads_file.mapped, + 'unmapped': reads_file.unmapped, + 'no_coordinate': reads_file.nocoordinate, 'filter_rmdup': reads_rmdup, 'filter_mapq': reads_mapq, 'pre_retro': reads_seen, diff --git a/wisecondorX/main.py b/wisecondorX/main.py index 49888d6..01183a0 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -8,7 +8,7 @@ import numpy as np -from wisecondorX.convert_tools import convert_bam +from wisecondorX.convert_tools import convert_reads from wisecondorX.newref_control import tool_newref_prep, tool_newref_main, tool_newref_merge from wisecondorX.newref_tools import train_gender_model, get_mask from wisecondorX.overall_tools import gender_correct, scale_sample @@ -20,7 +20,7 @@ def tool_convert(args): logging.info('Starting conversion') - sample, qual_info = convert_bam(args) + sample, qual_info = convert_reads(args) np.savez_compressed(args.outfile, binsize=args.binsize, sample=sample,