From a870763e3f965b143882f5d823ab990342ad5938 Mon Sep 17 00:00:00 2001 From: Matthias De Smet <11850640+matthdsm@users.noreply.github.com> Date: Fri, 3 Feb 2023 11:33:08 +0100 Subject: [PATCH] lint code with black --- LICENSE.md | 4 +- README.md | 189 +++++------ setup.py | 73 +++-- wisecondorX/convert_tools.py | 68 ++-- wisecondorX/main.py | 573 +++++++++++++++++++-------------- wisecondorX/newref_control.py | 198 ++++++------ wisecondorX/newref_tools.py | 96 ++++-- wisecondorX/overall_tools.py | 88 ++--- wisecondorX/predict_control.py | 42 ++- wisecondorX/predict_output.py | 238 ++++++++------ wisecondorX/predict_tools.py | 173 +++++----- 11 files changed, 985 insertions(+), 757 deletions(-) diff --git a/LICENSE.md b/LICENSE.md index 054777c..6fd25a7 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,7 +1,7 @@ Copyright (C) 2016 VU University Medical Center Amsterdam Author: Roy Straver (github.com/rstraver) -Mod: Lennart Raman (github.com/leraman) +Mod: Lennart Raman (github.com/leraman) WISECONDOR is distributed under the following license: -[Attribution-NonCommercial-ShareAlike CC BY-NC-SA]( https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) +[Attribution-NonCommercial-ShareAlike CC BY-NC-SA](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) This license is governed by Dutch law and this license is subject to the exclusive jurisdiction of the courts of the Netherlands. diff --git a/README.md b/README.md index 150a8e0..88ac46e 100755 --- a/README.md +++ b/README.md @@ -1,38 +1,41 @@ -# Background -After extensively comparing different (shallow) whole-genome sequencing-based copy number detection tools, -including [WISECONDOR](https://github.com/VUmcCGP/wisecondor), [QDNAseq](https://github.com/ccagc/QDNAseq), -[CNVkit](https://github.com/etal/cnvkit), [Control-FREEC](https://github.com/BoevaLab/FREEC), -[BIC-seq2](http://compbio.med.harvard.edu/BIC-seq/) and -[cn.MOPS](https://bioconductor.org/packages/release/bioc/html/cn.mops.html), -WISECONDOR appeared to normalize sequencing data in the most consistent way, as shown by -[our paper](https://www.ncbi.nlm.nih.gov/pubmed/30566647). Nevertheless, WISECONDOR has limitations: -Stouffer's Z-score approach is error-prone when dealing with large amounts of aberrations, the algorithm -is extremely slow (24h) when operating at small bin sizes (15 kb), and sex chromosomes are not part of the analysis. -Here, we present WisecondorX, an evolved WISECONDOR that aims at dealing with previous difficulties, resulting -in overall superior results and significantly lower computing times, allowing daily diagnostic use. WisecondorX is +# Background + +After extensively comparing different (shallow) whole-genome sequencing-based copy number detection tools, +including [WISECONDOR](https://github.com/VUmcCGP/wisecondor), [QDNAseq](https://github.com/ccagc/QDNAseq), +[CNVkit](https://github.com/etal/cnvkit), [Control-FREEC](https://github.com/BoevaLab/FREEC), +[BIC-seq2](http://compbio.med.harvard.edu/BIC-seq/) and +[cn.MOPS](https://bioconductor.org/packages/release/bioc/html/cn.mops.html), +WISECONDOR appeared to normalize sequencing data in the most consistent way, as shown by +[our paper](https://www.ncbi.nlm.nih.gov/pubmed/30566647). Nevertheless, WISECONDOR has limitations: +Stouffer's Z-score approach is error-prone when dealing with large amounts of aberrations, the algorithm +is extremely slow (24h) when operating at small bin sizes (15 kb), and sex chromosomes are not part of the analysis. +Here, we present WisecondorX, an evolved WISECONDOR that aims at dealing with previous difficulties, resulting +in overall superior results and significantly lower computing times, allowing daily diagnostic use. WisecondorX is meant to be applicable not only to NIPT, but also gDNA, PGT, FFPE, LQB, ... etc. # Manual ## Mapping -We found superior results through WisecondorX when using [bowtie2](https://github.com/BenLangmead/bowtie2) as a mapper. -Note that it is important that **no** read quality filtering is executed prior to running WisecondorX: this software +We found superior results through WisecondorX when using [bowtie2](https://github.com/BenLangmead/bowtie2) as a mapper. +Note that it is important that **no** read quality filtering is executed prior to running WisecondorX: this software requires low-quality reads to distinguish informative bins from non-informative ones. ## WisecondorX ### Installation -Stable releases can be installed through pip install. This option ascertains the latest version is -downloaded, however, it does not install R [dependencies](#dependencies). +Stable releases can be installed through pip install. This option ascertains the latest version is +downloaded, however, it does not install R [dependencies](#dependencies). + ```bash pip install -U git+https://github.com/CenterForMedicalGeneticsGhent/WisecondorX ``` -Alternatively, [Conda](https://conda.io/docs/) additionally installs all necessary [depedencies](#dependencies), +Alternatively, [Conda](https://conda.io/docs/) additionally installs all necessary [depedencies](#dependencies), however, the latest version might not be downloaded. + ```bash conda install -f -c conda-forge -c bioconda wisecondorx @@ -40,21 +43,22 @@ conda install -f -c conda-forge -c bioconda wisecondorx ### Running WisecondorX -There are three main stages (converting, reference creating and predicting) when using WisecondorX: +There are three main stages (converting, reference creating and predicting) when using WisecondorX: + - Convert aligned reads to .npz files (for both reference and test samples) -- Create a reference (using reference .npz files) - - **Important notes** - - Automated gender prediction, required to consistently analyze sex chromosomes, is based on a Gaussian mixture - model. If few samples (<20) are included during reference creation, or not both male and female samples (for - NIPT, this means male and female feti) are represented, this process might not be accurate. Therefore, - alternatively, one can manually tweak the [`--yfrac`](#stage-2-create-reference) parameter. - - It is of paramount importance that the reference set consists of exclusively negative control samples that - originate from the same sequencer, mapper, reference genome, type of material, ... etc, as the test samples. - As a rule of thumb, think of all laboratory and *in silico* steps: the more sources of bias that can be omitted, - the better. - - Try to include at least 50 samples per reference. The more the better, yet, from 500 on it is unlikely to - observe additional improvement concerning normalization. -- Predict copy number alterations (using the reference file and test .npz cases of interest) +- Create a reference (using reference .npz files) + - **Important notes** + - Automated gender prediction, required to consistently analyze sex chromosomes, is based on a Gaussian mixture + model. If few samples (<20) are included during reference creation, or not both male and female samples (for + NIPT, this means male and female feti) are represented, this process might not be accurate. Therefore, + alternatively, one can manually tweak the [`--yfrac`](#stage-2-create-reference) parameter. + - It is of paramount importance that the reference set consists of exclusively negative control samples that + originate from the same sequencer, mapper, reference genome, type of material, ... etc, as the test samples. + As a rule of thumb, think of all laboratory and _in silico_ steps: the more sources of bias that can be omitted, + the better. + - Try to include at least 50 samples per reference. The more the better, yet, from 500 on it is unlikely to + observe additional improvement concerning normalization. +- Predict copy number alterations (using the reference file and test .npz cases of interest) ### Stage (1) Convert aligned reads (bam/cram) to .npz @@ -63,12 +67,11 @@ There are three main stages (converting, reference creating and predicting) when WisecondorX convert input.bam/cram output.npz [--optional arguments] ``` -
Optional argument

| Function -:--- | :--- -`--reference x` | Fasta reference to be used with cram inputs -`--binsize x` | Size per bin in bp; the reference bin size should be a multiple of this value. Note that this parameter does not impact the resolution, yet it can be used to optimize processing speed (default: x=5e3) -`--normdup` | Use this flag to avoid duplicate removal - +|
Optional argument

| Function | +| :----------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--reference x` | Fasta reference to be used with cram inputs | +| `--binsize x` | Size per bin in bp; the reference bin size should be a multiple of this value. Note that this parameter does not impact the resolution, yet it can be used to optimize processing speed (default: x=5e3) | +| `--normdup` | Use this flag to avoid duplicate removal | → Bash recipe at `./pipeline/convert.sh` @@ -79,39 +82,39 @@ WisecondorX convert input.bam/cram output.npz [--optional arguments] WisecondorX newref reference_input_dir/*.npz reference_output.npz [--optional arguments] ``` -
Optional argument

| Function -:--- | :--- -`--nipt` | **Always include this flag for the generation of a NIPT reference** -`--binsize x` | Size per bin in bp, defines the resolution of the output (default: x=1e5) -`--refsize x` | Amount of reference locations per target; should generally not be tweaked (default: x=300) -`--yfrac x` | Y read fraction cutoff, in order to manually define gender. Setting this to 1 will treat all samples as female -`--plotyfrac x` | plots Y read fraction histogram and Gaussian mixture fit to file x, can help when setting `--yfrac` manually; software quits after plotting -`--cpus x` | Number of threads requested (default: x=1) +|
Optional argument

| Function | +| :----------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ | +| `--nipt` | **Always include this flag for the generation of a NIPT reference** | +| `--binsize x` | Size per bin in bp, defines the resolution of the output (default: x=1e5) | +| `--refsize x` | Amount of reference locations per target; should generally not be tweaked (default: x=300) | +| `--yfrac x` | Y read fraction cutoff, in order to manually define gender. Setting this to 1 will treat all samples as female | +| `--plotyfrac x` | plots Y read fraction histogram and Gaussian mixture fit to file x, can help when setting `--yfrac` manually; software quits after plotting | +| `--cpus x` | Number of threads requested (default: x=1) | → Bash recipe at `./pipeline/newref.sh` -### Stage (3) Predict copy number alterations +### Stage (3) Predict copy number alterations ```bash WisecondorX predict test_input.npz reference_input.npz output_id [--optional arguments] ``` - -
Optional argument

| Function -:--- | :--- -`--minrefbins x` | Minimum amount of sensible reference bins per target bin; should generally not be tweaked (default: x=150) -`--maskrepeats x` | Bins with distances > mean + sd * 3 in the reference will be masked. This parameter represents the number of masking cycles and defines the stringency of the blacklist (default: x=5) -`--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5) -`--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4) -`--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction) -`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`) -`--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M) -`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)** -`--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable **(\*)** -`--ylim [a,b]` | Force WisecondorX to use y-axis interval [a,b] during plotting, e.g. [-2,2] -`--cairo` | Some operating systems require the cairo bitmap type to write plots - -**(\*)** At least one of these output formats should be selected + +|
Optional argument

| Function | +| :----------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--minrefbins x` | Minimum amount of sensible reference bins per target bin; should generally not be tweaked (default: x=150) | +| `--maskrepeats x` | Bins with distances > mean + sd \* 3 in the reference will be masked. This parameter represents the number of masking cycles and defines the stringency of the blacklist (default: x=5) | +| `--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5) | +| `--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4) | +| `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction) | +| `--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`) | +| `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M) | +| `--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)** | +| `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable **(\*)** | +| `--ylim [a,b]` | Force WisecondorX to use y-axis interval [a,b] during plotting, e.g. [-2,2] | +| `--cairo` | Some operating systems require the cairo bitmap type to write plots | + +**(\*)** At least one of these output formats should be selected → Bash recipe at `./pipeline/predict.sh` @@ -122,72 +125,72 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg WisecondorX gender test_input.npz reference_input.npz ``` -Returns gender according to the reference. +Returns gender according to the reference. # Parameters -The default parameters are optimized for shallow whole-genome sequencing data (0.1x - 1x coverage) and reference bin -sizes ranging from 50 to 500 kb. +The default parameters are optimized for shallow whole-genome sequencing data (0.1x - 1x coverage) and reference bin +sizes ranging from 50 to 500 kb. # Underlying algorithm -To understand the underlying algorithm, I highly recommend reading -[Straver et al (2014)](https://www.ncbi.nlm.nih.gov/pubmed/24170809); and its update shortly introduced in -[Huijsdens-van Amsterdam et al (2018)](https://www.nature.com/articles/gim201832.epdf). Numerous adaptations to this -algorithm have been made, yet the central normalization principles remain. Changes include e.g. the inclusion of a gender -prediction algorithm, gender handling prior to normalization (ultimately enabling X and Y predictions), between-sample -Z-scoring, inclusion of a weighted circular binary segmentation algorithm and improved codes for obtaining tables and -plots. +To understand the underlying algorithm, I highly recommend reading +[Straver et al (2014)](https://www.ncbi.nlm.nih.gov/pubmed/24170809); and its update shortly introduced in +[Huijsdens-van Amsterdam et al (2018)](https://www.nature.com/articles/gim201832.epdf). Numerous adaptations to this +algorithm have been made, yet the central normalization principles remain. Changes include e.g. the inclusion of a gender +prediction algorithm, gender handling prior to normalization (ultimately enabling X and Y predictions), between-sample +Z-scoring, inclusion of a weighted circular binary segmentation algorithm and improved codes for obtaining tables and +plots. # Interpretation results ## Plots -Every dot represents a bin. The dots range across the X-axis from chromosome 1 to X (or Y, in case of a male). The -vertical position of a dot represents the ratio between the observed number of reads and the expected number of reads, -the latter being the 'normal' state. As these values are log2-transformed, copy neutral dots should be close-to 0. -Importantly, notice that the dots are always subject to Gaussian noise. Therefore, segments, indicated by horizontal -white lines, cover bins of predicted equal copy number. The size of the dots represents the variability at the reference -set. Thus, the size increases with the certainty of an observation. The same goes for the line width of the segments. -Vertical grey bars represent the blacklist, which matches mostly hypervariable loci and repeats. Finally, the horizontal -colored dotted lines show where the constitutional 1n and 3n states are expected (when constitutional DNA is at 100% -purity). Often, an aberration does not reach these limits, which has several potential causes: depending on your type -of analysis, the sample could be subject to tumor fraction, fetal fraction, a mosaicism, ... etc. Sometimes, the +Every dot represents a bin. The dots range across the X-axis from chromosome 1 to X (or Y, in case of a male). The +vertical position of a dot represents the ratio between the observed number of reads and the expected number of reads, +the latter being the 'normal' state. As these values are log2-transformed, copy neutral dots should be close-to 0. +Importantly, notice that the dots are always subject to Gaussian noise. Therefore, segments, indicated by horizontal +white lines, cover bins of predicted equal copy number. The size of the dots represents the variability at the reference +set. Thus, the size increases with the certainty of an observation. The same goes for the line width of the segments. +Vertical grey bars represent the blacklist, which matches mostly hypervariable loci and repeats. Finally, the horizontal +colored dotted lines show where the constitutional 1n and 3n states are expected (when constitutional DNA is at 100% +purity). Often, an aberration does not reach these limits, which has several potential causes: depending on your type +of analysis, the sample could be subject to tumor fraction, fetal fraction, a mosaicism, ... etc. Sometimes, the segments do surpass these limits: here it's likely you are dealing with 0n, 4n, 5n, 6n, ... ## Tables ### ID_bins.bed -This file contains all bin-wise information. When data is 'NaN', the corresponding bin is included in the blacklist. -The Z-scores are calculated as default using the within-sample reference bins as a null set. +This file contains all bin-wise information. When data is 'NaN', the corresponding bin is included in the blacklist. +The Z-scores are calculated as default using the within-sample reference bins as a null set. ### ID_segments.bed This file contains all segment-wise information. A combined Z-score is calculated using a between-sample Z-scoring -technique (the test case vs the reference cases). +technique (the test case vs the reference cases). ### ID_aberrations.bed -This file contains aberrant segments, defined by the [`--beta`](#stage-3-predict-copy-number-alterations) or -[`--zscore`](#stage-3-predict-copy-number-alterations) parameters. +This file contains aberrant segments, defined by the [`--beta`](#stage-3-predict-copy-number-alterations) or +[`--zscore`](#stage-3-predict-copy-number-alterations) parameters. ### ID_statistics.bed -This file contains some interesting statistics (per chromosome and overall). The definition of the Z-scores matches the one from -the 'ID_segments.bed'. Particularly interesting for NIPT. +This file contains some interesting statistics (per chromosome and overall). The definition of the Z-scores matches the one from +the 'ID_segments.bed'. Particularly interesting for NIPT. # Dependencies - R (v3.4) packages - - jsonlite (v1.5) + - jsonlite (v1.5) - R Bioconductor (v3.5) packages - - DNAcopy (v1.50.1) + - DNAcopy (v1.50.1) - Python (v3.6) libraries - - scipy (v1.1.0) + - scipy (v1.1.0) - scikit-learn (v0.20.0) - pysam (v0.15.1) - numpy (v1.15.2) - matplotlib (v2.2.3) -And of course, other versions are very likely to work as well. +And of course, other versions are very likely to work as well. diff --git a/setup.py b/setup.py index 418a5a8..778a94a 100644 --- a/setup.py +++ b/setup.py @@ -1,48 +1,55 @@ #! /usr/bin/env python from setuptools import setup, find_packages -version = '1.2.4' -dl_version = 'master' if 'dev' in version else '{}'.format(version) +version = "1.2.4" +dl_version = "master" if "dev" in version else "{}".format(version) setup( - name='WisecondorX', + name="WisecondorX", version=version, - author='Matthias De Smet, Lennart Raman', - author_email='Lennart.raman@ugent.be', + author="Matthias De Smet, Lennart Raman", + author_email="Lennart.raman@ugent.be", description="WisecondorX -- an evolved WISECONDOR", long_description=__doc__, - keywords=['bioinformatics', 'biology', 'sequencing', 'NGS', 'next generation sequencing', - 'CNV', 'SWGS', 'Shallow Whole Genome Sequencing'], - download_url='https://github.com/CenterForMedicalGeneticsGhent/WisecondorX/archive/v{}.tar.gz'.format( - dl_version), - license='Attribution-NonCommercial-ShareAlike CC BY-NC-SA', - packages=find_packages('.'), - python_requires='>=2.7', + keywords=[ + "bioinformatics", + "biology", + "sequencing", + "NGS", + "next generation sequencing", + "CNV", + "SWGS", + "Shallow Whole Genome Sequencing", + ], + download_url="https://github.com/CenterForMedicalGeneticsGhent/WisecondorX/archive/v{}.tar.gz".format( + dl_version + ), + license="Attribution-NonCommercial-ShareAlike CC BY-NC-SA", + packages=find_packages("."), + python_requires=">=2.7", include_package_data=True, zip_safe=False, install_requires=[ 'futures;python_version<"3"', - 'scipy', - 'scikit-learn', - 'pysam', - 'numpy' + "scipy", + "scikit-learn", + "pysam", + "numpy", ], - entry_points={ - 'console_scripts': ['WisecondorX = wisecondorX.main:main'] - }, + entry_points={"console_scripts": ["WisecondorX = wisecondorX.main:main"]}, classifiers=[ - 'Development Status :: 3 - Alpha', - 'Environment :: Console', - 'Intended Audience :: Science/Research', - 'Natural Language :: English', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: POSIX', - 'Operating System :: Unix', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Bio-Informatics' - ] + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX", + "Operating System :: Unix", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], ) diff --git a/wisecondorX/convert_tools.py b/wisecondorX/convert_tools.py index 4af6a54..d3585e5 100644 --- a/wisecondorX/convert_tools.py +++ b/wisecondorX/convert_tools.py @@ -6,10 +6,10 @@ import pysam import sys -''' +""" Converts aligned reads file to numpy array by transforming individual reads to counts per bin. -''' +""" def convert_reads(args): @@ -17,19 +17,24 @@ def convert_reads(args): for chr in range(1, 25): bins_per_chr[str(chr)] = None - logging.info('Importing data ...') + logging.info("Importing data ...") if args.infile.endswith(".bam"): - reads_file = pysam.AlignmentFile(args.infile, 'rb') + reads_file = pysam.AlignmentFile(args.infile, "rb") elif args.infile.endswith(".cram"): if args.reference is not None: - reads_file = pysam.AlignmentFile(args.infile, 'rc', reference_filename=args.reference) + reads_file = pysam.AlignmentFile( + args.infile, "rc", reference_filename=args.reference + ) else: - logging.error("Cram support requires a reference file, please use the --reference argument") + logging.error( + "Cram support requires a reference file, please use the --reference argument" + ) sys.exit(1) else: logging.error( - "Unsupported input file type. Make sure your input filename has a correct extension ( bam or cram)") + "Unsupported input file type. Make sure your input filename has a correct extension ( bam or cram)" + ) sys.exit(1) reads_seen = 0 @@ -40,32 +45,41 @@ def convert_reads(args): larp = -1 larp2 = -1 - logging.info('Converting aligned reads ... This might take a while ...') + logging.info("Converting aligned reads ... This might take a while ...") for index, chr in enumerate(reads_file.references): chr_name = chr - if chr_name[:3].lower() == 'chr': + if chr_name[:3].lower() == "chr": chr_name = chr_name[3:] - if chr_name not in bins_per_chr and chr_name != 'X' and chr_name != 'Y': + if chr_name not in bins_per_chr and chr_name != "X" and chr_name != "Y": continue - logging.info('Working at {}; processing {} bins' - .format(chr, int(reads_file.lengths[index] / float(args.binsize) + 1))) - counts = np.zeros(int(reads_file.lengths[index] / float(args.binsize) + 1), dtype=np.int32) + logging.info( + "Working at {}; processing {} bins".format( + chr, int(reads_file.lengths[index] / float(args.binsize) + 1) + ) + ) + counts = np.zeros( + int(reads_file.lengths[index] / float(args.binsize) + 1), dtype=np.int32 + ) bam_chr = reads_file.fetch(chr) - if chr_name == 'X': - chr_name = '23' - if chr_name == 'Y': - chr_name = '24' + if chr_name == "X": + chr_name = "23" + if chr_name == "Y": + chr_name = "24" for read in bam_chr: if read.is_paired: if not read.is_proper_pair: reads_pairf += 1 continue - if not args.normdup and larp == read.pos and larp2 == read.next_reference_start: + if ( + not args.normdup + and larp == read.pos + and larp2 == read.next_reference_start + ): reads_rmdup += 1 else: if read.mapping_quality >= 1: @@ -93,12 +107,14 @@ def convert_reads(args): bins_per_chr[chr_name] = counts reads_kept += sum(counts) - qual_info = {'mapped': reads_file.mapped, - 'unmapped': reads_file.unmapped, - 'no_coordinate': reads_file.nocoordinate, - 'filter_rmdup': reads_rmdup, - 'filter_mapq': reads_mapq, - 'pre_retro': reads_seen, - 'post_retro': reads_kept, - 'pair_fail': reads_pairf} + qual_info = { + "mapped": reads_file.mapped, + "unmapped": reads_file.unmapped, + "no_coordinate": reads_file.nocoordinate, + "filter_rmdup": reads_rmdup, + "filter_mapq": reads_mapq, + "pre_retro": reads_seen, + "post_retro": reads_kept, + "pair_fail": reads_pairf, + } return bins_per_chr, qual_info diff --git a/wisecondorX/main.py b/wisecondorX/main.py index 63e61d5..6a1d3b9 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -9,151 +9,170 @@ import numpy as np from wisecondorX.convert_tools import convert_reads -from wisecondorX.newref_control import tool_newref_prep, tool_newref_main, tool_newref_merge +from wisecondorX.newref_control import ( + tool_newref_prep, + tool_newref_main, + tool_newref_merge, +) from wisecondorX.newref_tools import train_gender_model, get_mask from wisecondorX.overall_tools import gender_correct, scale_sample from wisecondorX.predict_control import normalize, get_post_processed_result from wisecondorX.predict_output import generate_output_tables, exec_write_plots -from wisecondorX.predict_tools import log_trans, exec_cbs, apply_blacklist, predict_gender +from wisecondorX.predict_tools import ( + log_trans, + exec_cbs, + apply_blacklist, + predict_gender, +) def tool_convert(args): - logging.info('Starting conversion') + logging.info("Starting conversion") sample, qual_info = convert_reads(args) - np.savez_compressed(args.outfile, - binsize=args.binsize, - sample=sample, - quality=qual_info) + np.savez_compressed( + args.outfile, binsize=args.binsize, sample=sample, quality=qual_info + ) - logging.info('Finished conversion') + logging.info("Finished conversion") def tool_newref(args): - logging.info('Creating new reference') + logging.info("Creating new reference") if args.yfrac is not None: if args.yfrac < 0 or args.yfrac > 1: logging.critical( - 'Parameter --yfrac should be a positive number lower than or equal to 1') + "Parameter --yfrac should be a positive number lower than or equal to 1" + ) sys.exit() split_path = list(os.path.split(args.outfile)) - if split_path[-1][-4:] == '.npz': + if split_path[-1][-4:] == ".npz": split_path[-1] = split_path[-1][:-4] base_path = os.path.join(split_path[0], split_path[1]) args.basepath = base_path - args.prepfile = '{}_prep.npz'.format(base_path) - args.partfile = '{}_part'.format(base_path) + args.prepfile = "{}_prep.npz".format(base_path) + args.partfile = "{}_part".format(base_path) samples = [] - logging.info('Importing data ...') + logging.info("Importing data ...") for infile in args.infiles: - logging.info('Loading: {}'.format(infile)) - npzdata = np.load(infile, encoding='latin1', allow_pickle=True) - sample = npzdata['sample'].item() - binsize = int(npzdata['binsize']) - logging.info('Binsize: {}'.format(int(binsize))) + logging.info("Loading: {}".format(infile)) + npzdata = np.load(infile, encoding="latin1", allow_pickle=True) + sample = npzdata["sample"].item() + binsize = int(npzdata["binsize"]) + logging.info("Binsize: {}".format(int(binsize))) samples.append(scale_sample(sample, binsize, args.binsize)) samples = np.array(samples) genders, trained_cutoff = train_gender_model(args, samples) - if genders.count('F') < 5 and args.nipt: + if genders.count("F") < 5 and args.nipt: logging.warning( - 'A NIPT reference should have at least 5 female feti samples. Removing --nipt flag.') + "A NIPT reference should have at least 5 female feti samples. Removing --nipt flag." + ) args.nipt = False if not args.nipt: for i, sample in enumerate(samples): samples[i] = gender_correct(sample, genders[i]) total_mask, bins_per_chr = get_mask(samples) - if genders.count('F') > 4: - mask_F, _ = get_mask(samples[np.array(genders) == 'F']) + if genders.count("F") > 4: + mask_F, _ = get_mask(samples[np.array(genders) == "F"]) total_mask = total_mask & mask_F - if genders.count('M') > 4 and not args.nipt: - mask_M, _ = get_mask(samples[np.array(genders) == 'M']) + if genders.count("M") > 4 and not args.nipt: + mask_M, _ = get_mask(samples[np.array(genders) == "M"]) total_mask = total_mask & mask_M outfiles = [] if len(genders) > 9: - logging.info('Starting autosomal reference creation ...') - args.tmpoutfile = '{}.tmp.A.npz'.format(args.basepath) + logging.info("Starting autosomal reference creation ...") + args.tmpoutfile = "{}.tmp.A.npz".format(args.basepath) outfiles.append(args.tmpoutfile) - tool_newref_prep(args, samples, 'A', total_mask, bins_per_chr) - logging.info('This might take a while ...') + tool_newref_prep(args, samples, "A", total_mask, bins_per_chr) + logging.info("This might take a while ...") tool_newref_main(args, args.cpus) else: logging.critical( - 'Provide at least 10 samples to enable the generation of a reference.') + "Provide at least 10 samples to enable the generation of a reference." + ) sys.exit() - if genders.count('F') > 4: - logging.info('Starting female gonosomal reference creation ...') - args.tmpoutfile = '{}.tmp.F.npz'.format(args.basepath) + if genders.count("F") > 4: + logging.info("Starting female gonosomal reference creation ...") + args.tmpoutfile = "{}.tmp.F.npz".format(args.basepath) outfiles.append(args.tmpoutfile) - tool_newref_prep(args, samples[np.array( - genders) == 'F'], 'F', total_mask, bins_per_chr) - logging.info('This might take a while ...') + tool_newref_prep( + args, samples[np.array(genders) == "F"], "F", total_mask, bins_per_chr + ) + logging.info("This might take a while ...") tool_newref_main(args, 1) else: logging.warning( - 'Provide at least 5 female samples to enable normalization of female gonosomes.') + "Provide at least 5 female samples to enable normalization of female gonosomes." + ) if not args.nipt: - if genders.count('M') > 4: - logging.info('Starting male gonosomal reference creation ...') - args.tmpoutfile = '{}.tmp.M.npz'.format(args.basepath) + if genders.count("M") > 4: + logging.info("Starting male gonosomal reference creation ...") + args.tmpoutfile = "{}.tmp.M.npz".format(args.basepath) outfiles.append(args.tmpoutfile) - tool_newref_prep(args, samples[np.array( - genders) == 'M'], 'M', total_mask, bins_per_chr) + tool_newref_prep( + args, samples[np.array(genders) == "M"], "M", total_mask, bins_per_chr + ) tool_newref_main(args, 1) else: logging.warning( - 'Provide at least 5 male samples to enable normalization of male gonosomes.') + "Provide at least 5 male samples to enable normalization of male gonosomes." + ) tool_newref_merge(args, outfiles, trained_cutoff) - logging.info('Finished creating reference') + logging.info("Finished creating reference") def tool_test(args): - logging.info('Starting CNA prediction') + logging.info("Starting CNA prediction") if not args.bed and not args.plot: - logging.critical('No output format selected. ' - 'Select at least one of the supported output formats (--bed, --plot)') + logging.critical( + "No output format selected. " + "Select at least one of the supported output formats (--bed, --plot)" + ) sys.exit() if args.zscore <= 0: - logging.critical( - 'Parameter --zscore should be a strictly positive number') + logging.critical("Parameter --zscore should be a strictly positive number") sys.exit() if args.beta is not None: if args.beta <= 0 or args.beta > 1: logging.critical( - 'Parameter --beta should be a strictly positive number lower than or equal to 1') + "Parameter --beta should be a strictly positive number lower than or equal to 1" + ) sys.exit() if args.alpha <= 0 or args.alpha > 1: logging.critical( - 'Parameter --alpha should be a strictly positive number lower than or equal to 1') + "Parameter --alpha should be a strictly positive number lower than or equal to 1" + ) sys.exit() - logging.info('Importing data ...') - ref_file = np.load(args.reference, encoding='latin1', allow_pickle=True) - sample_file = np.load(args.infile, encoding='latin1', allow_pickle=True) + logging.info("Importing data ...") + ref_file = np.load(args.reference, encoding="latin1", allow_pickle=True) + sample_file = np.load(args.infile, encoding="latin1", allow_pickle=True) - sample = sample_file['sample'].item() + sample = sample_file["sample"].item() n_reads = sum([sum(sample[x]) for x in sample.keys()]) - sample = scale_sample(sample, int( - sample_file['binsize'].item()), int(ref_file['binsize'])) + sample = scale_sample( + sample, int(sample_file["binsize"].item()), int(ref_file["binsize"]) + ) - gender = predict_gender(sample, ref_file['trained_cutoff']) - if not ref_file['is_nipt']: + gender = predict_gender(sample, ref_file["trained_cutoff"]) + if not ref_file["is_nipt"]: if args.gender: gender = args.gender sample = gender_correct(sample, gender) @@ -161,254 +180,312 @@ def tool_test(args): else: if args.gender: gender = args.gender - ref_gender = 'F' + ref_gender = "F" - logging.info('Normalizing autosomes ...') + logging.info("Normalizing autosomes ...") results_r, results_z, results_w, ref_sizes, m_lr, m_z = normalize( - args, sample, ref_file, 'A') + args, sample, ref_file, "A" + ) - if not ref_file['is_nipt']: - if not ref_file['has_male'] and gender == 'M': - logging.warning('This sample is male, whilst the reference is created with fewer than 5 males. ' - 'The female gonosomal reference will be used for X predictions. Note that these might ' - 'not be accurate. If the latter is desired, create a new reference and include more ' - 'male samples.') - ref_gender = 'F' - - elif not ref_file['has_female'] and gender == 'F': - logging.warning('This sample is female, whilst the reference is created with fewer than 5 females. ' - 'The male gonosomal reference will be used for XY predictions. Note that these might ' - 'not be accurate. If the latter is desired, create a new reference and include more ' - 'female samples.') - ref_gender = 'M' + if not ref_file["is_nipt"]: + if not ref_file["has_male"] and gender == "M": + logging.warning( + "This sample is male, whilst the reference is created with fewer than 5 males. " + "The female gonosomal reference will be used for X predictions. Note that these might " + "not be accurate. If the latter is desired, create a new reference and include more " + "male samples." + ) + ref_gender = "F" + + elif not ref_file["has_female"] and gender == "F": + logging.warning( + "This sample is female, whilst the reference is created with fewer than 5 females. " + "The male gonosomal reference will be used for XY predictions. Note that these might " + "not be accurate. If the latter is desired, create a new reference and include more " + "female samples." + ) + ref_gender = "M" - logging.info('Normalizing gonosomes ...') + logging.info("Normalizing gonosomes ...") - null_ratios_aut_per_bin = ref_file['null_ratios'] - null_ratios_gon_per_bin = ref_file['null_ratios.{}'.format( - ref_gender)][len(null_ratios_aut_per_bin):] + null_ratios_aut_per_bin = ref_file["null_ratios"] + null_ratios_gon_per_bin = ref_file["null_ratios.{}".format(ref_gender)][ + len(null_ratios_aut_per_bin) : + ] results_r_2, results_z_2, results_w_2, ref_sizes_2, _, _ = normalize( - args, sample, ref_file, ref_gender) - - rem_input = {'args': args, - 'wd': str(os.path.dirname(os.path.realpath(__file__))), - 'binsize': int(ref_file['binsize']), - 'n_reads': n_reads, - 'ref_gender': ref_gender, - 'gender': gender, - 'mask': ref_file['mask.{}'.format(ref_gender)], - 'bins_per_chr': ref_file['bins_per_chr.{}'.format(ref_gender)], - 'masked_bins_per_chr': ref_file['masked_bins_per_chr.{}'.format(ref_gender)], - 'masked_bins_per_chr_cum': ref_file['masked_bins_per_chr_cum.{}'.format(ref_gender)]} + args, sample, ref_file, ref_gender + ) + + rem_input = { + "args": args, + "wd": str(os.path.dirname(os.path.realpath(__file__))), + "binsize": int(ref_file["binsize"]), + "n_reads": n_reads, + "ref_gender": ref_gender, + "gender": gender, + "mask": ref_file["mask.{}".format(ref_gender)], + "bins_per_chr": ref_file["bins_per_chr.{}".format(ref_gender)], + "masked_bins_per_chr": ref_file["masked_bins_per_chr.{}".format(ref_gender)], + "masked_bins_per_chr_cum": ref_file[ + "masked_bins_per_chr_cum.{}".format(ref_gender) + ], + } del ref_file results_r = np.append(results_r, results_r_2) results_z = np.append(results_z, results_z_2) - m_z - results_w = np.append(results_w * np.nanmean(results_w_2), - results_w_2 * np.nanmean(results_w)) + results_w = np.append( + results_w * np.nanmean(results_w_2), results_w_2 * np.nanmean(results_w) + ) results_w = results_w / np.nanmean(results_w) if np.isnan(results_w).any() or np.isinf(results_w).any(): - logging.warning('Non-numeric values found in weights -- reference too small. Circular binary segmentation and z-scoring will be unweighted') + logging.warning( + "Non-numeric values found in weights -- reference too small. Circular binary segmentation and z-scoring will be unweighted" + ) results_w = np.ones(len(results_w)) ref_sizes = np.append(ref_sizes, ref_sizes_2) null_ratios = np.array( - [x.tolist() for x in null_ratios_aut_per_bin] + [x.tolist() for x in null_ratios_gon_per_bin]) + [x.tolist() for x in null_ratios_aut_per_bin] + + [x.tolist() for x in null_ratios_gon_per_bin] + ) - results = {'results_r': results_r, - 'results_z': results_z, - 'results_w': results_w, - 'results_nr': null_ratios} + results = { + "results_r": results_r, + "results_z": results_z, + "results_w": results_w, + "results_nr": null_ratios, + } for result in results.keys(): results[result] = get_post_processed_result( - args, results[result], ref_sizes, rem_input) + args, results[result], ref_sizes, rem_input + ) log_trans(results, m_lr) if args.blacklist: - logging.info('Applying blacklist ...') + logging.info("Applying blacklist ...") apply_blacklist(rem_input, results) - logging.info('Executing circular binary segmentation ...') + logging.info("Executing circular binary segmentation ...") - results['results_c'] = exec_cbs(rem_input, results) + results["results_c"] = exec_cbs(rem_input, results) if args.bed: - logging.info('Writing tables ...') + logging.info("Writing tables ...") generate_output_tables(rem_input, results) if args.plot: - logging.info('Writing plots ...') + logging.info("Writing plots ...") exec_write_plots(rem_input, results) - logging.info('Finished prediction') + logging.info("Finished prediction") def output_gender(args): - ref_file = np.load(args.reference, encoding='latin1', allow_pickle=True) - sample_file = np.load(args.infile, encoding='latin1', allow_pickle=True) - gender = predict_gender( - sample_file['sample'].item(), ref_file['trained_cutoff']) - if gender == 'M': - print('male') + ref_file = np.load(args.reference, encoding="latin1", allow_pickle=True) + sample_file = np.load(args.infile, encoding="latin1", allow_pickle=True) + gender = predict_gender(sample_file["sample"].item(), ref_file["trained_cutoff"]) + if gender == "M": + print("male") else: - print('female') + print("female") def main(): - warnings.filterwarnings('ignore') - - parser = argparse.ArgumentParser(description='WisecondorX') - parser.add_argument('--loglevel', - type=str, - default='INFO', - choices=['info', 'warning', 'debug', 'error', 'critical']) + warnings.filterwarnings("ignore") + + parser = argparse.ArgumentParser(description="WisecondorX") + parser.add_argument( + "--loglevel", + type=str, + default="INFO", + choices=["info", "warning", "debug", "error", "critical"], + ) subparsers = parser.add_subparsers() - parser_convert = subparsers.add_parser('convert', - description='Convert and filter a aligned reads to .npz', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser_convert.add_argument('infile', - type=str, - help='aligned reads input for conversion') - parser_convert.add_argument('outfile', - type=str, - help='Output .npz file') - parser_convert.add_argument('-r', '--reference', - type=str, - help='Fasta reference to be used during cram conversion') - parser_convert.add_argument('--binsize', - type=float, - default=5e3, - help='Bin size (bp)') - parser_convert.add_argument('--normdup', - action='store_true', - help='Do not remove duplicates') + parser_convert = subparsers.add_parser( + "convert", + description="Convert and filter a aligned reads to .npz", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser_convert.add_argument( + "infile", type=str, help="aligned reads input for conversion" + ) + parser_convert.add_argument("outfile", type=str, help="Output .npz file") + parser_convert.add_argument( + "-r", + "--reference", + type=str, + help="Fasta reference to be used during cram conversion", + ) + parser_convert.add_argument( + "--binsize", type=float, default=5e3, help="Bin size (bp)" + ) + parser_convert.add_argument( + "--normdup", action="store_true", help="Do not remove duplicates" + ) parser_convert.set_defaults(func=tool_convert) - parser_newref = subparsers.add_parser('newref', - description='Create a new reference using healthy reference samples', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser_newref.add_argument('infiles', - type=str, - nargs='+', - help='Path to all reference data files (e.g. path/to/reference/*.npz)') - parser_newref.add_argument('outfile', - type=str, - help='Path and filename for the reference output (e.g. path/to/myref.npz)') - parser_newref.add_argument('--nipt', - action='store_true', - help='Use flag for NIPT (e.g. path/to/reference/*.npz)' - ) - parser_newref.add_argument('--yfrac', - type=float, - default=None, - help='Use to manually set the y read fraction cutoff, which defines gender' - ) - parser_newref.add_argument('--plotyfrac', - type=str, - default=None, - help='Path to yfrac .png plot for --yfrac optimization (e.g. path/to/plot.png); software will stop after plotting after which --yfrac can be set manually' - ) - parser_newref.add_argument('--refsize', - type=int, - default=300, - help='Amount of reference locations per target') - parser_newref.add_argument('--binsize', - type=int, - default=1e5, - help='Scale samples to this binsize, multiples of existing binsize only') - parser_newref.add_argument('--cpus', - type=int, - default=1, - help='Use multiple cores to find reference bins') + parser_newref = subparsers.add_parser( + "newref", + description="Create a new reference using healthy reference samples", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser_newref.add_argument( + "infiles", + type=str, + nargs="+", + help="Path to all reference data files (e.g. path/to/reference/*.npz)", + ) + parser_newref.add_argument( + "outfile", + type=str, + help="Path and filename for the reference output (e.g. path/to/myref.npz)", + ) + parser_newref.add_argument( + "--nipt", + action="store_true", + help="Use flag for NIPT (e.g. path/to/reference/*.npz)", + ) + parser_newref.add_argument( + "--yfrac", + type=float, + default=None, + help="Use to manually set the y read fraction cutoff, which defines gender", + ) + parser_newref.add_argument( + "--plotyfrac", + type=str, + default=None, + help="Path to yfrac .png plot for --yfrac optimization (e.g. path/to/plot.png); software will stop after plotting after which --yfrac can be set manually", + ) + parser_newref.add_argument( + "--refsize", + type=int, + default=300, + help="Amount of reference locations per target", + ) + parser_newref.add_argument( + "--binsize", + type=int, + default=1e5, + help="Scale samples to this binsize, multiples of existing binsize only", + ) + parser_newref.add_argument( + "--cpus", type=int, default=1, help="Use multiple cores to find reference bins" + ) parser_newref.set_defaults(func=tool_newref) - parser_gender = subparsers.add_parser('gender', - description='Returns the gender of a .npz resulting from convert, ' - 'based on a Gaussian mixture model trained during the newref phase', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser_gender.add_argument('infile', - type=str, - help='.npz input file') - parser_gender.add_argument('reference', - type=str, - help='Reference .npz, as previously created with newref') + parser_gender = subparsers.add_parser( + "gender", + description="Returns the gender of a .npz resulting from convert, " + "based on a Gaussian mixture model trained during the newref phase", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser_gender.add_argument("infile", type=str, help=".npz input file") + parser_gender.add_argument( + "reference", type=str, help="Reference .npz, as previously created with newref" + ) parser_gender.set_defaults(func=output_gender) - parser_test = subparsers.add_parser('predict', - description='Find copy number aberrations', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser_test.add_argument('infile', - type=str, - help='.npz input file') - parser_test.add_argument('reference', - type=str, - help='Reference .npz, as previously created with newref') - parser_test.add_argument('outid', - type=str, - help='Basename (w/o extension) of output files (paths are allowed, e.g. path/to/ID_1)') - parser_test.add_argument('--minrefbins', - type=int, - default=150, - help='Minimum amount of sensible reference bins per target bin.') - parser_test.add_argument('--maskrepeats', - type=int, - default=5, - help='Regions with distances > mean + sd * 3 will be masked. Number of masking cycles.') - parser_test.add_argument('--alpha', - type=float, - default=1e-4, - help='p-value cut-off for calling a CBS breakpoint.') - parser_test.add_argument('--zscore', - type=float, - default=5, - help='z-score cut-off for aberration calling.') - parser_test.add_argument('--beta', - type=float, - default=None, - help='When beta is given, --zscore is ignored and a ratio cut-off is used to call aberrations. Beta is a number between 0 (liberal) and 1 (conservative) and is optimally close to the purity.') - parser_test.add_argument('--blacklist', - type=str, - default=None, - help='Blacklist that masks regions in output, structure of header-less ' - 'file: chr...(/t)startpos(/t)endpos(/n)') - parser_test.add_argument('--gender', - type=str, - choices=["F", "M"], - help='Force WisecondorX to analyze this case as a male (M) or a female (F)') - parser_test.add_argument('--ylim', - type=str, - default='def', - help='y-axis limits for plotting. e.g. [-2,2]') - parser_test.add_argument('--bed', - action='store_true', - help='Outputs tab-delimited .bed files, containing the most important information') - parser_test.add_argument('--plot', - action='store_true', - help='Outputs .png plots') - parser_test.add_argument('--cairo', - action='store_true', - help='Uses cairo bitmap type for plotting. Might be necessary for certain setups.') - parser_test.add_argument("--add-plot-title", - action="store_true", - help="Add the output name as plot title") + parser_test = subparsers.add_parser( + "predict", + description="Find copy number aberrations", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser_test.add_argument("infile", type=str, help=".npz input file") + parser_test.add_argument( + "reference", type=str, help="Reference .npz, as previously created with newref" + ) + parser_test.add_argument( + "outid", + type=str, + help="Basename (w/o extension) of output files (paths are allowed, e.g. path/to/ID_1)", + ) + parser_test.add_argument( + "--minrefbins", + type=int, + default=150, + help="Minimum amount of sensible reference bins per target bin.", + ) + parser_test.add_argument( + "--maskrepeats", + type=int, + default=5, + help="Regions with distances > mean + sd * 3 will be masked. Number of masking cycles.", + ) + parser_test.add_argument( + "--alpha", + type=float, + default=1e-4, + help="p-value cut-off for calling a CBS breakpoint.", + ) + parser_test.add_argument( + "--zscore", + type=float, + default=5, + help="z-score cut-off for aberration calling.", + ) + parser_test.add_argument( + "--beta", + type=float, + default=None, + help="When beta is given, --zscore is ignored and a ratio cut-off is used to call aberrations. Beta is a number between 0 (liberal) and 1 (conservative) and is optimally close to the purity.", + ) + parser_test.add_argument( + "--blacklist", + type=str, + default=None, + help="Blacklist that masks regions in output, structure of header-less " + "file: chr...(/t)startpos(/t)endpos(/n)", + ) + parser_test.add_argument( + "--gender", + type=str, + choices=["F", "M"], + help="Force WisecondorX to analyze this case as a male (M) or a female (F)", + ) + parser_test.add_argument( + "--ylim", + type=str, + default="def", + help="y-axis limits for plotting. e.g. [-2,2]", + ) + parser_test.add_argument( + "--bed", + action="store_true", + help="Outputs tab-delimited .bed files, containing the most important information", + ) + parser_test.add_argument("--plot", action="store_true", help="Outputs .png plots") + parser_test.add_argument( + "--cairo", + action="store_true", + help="Uses cairo bitmap type for plotting. Might be necessary for certain setups.", + ) + parser_test.add_argument( + "--add-plot-title", + action="store_true", + help="Add the output name as plot title", + ) parser_test.set_defaults(func=tool_test) args = parser.parse_args(sys.argv[1:]) - logging.basicConfig(format='[%(levelname)s - %(asctime)s]: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=getattr(logging, args.loglevel.upper(), None)) - logging.debug('args are: {}'.format(args)) + logging.basicConfig( + format="[%(levelname)s - %(asctime)s]: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=getattr(logging, args.loglevel.upper(), None), + ) + logging.debug("args are: {}".format(args)) args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/wisecondorX/newref_control.py b/wisecondorX/newref_control.py index 31091f5..fab4433 100644 --- a/wisecondorX/newref_control.py +++ b/wisecondorX/newref_control.py @@ -11,56 +11,58 @@ from wisecondorX.newref_tools import normalize_and_mask, train_pca, get_reference -''' +""" Outputs preparation files of read depth normalized data and contains PCA information to execute between- sample normalization during testing. Function is executed three times. Once for autosomes, once for XX gonosomes (if enough females are included) and once for XY gonosomes (if enough males are included). -''' +""" def tool_newref_prep(args, samples, gender, mask, bins_per_chr): - if gender == 'A': + if gender == "A": last_chr = 22 - elif gender == 'F': + elif gender == "F": last_chr = 23 else: last_chr = 24 bins_per_chr = bins_per_chr[:last_chr] - mask = mask[:np.sum(bins_per_chr)] + mask = mask[: np.sum(bins_per_chr)] masked_data = normalize_and_mask(samples, range(1, last_chr + 1), mask) pca_corrected_data, pca = train_pca(masked_data) - masked_bins_per_chr = [sum(mask[sum(bins_per_chr[:i]):sum(bins_per_chr[:i]) + x]) - for i, x in enumerate(bins_per_chr)] - masked_bins_per_chr_cum = [sum(masked_bins_per_chr[:x + 1]) - for x in range(len(masked_bins_per_chr))] - - np.savez_compressed(args.prepfile, - binsize=args.binsize, - gender=gender, - - mask=mask, - masked_data=masked_data, - - bins_per_chr=bins_per_chr, - masked_bins_per_chr=masked_bins_per_chr, - masked_bins_per_chr_cum=masked_bins_per_chr_cum, - - pca_corrected_data=pca_corrected_data, - pca_components=pca.components_, - pca_mean=pca.mean_) - - -''' + masked_bins_per_chr = [ + sum(mask[sum(bins_per_chr[:i]) : sum(bins_per_chr[:i]) + x]) + for i, x in enumerate(bins_per_chr) + ] + masked_bins_per_chr_cum = [ + sum(masked_bins_per_chr[: x + 1]) for x in range(len(masked_bins_per_chr)) + ] + + np.savez_compressed( + args.prepfile, + binsize=args.binsize, + gender=gender, + mask=mask, + masked_data=masked_data, + bins_per_chr=bins_per_chr, + masked_bins_per_chr=masked_bins_per_chr, + masked_bins_per_chr_cum=masked_bins_per_chr_cum, + pca_corrected_data=pca_corrected_data, + pca_components=pca.components_, + pca_mean=pca.mean_, + ) + + +""" Prepares subfiles if multi-threading is requested. Main file is split in 'cpus' subfiles, each subfile is processed by a separate thread. -''' +""" def tool_newref_main(args, cpus): @@ -80,84 +82,94 @@ def tool_newref_main(args, cpus): os.remove(args.prepfile) for part in range(1, cpus + 1): - os.remove('{}_{}.npz'.format(args.partfile, str(part))) + os.remove("{}_{}.npz".format(args.partfile, str(part))) -''' +""" Function executed once for each thread. Controls within-sample reference creation. -''' +""" def _tool_newref_part(args): if args.part[0] > args.part[1]: - logging.critical('Part should be smaller or equal to total parts:{} > {} is wrong' - .format(args.part[0], args.part[1])) + logging.critical( + "Part should be smaller or equal to total parts:{} > {} is wrong".format( + args.part[0], args.part[1] + ) + ) sys.exit() if args.part[0] < 0: logging.critical( - 'Part should be at least zero: {} < 0 is wrong'.format(args.part[0])) + "Part should be at least zero: {} < 0 is wrong".format(args.part[0]) + ) sys.exit() - npzdata = np.load(args.prepfile, encoding='latin1', allow_pickle=True) - pca_corrected_data = npzdata['pca_corrected_data'] - masked_bins_per_chr = npzdata['masked_bins_per_chr'] - masked_bins_per_chr_cum = npzdata['masked_bins_per_chr_cum'] - - indexes, distances, null_ratios = get_reference(pca_corrected_data, masked_bins_per_chr, masked_bins_per_chr_cum, - ref_size=args.refsize, part=args.part[0], split_parts=args.part[1]) - - np.savez_compressed('{}_{}.npz'.format(args.partfile, str(args.part[0])), - indexes=indexes, - distances=distances, - null_ratios=null_ratios) - - -''' + npzdata = np.load(args.prepfile, encoding="latin1", allow_pickle=True) + pca_corrected_data = npzdata["pca_corrected_data"] + masked_bins_per_chr = npzdata["masked_bins_per_chr"] + masked_bins_per_chr_cum = npzdata["masked_bins_per_chr_cum"] + + indexes, distances, null_ratios = get_reference( + pca_corrected_data, + masked_bins_per_chr, + masked_bins_per_chr_cum, + ref_size=args.refsize, + part=args.part[0], + split_parts=args.part[1], + ) + + np.savez_compressed( + "{}_{}.npz".format(args.partfile, str(args.part[0])), + indexes=indexes, + distances=distances, + null_ratios=null_ratios, + ) + + +""" Merges separate subfiles (one for each thread) to a new temporary output file. -''' +""" def tool_newref_post(args, cpus): - npzdata_prep = np.load(args.prepfile, encoding='latin1', allow_pickle=True) + npzdata_prep = np.load(args.prepfile, encoding="latin1", allow_pickle=True) big_indexes = [] big_distances = [] big_null_ratios = [] for part in range(1, cpus + 1): - infile = '{}_{}.npz'.format(args.partfile, str(part)) - npzdata_part = np.load(infile, encoding='latin1') - big_indexes.extend(npzdata_part['indexes']) - big_distances.extend(npzdata_part['distances']) - big_null_ratios.extend(npzdata_part['null_ratios']) + infile = "{}_{}.npz".format(args.partfile, str(part)) + npzdata_part = np.load(infile, encoding="latin1") + big_indexes.extend(npzdata_part["indexes"]) + big_distances.extend(npzdata_part["distances"]) + big_null_ratios.extend(npzdata_part["null_ratios"]) indexes = np.array(big_indexes) distances = np.array(big_distances) null_ratios = np.array(big_null_ratios) - np.savez_compressed(args.tmpoutfile, - binsize=npzdata_prep['binsize'].item(), - gender=npzdata_prep['gender'].item(), - - mask=npzdata_prep['mask'], - - bins_per_chr=npzdata_prep['bins_per_chr'], - masked_bins_per_chr=npzdata_prep['masked_bins_per_chr'], - masked_bins_per_chr_cum=npzdata_prep['masked_bins_per_chr_cum'], - - pca_components=npzdata_prep['pca_components'], - pca_mean=npzdata_prep['pca_mean'], - - indexes=indexes, - distances=distances, - null_ratios=null_ratios) - - -''' + np.savez_compressed( + args.tmpoutfile, + binsize=npzdata_prep["binsize"].item(), + gender=npzdata_prep["gender"].item(), + mask=npzdata_prep["mask"], + bins_per_chr=npzdata_prep["bins_per_chr"], + masked_bins_per_chr=npzdata_prep["masked_bins_per_chr"], + masked_bins_per_chr_cum=npzdata_prep["masked_bins_per_chr_cum"], + pca_components=npzdata_prep["pca_components"], + pca_mean=npzdata_prep["pca_mean"], + indexes=indexes, + distances=distances, + null_ratios=null_ratios, + ) + + +""" Tries to remove text file, when it is busy, until becomes successful. This function, prevents OSError: [Errno 26] Text file busy... -''' +""" def force_remove(file_id): @@ -167,32 +179,36 @@ def force_remove(file_id): os.remove(file_id) break except: - print('Attemp #{}: Cannot remove {}, because it is busy, trying again...'.format(attemp, file_id)) + print( + "Attemp #{}: Cannot remove {}, because it is busy, trying again...".format( + attemp, file_id + ) + ) attemp = attemp + 1 time.sleep(5) -''' +""" Merges separate subfiles (A, F, M) to one final reference file. -''' +""" def tool_newref_merge(args, outfiles, trained_cutoff): - final_ref = {'has_female': False, 'has_male': False} + final_ref = {"has_female": False, "has_male": False} for file_id in outfiles: - npz_file = np.load(file_id, encoding='latin1', allow_pickle=True) - gender = str(npz_file['gender']) - for component in [x for x in npz_file.keys() if x != 'gender']: - if gender == 'F': - final_ref['has_female'] = True - final_ref['{}.F'.format(str(component))] = npz_file[component] - elif gender == 'M': - final_ref['has_male'] = True - final_ref['{}.M'.format(str(component))] = npz_file[component] + npz_file = np.load(file_id, encoding="latin1", allow_pickle=True) + gender = str(npz_file["gender"]) + for component in [x for x in npz_file.keys() if x != "gender"]: + if gender == "F": + final_ref["has_female"] = True + final_ref["{}.F".format(str(component))] = npz_file[component] + elif gender == "M": + final_ref["has_male"] = True + final_ref["{}.M".format(str(component))] = npz_file[component] else: final_ref[str(component)] = npz_file[component] force_remove(file_id) - final_ref['is_nipt'] = args.nipt - final_ref['trained_cutoff'] = trained_cutoff + final_ref["is_nipt"] = args.nipt + final_ref["trained_cutoff"] = trained_cutoff np.savez_compressed(args.outfile, **final_ref) diff --git a/wisecondorX/newref_tools.py b/wisecondorX/newref_tools.py index 4f03567..e0149f5 100644 --- a/wisecondorX/newref_tools.py +++ b/wisecondorX/newref_tools.py @@ -9,36 +9,46 @@ from sklearn.decomposition import PCA from sklearn.mixture import GaussianMixture -''' +""" A Gaussian mixture model is fitted against all one-dimensional reference y-fractions. Two components are expected: one for males, and one for females. The local minimum will serve as the cut-off point. -''' +""" def train_gender_model(args, samples): - genders = np.empty(len(samples), dtype='object') + genders = np.empty(len(samples), dtype="object") y_fractions = [] for sample in samples: - y_fractions.append(float(np.sum(sample['24'])) / float(np.sum([np.sum(sample[x]) for x in sample.keys()]))) + y_fractions.append( + float(np.sum(sample["24"])) + / float(np.sum([np.sum(sample[x]) for x in sample.keys()])) + ) y_fractions = np.array(y_fractions) - gmm = GaussianMixture(n_components=2, covariance_type='full', reg_covar=1e-99, max_iter=10000, tol=1e-99) + gmm = GaussianMixture( + n_components=2, + covariance_type="full", + reg_covar=1e-99, + max_iter=10000, + tol=1e-99, + ) gmm.fit(X=y_fractions.reshape(-1, 1)) gmm_x = np.linspace(0, 0.02, 5000) gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1, 1))) if args.plotyfrac is not None: import matplotlib.pyplot as plt + fig, ax = plt.subplots(figsize=(16, 6)) ax.hist(y_fractions, bins=100, density=True) - ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit') + ax.plot(gmm_x, gmm_y, "r-", label="Gaussian mixture fit") ax.set_xlim([0, 0.02]) - ax.legend(loc='best') + ax.legend(loc="best") plt.savefig(args.plotyfrac) - logging.info('Image written to {}, now quitting ...'.format(args.plotyfrac)) + logging.info("Image written to {}, now quitting ...".format(args.plotyfrac)) exit() if args.yfrac is not None: @@ -50,18 +60,18 @@ def train_gender_model(args, samples): local_min_i = argrelextrema(sorted_gmm_y, np.less) cut_off = gmm_x[local_min_i][0] - logging.info('Determined --yfrac cutoff: {}'.format(str(round(cut_off, 4)))) + logging.info("Determined --yfrac cutoff: {}".format(str(round(cut_off, 4)))) - genders[y_fractions > cut_off] = 'M' - genders[y_fractions < cut_off] = 'F' + genders[y_fractions > cut_off] = "M" + genders[y_fractions < cut_off] = "F" return genders.tolist(), cut_off -''' +""" Finds mask (locations of bins without data) in the subset 'samples'. -''' +""" def get_mask(samples): @@ -89,9 +99,9 @@ def get_mask(samples): return mask, bins_per_chr -''' +""" Normalizes samples for read depth and applies mask. -''' +""" def normalize_and_mask(samples, chrs, mask): @@ -116,10 +126,10 @@ def normalize_and_mask(samples, chrs, mask): return masked_data -''' +""" Executes PCA. Rotations are saved which enable between sample normalization in the test phase. -''' +""" def train_pca(ref_data, pcacomp=5): @@ -134,21 +144,30 @@ def train_pca(ref_data, pcacomp=5): return corrected.T, pca -''' +""" Calculates within-sample reference. -''' +""" -def get_reference(pca_corrected_data, masked_bins_per_chr, masked_bins_per_chr_cum, - ref_size, part, split_parts): +def get_reference( + pca_corrected_data, + masked_bins_per_chr, + masked_bins_per_chr_cum, + ref_size, + part, + split_parts, +): big_indexes = [] big_distances = [] bincount = masked_bins_per_chr_cum[-1] start_num, end_num = _get_part(part - 1, split_parts, bincount) - logging.info('Working on thread {} of {}, meaning bins {} up to {}' - .format(part, split_parts, start_num, end_num)) + logging.info( + "Working on thread {} of {}, meaning bins {} up to {}".format( + part, split_parts, start_num, end_num + ) + ) regions = _split_by_chr(start_num, end_num, masked_bins_per_chr_cum) for region in regions: @@ -167,22 +186,33 @@ def get_reference(pca_corrected_data, masked_bins_per_chr, masked_bins_per_chr_c big_indexes.extend(part_indexes) big_distances.extend(part_distances) continue - chr_data = np.concatenate((pca_corrected_data[:masked_bins_per_chr_cum[chr] - - masked_bins_per_chr[chr], :], - pca_corrected_data[masked_bins_per_chr_cum[chr]:, :])) - - part_indexes, part_distances = get_ref_for_bins(ref_size, start, end, - pca_corrected_data, chr_data) + chr_data = np.concatenate( + ( + pca_corrected_data[ + : masked_bins_per_chr_cum[chr] - masked_bins_per_chr[chr], : + ], + pca_corrected_data[masked_bins_per_chr_cum[chr] :, :], + ) + ) + + part_indexes, part_distances = get_ref_for_bins( + ref_size, start, end, pca_corrected_data, chr_data + ) big_indexes.extend(part_indexes) big_distances.extend(part_distances) index_array = np.array(big_indexes) distance_array = np.array(big_distances) - null_ratio_array = np.zeros((len(distance_array), min(len(pca_corrected_data[0]), 100))) + null_ratio_array = np.zeros( + (len(distance_array), min(len(pca_corrected_data[0]), 100)) + ) samples = np.transpose(pca_corrected_data) for null_i, case_i in enumerate( - random.sample(range(len(pca_corrected_data[0])), min(len(pca_corrected_data[0]), 100))): + random.sample( + range(len(pca_corrected_data[0])), min(len(pca_corrected_data[0]), 100) + ) + ): sample = samples[case_i] for bin_i in list(range(len(sample)))[start_num:end_num]: ref = sample[index_array[bin_i - start_num]] @@ -214,9 +244,9 @@ def _get_part(partnum, outof, bincount): return start_bin, end_bin -''' +""" Calculates within-sample reference for a particular chromosome. -''' +""" def get_ref_for_bins(ref_size, start, end, pca_corrected_data, chr_data): diff --git a/wisecondorX/overall_tools.py b/wisecondorX/overall_tools.py index 078b63d..cc3ee11 100644 --- a/wisecondorX/overall_tools.py +++ b/wisecondorX/overall_tools.py @@ -9,10 +9,10 @@ import numpy as np -''' +""" Scales the bin size of a sample.npz to the one requested for the reference -''' +""" def scale_sample(sample, from_size, to_size): @@ -20,8 +20,11 @@ def scale_sample(sample, from_size, to_size): return sample if to_size == 0 or from_size == 0 or to_size < from_size or to_size % from_size > 0: - logging.critical('Impossible binsize scaling requested: {} to {}'.format(int(from_size), - int(to_size))) + logging.critical( + "Impossible binsize scaling requested: {} to {}".format( + int(from_size), int(to_size) + ) + ) sys.exit() return_sample = dict() @@ -31,90 +34,99 @@ def scale_sample(sample, from_size, to_size): new_len = int(np.ceil(len(chr_data) / float(scale))) scaled_chr = np.zeros(new_len, dtype=np.int32) for i in range(new_len): - scaled_chr[i] = np.sum(chr_data[int(i * scale):int(i * scale + scale)]) + scaled_chr[i] = np.sum(chr_data[int(i * scale) : int(i * scale + scale)]) return_sample[chr_name] = scaled_chr return return_sample -''' +""" Levels gonosomal reads with the one at the autosomes. -''' +""" def gender_correct(sample, gender): - if gender == 'M': - sample['23'] = sample['23'] * 2 - sample['24'] = sample['24'] * 2 + if gender == "M": + sample["23"] = sample["23"] * 2 + sample["24"] = sample["24"] * 2 return sample -''' +""" Communicates with R. Outputs new json dictionary, resulting from R, if 'outfile' is a key in the input json. 'infile' and 'R_script' are mandatory keys and correspond to the input file required to execute the R_script, respectively. -''' +""" def exec_R(json_dict): - json.dump(json_dict, open(json_dict['infile'], 'w')) + json.dump(json_dict, open(json_dict["infile"], "w")) - r_cmd = ['Rscript', json_dict['R_script'], - '--infile', json_dict['infile']] - logging.debug('CBS cmd: {}'.format(r_cmd)) + r_cmd = ["Rscript", json_dict["R_script"], "--infile", json_dict["infile"]] + logging.debug("CBS cmd: {}".format(r_cmd)) try: subprocess.check_call(r_cmd) except subprocess.CalledProcessError as e: - logging.critical('Rscript failed: {}'.format(e)) + logging.critical("Rscript failed: {}".format(e)) sys.exit() - os.remove(json_dict['infile']) - if 'outfile' in json_dict.keys(): - json_out = json.load(open(json_dict['outfile'])) - os.remove(json_dict['outfile']) + os.remove(json_dict["infile"]) + if "outfile" in json_dict.keys(): + json_out = json.load(open(json_dict["outfile"])) + os.remove(json_dict["outfile"]) return json_out -''' +""" Calculates between sample z-score. -''' +""" def get_z_score(results_c, results): - results_nr, results_r, results_w = results['results_nr'], results['results_r'], results['results_w'] + results_nr, results_r, results_w = ( + results["results_nr"], + results["results_r"], + results["results_w"], + ) zs = [] for segment in results_c: - segment_nr = results_nr[segment[0]][segment[1]:segment[2]] - segment_rr = results_r[segment[0]][segment[1]:segment[2]] - segment_nr = [segment_nr[i] for i in range(len(segment_nr)) if segment_rr[i] != 0] + segment_nr = results_nr[segment[0]][segment[1] : segment[2]] + segment_rr = results_r[segment[0]][segment[1] : segment[2]] + segment_nr = [ + segment_nr[i] for i in range(len(segment_nr)) if segment_rr[i] != 0 + ] for i in range(len(segment_nr)): for ii in range(len(segment_nr[i])): if not np.isfinite(segment_nr[i][ii]): segment_nr[i][ii] = np.nan - segment_w = results_w[segment[0]][segment[1]:segment[2]] + segment_w = results_w[segment[0]][segment[1] : segment[2]] segment_w = [segment_w[i] for i in range(len(segment_w)) if segment_rr[i] != 0] - null_segments = [np.ma.average(np.ma.masked_array(x, np.isnan(x)), weights=segment_w) for x in np.transpose(segment_nr)] + null_segments = [ + np.ma.average(np.ma.masked_array(x, np.isnan(x)), weights=segment_w) + for x in np.transpose(segment_nr) + ] null_mean = np.ma.mean([x for x in null_segments if np.isfinite(x)]) null_sd = np.ma.std([x for x in null_segments if np.isfinite(x)]) z = (segment[3] - null_mean) / null_sd - z = min(z, 1000) ; z = max(z, -1000) + z = min(z, 1000) + z = max(z, -1000) if math.isnan(null_mean) or math.isnan(null_sd): - z = 'nan' + z = "nan" zs.append(z) return zs -''' +""" Returns MSV, measure for sample-wise noise. -''' +""" def get_median_segment_variance(results_c, results_r): vars = [] for segment in results_c: - segment_r = results_r[segment[0]][int(segment[1]):int(segment[2])] + segment_r = results_r[segment[0]][int(segment[1]) : int(segment[2])] segment_r = [x for x in segment_r if x != 0] if segment_r: var = np.var(segment_r) @@ -122,14 +134,14 @@ def get_median_segment_variance(results_c, results_r): return np.median(vars) -''' +""" Returns CPA, measure for sample-wise abnormality. -''' +""" def get_cpa(results_c, binsize): x = 0 for segment in results_c: x += (segment[2] - segment[1] + 1) * binsize * abs(segment[3]) - CPA = x / len(results_c) * (10 ** -8) - return(CPA) \ No newline at end of file + CPA = x / len(results_c) * (10**-8) + return CPA diff --git a/wisecondorX/predict_control.py b/wisecondorX/predict_control.py index 8c1d574..5bc2224 100644 --- a/wisecondorX/predict_control.py +++ b/wisecondorX/predict_control.py @@ -1,51 +1,63 @@ # WisecondorX -from wisecondorX.predict_tools import coverage_normalize_and_mask, project_pc, \ - get_optimal_cutoff, get_weights, normalize_repeat, inflate_results - -''' +from wisecondorX.predict_tools import ( + coverage_normalize_and_mask, + project_pc, + get_optimal_cutoff, + get_weights, + normalize_repeat, + inflate_results, +) + +""" Control function that executes following normalization strategies: - coverage normalization - between-sample normalization - within-sample normalization -''' +""" def normalize(args, sample, ref_file, ref_gender): - if ref_gender == 'A': - ap = '' + if ref_gender == "A": + ap = "" cp = 0 ct = 0 else: - ap = '.{}'.format(ref_gender) + ap = ".{}".format(ref_gender) cp = 22 - ct = ref_file['masked_bins_per_chr_cum{}'.format(ap)][cp - 1] + ct = ref_file["masked_bins_per_chr_cum{}".format(ap)][cp - 1] sample = coverage_normalize_and_mask(sample, ref_file, ap) sample = project_pc(sample, ref_file, ap) results_w = get_weights(ref_file, ap)[ct:] optimal_cutoff = get_optimal_cutoff(ref_file, args.maskrepeats) - results_z, results_r, ref_sizes, m_lr, m_z = normalize_repeat(sample, ref_file, optimal_cutoff, ct, cp, ap) + results_z, results_r, ref_sizes, m_lr, m_z = normalize_repeat( + sample, ref_file, optimal_cutoff, ct, cp, ap + ) return results_r, results_z, results_w, ref_sizes, m_lr, m_z -''' +""" Function processes a result (e.g. results_r) to an easy-to-interpret format. Bins without information are set to 0. -''' +""" def get_post_processed_result(args, result, ref_sizes, rem_input): - infinite_mask = (ref_sizes < args.minrefbins) + infinite_mask = ref_sizes < args.minrefbins result[infinite_mask] = 0 inflated_results = inflate_results(result, rem_input) final_results = [] - for chr in range(len(rem_input['bins_per_chr'])): - chr_data = inflated_results[sum(rem_input['bins_per_chr'][:chr]):sum(rem_input['bins_per_chr'][:chr + 1])] + for chr in range(len(rem_input["bins_per_chr"])): + chr_data = inflated_results[ + sum(rem_input["bins_per_chr"][:chr]) : sum( + rem_input["bins_per_chr"][: chr + 1] + ) + ] final_results.append(chr_data) return final_results diff --git a/wisecondorX/predict_output.py b/wisecondorX/predict_output.py index 4586966..b295a0b 100644 --- a/wisecondorX/predict_output.py +++ b/wisecondorX/predict_output.py @@ -4,43 +4,47 @@ import numpy as np -from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa - -''' +from wisecondorX.overall_tools import ( + exec_R, + get_z_score, + get_median_segment_variance, + get_cpa, +) + +""" Writes plots. -''' +""" def exec_write_plots(rem_input, results): - json_plot_dir = os.path.abspath(rem_input['args'].outid + '_plot_tmp') + json_plot_dir = os.path.abspath(rem_input["args"].outid + "_plot_tmp") json_dict = { - 'R_script': str('{}/include/plotter.R'.format(rem_input['wd'])), - 'ref_gender': str(rem_input['ref_gender']), - 'beta': str(rem_input['args'].beta), - 'zscore': str(rem_input['args'].zscore), - 'binsize': str(rem_input['binsize']), - 'n_reads': str(rem_input['n_reads']), - 'cairo': str(rem_input['args'].cairo), - 'results_r': results['results_r'], - 'results_w': results['results_w'], - 'results_c': results['results_c'], - 'ylim': str(rem_input['args'].ylim), - 'infile': str('{}.json'.format(json_plot_dir)), - 'out_dir': str('{}.plots'.format(rem_input['args'].outid)), + "R_script": str("{}/include/plotter.R".format(rem_input["wd"])), + "ref_gender": str(rem_input["ref_gender"]), + "beta": str(rem_input["args"].beta), + "zscore": str(rem_input["args"].zscore), + "binsize": str(rem_input["binsize"]), + "n_reads": str(rem_input["n_reads"]), + "cairo": str(rem_input["args"].cairo), + "results_r": results["results_r"], + "results_w": results["results_w"], + "results_c": results["results_c"], + "ylim": str(rem_input["args"].ylim), + "infile": str("{}.json".format(json_plot_dir)), + "out_dir": str("{}.plots".format(rem_input["args"].outid)), } if rem_input["args"].add_plot_title: # Strip away paths from the outid if need be - json_dict["plot_title"] = str( - os.path.basename(rem_input["args"].outid)) + json_dict["plot_title"] = str(os.path.basename(rem_input["args"].outid)) exec_R(json_dict) -''' +""" Calculates zz-scores, marks aberrations and writes tables. -''' +""" def generate_output_tables(rem_input, results): @@ -50,66 +54,83 @@ def generate_output_tables(rem_input, results): def _generate_bins_bed(rem_input, results): - bins_file = open('{}_bins.bed'.format(rem_input['args'].outid), 'w') - bins_file.write('chr\tstart\tend\tid\tratio\tzscore\n') - results_r = results['results_r'] - results_z = results['results_z'] - binsize = rem_input['binsize'] + bins_file = open("{}_bins.bed".format(rem_input["args"].outid), "w") + bins_file.write("chr\tstart\tend\tid\tratio\tzscore\n") + results_r = results["results_r"] + results_z = results["results_z"] + binsize = rem_input["binsize"] for chr in range(len(results_r)): chr_name = str(chr + 1) - if chr_name == '23': - chr_name = 'X' - if chr_name == '24': - chr_name = 'Y' + if chr_name == "23": + chr_name = "X" + if chr_name == "24": + chr_name = "Y" feat = 1 for i in range(len(results_r[chr])): r = results_r[chr][i] z = results_z[chr][i] if r == 0: - r = 'nan' + r = "nan" if z == 0: - z = 'nan' - feat_str = '{}:{}-{}'.format(chr_name, str(feat), str(feat + binsize - 1)) + z = "nan" + feat_str = "{}:{}-{}".format(chr_name, str(feat), str(feat + binsize - 1)) row = [chr_name, feat, feat + binsize - 1, feat_str, r, z] - bins_file.write('{}\n'.format('\t'.join([str(x) for x in row]))) + bins_file.write("{}\n".format("\t".join([str(x) for x in row]))) feat += binsize bins_file.close() def _generate_segments_and_aberrations_bed(rem_input, results): - segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w') - abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w') - segments_file.write('chr\tstart\tend\tratio\tzscore\n') - abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n') + segments_file = open("{}_segments.bed".format(rem_input["args"].outid), "w") + abberations_file = open("{}_aberrations.bed".format(rem_input["args"].outid), "w") + segments_file.write("chr\tstart\tend\tratio\tzscore\n") + abberations_file.write("chr\tstart\tend\tratio\tzscore\ttype\n") - for segment in results['results_c']: + for segment in results["results_c"]: chr_name = str(segment[0] + 1) - if chr_name == '23': - chr_name = 'X' - if chr_name == '24': - chr_name = 'Y' - row = [chr_name, - int(segment[1] * rem_input['binsize'] + 1), - int(segment[2] * rem_input['binsize']), - segment[4], segment[3]] - segments_file.write('{}\n'.format('\t'.join([str(x) for x in row]))) + if chr_name == "23": + chr_name = "X" + if chr_name == "24": + chr_name = "Y" + row = [ + chr_name, + int(segment[1] * rem_input["binsize"] + 1), + int(segment[2] * rem_input["binsize"]), + segment[4], + segment[3], + ] + segments_file.write("{}\n".format("\t".join([str(x) for x in row]))) ploidy = 2 - if (chr_name == 'X' or chr_name == 'Y') and rem_input['ref_gender'] == 'M': + if (chr_name == "X" or chr_name == "Y") and rem_input["ref_gender"] == "M": ploidy = 1 - if rem_input['args'].beta is not None: - if float(segment[4]) > __get_aberration_cutoff(rem_input['args'].beta, ploidy)[1]: - abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row]))) - elif float(segment[4]) < __get_aberration_cutoff(rem_input['args'].beta, ploidy)[0]: - abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row]))) + if rem_input["args"].beta is not None: + if ( + float(segment[4]) + > __get_aberration_cutoff(rem_input["args"].beta, ploidy)[1] + ): + abberations_file.write( + "{}\tgain\n".format("\t".join([str(x) for x in row])) + ) + elif ( + float(segment[4]) + < __get_aberration_cutoff(rem_input["args"].beta, ploidy)[0] + ): + abberations_file.write( + "{}\tloss\n".format("\t".join([str(x) for x in row])) + ) elif isinstance(segment[3], str): continue else: - if float(segment[3]) > rem_input['args'].zscore: - abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row]))) - elif float(segment[3]) < - rem_input['args'].zscore: - abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row]))) + if float(segment[3]) > rem_input["args"].zscore: + abberations_file.write( + "{}\tgain\n".format("\t".join([str(x) for x in row])) + ) + elif float(segment[3]) < -rem_input["args"].zscore: + abberations_file.write( + "{}\tloss\n".format("\t".join([str(x) for x in row])) + ) segments_file.close() abberations_file.close() @@ -122,48 +143,69 @@ def __get_aberration_cutoff(beta, ploidy): def _generate_chr_statistics_file(rem_input, results): - stats_file = open('{}_statistics.txt'.format(rem_input['args'].outid), 'w') - stats_file.write('chr\tratio.mean\tratio.median\tzscore\n') - chr_ratio_means = [np.ma.average(results['results_r'][chr], weights=results['results_w'][chr]) - for chr in range(len(results['results_r']))] - chr_ratio_medians = [np.median([x for x in results['results_r'][chr] if x != 0]) - for chr in range(len(results['results_r']))] - - results_c_chr = [[x, 0, rem_input['bins_per_chr'][x] - 1, chr_ratio_means[x]] - for x in range(len(results['results_r']))] - - msv = round(get_median_segment_variance(results['results_c'], results['results_r']), 5) - cpa = round(get_cpa(results['results_c'], rem_input['binsize']), 5) + stats_file = open("{}_statistics.txt".format(rem_input["args"].outid), "w") + stats_file.write("chr\tratio.mean\tratio.median\tzscore\n") + chr_ratio_means = [ + np.ma.average(results["results_r"][chr], weights=results["results_w"][chr]) + for chr in range(len(results["results_r"])) + ] + chr_ratio_medians = [ + np.median([x for x in results["results_r"][chr] if x != 0]) + for chr in range(len(results["results_r"])) + ] + + results_c_chr = [ + [x, 0, rem_input["bins_per_chr"][x] - 1, chr_ratio_means[x]] + for x in range(len(results["results_r"])) + ] + + msv = round( + get_median_segment_variance(results["results_c"], results["results_r"]), 5 + ) + cpa = round(get_cpa(results["results_c"], rem_input["binsize"]), 5) chr_z_scores = get_z_score(results_c_chr, results) - for chr in range(len(results['results_r'])): + for chr in range(len(results["results_r"])): chr_name = str(chr + 1) - if chr_name == '23': - chr_name = 'X' - if chr_name == '24': - chr_name = 'Y' - - row = [chr_name, - chr_ratio_means[chr], - chr_ratio_medians[chr], - chr_z_scores[chr]] - - stats_file.write('\t'.join([str(x) for x in row]) + '\n') - - stats_file.write('Gender based on --yfrac (or manually overridden by --gender): {}\n' - .format(str(rem_input['gender']))) - - stats_file.write('Number of reads: {}\n' - .format(str(rem_input['n_reads']))) - - stats_file.write('Standard deviation of the ratios per chromosome: {}\n' - .format(str(round(float(np.nanstd(chr_ratio_means)), 5)))) - - stats_file.write('Median segment variance per bin (doi: 10.1093/nar/gky1263): {}\n' - .format(str(msv))) - - stats_file.write('Copy number profile abnormality (CPA) score (doi: 10.1186/s13073-020-00735-4): {}\n' - .format(str(cpa))) + if chr_name == "23": + chr_name = "X" + if chr_name == "24": + chr_name = "Y" + + row = [ + chr_name, + chr_ratio_means[chr], + chr_ratio_medians[chr], + chr_z_scores[chr], + ] + + stats_file.write("\t".join([str(x) for x in row]) + "\n") + + stats_file.write( + "Gender based on --yfrac (or manually overridden by --gender): {}\n".format( + str(rem_input["gender"]) + ) + ) + + stats_file.write("Number of reads: {}\n".format(str(rem_input["n_reads"]))) + + stats_file.write( + "Standard deviation of the ratios per chromosome: {}\n".format( + str(round(float(np.nanstd(chr_ratio_means)), 5)) + ) + ) + + stats_file.write( + "Median segment variance per bin (doi: 10.1093/nar/gky1263): {}\n".format( + str(msv) + ) + ) + + stats_file.write( + "Copy number profile abnormality (CPA) score (doi: 10.1186/s13073-020-00735-4): {}\n".format( + str(cpa) + ) + ) stats_file.close() diff --git a/wisecondorX/predict_tools.py b/wisecondorX/predict_tools.py index d659c52..91a0548 100644 --- a/wisecondorX/predict_tools.py +++ b/wisecondorX/predict_tools.py @@ -8,51 +8,55 @@ from wisecondorX.overall_tools import exec_R, get_z_score -''' +""" Returns gender based on Gaussian mixture model trained during newref phase. -''' +""" def predict_gender(sample, trained_cutoff): - Y_fraction = float(np.sum(sample['24'])) / float(np.sum([np.sum(sample[x]) for x in sample.keys()])) + Y_fraction = float(np.sum(sample["24"])) / float( + np.sum([np.sum(sample[x]) for x in sample.keys()]) + ) if Y_fraction > trained_cutoff: - return 'M' + return "M" else: - return 'F' + return "F" -''' +""" Normalize sample for read depth and apply mask. -''' +""" def coverage_normalize_and_mask(sample, ref_file, ap): by_chr = [] - chrs = range(1, len(ref_file['bins_per_chr{}'.format(ap)]) + 1) + chrs = range(1, len(ref_file["bins_per_chr{}".format(ap)]) + 1) for chr in chrs: - this_chr = np.zeros(ref_file['bins_per_chr{}'.format(ap)][chr - 1], dtype=float) - min_len = min(ref_file['bins_per_chr{}'.format(ap)][chr - 1], len(sample[str(chr)])) + this_chr = np.zeros(ref_file["bins_per_chr{}".format(ap)][chr - 1], dtype=float) + min_len = min( + ref_file["bins_per_chr{}".format(ap)][chr - 1], len(sample[str(chr)]) + ) this_chr[:min_len] = sample[str(chr)][:min_len] by_chr.append(this_chr) all_data = np.concatenate(by_chr, axis=0) all_data = all_data / np.sum(all_data) - masked_data = all_data[ref_file['mask{}'.format(ap)]] + masked_data = all_data[ref_file["mask{}".format(ap)]] return masked_data -''' +""" Project test sample to PCA space. -''' +""" def project_pc(sample_data, ref_file, ap): - pca = PCA(n_components=ref_file['pca_components{}'.format(ap)].shape[0]) - pca.components_ = ref_file['pca_components{}'.format(ap)] - pca.mean_ = ref_file['pca_mean{}'.format(ap)] + pca = PCA(n_components=ref_file["pca_components{}".format(ap)].shape[0]) + pca.components_ = ref_file["pca_components{}".format(ap)] + pca.mean_ = ref_file["pca_mean{}".format(ap)] transform = pca.transform(np.array([sample_data])) @@ -61,15 +65,15 @@ def project_pc(sample_data, ref_file, ap): return sample_data / reconstructed -''' +""" Defines cutoff that will add bins to a blacklist depending on the within reference distances. -''' +""" def get_optimal_cutoff(ref_file, repeats): - distances = ref_file['distances'] - cutoff = float('inf') + distances = ref_file["distances"] + cutoff = float("inf") for i in range(0, repeats): mask = distances < cutoff average = np.average(distances[mask]) @@ -78,13 +82,13 @@ def get_optimal_cutoff(ref_file, repeats): return cutoff -''' +""" Within sample normalization. Cycles through a number of repeats where z-scores define whether a bin is seen as 'normal' in a sample (in most cases this means 'non-aberrant') -- if it is, it can be used as a reference for the other bins. -''' +""" def normalize_repeat(test_data, ref_file, optimal_cutoff, ct, cp, ap): @@ -93,8 +97,9 @@ def normalize_repeat(test_data, ref_file, optimal_cutoff, ct, cp, ap): ref_sizes = None test_copy = np.copy(test_data) for i in range(3): - results_z, results_r, ref_sizes = _normalize_once(test_data, test_copy, ref_file, - optimal_cutoff, ct, cp, ap) + results_z, results_r, ref_sizes = _normalize_once( + test_data, test_copy, ref_file, optimal_cutoff, ct, cp, ap + ) test_copy[ct:][np.abs(results_z) >= norm.ppf(0.99)] = -1 m_lr = np.nanmedian(np.log2(results_r)) @@ -104,13 +109,13 @@ def normalize_repeat(test_data, ref_file, optimal_cutoff, ct, cp, ap): def _normalize_once(test_data, test_copy, ref_file, optimal_cutoff, ct, cp, ap): - masked_bins_per_chr = ref_file['masked_bins_per_chr{}'.format(ap)] - masked_bins_per_chr_cum = ref_file['masked_bins_per_chr_cum{}'.format(ap)] + masked_bins_per_chr = ref_file["masked_bins_per_chr{}".format(ap)] + masked_bins_per_chr_cum = ref_file["masked_bins_per_chr_cum{}".format(ap)] results_z = np.zeros(masked_bins_per_chr_cum[-1])[ct:] results_r = np.zeros(masked_bins_per_chr_cum[-1])[ct:] ref_sizes = np.zeros(masked_bins_per_chr_cum[-1])[ct:] - indexes = ref_file['indexes{}'.format(ap)] - distances = ref_file['distances{}'.format(ap)] + indexes = ref_file["indexes{}".format(ap)] + distances = ref_file["distances{}".format(ap)] i = ct i2 = 0 @@ -118,8 +123,11 @@ def _normalize_once(test_data, test_copy, ref_file, optimal_cutoff, ct, cp, ap): start = masked_bins_per_chr_cum[chr] - masked_bins_per_chr[chr] end = masked_bins_per_chr_cum[chr] chr_data = np.concatenate( - (test_copy[:masked_bins_per_chr_cum[chr] - masked_bins_per_chr[chr]], - test_copy[masked_bins_per_chr_cum[chr]:])) + ( + test_copy[: masked_bins_per_chr_cum[chr] - masked_bins_per_chr[chr]], + test_copy[masked_bins_per_chr_cum[chr] :], + ) + ) for index in indexes[start:end]: ref_data = chr_data[index[distances[i] < optimal_cutoff]] @@ -134,61 +142,61 @@ def _normalize_once(test_data, test_copy, ref_file, optimal_cutoff, ct, cp, ap): return results_z, results_r, ref_sizes -''' +""" The means of sets of within-sample reference distances can serve as inverse weights for CBS, Z-scoring and plotting. -''' +""" def get_weights(ref_file, ap): - inverse_weights = [np.mean(np.sqrt(x)) for x in ref_file['distances{}'.format(ap)]] + inverse_weights = [np.mean(np.sqrt(x)) for x in ref_file["distances{}".format(ap)]] weights = np.array([1 / x for x in inverse_weights]) return weights -''' +""" Unmasks results array. -''' +""" def inflate_results(results, rem_input): - temp = [0 for x in rem_input['mask']] + temp = [0 for x in rem_input["mask"]] j = 0 - for i, val in enumerate(rem_input['mask']): + for i, val in enumerate(rem_input["mask"]): if val: temp[i] = results[j] j += 1 return temp -''' +""" Log2-transforms results_r. If resulting elements are infinite, all corresponding possible positions (at results_r, results_z and results_w are set to 0 (blacklist)). -''' +""" def log_trans(results, log_r_median): - for chr in range(len(results['results_r'])): - results['results_r'][chr] = np.log2(results['results_r'][chr]) + for chr in range(len(results["results_r"])): + results["results_r"][chr] = np.log2(results["results_r"][chr]) - results['results_r'] = [x.tolist() for x in results['results_r']] + results["results_r"] = [x.tolist() for x in results["results_r"]] - for c in range(len(results['results_r'])): - for i, rR in enumerate(results['results_r'][c]): + for c in range(len(results["results_r"])): + for i, rR in enumerate(results["results_r"][c]): if not np.isfinite(rR): - results['results_r'][c][i] = 0 - results['results_z'][c][i] = 0 - results['results_w'][c][i] = 0 - if results['results_r'][c][i] != 0: - results['results_r'][c][i] = results['results_r'][c][i] - log_r_median + results["results_r"][c][i] = 0 + results["results_z"][c][i] = 0 + results["results_w"][c][i] = 0 + if results["results_r"][c][i] != 0: + results["results_r"][c][i] = results["results_r"][c][i] - log_r_median -''' +""" Applies additional blacklist to results_r, results_z and results_w if requested. -''' +""" def apply_blacklist(rem_input, results): @@ -197,65 +205,70 @@ def apply_blacklist(rem_input, results): for chr in blacklist.keys(): for s_e in blacklist[chr]: for pos in range(s_e[0], s_e[1]): - if len(results['results_r']) < 24 and chr == 23: + if len(results["results_r"]) < 24 and chr == 23: continue - if pos >= len(results['results_r'][chr]) or pos < 0: + if pos >= len(results["results_r"][chr]) or pos < 0: continue - results['results_r'][chr][pos] = 0 - results['results_z'][chr][pos] = 0 - results['results_w'][chr][pos] = 0 + results["results_r"][chr][pos] = 0 + results["results_z"][chr][pos] = 0 + results["results_w"][chr][pos] = 0 def _import_bed(rem_input): bed = {} - for line in open(rem_input['args'].blacklist): - chr_name, s, e = line.strip().split('\t') - if chr_name[:3] == 'chr': + for line in open(rem_input["args"].blacklist): + chr_name, s, e = line.strip().split("\t") + if chr_name[:3] == "chr": chr_name = chr_name[3:] - if chr_name == 'X': - chr_name = '23' - if chr_name == 'Y': - chr_name = '24' + if chr_name == "X": + chr_name = "23" + if chr_name == "Y": + chr_name = "24" chr = int(chr_name) - 1 if chr not in bed.keys(): bed[chr] = [] - bed[chr].append([int(int(s) / rem_input['binsize']), int(int(e) / rem_input['binsize']) + 1]) + bed[chr].append( + [int(int(s) / rem_input["binsize"]), int(int(e) / rem_input["binsize"]) + 1] + ) return bed -''' +""" Executed CBS on results_r using results_w as weights. Calculates segmental zz-scores. -''' +""" def exec_cbs(rem_input, results): - json_cbs_dir = os.path.abspath(rem_input['args'].outid + '_CBS_tmp') + json_cbs_dir = os.path.abspath(rem_input["args"].outid + "_CBS_tmp") json_dict = { - 'R_script': str('{}/include/CBS.R'.format(rem_input['wd'])), - 'ref_gender': str(rem_input['ref_gender']), - 'alpha': str(rem_input['args'].alpha), - 'binsize': str(rem_input['binsize']), - 'results_r': results['results_r'], - 'results_w': results['results_w'], - 'infile': str('{}_01.json'.format(json_cbs_dir)), - 'outfile': str('{}_02.json'.format(json_cbs_dir)) + "R_script": str("{}/include/CBS.R".format(rem_input["wd"])), + "ref_gender": str(rem_input["ref_gender"]), + "alpha": str(rem_input["args"].alpha), + "binsize": str(rem_input["binsize"]), + "results_r": results["results_r"], + "results_w": results["results_w"], + "infile": str("{}_01.json".format(json_cbs_dir)), + "outfile": str("{}_02.json".format(json_cbs_dir)), } results_c = _get_processed_cbs(exec_R(json_dict)) segment_z = get_z_score(results_c, results) - results_c = [results_c[i][:3] + [segment_z[i]] + [results_c[i][3]] for i in range(len(results_c))] + results_c = [ + results_c[i][:3] + [segment_z[i]] + [results_c[i][3]] + for i in range(len(results_c)) + ] return results_c def _get_processed_cbs(cbs_data): results_c = [] for i, segment in enumerate(cbs_data): - chr = int(segment['chr']) - 1 - s = int(segment['s']) - e = int(segment['e']) - r = segment['r'] + chr = int(segment["chr"]) - 1 + s = int(segment["s"]) + e = int(segment["e"]) + r = segment["r"] results_c.append([chr, s, e, r]) return results_c