From be7f2cae1a5a5125a1ff288372e31bb6a1158e53 Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 5 Jan 2021 12:45:44 +0100 Subject: [PATCH 1/3] Added --plotyfrac option and CPA calculation feature --- README.md | 4 +- example.blacklist/acen.and.gvar.hg38.txt | 65 +++++++++++++++++++ .../{centromere.hg38.txt => acen.hg38.txt} | 0 setup.py | 2 +- wisecondorX/include/plotter.R | 7 +- wisecondorX/main.py | 7 +- wisecondorX/newref_tools.py | 26 ++++---- wisecondorX/overall_tools.py | 15 ++++- wisecondorX/predict_output.py | 27 +++++--- 9 files changed, 126 insertions(+), 27 deletions(-) create mode 100644 example.blacklist/acen.and.gvar.hg38.txt rename example.blacklist/{centromere.hg38.txt => acen.hg38.txt} (100%) diff --git a/README.md b/README.md index 185011b..f1c9828 100755 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ WisecondorX newref reference_input_dir/*.npz reference_output.npz [--optional ar `--binsize x` | Size per bin in bp, defines the resolution of the output (default: x=1e5) `--refsize x` | Amount of reference locations per target; should generally not be tweaked (default: x=300) `--yfrac x` | Y read fraction cutoff, in order to manually define gender. Setting this to 1 will treat all samples as female +`--plotyfrac x` | plots Y read fraction histogram and Gaussian mixture fit to file x, can help when setting `--yfrac` manually; software quits after plotting `--cpus x` | Number of threads requested (default: x=1) → Bash recipe at `./pipeline/newref.sh` @@ -102,7 +103,7 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg `--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5) `--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4) `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction) -`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`) (no default) +`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`) `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M) `--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)** `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable **(\*)** @@ -186,5 +187,6 @@ the 'ID_segments.bed'. Particularly interesting for NIPT. - scikit-learn (v0.20.0) - pysam (v0.15.1) - numpy (v1.15.2) + - matplotlib (v2.2.3) And of course, other versions are very likely to work as well. diff --git a/example.blacklist/acen.and.gvar.hg38.txt b/example.blacklist/acen.and.gvar.hg38.txt new file mode 100644 index 0000000..74ae6a5 --- /dev/null +++ b/example.blacklist/acen.and.gvar.hg38.txt @@ -0,0 +1,65 @@ +chr1 121700000 123400000 +chr1 123400000 125100000 +chr1 125100000 143200000 +chr10 38000000 39800000 +chr10 39800000 41600000 +chr11 51000000 53400000 +chr11 53400000 55800000 +chr12 33200000 35500000 +chr12 35500000 37800000 +chr13 0 4600000 +chr13 10100000 16500000 +chr13 16500000 17700000 +chr13 17700000 18900000 +chr14 0 3600000 +chr14 8000000 16100000 +chr14 16100000 17200000 +chr14 17200000 18200000 +chr15 0 4200000 +chr15 9700000 17500000 +chr15 17500000 19000000 +chr15 19000000 20500000 +chr16 35300000 36800000 +chr16 36800000 38400000 +chr16 38400000 47000000 +chr17 22700000 25100000 +chr17 25100000 27400000 +chr18 15400000 18500000 +chr18 18500000 21500000 +chr19 19900000 24200000 +chr19 24200000 26200000 +chr19 26200000 28100000 +chr19 28100000 31900000 +chr2 91800000 93900000 +chr2 93900000 96000000 +chr20 25700000 28100000 +chr20 28100000 30400000 +chr21 0 3100000 +chr21 7000000 10900000 +chr21 10900000 12000000 +chr21 12000000 13000000 +chr22 0 4300000 +chr22 9400000 13700000 +chr22 13700000 15000000 +chr22 15000000 17400000 +chr3 87800000 90900000 +chr3 90900000 94000000 +chr3 94000000 98600000 +chr4 48200000 50000000 +chr4 50000000 51800000 +chr5 46100000 48800000 +chr5 48800000 51400000 +chr6 58500000 59800000 +chr6 59800000 62600000 +chr7 58100000 60100000 +chr7 60100000 62100000 +chr8 43200000 45200000 +chr8 45200000 47200000 +chr9 42200000 43000000 +chr9 43000000 45500000 +chr9 45500000 61500000 +chrX 58100000 61000000 +chrX 61000000 63800000 +chrY 10300000 10400000 +chrY 10400000 10600000 +chrY 26600000 57227415 diff --git a/example.blacklist/centromere.hg38.txt b/example.blacklist/acen.hg38.txt similarity index 100% rename from example.blacklist/centromere.hg38.txt rename to example.blacklist/acen.hg38.txt diff --git a/setup.py b/setup.py index 6cd90bd..8f0c863 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python from setuptools import setup, find_packages -version = '1.1.6' +version = '1.2.0' dl_version = 'master' if 'dev' in version else '{}'.format(version) setup( diff --git a/wisecondorX/include/plotter.R b/wisecondorX/include/plotter.R index 7f9ffe1..dab12ba 100644 --- a/wisecondorX/include/plotter.R +++ b/wisecondorX/include/plotter.R @@ -124,8 +124,8 @@ layout(l.matrix) par(mar=c(2.5,4,1,0), mgp=c(2.4,0.5,0), oma=c(0,0,3,0)) plot(1, main="", axes=F, # plots nothing -- enables segments function - xlab="", ylab="", col="white", xlim=c(chr.ends[1], chr.ends[length(chr.ends)]), - cex=0.0001, ylim=c(chr.wide.lower.limit,chr.wide.upper.limit)) + xlab="", ylab="", xlim=c(chr.ends[1], chr.ends[length(chr.ends)]), + cex=0, ylim=c(chr.wide.lower.limit,chr.wide.upper.limit)) plot.constitutionals <- function(ploidy, start, end){ segments(start, log2(1/ploidy), end, log2(1/ploidy), col=color.B, lwd=2, lty=3) @@ -291,8 +291,7 @@ for (c in chrs){ par(mar=c(2.5,4,1,0), mgp=c(2.4,0.5,0)) plot(1, main="", axes=F, # plots nothing -- enables segments function - xlab="", ylab="", col="white", - cex=0.0001, ylim=c(lower.limit,upper.limit), xlim=margins) + xlab="", ylab="", cex=0, ylim=c(lower.limit,upper.limit), xlim=margins) if (gender == "F"){ plot.constitutionals(2, chr.ends[c] - bins.per.chr[c] * 0.02, chr.ends[c+1] + bins.per.chr[c] * 0.02) diff --git a/wisecondorX/main.py b/wisecondorX/main.py index 6e5c2aa..e4f69cd 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -58,7 +58,7 @@ def tool_newref(args): samples.append(scale_sample(sample, binsize, args.binsize)) samples = np.array(samples) - genders, trained_cutoff = train_gender_model(samples, args.yfrac) + genders, trained_cutoff = train_gender_model(args, samples) if genders.count('F') < 5 and args.nipt: logging.warning( @@ -309,6 +309,11 @@ def main(): default=None, help='Use to manually set the y read fraction cutoff, which defines gender' ) + parser_newref.add_argument('--plotyfrac', + type=str, + default=None, + help='Path to yfrac .png plot for --yfrac optimization (e.g. path/to/plot.png); software will stop after plotting after which --yfrac can be set manually' + ) parser_newref.add_argument('--refsize', type=int, default=300, diff --git a/wisecondorX/newref_tools.py b/wisecondorX/newref_tools.py index 69d29c6..4724890 100644 --- a/wisecondorX/newref_tools.py +++ b/wisecondorX/newref_tools.py @@ -18,7 +18,7 @@ ''' -def train_gender_model(samples, yfrac): +def train_gender_model(args, samples): genders = np.empty(len(samples), dtype='object') y_fractions = [] for sample in samples: @@ -30,16 +30,19 @@ def train_gender_model(samples, yfrac): gmm_x = np.linspace(0, 0.02, 5000) gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1, 1))) - # import matplotlib.pyplot as plt - # fig, ax = plt.subplots(figsize=(10, 6)) - # ax.hist(y_fractions, bins=50, normed=True) - # ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit') - # ax.set_xlim([0.001, 0.01]) - # ax.legend(loc='best') - # plt.show() - - if yfrac is not None: - cut_off = yfrac + if args.plotyfrac is not None: + import matplotlib.pyplot as plt + fig, ax = plt.subplots(figsize=(16, 6)) + ax.hist(y_fractions, bins=50, normed=True) + ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit') + ax.set_xlim([0, 0.02]) + ax.legend(loc='best') + plt.savefig(args.plotyfrac) + logging.info('Image written to {}, now quitting ...'.format(args.plotyfrac)) + exit() + + if args.yfrac is not None: + cut_off = args.yfrac else: sort_idd = np.argsort(gmm_x) sorted_gmm_y = gmm_y[sort_idd] @@ -47,6 +50,7 @@ def train_gender_model(samples, yfrac): local_min_i = argrelextrema(sorted_gmm_y, np.less) cut_off = gmm_x[local_min_i][0] + logging.info('Determined --yfrac cutoff: {}'.format(str(round(cut_off, 4)))) genders[y_fractions > cut_off] = 'M' genders[y_fractions < cut_off] = 'F' diff --git a/wisecondorX/overall_tools.py b/wisecondorX/overall_tools.py index 7579295..078b63d 100644 --- a/wisecondorX/overall_tools.py +++ b/wisecondorX/overall_tools.py @@ -101,7 +101,7 @@ def get_z_score(results_c, results): z = (segment[3] - null_mean) / null_sd z = min(z, 1000) ; z = max(z, -1000) if math.isnan(null_mean) or math.isnan(null_sd): - z = 'NaN' + z = 'nan' zs.append(z) return zs @@ -120,3 +120,16 @@ def get_median_segment_variance(results_c, results_r): var = np.var(segment_r) vars.append(var) return np.median(vars) + + +''' +Returns CPA, measure for sample-wise abnormality. +''' + + +def get_cpa(results_c, binsize): + x = 0 + for segment in results_c: + x += (segment[2] - segment[1] + 1) * binsize * abs(segment[3]) + CPA = x / len(results_c) * (10 ** -8) + return(CPA) \ No newline at end of file diff --git a/wisecondorX/predict_output.py b/wisecondorX/predict_output.py index 89f213d..4a5ef9f 100644 --- a/wisecondorX/predict_output.py +++ b/wisecondorX/predict_output.py @@ -4,7 +4,7 @@ import numpy as np -from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance +from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa ''' Writes plots. @@ -61,9 +61,9 @@ def _generate_bins_bed(rem_input, results): r = results_r[chr][i] z = results_z[chr][i] if r == 0: - r = 'NaN' + r = 'nan' if z == 0: - z = 'NaN' + z = 'nan' feat_str = '{}:{}-{}'.format(chr_name, str(feat), str(feat + binsize - 1)) row = [chr_name, feat, feat + binsize - 1, feat_str, r, z] bins_file.write('{}\n'.format('\t'.join([str(x) for x in row]))) @@ -116,7 +116,7 @@ def __get_aberration_cutoff(beta, ploidy): def _generate_chr_statistics_file(rem_input, results): - stats_file = open('{}_chr_statistics.txt'.format(rem_input['args'].outid), 'w') + stats_file = open('{}_statistics.txt'.format(rem_input['args'].outid), 'w') stats_file.write('chr\tratio.mean\tratio.median\tzscore\n') chr_ratio_means = [np.ma.average(results['results_r'][chr], weights=results['results_w'][chr]) for chr in range(len(results['results_r']))] @@ -126,7 +126,8 @@ def _generate_chr_statistics_file(rem_input, results): results_c_chr = [[x, 0, rem_input['bins_per_chr'][x] - 1, chr_ratio_means[x]] for x in range(len(results['results_r']))] - msv = get_median_segment_variance(results['results_c'], results['results_r']) + msv = round(get_median_segment_variance(results['results_c'], results['results_r']), 5) + cpa = round(get_cpa(results['results_c'], rem_input['binsize']), 5) chr_z_scores = get_z_score(results_c_chr, results) for chr in range(len(results['results_r'])): @@ -144,9 +145,19 @@ def _generate_chr_statistics_file(rem_input, results): stats_file.write('\t'.join([str(x) for x in row]) + '\n') - stats_file.write('Standard deviation ratios (per chromosome): {}\n' - .format(str(np.nanstd(chr_ratio_means)))) + stats_file.write('Gender based on --yfrac (or manually overridden by --gender): {}\n' + .format(str(rem_input['actual_gender']))) - stats_file.write('Median segment variance (per bin): {}\n' + stats_file.write('Number of reads: {}\n' + .format(str(rem_input['n_reads']))) + + stats_file.write('Standard deviation of the ratios per chromosome: {}\n' + .format(str(round(float(np.nanstd(chr_ratio_means)), 5)))) + + stats_file.write('Median segment variance per bin (doi: 10.1093/nar/gky1263): {}\n' .format(str(msv))) + + stats_file.write('Copy number profile abnormality (CPA) score (doi: 10.1186/s13073-020-00735-4): {}\n' + .format(str(cpa))) + stats_file.close() From 733002858a27c41b27f8575a34b853f970b640d8 Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 5 Jan 2021 12:57:31 +0100 Subject: [PATCH 2/3] Updated manual --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1c9828..a7e6ec4 100755 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg `--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5) `--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4) `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction) -`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`) +`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`) `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M) `--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)** `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable **(\*)** From b2f9cef6888f94940d6ee4960321397008847f7e Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 5 Jan 2021 14:35:31 +0100 Subject: [PATCH 3/3] 50 bins -> 100 bins histogram --- wisecondorX/newref_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wisecondorX/newref_tools.py b/wisecondorX/newref_tools.py index 4724890..90898b0 100644 --- a/wisecondorX/newref_tools.py +++ b/wisecondorX/newref_tools.py @@ -33,7 +33,7 @@ def train_gender_model(args, samples): if args.plotyfrac is not None: import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(16, 6)) - ax.hist(y_fractions, bins=50, normed=True) + ax.hist(y_fractions, bins=100, normed=True) ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit') ax.set_xlim([0, 0.02]) ax.legend(loc='best')