Merge pull request #67 from CenterForMedicalGeneticsGhent/v1.2.0

Added --plotyfrac option and automatic CPA calculation
CenterForMedicalGeneticsGhent · Jan 5, 2021 · d994f78 · d994f78
2 parents 37ed043 + b2f9cef
commit d994f78
Show file tree

Hide file tree

Showing 9 changed files with 126 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -84,6 +84,7 @@ WisecondorX newref reference_input_dir/*.npz reference_output.npz [--optional ar
 `--binsize x` | Size per bin in bp, defines the resolution of the output (default: x=1e5)  
 `--refsize x` | Amount of reference locations per target; should generally not be tweaked (default: x=300)  
 `--yfrac x` | Y read fraction cutoff, in order to manually define gender. Setting this to 1 will treat all samples as female  
+`--plotyfrac x` | plots Y read fraction histogram and Gaussian mixture fit to file x, can help when setting `--yfrac` manually; software quits after plotting  
 `--cpus x` | Number of threads requested (default: x=1)  
 
 &rarr; Bash recipe at `./pipeline/newref.sh`
@@ -102,7 +103,7 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg
 `--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5)  
 `--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4)  
 `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction)  
-`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`) (no default)  
+`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`)  
 `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M)
 `--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information  **(\*)**  
 `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable  **(\*)**  
@@ -186,5 +187,6 @@ the 'ID_segments.bed'. Particularly interesting for NIPT.
     - scikit-learn (v0.20.0)
     - pysam (v0.15.1)
     - numpy (v1.15.2)
+    - matplotlib (v2.2.3)
 
 And of course, other versions are very likely to work as well.  
diff --git a/example.blacklist/acen.and.gvar.hg38.txt b/example.blacklist/acen.and.gvar.hg38.txt
@@ -0,0 +1,65 @@
+chr1	121700000	123400000
+chr1	123400000	125100000
+chr1	125100000	143200000
+chr10	38000000	39800000
+chr10	39800000	41600000
+chr11	51000000	53400000
+chr11	53400000	55800000
+chr12	33200000	35500000
+chr12	35500000	37800000
+chr13	0	4600000
+chr13	10100000	16500000
+chr13	16500000	17700000
+chr13	17700000	18900000
+chr14	0	3600000
+chr14	8000000	16100000
+chr14	16100000	17200000
+chr14	17200000	18200000
+chr15	0	4200000
+chr15	9700000	17500000
+chr15	17500000	19000000
+chr15	19000000	20500000
+chr16	35300000	36800000
+chr16	36800000	38400000
+chr16	38400000	47000000
+chr17	22700000	25100000
+chr17	25100000	27400000
+chr18	15400000	18500000
+chr18	18500000	21500000
+chr19	19900000	24200000
+chr19	24200000	26200000
+chr19	26200000	28100000
+chr19	28100000	31900000
+chr2	91800000	93900000
+chr2	93900000	96000000
+chr20	25700000	28100000
+chr20	28100000	30400000
+chr21	0	3100000
+chr21	7000000	10900000
+chr21	10900000	12000000
+chr21	12000000	13000000
+chr22	0	4300000
+chr22	9400000	13700000
+chr22	13700000	15000000
+chr22	15000000	17400000
+chr3	87800000	90900000
+chr3	90900000	94000000
+chr3	94000000	98600000
+chr4	48200000	50000000
+chr4	50000000	51800000
+chr5	46100000	48800000
+chr5	48800000	51400000
+chr6	58500000	59800000
+chr6	59800000	62600000
+chr7	58100000	60100000
+chr7	60100000	62100000
+chr8	43200000	45200000
+chr8	45200000	47200000
+chr9	42200000	43000000
+chr9	43000000	45500000
+chr9	45500000	61500000
+chrX	58100000	61000000
+chrX	61000000	63800000
+chrY	10300000	10400000
+chrY	10400000	10600000
+chrY	26600000	57227415
diff --git a/example.blacklist/centromere.hg38.txt → example.blacklist/acen.hg38.txt b/example.blacklist/centromere.hg38.txt → example.blacklist/acen.hg38.txt
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 from setuptools import setup, find_packages
 
-version = '1.1.6'
+version = '1.2.0'
 dl_version = 'master' if 'dev' in version else '{}'.format(version)
 
 setup(

diff --git a/wisecondorX/include/plotter.R b/wisecondorX/include/plotter.R
@@ -124,8 +124,8 @@ layout(l.matrix)
 par(mar=c(2.5,4,1,0), mgp=c(2.4,0.5,0), oma=c(0,0,3,0))
 
 plot(1, main="", axes=F, # plots nothing -- enables segments function
-     xlab="", ylab="", col="white", xlim=c(chr.ends[1], chr.ends[length(chr.ends)]),
-     cex=0.0001, ylim=c(chr.wide.lower.limit,chr.wide.upper.limit))
+     xlab="", ylab="", xlim=c(chr.ends[1], chr.ends[length(chr.ends)]),
+     cex=0, ylim=c(chr.wide.lower.limit,chr.wide.upper.limit))
 
 plot.constitutionals <- function(ploidy, start, end){
   segments(start, log2(1/ploidy), end, log2(1/ploidy), col=color.B, lwd=2, lty=3)
@@ -291,8 +291,7 @@ for (c in chrs){
   par(mar=c(2.5,4,1,0), mgp=c(2.4,0.5,0))
 
   plot(1, main="", axes=F, # plots nothing -- enables segments function
-       xlab="", ylab="", col="white",
-       cex=0.0001, ylim=c(lower.limit,upper.limit), xlim=margins)
+       xlab="", ylab="", cex=0, ylim=c(lower.limit,upper.limit), xlim=margins)
 
   if (gender == "F"){
     plot.constitutionals(2, chr.ends[c] - bins.per.chr[c] * 0.02, chr.ends[c+1] + bins.per.chr[c] * 0.02)

diff --git a/wisecondorX/main.py b/wisecondorX/main.py
@@ -58,7 +58,7 @@ def tool_newref(args):
         samples.append(scale_sample(sample, binsize, args.binsize))
 
     samples = np.array(samples)
-    genders, trained_cutoff = train_gender_model(samples, args.yfrac)
+    genders, trained_cutoff = train_gender_model(args, samples)
 
     if genders.count('F') < 5 and args.nipt:
         logging.warning(
@@ -309,6 +309,11 @@ def main():
                                default=None,
                                help='Use to manually set the y read fraction cutoff, which defines gender'
                                )
+    parser_newref.add_argument('--plotyfrac',
+                               type=str,
+                               default=None,
+                               help='Path to yfrac .png plot for --yfrac optimization (e.g. path/to/plot.png); software will stop after plotting after which --yfrac can be set manually'
+                               )
     parser_newref.add_argument('--refsize',
                                type=int,
                                default=300,

diff --git a/wisecondorX/newref_tools.py b/wisecondorX/newref_tools.py
@@ -18,7 +18,7 @@
 '''
 
 
-def train_gender_model(samples, yfrac):
+def train_gender_model(args, samples):
     genders = np.empty(len(samples), dtype='object')
     y_fractions = []
     for sample in samples:
@@ -30,23 +30,27 @@ def train_gender_model(samples, yfrac):
     gmm_x = np.linspace(0, 0.02, 5000)
     gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1, 1)))
 
-    # import matplotlib.pyplot as plt
-    # fig, ax = plt.subplots(figsize=(10, 6))
-    # ax.hist(y_fractions, bins=50, normed=True)
-    # ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit')
-    # ax.set_xlim([0.001, 0.01])
-    # ax.legend(loc='best')
-    # plt.show()
-
-    if yfrac is not None:
-        cut_off = yfrac
+    if args.plotyfrac is not None:
+        import matplotlib.pyplot as plt
+        fig, ax = plt.subplots(figsize=(16, 6))
+        ax.hist(y_fractions, bins=100, normed=True)
+        ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit')
+        ax.set_xlim([0, 0.02])
+        ax.legend(loc='best')
+        plt.savefig(args.plotyfrac)
+        logging.info('Image written to {}, now quitting ...'.format(args.plotyfrac))
+        exit()
+
+    if args.yfrac is not None:
+        cut_off = args.yfrac
     else:
         sort_idd = np.argsort(gmm_x)
         sorted_gmm_y = gmm_y[sort_idd]
 
         local_min_i = argrelextrema(sorted_gmm_y, np.less)
 
         cut_off = gmm_x[local_min_i][0]
+        logging.info('Determined --yfrac cutoff: {}'.format(str(round(cut_off, 4))))
 
     genders[y_fractions > cut_off] = 'M'
     genders[y_fractions < cut_off] = 'F'

diff --git a/wisecondorX/overall_tools.py b/wisecondorX/overall_tools.py
@@ -101,7 +101,7 @@ def get_z_score(results_c, results):
         z = (segment[3] - null_mean) / null_sd
         z = min(z, 1000) ; z = max(z, -1000)
         if math.isnan(null_mean) or math.isnan(null_sd):
-            z = 'NaN'
+            z = 'nan'
         zs.append(z)
     return zs
 
@@ -120,3 +120,16 @@ def get_median_segment_variance(results_c, results_r):
             var = np.var(segment_r)
             vars.append(var)
     return np.median(vars)
+
+
+'''
+Returns CPA, measure for sample-wise abnormality.
+'''
+
+
+def get_cpa(results_c, binsize):
+    x = 0
+    for segment in results_c:
+        x += (segment[2] - segment[1] + 1) * binsize * abs(segment[3])
+    CPA = x / len(results_c) * (10 ** -8)
+    return(CPA)
diff --git a/wisecondorX/predict_output.py b/wisecondorX/predict_output.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance
+from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa
 
 '''
 Writes plots.
@@ -61,9 +61,9 @@ def _generate_bins_bed(rem_input, results):
             r = results_r[chr][i]
             z = results_z[chr][i]
             if r == 0:
-                r = 'NaN'
+                r = 'nan'
             if z == 0:
-                z = 'NaN'
+                z = 'nan'
             feat_str = '{}:{}-{}'.format(chr_name, str(feat), str(feat + binsize - 1))
             row = [chr_name, feat, feat + binsize - 1, feat_str, r, z]
             bins_file.write('{}\n'.format('\t'.join([str(x) for x in row])))
@@ -116,7 +116,7 @@ def __get_aberration_cutoff(beta, ploidy):
 
 
 def _generate_chr_statistics_file(rem_input, results):
-    stats_file = open('{}_chr_statistics.txt'.format(rem_input['args'].outid), 'w')
+    stats_file = open('{}_statistics.txt'.format(rem_input['args'].outid), 'w')
     stats_file.write('chr\tratio.mean\tratio.median\tzscore\n')
     chr_ratio_means = [np.ma.average(results['results_r'][chr], weights=results['results_w'][chr])
                        for chr in range(len(results['results_r']))]
@@ -126,7 +126,8 @@ def _generate_chr_statistics_file(rem_input, results):
     results_c_chr = [[x, 0, rem_input['bins_per_chr'][x] - 1, chr_ratio_means[x]]
                      for x in range(len(results['results_r']))]
 
-    msv = get_median_segment_variance(results['results_c'], results['results_r'])
+    msv = round(get_median_segment_variance(results['results_c'], results['results_r']), 5)
+    cpa = round(get_cpa(results['results_c'], rem_input['binsize']), 5)
     chr_z_scores = get_z_score(results_c_chr, results)
 
     for chr in range(len(results['results_r'])):
@@ -144,9 +145,19 @@ def _generate_chr_statistics_file(rem_input, results):
 
         stats_file.write('\t'.join([str(x) for x in row]) + '\n')
 
-    stats_file.write('Standard deviation ratios (per chromosome): {}\n'
-                     .format(str(np.nanstd(chr_ratio_means))))
+    stats_file.write('Gender based on --yfrac (or manually overridden by --gender): {}\n'
+                     .format(str(rem_input['actual_gender'])))
 
-    stats_file.write('Median segment variance (per bin): {}\n'
+    stats_file.write('Number of reads: {}\n'
+                     .format(str(rem_input['n_reads'])))
+
+    stats_file.write('Standard deviation of the ratios per chromosome: {}\n'
+                     .format(str(round(float(np.nanstd(chr_ratio_means)), 5))))
+
+    stats_file.write('Median segment variance per bin (doi: 10.1093/nar/gky1263): {}\n'
                      .format(str(msv)))
+
+    stats_file.write('Copy number profile abnormality (CPA) score (doi: 10.1186/s13073-020-00735-4): {}\n'
+                     .format(str(cpa)))
+
     stats_file.close()