From be7f2cae1a5a5125a1ff288372e31bb6a1158e53 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Tue, 5 Jan 2021 12:45:44 +0100
Subject: [PATCH 1/3] Added --plotyfrac option and CPA calculation feature

---
 README.md                                     |  4 +-
 example.blacklist/acen.and.gvar.hg38.txt      | 65 +++++++++++++++++++
 .../{centromere.hg38.txt => acen.hg38.txt}    |  0
 setup.py                                      |  2 +-
 wisecondorX/include/plotter.R                 |  7 +-
 wisecondorX/main.py                           |  7 +-
 wisecondorX/newref_tools.py                   | 26 ++++----
 wisecondorX/overall_tools.py                  | 15 ++++-
 wisecondorX/predict_output.py                 | 27 +++++---
 9 files changed, 126 insertions(+), 27 deletions(-)
 create mode 100644 example.blacklist/acen.and.gvar.hg38.txt
 rename example.blacklist/{centromere.hg38.txt => acen.hg38.txt} (100%)

diff --git a/README.md b/README.md
index 185011b..f1c9828 100755
--- a/README.md
+++ b/README.md
@@ -84,6 +84,7 @@ WisecondorX newref reference_input_dir/*.npz reference_output.npz [--optional ar
 `--binsize x` | Size per bin in bp, defines the resolution of the output (default: x=1e5)  
 `--refsize x` | Amount of reference locations per target; should generally not be tweaked (default: x=300)  
 `--yfrac x` | Y read fraction cutoff, in order to manually define gender. Setting this to 1 will treat all samples as female  
+`--plotyfrac x` | plots Y read fraction histogram and Gaussian mixture fit to file x, can help when setting `--yfrac` manually; software quits after plotting  
 `--cpus x` | Number of threads requested (default: x=1)  
 
 &rarr; Bash recipe at `./pipeline/newref.sh`
@@ -102,7 +103,7 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg
 `--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5)  
 `--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4)  
 `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction)  
-`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`) (no default)  
+`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`)  
 `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M)
 `--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information  **(\*)**  
 `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable  **(\*)**  
@@ -186,5 +187,6 @@ the 'ID_segments.bed'. Particularly interesting for NIPT.
     - scikit-learn (v0.20.0)
     - pysam (v0.15.1)
     - numpy (v1.15.2)
+    - matplotlib (v2.2.3)
 
 And of course, other versions are very likely to work as well.  
diff --git a/example.blacklist/acen.and.gvar.hg38.txt b/example.blacklist/acen.and.gvar.hg38.txt
new file mode 100644
index 0000000..74ae6a5
--- /dev/null
+++ b/example.blacklist/acen.and.gvar.hg38.txt
@@ -0,0 +1,65 @@
+chr1	121700000	123400000
+chr1	123400000	125100000
+chr1	125100000	143200000
+chr10	38000000	39800000
+chr10	39800000	41600000
+chr11	51000000	53400000
+chr11	53400000	55800000
+chr12	33200000	35500000
+chr12	35500000	37800000
+chr13	0	4600000
+chr13	10100000	16500000
+chr13	16500000	17700000
+chr13	17700000	18900000
+chr14	0	3600000
+chr14	8000000	16100000
+chr14	16100000	17200000
+chr14	17200000	18200000
+chr15	0	4200000
+chr15	9700000	17500000
+chr15	17500000	19000000
+chr15	19000000	20500000
+chr16	35300000	36800000
+chr16	36800000	38400000
+chr16	38400000	47000000
+chr17	22700000	25100000
+chr17	25100000	27400000
+chr18	15400000	18500000
+chr18	18500000	21500000
+chr19	19900000	24200000
+chr19	24200000	26200000
+chr19	26200000	28100000
+chr19	28100000	31900000
+chr2	91800000	93900000
+chr2	93900000	96000000
+chr20	25700000	28100000
+chr20	28100000	30400000
+chr21	0	3100000
+chr21	7000000	10900000
+chr21	10900000	12000000
+chr21	12000000	13000000
+chr22	0	4300000
+chr22	9400000	13700000
+chr22	13700000	15000000
+chr22	15000000	17400000
+chr3	87800000	90900000
+chr3	90900000	94000000
+chr3	94000000	98600000
+chr4	48200000	50000000
+chr4	50000000	51800000
+chr5	46100000	48800000
+chr5	48800000	51400000
+chr6	58500000	59800000
+chr6	59800000	62600000
+chr7	58100000	60100000
+chr7	60100000	62100000
+chr8	43200000	45200000
+chr8	45200000	47200000
+chr9	42200000	43000000
+chr9	43000000	45500000
+chr9	45500000	61500000
+chrX	58100000	61000000
+chrX	61000000	63800000
+chrY	10300000	10400000
+chrY	10400000	10600000
+chrY	26600000	57227415
diff --git a/example.blacklist/centromere.hg38.txt b/example.blacklist/acen.hg38.txt
similarity index 100%
rename from example.blacklist/centromere.hg38.txt
rename to example.blacklist/acen.hg38.txt
diff --git a/setup.py b/setup.py
index 6cd90bd..8f0c863 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 from setuptools import setup, find_packages
 
-version = '1.1.6'
+version = '1.2.0'
 dl_version = 'master' if 'dev' in version else '{}'.format(version)
 
 setup(
diff --git a/wisecondorX/include/plotter.R b/wisecondorX/include/plotter.R
index 7f9ffe1..dab12ba 100644
--- a/wisecondorX/include/plotter.R
+++ b/wisecondorX/include/plotter.R
@@ -124,8 +124,8 @@ layout(l.matrix)
 par(mar=c(2.5,4,1,0), mgp=c(2.4,0.5,0), oma=c(0,0,3,0))
 
 plot(1, main="", axes=F, # plots nothing -- enables segments function
-     xlab="", ylab="", col="white", xlim=c(chr.ends[1], chr.ends[length(chr.ends)]),
-     cex=0.0001, ylim=c(chr.wide.lower.limit,chr.wide.upper.limit))
+     xlab="", ylab="", xlim=c(chr.ends[1], chr.ends[length(chr.ends)]),
+     cex=0, ylim=c(chr.wide.lower.limit,chr.wide.upper.limit))
 
 plot.constitutionals <- function(ploidy, start, end){
   segments(start, log2(1/ploidy), end, log2(1/ploidy), col=color.B, lwd=2, lty=3)
@@ -291,8 +291,7 @@ for (c in chrs){
   par(mar=c(2.5,4,1,0), mgp=c(2.4,0.5,0))
   
   plot(1, main="", axes=F, # plots nothing -- enables segments function
-       xlab="", ylab="", col="white",
-       cex=0.0001, ylim=c(lower.limit,upper.limit), xlim=margins)
+       xlab="", ylab="", cex=0, ylim=c(lower.limit,upper.limit), xlim=margins)
 
   if (gender == "F"){
     plot.constitutionals(2, chr.ends[c] - bins.per.chr[c] * 0.02, chr.ends[c+1] + bins.per.chr[c] * 0.02)
diff --git a/wisecondorX/main.py b/wisecondorX/main.py
index 6e5c2aa..e4f69cd 100755
--- a/wisecondorX/main.py
+++ b/wisecondorX/main.py
@@ -58,7 +58,7 @@ def tool_newref(args):
         samples.append(scale_sample(sample, binsize, args.binsize))
 
     samples = np.array(samples)
-    genders, trained_cutoff = train_gender_model(samples, args.yfrac)
+    genders, trained_cutoff = train_gender_model(args, samples)
 
     if genders.count('F') < 5 and args.nipt:
         logging.warning(
@@ -309,6 +309,11 @@ def main():
                                default=None,
                                help='Use to manually set the y read fraction cutoff, which defines gender'
                                )
+    parser_newref.add_argument('--plotyfrac',
+                               type=str,
+                               default=None,
+                               help='Path to yfrac .png plot for --yfrac optimization (e.g. path/to/plot.png); software will stop after plotting after which --yfrac can be set manually'
+                               )
     parser_newref.add_argument('--refsize',
                                type=int,
                                default=300,
diff --git a/wisecondorX/newref_tools.py b/wisecondorX/newref_tools.py
index 69d29c6..4724890 100644
--- a/wisecondorX/newref_tools.py
+++ b/wisecondorX/newref_tools.py
@@ -18,7 +18,7 @@
 '''
 
 
-def train_gender_model(samples, yfrac):
+def train_gender_model(args, samples):
     genders = np.empty(len(samples), dtype='object')
     y_fractions = []
     for sample in samples:
@@ -30,16 +30,19 @@ def train_gender_model(samples, yfrac):
     gmm_x = np.linspace(0, 0.02, 5000)
     gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1, 1)))
 
-    # import matplotlib.pyplot as plt
-    # fig, ax = plt.subplots(figsize=(10, 6))
-    # ax.hist(y_fractions, bins=50, normed=True)
-    # ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit')
-    # ax.set_xlim([0.001, 0.01])
-    # ax.legend(loc='best')
-    # plt.show()
-
-    if yfrac is not None:
-        cut_off = yfrac
+    if args.plotyfrac is not None:
+        import matplotlib.pyplot as plt
+        fig, ax = plt.subplots(figsize=(16, 6))
+        ax.hist(y_fractions, bins=50, normed=True)
+        ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit')
+        ax.set_xlim([0, 0.02])
+        ax.legend(loc='best')
+        plt.savefig(args.plotyfrac)
+        logging.info('Image written to {}, now quitting ...'.format(args.plotyfrac))
+        exit()
+
+    if args.yfrac is not None:
+        cut_off = args.yfrac
     else:
         sort_idd = np.argsort(gmm_x)
         sorted_gmm_y = gmm_y[sort_idd]
@@ -47,6 +50,7 @@ def train_gender_model(samples, yfrac):
         local_min_i = argrelextrema(sorted_gmm_y, np.less)
 
         cut_off = gmm_x[local_min_i][0]
+        logging.info('Determined --yfrac cutoff: {}'.format(str(round(cut_off, 4))))
 
     genders[y_fractions > cut_off] = 'M'
     genders[y_fractions < cut_off] = 'F'
diff --git a/wisecondorX/overall_tools.py b/wisecondorX/overall_tools.py
index 7579295..078b63d 100644
--- a/wisecondorX/overall_tools.py
+++ b/wisecondorX/overall_tools.py
@@ -101,7 +101,7 @@ def get_z_score(results_c, results):
         z = (segment[3] - null_mean) / null_sd
         z = min(z, 1000) ; z = max(z, -1000)
         if math.isnan(null_mean) or math.isnan(null_sd):
-            z = 'NaN'
+            z = 'nan'
         zs.append(z)
     return zs
 
@@ -120,3 +120,16 @@ def get_median_segment_variance(results_c, results_r):
             var = np.var(segment_r)
             vars.append(var)
     return np.median(vars)
+
+
+'''
+Returns CPA, measure for sample-wise abnormality.
+'''
+
+
+def get_cpa(results_c, binsize):
+    x = 0
+    for segment in results_c:
+        x += (segment[2] - segment[1] + 1) * binsize * abs(segment[3])
+    CPA = x / len(results_c) * (10 ** -8)
+    return(CPA)
\ No newline at end of file
diff --git a/wisecondorX/predict_output.py b/wisecondorX/predict_output.py
index 89f213d..4a5ef9f 100644
--- a/wisecondorX/predict_output.py
+++ b/wisecondorX/predict_output.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance
+from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa
 
 '''
 Writes plots.
@@ -61,9 +61,9 @@ def _generate_bins_bed(rem_input, results):
             r = results_r[chr][i]
             z = results_z[chr][i]
             if r == 0:
-                r = 'NaN'
+                r = 'nan'
             if z == 0:
-                z = 'NaN'
+                z = 'nan'
             feat_str = '{}:{}-{}'.format(chr_name, str(feat), str(feat + binsize - 1))
             row = [chr_name, feat, feat + binsize - 1, feat_str, r, z]
             bins_file.write('{}\n'.format('\t'.join([str(x) for x in row])))
@@ -116,7 +116,7 @@ def __get_aberration_cutoff(beta, ploidy):
 
 
 def _generate_chr_statistics_file(rem_input, results):
-    stats_file = open('{}_chr_statistics.txt'.format(rem_input['args'].outid), 'w')
+    stats_file = open('{}_statistics.txt'.format(rem_input['args'].outid), 'w')
     stats_file.write('chr\tratio.mean\tratio.median\tzscore\n')
     chr_ratio_means = [np.ma.average(results['results_r'][chr], weights=results['results_w'][chr])
                        for chr in range(len(results['results_r']))]
@@ -126,7 +126,8 @@ def _generate_chr_statistics_file(rem_input, results):
     results_c_chr = [[x, 0, rem_input['bins_per_chr'][x] - 1, chr_ratio_means[x]]
                      for x in range(len(results['results_r']))]
 
-    msv = get_median_segment_variance(results['results_c'], results['results_r'])
+    msv = round(get_median_segment_variance(results['results_c'], results['results_r']), 5)
+    cpa = round(get_cpa(results['results_c'], rem_input['binsize']), 5)
     chr_z_scores = get_z_score(results_c_chr, results)
 
     for chr in range(len(results['results_r'])):
@@ -144,9 +145,19 @@ def _generate_chr_statistics_file(rem_input, results):
 
         stats_file.write('\t'.join([str(x) for x in row]) + '\n')
 
-    stats_file.write('Standard deviation ratios (per chromosome): {}\n'
-                     .format(str(np.nanstd(chr_ratio_means))))
+    stats_file.write('Gender based on --yfrac (or manually overridden by --gender): {}\n'
+                     .format(str(rem_input['actual_gender'])))
 
-    stats_file.write('Median segment variance (per bin): {}\n'
+    stats_file.write('Number of reads: {}\n'
+                     .format(str(rem_input['n_reads'])))
+
+    stats_file.write('Standard deviation of the ratios per chromosome: {}\n'
+                     .format(str(round(float(np.nanstd(chr_ratio_means)), 5))))
+
+    stats_file.write('Median segment variance per bin (doi: 10.1093/nar/gky1263): {}\n'
                      .format(str(msv)))
+
+    stats_file.write('Copy number profile abnormality (CPA) score (doi: 10.1186/s13073-020-00735-4): {}\n'
+                     .format(str(cpa)))
+
     stats_file.close()

From 733002858a27c41b27f8575a34b853f970b640d8 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Tue, 5 Jan 2021 12:57:31 +0100
Subject: [PATCH 2/3] Updated manual

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f1c9828..a7e6ec4 100755
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg
 `--zscore x` | Z-score cutoff to call segments as aberrations (default: x=5)  
 `--alpha x` | P-value cutoff for calling circular binary segmentation breakpoints (default: x=1e-4)  
 `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction)  
-`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; example at `./example.blacklist/centromere.hg38.txt`)  
+`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`)  
 `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M)
 `--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information  **(\*)**  
 `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable  **(\*)**  

From b2f9cef6888f94940d6ee4960321397008847f7e Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Tue, 5 Jan 2021 14:35:31 +0100
Subject: [PATCH 3/3] 50 bins -> 100 bins histogram

---
 wisecondorX/newref_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wisecondorX/newref_tools.py b/wisecondorX/newref_tools.py
index 4724890..90898b0 100644
--- a/wisecondorX/newref_tools.py
+++ b/wisecondorX/newref_tools.py
@@ -33,7 +33,7 @@ def train_gender_model(args, samples):
     if args.plotyfrac is not None:
         import matplotlib.pyplot as plt
         fig, ax = plt.subplots(figsize=(16, 6))
-        ax.hist(y_fractions, bins=50, normed=True)
+        ax.hist(y_fractions, bins=100, normed=True)
         ax.plot(gmm_x, gmm_y, 'r-', label='Gaussian mixture fit')
         ax.set_xlim([0, 0.02])
         ax.legend(loc='best')