Skip to content

Commit

Permalink
Merge pull request #48 from CenterForMedicalGeneticsGhent/yfrac
Browse files Browse the repository at this point in the history
Yfrac
  • Loading branch information
matthdsm authored Sep 11, 2019
2 parents 2ec85d7 + 6061e1b commit d2f5a8b
Show file tree
Hide file tree
Showing 35 changed files with 27,300 additions and 27,287 deletions.
146 changes: 71 additions & 75 deletions README.md

Large diffs are not rendered by default.

5 changes: 1 addition & 4 deletions example.bed/ID_aberrations.bed
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
chr start end ratio zscore type
9 63600001 64100000 0.0759 -0.231133214620813 gain
16 34000001 36300000 0.0496 -0.1199119851304625 gain
21 8100001 9400000 0.0406 -0.4677773165898967 gain
21 12900001 46700000 0.1058 12.717698165549878 gain
21 13100001 46700000 0.0923 16.608291452927805 gain
54,210 changes: 27,105 additions & 27,105 deletions example.bed/ID_bins.bed

Large diffs are not rendered by default.

50 changes: 25 additions & 25 deletions example.bed/ID_chr_statistics.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
chr ratio.mean ratio.median zscore
1 0.004801024173039574 0.004708641284525177 -0.1943411218777867
2 0.00904796528295331 0.009609029395419568 0.48988222904782713
3 0.0070667998784118315 0.006120352019274799 -0.04117431086520306
4 0.014308374998224929 0.013171808868526196 0.7679143972769512
5 0.005721646042904271 0.0067685835413810545 -0.018580262175933095
6 0.009585405233715245 0.00908471378696553 0.22573391898745276
7 0.008317050672746367 0.007692366189364621 0.0037107023278718913
8 0.005160050242513214 0.005713241514037909 -0.2042135192016979
9 0.01121468971966588 0.00977756579857901 0.047472187159729694
10 0.0030692311654508158 0.004583848280579345 -0.34837807476958
11 0.00688030906128233 0.007086721970776834 -0.10424614526746451
12 0.00549203759265961 0.005864243262039401 -0.2160328206663196
13 0.013675673399040739 0.013656105668541224 0.34217825239829247
14 0.0029720591963596945 0.003580148355038131 -0.5530616685579286
15 -0.0009506167889229975 0.00010174786641035236 -0.8054144166374848
16 0.0034939010454746585 0.0028736750549622245 -0.6702359144390323
17 0.006734104272005228 0.004098578784556573 -0.5477376361073563
18 0.00344428643908555 0.005665617680618514 -0.38259039482492824
19 0.01286934573127321 0.013759142662575248 -0.10135617298405078
20 0.006757472304359755 0.004402958567255848 -0.5488829694781852
21 0.10461425283698339 0.10551232650812012 10.316924768074003
22 0.01291675254414625 0.013535007662770668 -0.19161787807505576
X -0.006535316513306532 -0.007609995157314061 -1.3683545619974293
Standard deviation ratios (per chromosome): 0.02071953091750753
Median segment variance (per bin): 0.003717839883081593
1 -0.0032609199439199567 -0.0030574749279125464 -0.7181555916155843
2 -0.00222586397436516 -0.0013685886009461151 -0.4948482469949836
3 -0.0023969547839590264 -0.0011334488095482856 -0.6990839443004558
4 -0.0006541784524630243 0.0006980480764811368 0.07011094194494766
5 -0.002027482027326054 -0.0029393041209732663 -0.45940806974679793
6 0.0003975992265176495 0.0008366829702791123 0.6996009429776066
7 -0.0013405186922353768 0.00015858560133438651 -0.26070705932894234
8 -0.0018417550644656312 -0.0007432575407652726 -0.29488018108963754
9 -0.0004837345131196531 0.0014788938989835924 0.09001923578037066
10 -0.0015900220429246811 0.00015297606052258302 -0.14923486447099735
11 -0.0008582369348535444 -0.0010502284634716433 0.0448766391972359
12 -0.00025677831553574574 3.019246228672481e-05 0.15274375810537785
13 0.00010740967984628051 0.0010327967421443886 0.09509981977368882
14 -0.000769248617836592 0.001469153008167432 0.05796904654663658
15 -0.002433002616849088 -0.0021174826085320793 -0.29353041238958605
16 -0.0012881535434355064 -0.0010316510071278879 -0.09030886457732079
17 0.0026513354040225124 -0.0001100300013302676 0.5224786008875596
18 -0.0011608661300810323 -0.0005435053108220631 -0.06299183690262584
19 -0.005387259215980009 -0.007701648079880146 -0.08427206994165705
20 -0.0003655276543721735 -0.0007153791600984443 0.054850498488133136
21 0.08936088385621849 0.09039306322638224 16.291170671998824
22 -0.005828358472404008 -0.004006435094853288 -0.39925252564061936
X -0.010416143849382876 -0.00890714719904051 -0.5860122754447978
Standard deviation ratios (per chromosome): 0.018776073523327164
Median segment variance (per bin): 0.0028141000784848895
94 changes: 50 additions & 44 deletions example.bed/ID_segments.bed
Original file line number Diff line number Diff line change
@@ -1,45 +1,51 @@
chr start end ratio zscore
1 1500001 121600000 0.0002 -0.5395072376267049
1 145300001 248900000 0.0081 0.30459878873132157
2 1 89000000 0.0072 0.1367763758989364
2 91600001 92100000 -0.0271 -0.8562607942568246
2 94100001 241800000 0.0103 0.7868223823749212
3 1 91500000 0.0039 -0.18283000303325758
3 93700001 198100000 0.0068 0.19737583751164328
4 200001 49000000 0.0155 0.5459786931822127
4 51800001 190000000 0.0119 0.9287021405930612
5 100001 46400000 0.0122 0.26024999111297736
5 50100001 181300000 0.0041 -0.05882213461466398
6 400001 170600000 0.0086 0.6492843497284643
7 600001 58100000 0.008 0.022500534120313095
7 60800001 159300000 0.0064 0.09037951161445841
8 200001 43900000 0.007 -0.1252528999784414
8 45900001 145000000 0.0043 -0.11313793668249253
9 400001 38800000 0.0066 -0.18366700098732888
9 63600001 64100000 0.0759 -0.231133214620813
9 68200001 138100000 0.0099 0.281763763237288
10 1 38500000 0.0001 -0.6078005057190251
10 41600001 132600000 0.0055 -0.01757697408120559
11 200001 50800000 0.0082 0.009465295731846238
11 54500001 135100000 0.0053 -0.07562141723123818
12 100001 34200000 0.0023 -0.48737810672795356
12 37200001 133200000 0.0062 0.08865822546440758
13 18500001 114300000 0.0131 0.7988774227192919
14 18200001 106900000 0.0023 -0.33736788569132403
15 23400001 101900000 -0.0013 -0.6816000498882985
16 1600001 31900000 -0.0061 -0.9297593120177745
16 34000001 36300000 0.0496 -0.1199119851304625
16 46300001 90100000 0.0034 -0.3523900862540029
17 300001 21800000 0.0008 -0.5998312930566669
17 26600001 82900000 0.0038 -0.3278717786267936
18 100001 15400000 0.0081 -0.3185240131224995
18 20900001 80300000 0.0045 -0.24274630463611976
19 2400001 24200000 0.0171 0.05299272324196385
19 27200001 58500000 0.0102 -0.08637394776619045
20 100001 26300000 0.002 -0.5255374427060611
20 29500001 64300000 0.004 -0.3573887827136666
21 8100001 9400000 0.0406 -0.4677773165898967
21 12900001 46700000 0.1058 12.717698165549878
22 15100001 50800000 0.0116 0.09385249408905262
X 2900001 58400000 -0.0121 -1.3517040398386777
X 62400001 156000000 -0.0049 -0.9973236242906572
1 1800001 120500000 -0.0047 -1.056167328339317
1 124900001 125200000 0.0031 0.025868113152485
1 143000001 143300000 -0.0069 -0.1640725183584622
1 145300001 248900000 -0.0016 -0.25233416680728865
2 1 89000000 -0.0022 -0.293517826995595
2 91700001 92100000 0.0404 0.911861625111566
2 94100001 241800000 -0.0023 -0.4991782025123656
3 1 91500000 -0.0031 -0.6350590012594971
3 93700001 198100000 -0.0018 -0.36116967183169585
4 200001 49000000 -0.0053 -0.9855052748230904
4 51800001 190000000 0.0009 0.37467791309011167
5 100001 46400000 -0.0019 -0.17862035969056309
5 50100001 181300000 -0.0021 -0.45025286578556517
6 400001 170600000 0.0004 0.7006941034402193
7 600001 58100000 -0.0017 -0.3724598839131979
7 60800001 159300000 -0.0011 -0.13592123618109778
8 300001 43900000 -0.0001 0.07244426135762981
8 45900001 145100000 -0.0025 -0.2727898550049748
9 300001 38800000 0.0017 0.4790995535108751
9 43100001 43300000 0.0439 0.8848940097463155
9 63600001 63900000 0.0341 0.6366046862067515
9 68300001 138100000 -0.0019 -0.3202222334881715
10 100001 39500000 0.002 0.6259913126384714
10 41600001 132400000 -0.0032 -0.7442691851316088
11 900001 50800000 -0.002 -0.1663741141464333
11 54500001 135100000 -0.0001 0.209377263214418
12 100001 34200000 0.0019 0.4689276480386167
12 37200001 133100000 -0.001 -0.0112872115438825
13 18500001 114300000 0.0001 0.09381228506602132
14 18200001 106600000 -0.0008 0.050205762842884465
15 19800001 20000000 0.0102 0.22967876180626007
15 23300001 101700000 -0.0025 -0.30650224880446886
16 100001 31900000 -0.0016 -0.12066278499457352
16 33900001 36200000 0.0167 0.6817695488121762
16 46400001 90100000 -0.0016 -0.1572399361130969
17 200001 22700000 0.004 0.50903626654136
17 26600001 83200000 0.0021 0.3467494513526086
18 200001 15400000 0.0059 1.126802241716368
18 20900001 80200000 -0.0029 -0.4973857243626394
19 2400001 24300000 -0.0098 -0.3010273270741896
19 27200001 58500000 -0.0027 0.08938792970248823
20 100001 26400000 -0.0025 -0.18962034693116278
20 29500001 64300000 0.0014 0.16072260906557312
21 6300001 9400000 -0.0195 -0.6928045959344472
21 12900001 13100000 -0.0223 -0.2727467001285998
21 13100001 46700000 0.0923 16.608291452927805
22 12600001 12900000 -0.1316 -2.348890688250206
22 15900001 50800000 -0.005 -0.31613415132031625
X 400001 58500000 -0.0116 -0.6560723451986354
X 62400001 156000000 -0.0097 -0.5317932094740334
Binary file modified example.plot/chr1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr10.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr11.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr12.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr13.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr14.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr15.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr16.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr17.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr18.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr19.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr20.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr21.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr22.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr8.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chr9.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/chrX.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified example.plot/genome_wide.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 5 additions & 3 deletions pipeline/convert.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#!/usr/bin/env bash

# PARAMETERS
BAM_FILES="path/to/bam_files.txt" # reference or test cases

BAM_FILES="path/to/bam_files.txt" # file
# Example of the structure of this file:
# ID_1 path/to/ID_1.bam
# ID_2 path/to/ID_2.bam
OUTPUT_DIR="path/to/convert.npz"
# ...
OUTPUT_DIR="path/to/converts" # existing folder

# SCRIPT

Expand All @@ -17,4 +19,4 @@ while read LINE; do
echo "Creating 5kb bins for sample ${SAMPLE}"
WisecondorX convert ${BAM} ${OUTPUT_DIR}/${SAMPLE}.npz

done < ${BAM_FILES}
done < ${BAM_FILES}
10 changes: 5 additions & 5 deletions pipeline/newref.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env bash

# PARAMETERS
CORES=6
INPUT_DIR="path/to/convert.npz" # existing (non-empty) folder, containing reference .npz files
OUTPUT_DIR="path/to/newref.npz" # existing (empty) folder

CORES=8
INPUT_DIR="path/to/converts_ref" # folder containing reference .npz files
OUTPUT_DIR="path/to/references" # existing folder
REF_SIZES="1000 500 200 100" # space separated list (kb)
RELEASE="hg38" # reference used to create bam files (solely used for reference filename)

# SCRIPT

Expand All @@ -14,6 +14,6 @@ do
echo "Creating reference at bins size ${REF} kb"

WisecondorX newref ${INPUT_DIR}/*.npz \
${OUTPUT_DIR}/reference.${RELEASE}.${REF}kb.npz \
${OUTPUT_DIR}/reference.${REF}kb.npz \
--binsize ${REF}000 --cpus ${CORES}
done
18 changes: 7 additions & 11 deletions pipeline/predict.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
#!/usr/bin/env bash

# PARAMETERS
NPZ_FILES="path/to/samples.txt" # cases that will be tested versus the given reference

NPZ_FILES="path/to/npz_files.txt" # file
# Example of the structure of this file:
# ID_1 path/to/convert.npz/ID_1.npz
# ID_2 path/to/convert.npz/ID_2.npz
# ID_1 path/to/converts_test/ID_1.npz
# ID_2 path/to/converts_test/ID_2.npz
# ...
REF="path/to/newref.npz/reference.hg38.50kb.npz"
OUTPUT_DIR="path/to/predict.output" # existing output folder


# OPTIONAL PARAMETERS

OPT="--plot --bed"
REF="path/to/references/reference.100kb.npz"
OUTPUT_DIR="path/to/output_100kb" # existing folder

# SCRIPT

Expand All @@ -22,6 +18,6 @@ while read LINE; do
NPZ=$(echo $LINE | awk -F ' ' '{print $2}')

echo "Predicting sample ${SAMPLE}"
WisecondorX predict ${NPZ} ${REF} ${OUTPUT_DIR}/${SAMPLE} ${OPT}
WisecondorX predict ${NPZ} ${REF} ${OUTPUT_DIR}/${SAMPLE} --plot --bed

done <${NPZ_FILES}
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python
from setuptools import setup, find_packages

version = '1.1.2'
version = '1.1.3'
dl_version = 'master' if 'dev' in version else '{}'.format(version)

setup(
Expand Down
29 changes: 22 additions & 7 deletions wisecondorX/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ def tool_convert(args):
def tool_newref(args):
logging.info('Creating new reference')

if args.yfrac is not None:
if args.yfrac < 0 or args.yfrac > 1:
logging.critical(
'Parameter --yfrac should be a positive number lower than or equal to 1')
sys.exit()

split_path = list(os.path.split(args.outfile))
if split_path[-1][-4:] == '.npz':
split_path[-1] = split_path[-1][:-4]
Expand All @@ -52,8 +58,12 @@ def tool_newref(args):
samples.append(scale_sample(sample, binsize, args.binsize))

samples = np.array(samples)
genders, trained_cutoff = train_gender_model(samples)
genders, trained_cutoff = train_gender_model(samples, args.yfrac)

if genders.count('F') < 5 and args.nipt:
logging.warning(
'A NIPT reference should have at least 5 female feti samples. Removing --nipt flag.')
args.nipt = False
if not args.nipt:
for i, sample in enumerate(samples):
samples[i] = gender_correct(sample, genders[i])
Expand Down Expand Up @@ -124,12 +134,12 @@ def tool_test(args):
if args.beta is not None:
if args.beta <= 0 or args.beta > 1:
logging.critical(
'Parameter --beta should be a strictly positive number lower than 1')
'Parameter --beta should be a strictly positive number lower than or equal to 1')
sys.exit()

if args.alpha <= 0 or args.alpha > 1:
logging.critical(
'Parameter --alpha should be a strictly positive number lower than 1')
'Parameter --alpha should be a strictly positive number lower than or equal to 1')
sys.exit()

logging.info('Importing data ...')
Expand Down Expand Up @@ -198,9 +208,9 @@ def tool_test(args):

results_r = np.append(results_r, results_r_2)
results_z = np.append(results_z, results_z_2) - m_z
results_w = np.append(results_w * np.nanmedian(results_w_2),
results_w_2 * np.nanmedian(results_w))
results_w = results_w / np.nanmedian(results_w)
results_w = np.append(results_w * np.nanmean(results_w_2),
results_w_2 * np.nanmean(results_w))
results_w = results_w / np.nanmean(results_w)

if np.isnan(results_w).any() or np.isinf(results_w).any():
logging.warning('Non-numeric values found in weights -- reference too small. Circular binary segmentation and z-scoring will be unweighted')
Expand Down Expand Up @@ -292,7 +302,12 @@ def main():
help='Path and filename for the reference output (e.g. path/to/myref.npz)')
parser_newref.add_argument('--nipt',
action='store_true',
help='Use only for NIPT! (e.g. path/to/reference/*.npz)'
help='Use flag for NIPT (e.g. path/to/reference/*.npz)'
)
parser_newref.add_argument('--yfrac',
type=float,
default=None,
help='Use to manually set the y read fraction cutoff, which defines gender'
)
parser_newref.add_argument('--refsize',
type=int,
Expand Down
15 changes: 8 additions & 7 deletions wisecondorX/newref_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
'''


def train_gender_model(samples):
def train_gender_model(samples, yfrac):
genders = np.empty(len(samples), dtype='object')
y_fractions = []
for sample in samples:
Expand All @@ -38,14 +38,15 @@ def train_gender_model(samples):
# ax.legend(loc='best')
# plt.show()

sort_idd = np.argsort(gmm_x)
sorted_gmm_y = gmm_y[sort_idd]
if yfrac is not None:
cut_off = yfrac
else:
sort_idd = np.argsort(gmm_x)
sorted_gmm_y = gmm_y[sort_idd]

local_min_i = argrelextrema(sorted_gmm_y, np.less)
local_min_i = argrelextrema(sorted_gmm_y, np.less)

cut_off = gmm_x[local_min_i][0]

# plot(cut_off)
cut_off = gmm_x[local_min_i][0]

genders[y_fractions > cut_off] = 'M'
genders[y_fractions < cut_off] = 'F'
Expand Down

0 comments on commit d2f5a8b

Please sign in to comment.