-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path04_runVEP.pl
executable file
·759 lines (672 loc) · 28.3 KB
/
04_runVEP.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
#!/usr/bin/perl
############################################################################################
# Copyright (C) Nicolas Thierry-Mieg, 2019-2025
#
# This file is part of grexome-TIMC-Secondary, written by Nicolas Thierry-Mieg
# (CNRS, France) [email protected]
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with this program.
# If not, see <https://www.gnu.org/licenses/>.
############################################################################################
# 24/03/2018
# NTM
# Read on stdin a VCF file, write to stdout a similar VCF file
# with added VEP annotations.
# The VEP command-line args and VEP plugins used are hard-coded
# in &vepCommand(), edit that sub if you want different VEP fields
# (and then you should also edit @goodVeps at the top of 05_vcf2tsv.pl)
#
# 11/08/2019: adding a cache file (VEP is slow).
# CSQ are taken from the cachefile when the variant is in it,
# otherwise run VEP and add CSQ to cache.
# If VEP and/or the VEP-provided cache are updated, script dies and explains that
# cachefile must be manually removed. It will then be rebuilt from scratch
# on the next execution.
use strict;
use warnings;
use File::Basename qw(basename);
use Getopt::Long;
use POSIX qw(strftime);
# import LOCK_* constants
use Fcntl qw(:flock);
# Storable for the cache
use Storable qw(store retrieve lock_retrieve);
# we use $0 in every stderr message but we really only want
# the program name, not the path
$0 = basename($0);
### enclosing block for the "main", to enforce that the subs don't use any
### global/file-level-my variables except $0
{
#########################################################################
## options / params from the command-line
# our VEP cachefile, with path
my $cacheFile;
# genome fasta, filename with path, for VEP --hgvs
my $genome;
# species for VEP --species
my $species = "homo_sapiens";
# dir containing subdirs with the data required by the VEP plugins
# we use (eg dbNSFP/)
my $dataDir;
# tmp dir, preferably on a RAMdisk or at least SSD
my $tmpDir;
# debug: if true don't use any annotations from our cacheFile, just compare
# the VEP annotations with those in the cache and report any diffs to stderr
my $debug = '';
# vep "binary", with path if it's not in PATH
my $vepBin = "vep";
# number of jobs that VEP can run in parallel (--fork)
my $vepJobs = 6;
# help: if true just print $USAGE and exit
my $help = '';
my $USAGE = "\nRead on stdin a VCF file, write to stdout a similar VCF file with added VEP annotations.\n
Arguments (all can be abbreviated to shortest unambiguous prefixes):
--cacheFile string : filename with path where this script stores its cachefile
--genome string : ref genome fasta, with path
--species string [default $species]: species, as expected by VEP (eg mus_musculus)
--dataDir string : dir containing subdirs with the data required by the VEP plugins we use (eg dbNSFP)
--tmpDir string : tmp dir, must not pre-exist, will be removed after running
--debug : don't use the cacheFile, just report discrepancies between it and VEP
--vep [default $vepBin] : VEP program, with path if it's not in PATH
--jobs [default $vepJobs] : number of jobs that VEP can run in parallel
--help : print this USAGE";
GetOptions ("cacheFile=s" => \$cacheFile,
"genome=s" => \$genome,
"species=s" => \$species,
"dataDir=s" => \$dataDir,
"tmpDir=s" => \$tmpDir,
"debug" => \$debug,
"vep=s" => \$vepBin,
"jobs=i" => \$vepJobs,
"help" => \$help)
or die("E $0: Error in command line arguments\n$USAGE\n");
# make sure required options were provided and sanity check them
($help) && die "$USAGE\n\n";
($cacheFile) || die "E $0: you must provide a cacheFile\n$USAGE\n";
# if cacheFile exists we need write-access to it
if (-e $cacheFile) {
(-w $cacheFile) ||
die "E $0: provided cacheFile $cacheFile exists but isn't writable\n";
}
# otherwise it's OK we'll create it, but we need write-access to the subdir
else {
open(TEST, ">>$cacheFile") ||
die "E $0: provided cacheFile $cacheFile doesn't exist and we can't create it\n";
close(TEST);
unlink($cacheFile);
}
($genome) || die "E $0: you must provide a ref genome fasta file\n";
(-f $genome) || die "E $0: provided genome fasta file doesn't exist\n";
($species =~ /^\w+$/) || die "E $0: VEP species must be alphanumeric+underscores\n";
($dataDir) ||
die "E $0: you must provide a dataDir containing the data required by the VEP plugins we use\n";
(-d $dataDir) ||
die "E $0: the provided dataDir $dataDir doesn't exist or isn't a dir\n";
($tmpDir) || die "E $0: you must provide a non-existing tmpDir\n";
(-e $tmpDir) &&
die "E $0: tmpDir $tmpDir exists, please rm -r $tmpDir or use another tmpDir as arg\n";
mkdir($tmpDir) || die "E $0: cannot create tmpDir $tmpDir\n";
system("which $vepBin &> /dev/null") &&
die "E $0: the VEP executable $vepBin can't be found\n";
($vepJobs =~ /^\d+$/) || die "E $0: --jobs must be an int, you provided $vepJobs\n";
my $now = strftime("%F %T", localtime);
warn "I $now: $0 - starting to run\n";
##########################################################################
# vep --no_stats results in buggy VEP output (VEP git issue 1034), so we produce
# stats in $vepStats and remove the file when done
my $vepStats = "$tmpDir/vepStats.html";
# construct full VEP command-line
my $vepCommand = &vepCommand($vepBin, $dataDir, $genome, $species, $vepJobs, $vepStats);
# read $cacheFile if provided
# $cache is a hashref. key=="chr:pos:ref:alt" (for SNVs/indels) or
# "chr:pos:ref:alt:end" (for CNVs), value==CSQ, + 2 special keys
# "VEPVERSION" and "INFOCSQ" used to store the VEP version and fields
my $cache = {};
# grab previously cached data except in debug mode
if ((!$debug) && (-f $cacheFile) && (! -z $cacheFile)) {
$cache = &lock_retrieve($cacheFile) ||
die "E $0: cachefile $cacheFile exists and is not empty but I can't retrieve hash from it.\n";
}
# $cacheUpdate is a hashref, same key->value pairs as $cache but it
# will hold new VEP CSQs for keys that were missing in $cache
my $cacheUpdate = {};
#################################
# headers:
# - send header of VCF on stdin to VEP, to make sure the cache isn't stale;
# - if AOK, print VEP-processed header immediately to stdout;
# - store the #CHROM header line in $headerLine, we'll need it later for VEP.
my $headerLine = &checkHeaders($cache, $cacheUpdate, $tmpDir, $vepCommand);
if (!$headerLine) {
# VEP version or CSQ mismatch detected
unlink($vepStats) || die "E $0: cannot unlink vepStats file $vepStats\n";
rmdir($tmpDir) ||
warn "W $0: VEP version or INFO-CSQ mismatch but can't rmdir tmpDir $tmpDir\n";
die "E $0: VEP version or INFO-CSQ mismatch. If you updated your VEP cache or VEP ".
"options/plugins this script's cachefile is now stale, you need to rm $cacheFile ".
"and rebuild a fresh cache.\n";
}
#################################
# data
# - $headerLine and data lines absent from cache are sent to VEP, and result
# is printed to $vcfFromVep;
# - data lines found in cache are annotated using cache and printed
# to $vcfFromCache;
# - whenever we change chromosomes, start a thread that merges the fromVep
# and fromCache files from previous chrom and prints results to stdout.
# Both $vcf temp files are gzipped --fast, because they can get large and
# $tmpDir can be a smallish ramdisk.
# filenames and FILEHANDLES are per-chromosome
my ($vcfFromVep, $vcfFromCache);
my ($VCFVEP, $VCFCACHE);
# chrom we were working on in previous stdin line
my $prevChr;
# parse data lines from stdin
while (my $line = <STDIN>) {
chomp($line);
my @f = split(/\t/,$line,-1);
(@f >= 8) || die "E $0: VCF line doesn't have >= 8 columns:\n$line\n";
my $chr = $f[0];
if (!defined $prevChr) {
# this is the first data line
$vcfFromVep = "$tmpDir/vcfFromVep_$chr.vcf.gz";
(-e $vcfFromVep) &&
die "E $0: vcfFromVep $vcfFromVep already exists\n";
open($VCFVEP, "| $vepCommand | gzip -c --fast > $vcfFromVep") ||
die "E $0: cannot run VEP on new data for chrom $chr\n";
$vcfFromCache = "$tmpDir/vcfFromCache_$chr.vcf.gz";
(-e $vcfFromCache) &&
die "E $0: vcfFromCache $vcfFromCache already exists\n";
open($VCFCACHE, "| gzip -c --fast > $vcfFromCache") ||
die "E $0: cannot open gzip pipe to vcfFromCache $vcfFromCache\n";
# send minimal header to VEP
print $VCFVEP $headerLine;
$prevChr = $chr;
}
elsif ($prevChr ne $chr) {
# we changed chrom, start merging and printing results from prevChr
close($VCFVEP);
close($VCFCACHE);
$now = strftime("%F %T", localtime);
warn "I $now: $0 - finished parsing/processing chrom $prevChr\n";
# merge prevChr files
&mergeAndPrint($vcfFromVep, $vcfFromCache, $cacheUpdate);
unlink($vcfFromVep, $vcfFromCache);
# set up new outputs
$vcfFromVep = "$tmpDir/vcfFromVep_$chr.vcf.gz";
(-e $vcfFromVep) &&
die "E $0: vcfFromVep $vcfFromVep already exists\n";
open($VCFVEP, "| $vepCommand | gzip -c --fast > $vcfFromVep") ||
die "E $0: cannot run VEP on new data for chrom $chr\n";
$vcfFromCache = "$tmpDir/vcfFromCache_$chr.vcf.gz";
(-e $vcfFromCache) &&
die "E $0: vcfFromCache $vcfFromCache already exists\n";
open($VCFCACHE, "| gzip -c --fast > $vcfFromCache") ||
die "E $0: cannot open gzip pipe to vcfFromCache $vcfFromCache\n";
# send minimal header to VEP
print $VCFVEP $headerLine;
$prevChr = $chr;
}
# no else: process $line whether we changed chrom or not
# key: chrom:pos:ref:alt for SNVs/indels, chrom:pos:ref:alt:end for CNVs
my $key = "$f[0]:$f[1]:$f[3]:$f[4]";
if (($f[4] eq '<DUP>') || ($f[4] eq '<DEL>')) {
# CNV: grab END
($f[7] =~ /END=(\d+)/) ||
die "E $0: cannot grab END from CNV line:\n$line\n";
$key .= ":$1";
}
if (defined $cache->{$key}) {
# just copy CSQ from cache
($f[7]) && ($f[7] ne '.') && ($f[7] .= ';');
($f[7]) && ($f[7] eq '.') && ($f[7] = '');
$f[7] .= 'CSQ='.$cache->{$key};
print $VCFCACHE join("\t",@f)."\n";
}
else {
print $VCFVEP "$line\n";
}
}
# merge VCFs for last chrom, unless stdin was completely empty
if ($prevChr) {
$now = strftime("%F %T", localtime);
warn "I $now: $0 - finished parsing/processing chrom $prevChr\n";
close($VCFVEP);
close($VCFCACHE);
# merge last chr files
&mergeAndPrint($vcfFromVep, $vcfFromCache, $cacheUpdate);
unlink($vcfFromVep, $vcfFromCache);
# update cache with new CSQs
&updateCache($cacheFile, $cacheUpdate, $debug);
}
# clean up
unlink($vepStats) || die "E $0: cannot unlink vepStats file $vepStats\n";
rmdir($tmpDir) || die "E $0: cannot rmdir tmpdir $tmpDir\n";
$now = strftime("%F %T", localtime);
warn "I $now: $0 - ALL DONE, completed successfully!\n";
}
##########################################################################
### SUBS
##########################################################################
#######################
# construct and return the full VEP command-line
sub vepCommand {
(@_ == 6) || die "E $0: vepCommand needs 6 args";
my ($vepBin, $dataDir, $genome, $species, $vepJobs, $vepStats) = @_;
# --no_stats results in buggy VEP output (VEP git issue 1034), so we produce
# stats in $vepStats and the caller must remove the file when done
# VEP command, reading on stdin and printing to stdout
my $vepCommand = $vepBin;
$vepCommand .= " --offline --format vcf --vcf";
# cache to use: refseq, merged, or ensembl (default)
# $vepCommand .= " --merged";
$vepCommand .= " --species $species";
$vepCommand .= " --force_overwrite";
$vepCommand .= " --stats_file $vepStats";
$vepCommand .= " --allele_number"; # for knowing which CSQ annotates which ALT
$vepCommand .= " --canonical --biotype --symbol";
$vepCommand .= " --numbers --total_length --variant_class";
# report where the variant lies in the miRNA secondary structure
$vepCommand .= " --mirna";
# reduce default distance (5000) for annotating [upstream|downstream]_gene_variant
$vepCommand .= " --distance 1000";
# right-align indels before consequence calculation, see:
# https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#shifting
$vepCommand .= " --shift_3prime 1";
# commenting out "--domains", it's a bit massive and in non-deterministic order
# and we don't currently look at it
# also removing --pubmed, don't think anyone looks at that either
# I also tried out --regulatory but it's not really usable IMO, eg we obtain
# ENSR and ENSM features but we can't know the target genes...
## other possibilities to consider: --tsl --appris
if (($species eq 'homo_sapiens') || ($species eq 'human')) {
# human-only options
# limit to "GENCODE basic" transcripts ("removes fragmented or problematic transcripts")
# in v113 this is 158k transcripts, including all 19k MANE, almost all 41k "GENCODE Primary",
# and 72.7k/78.7k "Ensembl canonical"
$vepCommand .= " --gencode_basic";
$vepCommand .= " --mane --sift b --polyphen b";
# --af is 1KG-phase3 global AF, --af_1kg is per continent AFs
$vepCommand .= " --af --af_gnomade --af_gnomadg";
$vepCommand .= " --check_existing";
# Don't URI escape HGVS strings
$vepCommand .= " --no_escape";
$vepCommand .= " --fasta $genome --hgvs";
# plugins: all are human-only
# full --plugin string with all VEP plugins we want to use and
# associated datafiles (with paths)
my $vepPlugins = "";
# CADD - dbNSFP provides it (among many other things) for coding and consensus
# splice site variants, but not for deeper intronic variants
my $caddPath = "$dataDir/CADD/";
if (-d $caddPath) {
my $caddSnvs = "$caddPath/whole_genome_SNVs.tsv.gz";
my $caddIndels = "$caddPath/gnomad.genomes.r4.0.indel.tsv.gz";
if (-f $caddSnvs) {
if (-f $caddIndels) {
$vepPlugins .= " --plugin CADD,snv=$caddSnvs,indels=$caddIndels";
}
else {
warn "W: $0 - not using CADD plugin: cannot find CADD indels file, looking for $caddIndels\n";
}
}
else {
warn "W: $0 - not using CADD plugin: cannot find CADD SNVs file, looking for $caddSnvs\n";
}
}
else {
warn "W $0 - not using CADD plugin: CADD datadir doesn't exist, looking for $caddPath\n";
}
# dbNSFP and dbscSNV
my $dbNsfpPath = "$dataDir/dbNSFP/";
if (-d $dbNsfpPath) {
my $dbNsfpFile = "$dbNsfpPath/dbNSFP4.8a.gz";
if (-f $dbNsfpFile) {
# comma-separated list of fields to retrieve from dbNSFP, there are MANY
# possibilities, check the README in $dbNsfpPath
my $dbNsfpFields = "MutationTaster_pred,REVEL_rankscore,CADD_raw_rankscore";
# MetaRNN: both MetaRNN_rankscore and MetaRNN_pred: T(olerated) or D(amaging)
$dbNsfpFields .= ",MetaRNN_rankscore,MetaRNN_pred";
# ALFA minor allele freq: grab from dbNSFP, it's not in VEP cache despite
# https://github.com/Ensembl/ensembl-vep/issues/1043
$dbNsfpFields .= ",ALFA_Total_AF";
$vepPlugins .= " --plugin dbNSFP,$dbNsfpFile,transcript_match=1,$dbNsfpFields";
}
else {
warn "W: $0 - not using dbNSFP plugin: cannot find dbNSFP file, looking for $dbNsfpFile\n";
}
# dbscSNV (splicing), data is with dbNSFP (same authors)
my $dbscSNVfile = "$dbNsfpPath/dbscSNV1.1_GRCh38.txt.gz";
if (-f $dbscSNVfile) {
# specify assembly GRCh38 as second param because the plugin can't figure it out
$vepPlugins .= " --plugin dbscSNV,$dbscSNVfile,GRCh38";
}
else {
warn "W: $0 - not using dbscSNV plugin: cannot find dbscSNV file, looking for $dbscSNVfile\n";
}
}
else {
warn "W $0 - not using dbNSFP and dbscSNV plugins: datadir doesn't exist, looking for $dbNsfpPath\n";
}
# spliceAI - data must be DL'd, for that you have to create an account, provide your email
# and personal details... see eg:
# https://github.com/Ensembl/VEP_plugins/blob/release/113/SpliceAI.pm
my $spliceAIPath = "$dataDir/SpliceAI/";
if (-d $spliceAIPath) {
my $spliceAISnvs = "$spliceAIPath/spliceai_scores.raw.snv.hg38.vcf.gz";
my $spliceAIIndels = "$spliceAIPath/spliceai_scores.raw.indel.hg38.vcf.gz";
if (-f $spliceAISnvs) {
if (-f $spliceAIIndels) {
$vepPlugins .= " --plugin SpliceAI,snv=$spliceAISnvs,indel=$spliceAIIndels";
}
else {
warn "W: $0 - not using spliceAI plugin: cannot find spliceAI indels file, looking for $spliceAIIndels\n";
}
}
else {
warn "W: $0 - not using spliceAI plugin: cannot find spliceAI SNVs file, looking for $spliceAISnvs\n";
}
}
else {
warn "W $0 - not using spliceAI plugin: spliceAI datadir doesn't exist, looking for $spliceAIPath\n";
}
# AlphaMissense, data file must be DL'd as $alphaMSfile and tabix-indexed, see:
# https://github.com/Ensembl/VEP_plugins/blob/release/113/AlphaMissense.pm
my $alphaMSfile = "$dataDir/AlphaMissense/AlphaMissense_hg38.tsv.gz";
if (-f $alphaMSfile) {
$vepPlugins .= " --plugin AlphaMissense,file=$alphaMSfile";
}
else {
warn "W: $0 - not using AlphaMissense plugin:cannot find alphaMissense file, looking for $alphaMSfile\n";
}
# pLI, data file must be DL'd as $pLIfile, see:
# https://github.com/Ensembl/VEP_plugins/blob/release/113/pLI.pm
my $pLIfile = "$dataDir/pLI/pLI_values.txt";
if (-f $pLIfile) {
$vepPlugins .= " --plugin pLI,$pLIfile";
}
else {
warn "W: $0 - not using pLI plugin:cannot find pLI file, looking for $pLIfile\n";
}
$vepCommand .= $vepPlugins;
}
# --fork borks when $vepJobs==1
($vepJobs > 1) && ($vepCommand .= " --fork $vepJobs");
# write output to stdout so we can pipe it to another program
$vepCommand .= " -o STDOUT";
return($vepCommand);
}
#######################
# checkHeaders:
# - read VCF headers from STDIN;
# - run them through VEP, make sure the cache isn't stale;
# - if AOK, print VEP-processed header to stdout;
# - if $cache was empty, populate VEPVERSION and INFOCSQ in $cacheUpdate;
# - return the #CHROM header line, or "" if the cache is stale
sub checkHeaders {
(@_ == 4) || die "E $0: checkHeaders needs 4 args";
my ($cache, $cacheUpdate, $tmpDir, $vepCommand) = @_;
# #CHROM header line to return, will be set to "" if cache is stale
my $headerLine;
# make a small VCF containing all the headers from STDIN
my $vcfHeader = "$tmpDir/vcfHeader.vcf";
open(my $VCFHEAD, "> $vcfHeader") ||
die "E $0: cannot open vcfHeader $vcfHeader for writing\n";
while (my $line = <STDIN>) {
($line =~ /^#/) ||
die "E $0 in checkHeaders: STDIN isn't a correct VCF, found line:\n$line";
# header lines go to VCFHEAD
print $VCFHEAD $line;
# line #CHROM is always last header line
($line =~ /^#CHROM/) && ($headerLine = $line) && last;
}
close($VCFHEAD);
($headerLine) ||
die "E $0 in checkHeaders: STDIN is not a VCF, comment lines don't end with #CHROM line\n";
# run VEP on header file
open(my $VCFHEAD_OUT, "$vepCommand < $vcfHeader |") ||
die "E $0: cannot run VEP on header file with:\n$vepCommand < $vcfHeader\n";
# check that the cache matches the VEP version and has the correct VEP columns,
# if AOK print processed headers to stdout
my $headerToPrint = "";
while (my $line = <$VCFHEAD_OUT>) {
$headerToPrint .= $line;
chomp($line);
if ($line =~ /^##VEP=/) {
# make sure VEP version + DBs match the cache
# need to remove timestamp
my $lineClean = $line;
($lineClean =~ s/time="[^"]+" //) ||
die "E $0: cannot remove timestamp from ##VEP line:\n$line\n";
# also remove any path before /.vep/ in cache= so different users can use the
# same cacheFile if they have their own VEP install with same cache versions
$lineClean =~ s~(cache=")[^"]+(/.vep/)~$1$2~; # no "||die", eg if the user changed his VEPDIR
# also remove ensembl* fields, they appear in a random order that changes from run to run:
# https://github.com/Ensembl/ensembl-vep/issues/1540
$lineClean =~ s/\sensembl\S+//g ||
die "E $0: cannot remove ensembl* version info from ##VEP line:\n$line\n";
if (defined $cache->{"VEPversion"}) {
my $cacheLine = $cache->{"VEPversion"};
if ($cacheLine ne $lineClean) {
# version mismatch
$headerLine = "";
warn "E: $0 - cached VEP version and ##VEP line from VCF are different:\n$cacheLine\n$lineClean\n";
last;
}
}
else {
$cacheUpdate->{"VEPversion"} = $lineClean;
}
}
elsif ($line =~ /^##INFO=<ID=CSQ/) {
# make sure the INFO->CSQ fields from file and cache match
if (defined $cache->{"INFOCSQ"}) {
my $cacheLine = $cache->{"INFOCSQ"};
if ($cacheLine ne $line) {
$headerLine = "";
warn "E: $0 - cacheLine and INFO-CSQ line from VCF are different:\n$cacheLine\n$line\n";
last;
}
}
else {
$cacheUpdate->{"INFOCSQ"} = $line;
}
}
}
close($VCFHEAD_OUT);
unlink($vcfHeader);
# print headers unless there was a VEP version or INFO-CSQ mismatch
($headerLine) && (print $headerToPrint);
# return $headerLine, whether empty or not
return($headerLine);
}
#######################
# Merge $vcfFromVep and $vcfFromCache, printing resulting VCF to stdout.
# Header lines are ignored.
# $vcfFromVep and $vcfFromCache must both concern a single identical CHROM.
# All CSQs in $vcfFromVep are added to $cacheUpdate, a hashref similar
# to $cache: key=="chr:pos:ref:alt" for SNVs/indels and "chrom:pos:ref:alt:end"
# for CNVs, value==CSQ
sub mergeAndPrint {
(@_ == 3) || die "E $0: mergeAndPrint needs 3 args";
my ($vcfFromVep, $vcfFromCache, $cacheUpdate) = @_;
open(my $FROMVEP, "gunzip -c $vcfFromVep |") ||
die "E $0: cannot gunzip-open FROMVEP $vcfFromVep\n";
open (my $FROMCACHE, "gunzip -c $vcfFromCache |") ||
die "E $0: cannot gunzip-open FROMCACHE $vcfFromCache\n";
# next data lines from each file
my ($nextVep, $nextCache);
# POS from each next line (undef if no more data)
my ($nextVepPos, $nextCachePos);
# also need ALT and END (set to -1 if absent), to obtain deterministic merged output
my ($nextVepAlt, $nextCacheAlt, $nextVepEnd, $nextCacheEnd);
# CHROM, must be the same for both files and all lines
my $chr;
#######################
# prime with first lines, skipping headers
do {
$nextVep = <$FROMVEP>;
} while(($nextVep) && ($nextVep =~ /^#/));
if ($nextVep) {
# grab next* values for deterministic merging of fromVep and fromCache, and
# also populate $cacheUpdate for updating cache at the end
my @f = split(/\t/,$nextVep);
(@f >= 8) || die "E $0: nextVep line doesn't have >=8 columns:\n$nextVep";
($chr,$nextVepPos,$nextVepAlt)=($f[0],$f[1],$f[4]);
# key for $cacheUpdate: chrom:pos:ref:alt for SNVs/indels, chrom:pos:ref:alt:end for CNVs
my $key = "$f[0]:$f[1]:$f[3]:$f[4]";
$nextVepEnd = -1;
if (($nextVepAlt eq '<DUP>') || ($nextVepAlt eq '<DEL>')) {
# CNV: grab END
($f[7] =~ /END=(\d+)/) ||
die "E $0: cannot grab END in nextVep line:\n$nextVep";
$nextVepEnd = $1;
$key .= ":$1";
}
my $csq = "";
if ($f[7] =~ /CSQ=([^;]+)/) {
$csq = $1;
}
else {
warn "I $0: cannot grab CSQ in nextVep line, this is expected for huge SVs only:\n$nextVep";
}
$cacheUpdate->{$key} = $csq;
}
$nextCache = <$FROMCACHE>;
if ($nextCache) {
my @f = split(/\t/,$nextCache);
my $thisChr = $f[0];
if (!defined $chr) {
$chr = $thisChr;
}
else {
($chr eq $thisChr) ||
die"E $0: vcfFromCache has line with bad chrom, expected $chr:\n$nextCache\n";
}
($nextCachePos,$nextCacheAlt) = ($f[1],$f[4]);
$nextCacheEnd = -1;
if (($nextCacheAlt eq '<DUP>') || ($nextCacheAlt eq '<DEL>')) {
# CNV: grab END
($f[7] =~ /END=(\d+)/) ||
die "E $0: cannot grab END in nextCache line:\n$nextCache";
$nextCacheEnd = $1;
}
}
#######################
# as long as there is data
while ($nextVep || $nextCache) {
if (($nextVep) && ((! $nextCache) ||
($nextVepPos < $nextCachePos) ||
(($nextVepPos == $nextCachePos) && ($nextVepAlt lt $nextCacheAlt)) ||
(($nextVepPos == $nextCachePos) && ($nextVepAlt eq $nextCacheAlt) && ($nextVepEnd < $nextCacheEnd)) )) {
# still have VEP data and it must be printed now
print $nextVep;
# read next VEP line
$nextVep = <$FROMVEP>;
if ($nextVep) {
# grab next* values for deterministic merging of fromVep and fromCache, and
# also populate $cacheUpdate for updating cache at the end
my @f = split(/\t/,$nextVep);
(@f >= 8) || die "E $0: nextVep line doesn't have >=8 columns:\n$nextVep";
my $thisChr = $f[0];
($chr eq $thisChr) ||
die "E $0: vcfFromVep has line with bad chrom, expected $chr:\n$nextVep\n";
($nextVepPos,$nextVepAlt)=($f[1],$f[4]);
# key for $cacheUpdate: chrom:pos:ref:alt for SNVs/indels, chrom:pos:ref:alt:end for CNVs
my $key = "$f[0]:$f[1]:$f[3]:$f[4]";
$nextVepEnd = -1;
if (($nextVepAlt eq '<DUP>') || ($nextVepAlt eq '<DEL>')) {
# CNV: grab END
($f[7] =~ /END=(\d+)/) ||
die "E $0: cannot grab END in nextVep line:\n$nextVep";
$nextVepEnd = $1;
$key .= ":$1";
}
my $csq = "";
if ($f[7] =~ /CSQ=([^;]+)/) {
$csq = $1;
}
else {
warn "I $0: cannot grab CSQ in nextVep line, this is expected for huge SVs only:\n$nextVep";
}
$cacheUpdate->{$key} = $csq;
}
}
else {
# cache data must be printed
print $nextCache;
$nextCache = <$FROMCACHE>;
if ($nextCache) {
my @f = split(/\t/,$nextCache);
my $thisChr = $f[0];
($chr eq $thisChr) ||
die"E $0: vcfFromCache has line with bad chrom, expected $chr:\n$nextCache\n";
($nextCachePos,$nextCacheAlt) = ($f[1],$f[4]);
$nextCacheEnd = -1;
if (($nextCacheAlt eq '<DUP>') || ($nextCacheAlt eq '<DEL>')) {
# CNV: grab END
($f[7] =~ /END=(\d+)/) ||
die "E $0: cannot grab END in nextCache line:\n$nextCache";
$nextCacheEnd = $1;
}
}
}
}
#######################
# clean up
close($FROMVEP);
close($FROMCACHE);
}
#######################
# - Read cache from $cacheFile and compare with entries from %$cacheupdate;
# - save new entries from %$cacheupdate to $cacheFile (unless $debug);
# - if hashrefs have common keys, warn if the values disagree.
# This sub is thread-safe (can be called by several jobs running in parallel).
sub updateCache {
(@_ == 3) || die "E $0: updateCache needs 3 args";
my ($cacheFile, $cacheUpdate, $debug) = @_;
# return immediately if there's nothing to update
(%$cacheUpdate) || return();
# lock $cacheFile and retrieve a fresh copy (which will include any
# updates made by other processes while this process was running), then
# add any new entries from %$cacheUpdate and store
# -> works correctly even if several jobs are running in parallel
open(CACHELOCK, ">>$cacheFile") ||
die "E $0: I want to update cacheFile but I can't open it for locking\n";
flock(CACHELOCK, LOCK_EX) || die "E $0: cannot get lock on cachefile\n";
my $cache = {};
if (! -z $cacheFile) {
$cache = &retrieve($cacheFile) ||
die "E $0: I can't retrieve fresh copy of cache from cachefile $cacheFile\n";
}
foreach my $k (keys(%$cacheUpdate)) {
if (defined($cache->{$k})) {
# annotation for key $k has been added to $cacheFile while we were running
# (or was already there if $debug), make sure it's consistent
($cache->{$k} eq $cacheUpdate->{$k}) ||
warn "W $0: cacheFile entry for $k was added while we were running (or we are in debugVep mode)".
" and disagrees with our entry, we keep the cached version:\n".
$cache->{$k}."\n".$cacheUpdate->{$k}."\n";
}
else {
$cache->{$k} = $cacheUpdate->{$k};
}
}
if (! $debug) {
&store($cache, $cacheFile) ||
die "E $0: produced/updated cache but cannot store to cachefile $cacheFile\n";
}
# remove file if it exists but is empty (can happen in debug mode with
# no pre-existing cachefile)
(-z $cacheFile) && unlink($cacheFile);
# ok, release lock
flock(CACHELOCK, LOCK_UN);
close(CACHELOCK);
}