From 37efc0dac7fe31bc1973a8d407e8a2886546cb26 Mon Sep 17 00:00:00 2001
From: "Philip R. Kensche"
Date: Wed, 15 May 2019 12:54:23 +0200
Subject: [PATCH 1/4] code cleanup annotate_vcf.pl.
---
.../indelCallingWorkflow/annotate_vcf.pl | 677 +++++++++---------
1 file changed, 339 insertions(+), 338 deletions(-)
diff --git a/resources/analysisTools/indelCallingWorkflow/annotate_vcf.pl b/resources/analysisTools/indelCallingWorkflow/annotate_vcf.pl
index 2613fc9..2ee798b 100755
--- a/resources/analysisTools/indelCallingWorkflow/annotate_vcf.pl
+++ b/resources/analysisTools/indelCallingWorkflow/annotate_vcf.pl
@@ -1,8 +1,8 @@
#!/usr/bin/env perl
#
-# Copyright (c) 2018 German Cancer Research Center (DKFZ).
+# Copyright (c) 2018 German Cancer Research Center (Deutsches Krebsforschungszentrum, DKFZ).
#
-# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
+# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow/LICENSE).
#
use strict;
@@ -13,194 +13,188 @@
my @standardColnames;
BEGIN {
- use Getopt::Long;
- use Pod::Usage;
-
- if (! @ARGV) {
- pod2usage({-verbose => 1, -exitval => 2, -output => \*STDERR});
- }
-
- %opts = ('tabix_bin' => 'tabix',
- #'bgzip_bin' => 'bgzip',
- 'aFileType' => 'vcf',
- 'bFileType' => 'vcf',
- 'padding' => 0,
- 'maxBorderDistanceSum' => -1, # negative values in one of these disable this filter
- 'minOverlapFraction' => -1, # CAVE: it is one filter --- both conditions are connected by OR
- 'reportLevel' => 2, # 1: exact or position or all; 2: exact or all; 3: all,
- 'aPosOffset' => 0,
- 'aEndOffset' => 0,
- 'maxNrOfMatches' => 0,
- 'aColNameLineStart' => '#CHR',
- 'noMatchString' => '.',
- 'emptyBFields' => 0,
- );
-
-
-
- GetOptions( 'a=s' => \$opts{afile},
- 'b=s' => \$opts{bfile},
- 'columnName=s' => \$opts{columnname},
- 'padding:i' => \$opts{padding},
- 'tabix_bin:s' => \$opts{tabix_bin},
- #'bgzip_bin:s' => \$opts{bgzip_bin},
- 'bReportColumn:i' => \$opts{bReportColumn},
- 'bAdditionalColumns:s' => \$opts{bcolumns},
- 'minOverlapFraction:f' => \$opts{minOverlapFraction},
- 'maxBorderDistanceSum:i' => \$opts{maxBorderDistanceSum},
- 'bFileType:s' => \$opts{bFileType},
- 'aFileType:s' => \$opts{aFileType},
- 'aChromColumn:s' => \$opts{aChromColumn},
- 'aPosColumn:s' => \$opts{aPosColumn},
- 'aEndColumn:s' => \$opts{aEndColumn},
- 'aPosOffset:i' => \$opts{aPosOffset},
- 'aEndOffset:i' => \$opts{aEndOffset},
- 'aColNameLineStart:s' => \$opts{aColNameLineStart},
- 'reportLevel:i' => \$opts{reportLevel},
- 'reportMatchType!' => \$opts{reportMatchType},
- 'reportBFeatCoord!' => \$opts{reportBFeatCoord},
- 'reportOnlyMatches!' => \$opts{reportOnlyMatches},
- 'maxNrOfMatches:i' => \$opts{maxNrOfMatches},
- 'noMatchString:s' => \$opts{noMatchString},
- 'breakPointMode!' => \$opts{breakPointMode},
- 'chromXtr:s' => \$opts{chromXtr},
- 'chromYtr:s' => \$opts{chromYtr},
- 'emptyBFields' => \$opts{emptyBFields},
- 'debug!' => \$opts{debug},
- 'help' => \$opts{help},
- ) || pod2usage({-verbose => 1, -message => "$!", -exitval => 2, -output => \*STDERR});
-
- if ($opts{help}) {
- pod2usage({-verbose => 2, -exitval => 1});
- }
-
- ### Process user input and check validity
- if (! $opts{afile} || ! $opts{bfile} || ! $opts{columnname}) {
- pod2usage({-verbose => 1, -message => "Required parameter missing (a, b and columnname are required)", -exitval => 2, -output => \*STDERR});
- }
-
- $opts{bFileType} = lc($opts{bFileType});
-
- my %valid_bFileTypes = ( 'vcf' => 1,
- 'bed' => 1,
- 'gff3' => 1,
- 'bed_refvar' => 1,
- );
- $valid_bFileTypes{$opts{bFileType}} || die "Unknown b-file type $opts{bFileType}. Only b-files of type ". join(" ", sort keys %valid_bFileTypes). " are supported";
-
- $opts{aFileType} = lc($opts{aFileType});
- my %valid_aFileTypes = ( 'vcf' => 1,
- 'custom' => 1,
- );
- $valid_aFileTypes{$opts{aFileType}} || die "Only a-files of type ". join(" ", sort keys %valid_aFileTypes). " are supported";
-
- if (! defined($opts{bReportColumn})) {
- $opts{bReportColumn} = ($opts{bFileType} eq 'vcf') ? 7 : ($opts{bFileType} eq 'gff3') ? 8 : 3;
- }
-
- ### Now make constants
- require constant;
- for (keys %opts) {
- constant->import(uc($_), $opts{$_});
- }
- if ($opts{aFileType} eq 'vcf') {
- constant->import(CHROMCOL => 'CHROM');
- constant->import(POSCOL => 'POS');
- } elsif ($opts{aFileType} eq 'custom') {
- constant->import(CHROMCOL => $opts{aChromColumn});
- constant->import(POSCOL => $opts{aPosColumn});
- constant->import(ENDCOL => $opts{aEndColumn});
- }
+ use Getopt::Long;
+ use Pod::Usage;
+
+ if (!@ARGV) {
+ pod2usage({ -verbose => 1, -exitval => 2, -output => \*STDERR });
+ }
+
+ %opts = ('tabix_bin' => 'tabix',
+ #'bgzip_bin' => 'bgzip',
+ 'aFileType' => 'vcf',
+ 'bFileType' => 'vcf',
+ 'padding' => 0,
+ 'maxBorderDistanceSum' => -1, # negative values in one of these disable this filter
+ 'minOverlapFraction' => -1, # CAVE: it is one filter --- both conditions are connected by OR
+ 'reportLevel' => 2, # 1: exact or position or all; 2: exact or all; 3: all,
+ 'aPosOffset' => 0,
+ 'aEndOffset' => 0,
+ 'maxNrOfMatches' => 0,
+ 'aColNameLineStart' => '#CHR',
+ 'noMatchString' => '.',
+ 'emptyBFields' => 0,
+ );
+
+ GetOptions('a=s' => \$opts{afile},
+ 'b=s' => \$opts{bfile},
+ 'columnName=s' => \$opts{columnname},
+ 'padding:i' => \$opts{padding},
+ 'tabix_bin:s' => \$opts{tabix_bin},
+ #'bgzip_bin:s' => \$opts{bgzip_bin},
+ 'bReportColumn:i' => \$opts{bReportColumn},
+ 'bAdditionalColumns:s' => \$opts{bcolumns},
+ 'minOverlapFraction:f' => \$opts{minOverlapFraction},
+ 'maxBorderDistanceSum:i' => \$opts{maxBorderDistanceSum},
+ 'bFileType:s' => \$opts{bFileType},
+ 'aFileType:s' => \$opts{aFileType},
+ 'aChromColumn:s' => \$opts{aChromColumn},
+ 'aPosColumn:s' => \$opts{aPosColumn},
+ 'aEndColumn:s' => \$opts{aEndColumn},
+ 'aPosOffset:i' => \$opts{aPosOffset},
+ 'aEndOffset:i' => \$opts{aEndOffset},
+ 'aColNameLineStart:s' => \$opts{aColNameLineStart},
+ 'reportLevel:i' => \$opts{reportLevel},
+ 'reportMatchType!' => \$opts{reportMatchType},
+ 'reportBFeatCoord!' => \$opts{reportBFeatCoord},
+ 'reportOnlyMatches!' => \$opts{reportOnlyMatches},
+ 'maxNrOfMatches:i' => \$opts{maxNrOfMatches},
+ 'noMatchString:s' => \$opts{noMatchString},
+ 'breakPointMode!' => \$opts{breakPointMode},
+ 'chromXtr:s' => \$opts{chromXtr},
+ 'chromYtr:s' => \$opts{chromYtr},
+ 'emptyBFields' => \$opts{emptyBFields},
+ 'debug!' => \$opts{debug},
+ 'help' => \$opts{help},
+ ) || pod2usage({ -verbose => 1, -message => "$!", -exitval => 2, -output => \*STDERR });
+
+ if ($opts{help}) {
+ pod2usage({ -verbose => 2, -exitval => 1 });
+ }
+
+ ### Process user input and check validity
+ if (!$opts{afile} || !$opts{bfile} || !$opts{columnname}) {
+ pod2usage({ -verbose => 1, -message => "Required parameter missing (a, b and columnname are required)", -exitval => 2, -output => \*STDERR });
+ }
+
+ $opts{bFileType} = lc($opts{bFileType});
+
+ my %valid_bFileTypes = ('vcf' => 1,
+ 'bed' => 1,
+ 'gff3' => 1,
+ 'bed_refvar' => 1,
+ );
+ $valid_bFileTypes{$opts{bFileType}} || die "Unknown b-file type $opts{bFileType}. Only b-files of type " . join(" ", sort keys %valid_bFileTypes) . " are supported";
+
+ $opts{aFileType} = lc($opts{aFileType});
+ my %valid_aFileTypes = ('vcf' => 1,
+ 'custom' => 1,
+ );
+ $valid_aFileTypes{$opts{aFileType}} || die "Only a-files of type " . join(" ", sort keys %valid_aFileTypes) . " are supported";
+
+ if (!defined($opts{bReportColumn})) {
+ $opts{bReportColumn} = ($opts{bFileType} eq 'vcf') ? 7 : ($opts{bFileType} eq 'gff3') ? 8 : 3;
+ }
+
+ ### Now make constants
+ require constant;
+ for (keys %opts) {
+ constant->import(uc($_), $opts{$_});
+ }
+ if ($opts{aFileType} eq 'vcf') {
+ constant->import(CHROMCOL => 'CHROM');
+ constant->import(POSCOL => 'POS');
+ } elsif ($opts{aFileType} eq 'custom') {
+ constant->import(CHROMCOL => $opts{aChromColumn});
+ constant->import(POSCOL => $opts{aPosColumn});
+ constant->import(ENDCOL => $opts{aEndColumn});
+ }
}
if (BFILETYPE eq 'vcf') {
- @standardColnames = qw(CHROM POS ID REF ALT QUAL FILTER INFO);
+ @standardColnames = qw(CHROM POS ID REF ALT QUAL FILTER INFO);
} elsif (BFILETYPE eq 'gff3') {
- @standardColnames = qw(seqid source type start end score strand phase attributes);
+ @standardColnames = qw(seqid source type start end score strand phase attributes);
} elsif (BFILETYPE eq 'bed_refvar') {
- @standardColnames = (qw(chrom chromStart chromEnd ), 'ref>var');
+ @standardColnames = (qw(chrom chromStart chromEnd), 'ref>var');
} else {
- @standardColnames = qw(chrom chromStart chromEnd name);
+ @standardColnames = qw(chrom chromStart chromEnd name);
}
-
DEBUG && say "MINOVERLAPFRACTION: ", MINOVERLAPFRACTION(), "MAXBORDERDISTANCESUM: ", MAXBORDERDISTANCESUM();
#my $newcol = $opts{columnname};
#my $pad = $opts{padding};
-
+
open(A, "$opts{afile}") || die "Could not open a-file $opts{afile}\n";
-if (! -e $opts{bfile})
-{
- die "The b-file $opts{bfile} does not exist, exiting!\n";
+if (!-e $opts{bfile}) {
+ die "The b-file $opts{bfile} does not exist, exiting!\n";
}
-if (! -e "$opts{bfile}.tbi")
-{
- die "There is no .tbi index for b-file $opts{bfile}, exiting!\n";
+if (!-e "$opts{bfile}.tbi") {
+ die "There is no .tbi index for b-file $opts{bfile}, exiting!\n";
}
# guess b file chr format ## TODO: this does not work in every case
my ($b_chr_prefix, $b_chr_suffix);
open(GUESS, TABIX_BIN . " -l $opts{bfile} | ");
while () {
- chomp;
- if (/([^\d]*)\d+(.*)/) {
- $b_chr_prefix = $1;
- $b_chr_suffix = $2;
- last;
- }
+ chomp;
+ if (/([^\d]*)\d+(.*)/) {
+ $b_chr_prefix = $1;
+ $b_chr_suffix = $2;
+ last;
+ }
}
close GUESS;
-$b_chr_prefix = '' if (! defined($b_chr_prefix));
-$b_chr_suffix = '' if (! defined($b_chr_suffix));
+$b_chr_prefix = '' if (!defined($b_chr_prefix));
+$b_chr_suffix = '' if (!defined($b_chr_suffix));
# get b-file column names
my @b_colnames;
-if (BFILETYPE ne 'gff3') { # gff3 files have no column names in header
- my $b_header_cmd = TABIX_BIN() . " -h $opts{bfile} $b_chr_prefix" . '1' . $b_chr_suffix . ":0-0 |";
- open(HEAD, $b_header_cmd);
- my @b_header = ;
-
- #### if I have a multi-line header print out all lines but the last
- #for (my $i=0; $i < @b_header-1; $i++) {
- # print $b_header[$i];
- #}
-
- ### the last line should contain the column names
- if (@b_header) {
- my $b_header = $b_header[-1];
- chomp($b_header);
- $b_header =~ s/^\#//;
- @b_colnames = split(/\t/, $b_header);
- }
+if (BFILETYPE ne 'gff3') {
+ # gff3 files have no column names in header
+ my $b_header_cmd = TABIX_BIN() . " -h $opts{bfile} $b_chr_prefix" . '1' . $b_chr_suffix . ":0-0 |";
+ open(HEAD, $b_header_cmd);
+ my @b_header = ;
+
+ #### if I have a multi-line header print out all lines but the last
+ #for (my $i=0; $i < @b_header-1; $i++) {
+ # print $b_header[$i];
+ #}
+
+ ### the last line should contain the column names
+ if (@b_header) {
+ my $b_header = $b_header[-1];
+ chomp($b_header);
+ $b_header =~ s/^\#//;
+ @b_colnames = split(/\t/, $b_header);
+ }
}
-if (! @b_colnames) {
- @b_colnames = @standardColnames;
+if (!@b_colnames) {
+ @b_colnames = @standardColnames;
}
-
DEBUG && say "b_colnames: @b_colnames";
-
my $newcolidx;
my %a_fields;
my @b_fields;
my @a_columns;
my %a_columns;
-my @b_columns = split(',',$opts{bcolumns}) if (defined($opts{bcolumns}));
-my $chr='';
+my @b_columns = split(',', $opts{bcolumns}) if (defined($opts{bcolumns}));
+my $chr = '';
my $chr_raw = '';
my $mh;
my $rs;
my $header;
while ($header = ) {
- last if ($header =~ /^$opts{aColNameLineStart}/i); # that is the line with the column names
- print $header; # print out every preceeding line
- die "Invalid a-file header" if ($header =~ /^[^\#]/);
+ last if ($header =~ /^$opts{aColNameLineStart}/i); # that is the line with the column names
+ print $header; # print out every preceeding line
+ die "Invalid a-file header" if ($header =~ /^[^\#]/);
}
chomp $header;
@@ -209,33 +203,32 @@ BEGIN
@a_columns = split(/\t/, $header);
$newcolidx = 0;
foreach (@a_columns) {
- if ($_ eq COLUMNNAME()) {
- last;
- }
- $newcolidx++;
+ if ($_ eq COLUMNNAME()) {
+ last;
+ }
+ $newcolidx++;
}
if ($newcolidx == @a_columns) {
- say $poundkey_rm ? '#' : '', $header, "\t", COLUMNNAME();
- push @a_columns, COLUMNNAME();
+ say $poundkey_rm ? '#' : '', $header, "\t", COLUMNNAME();
+ push @a_columns, COLUMNNAME();
} else {
- say $poundkey_rm ? '#' : '', $header;
+ say $poundkey_rm ? '#' : '', $header;
}
-%a_columns = map { $_ => 1 } @a_columns;
+%a_columns = map {$_ => 1} @a_columns;
-if (! exists $a_columns{CHROMCOL()}) {
- die "CHROMCOL not found in a-file ";
+if (!exists $a_columns{CHROMCOL()}) {
+ die "CHROMCOL not found in a-file ";
}
-if (! exists $a_columns{POSCOL()}) {
- die "POSCOL not found in a-file ";
+if (!exists $a_columns{POSCOL()}) {
+ die "POSCOL not found in a-file ";
}
-if ($opts{aFileType} eq 'custom' && ! exists $a_columns{ENDCOL()}) {
- die "ENDCOL not found in a-file ";
+if ($opts{aFileType} eq 'custom' && !exists $a_columns{ENDCOL()}) {
+ die "ENDCOL not found in a-file ";
}
-
my @matches;
my $a_line;
my $b_line;
@@ -255,194 +248,202 @@ BEGIN
my $alt;
my %a_alts;
-AFILE_LOOP: while ($a_line=) {
- @matches = ();
- chomp($a_line);
- @a_fields{@a_columns} = split(/\t/, $a_line);
- # DEBUG && say join '; ', map {"$_ => $a_fields{$_}"} @a_columns;
- if ($a_fields{CHROMCOL()} ne $chr_raw) {
- $chr_changed = 1;
- $chr_raw = $a_fields{CHROMCOL()};
- $chr = $chr_raw =~ s/[^\dXY]*([\dXY]+).*/$1/r;
- $chr = $chrXb if (CHROMXTR() && $chr eq $chrXa);
- $chr = $chrYb if (CHROMYTR() && $chr eq $chrYa);
- } else {
- $chr_changed = 0;
- }
- #DEBUG && say "CHR: $chr CHR_RAW: $chr_raw ($a_fields{CHR})";
-
- $a_left = $a_fields{POSCOL()};
- if (AFILETYPE eq 'vcf' ) {
- # if (defined($a_fields{INFO}) && $a_fields{INFO} =~ /END=(\d+)/) {
- # $a_right = $1 - 1;
- # } else {
- $a_right = $a_left + length($a_fields{REF}) - 1;
- # }
- } else {
- $a_right = $a_fields{ENDCOL()};
- }
-
- if (APOSOFFSET()) {
- $a_left += APOSOFFSET();
- }
- if (AENDOFFSET()) {
- $a_right += AENDOFFSET();
- }
-
-
- if ($chr_changed) {
- @b_lines = ();
- $next_b_line = {};
- close $b_fh if (ref($b_fh));
- open $b_fh, TABIX_BIN . " $opts{bfile} ${b_chr_prefix}${chr}${b_chr_suffix} |" or die "opening b file $opts{bfile} with tabix failed";
- warn "Tabix returned no b-features for chromosome ${b_chr_prefix}${chr}${b_chr_suffix}" if (! ref($b_fh));
- # $b_linectr = 0;
- }
- if ((! defined($next_b_line->{left}) || $next_b_line->{left}-PADDING() <= $a_right && ref($b_fh))) {
- # read new b_lines until we have one where the left coordinate is higher than a_right + pad
- while ($b_line=<$b_fh>) {
- if (defined($next_b_line->{left})) {
- push(@b_lines,$next_b_line);
- $next_b_line = {};
- }
- chomp($b_line);
- DEBUG && say $b_line;
- # $b_linectr++;
- @b_fields = split(/\t/, $b_line);
- # ($b_fields[0] eq $b_chr_prefix.$chr.$b_chr_suffix) || die "Chromosomes between tumor and germline file do not match: $b_fields[0] ne ${b_chr_prefix}${chr}${b_chr_suffix}";
- if (BFILETYPE eq 'vcf') { ### b-file is vcf file
- $next_b_line->{left} = $b_fields[1];
- $next_b_line->{ref} = $b_fields[3];
- $next_b_line->{alt} = $b_fields[4];
- $next_b_line->{report} = 'POS='.$next_b_line->{left}.';' if (REPORTBFEATCOORD);
- $next_b_line->{right} = $next_b_line->{left} + length($next_b_line->{ref}) - 1;
- if ($b_fields[7] !~ /END=(\d+)/) {
- $next_b_line->{report} .= 'END='.$next_b_line->{right}.';' if (REPORTBFEATCOORD);
- }
- } elsif (BFILETYPE eq 'gff3') { ### b-file is gff3 file
- $next_b_line->{left} = $b_fields[3];
- $next_b_line->{right} = $b_fields[4]; # here we read END as the last included base
- } elsif (BFILETYPE eq 'bed_refvar') {
- $next_b_line->{left} = $b_fields[1] + 1;
- $next_b_line->{right} = $b_fields[2]; # here we read END as the last included base
- ($next_b_line->{ref},$next_b_line->{alt}) = split(/>/,$b_fields[3]);
- } else { ### b-file is bed file
- $next_b_line->{left} = $b_fields[1] + 1;
- $next_b_line->{right} = $b_fields[2]; # here we read END as the last included base
- }
-
- if ($next_b_line->{right}+PADDING() < $a_left) { # b-feature is left of a-feature; skip...
- $next_b_line = {};
- next;
- }
- if (REPORTBFEATCOORD && (BFILETYPE eq 'gff3' || BFILETYPE eq 'bed' || BFILETYPE eq 'bed_refvar')) {
- $next_b_line->{report} = 'POS='.$next_b_line->{left}.';END='.$next_b_line->{right}.';';
- }
- if (EMPTYBFIELDS()) {
- for (@b_columns) {
- $b_fields[$_] = '.' if (! defined($b_fields[$_]));
- }
- }
- if (BREPORTCOLUMN() >= 0 ) {
- $next_b_line->{report} .= join ';', $b_fields[BREPORTCOLUMN()], (map { $b_colnames[$_] . '=' .$b_fields[$_] } @b_columns);
- } else {
- $next_b_line->{report} .= join ';', (map { $b_colnames[$_] . '=' .$b_fields[$_] } @b_columns);
- }
- last if ($next_b_line->{left}-PADDING() > $a_right);
- }
- if (defined($next_b_line->{left}) && $next_b_line->{left}-PADDING() <= $a_right) { # the while loop terminated because eof at b_fh
- push(@b_lines, $next_b_line);
- $next_b_line = {};
+AFILE_LOOP:
+while ($a_line = ) {
+ @matches = ();
+ chomp($a_line);
+ @a_fields{@a_columns} = split(/\t/, $a_line);
+ # DEBUG && say join '; ', map {"$_ => $a_fields{$_}"} @a_columns;
+ if ($a_fields{CHROMCOL()} ne $chr_raw) {
+ $chr_changed = 1;
+ $chr_raw = $a_fields{CHROMCOL()};
+ $chr = $chr_raw =~ s/[^\dXY]*([\dXY]+).*/$1/r;
+ $chr = $chrXb if (CHROMXTR() && $chr eq $chrXa);
+ $chr = $chrYb if (CHROMYTR() && $chr eq $chrYa);
+ } else {
+ $chr_changed = 0;
}
- }
- # Now compare a_line to all b_lines in b_lines hash
- B_LINE_LOOP: foreach $b_line (@b_lines) {
- if ( $b_line->{right}+PADDING() < $a_left ) {
- $b_line = undef; # delete($b_lines{$b_linenr}); #####
- next;
+ #DEBUG && say "CHR: $chr CHR_RAW: $chr_raw ($a_fields{CHR})";
+
+ $a_left = $a_fields{POSCOL()};
+ if (AFILETYPE eq 'vcf') {
+ # if (defined($a_fields{INFO}) && $a_fields{INFO} =~ /END=(\d+)/) {
+ # $a_right = $1 - 1;
+ # } else {
+ $a_right = $a_left + length($a_fields{REF}) - 1;
+ # }
+ } else {
+ $a_right = $a_fields{ENDCOL()};
}
- if ( $b_line->{left}-PADDING() > $a_right ) {
- last;
+
+ if (APOSOFFSET()) {
+ $a_left += APOSOFFSET();
}
- if ((BFILETYPE eq 'vcf' || BFILETYPE eq 'bed_refvar') && AFILETYPE() eq 'vcf') {
- %a_alts = ();
- map { $a_alts{uc($_)} = 1 } split(',',$a_fields{ALT});
- foreach $alt (split(',', $b_line->{alt})) { # to check for exact match we must go through all alt alleles in b files
- if ( $b_line->{left} == $a_left && $b_line->{right} == $a_right && uc($b_line->{ref}) eq uc($a_fields{REF}) && $a_alts{uc($alt)} ) {
- $mh = {mt => 'exact',
- mr => $b_line->{report},
- ofsd => 3};
- push(@matches, $mh);
- next B_LINE_LOOP ;
- }
- }
+ if (AENDOFFSET()) {
+ $a_right += AENDOFFSET();
}
- next B_LINE_LOOP if (REPORTLEVEL == 4);
- if ( $b_line->{left} == $a_left && $b_line->{right} == $a_right ) {
- $mh = {mt => 'position',
- mr => $b_line->{report},
- ofsd => 2};
- push(@matches, $mh);
- next;
+
+ if ($chr_changed) {
+ @b_lines = ();
+ $next_b_line = {};
+ close $b_fh if (ref($b_fh));
+ open $b_fh, TABIX_BIN . " $opts{bfile} ${b_chr_prefix}${chr}${b_chr_suffix} |" or die "opening b file $opts{bfile} with tabix failed";
+ warn "Tabix returned no b-features for chromosome ${b_chr_prefix}${chr}${b_chr_suffix}" if (!ref($b_fh));
+ # $b_linectr = 0;
}
- if ($b_line->{left} <= $a_right && $b_line->{right} >= $a_left) {
- if (BREAKPOINTMODE()) {
- next if ($b_line->{left}-PADDING() > $a_left && $b_line->{right}+PADDING() < $a_right);
- }
- $ol_left = ($b_line->{left} > $a_left) ? $b_line->{left} : $a_left;
- $ol_right = ($b_line->{right} < $a_right) ? $b_line->{right} : $a_right;
- $ol_length = $ol_right - $ol_left + 1;
- $ol_frac = $ol_length / (($b_line->{right}-$b_line->{left} > $a_right-$a_left) ? $b_line->{right}-$b_line->{left} + 1 : $a_right-$a_left + 1);
- $ol_frac = sprintf("%.2f", $ol_frac);
- } else {
- $ol_frac = 0;
+ if ((!defined($next_b_line->{left}) || $next_b_line->{left} - PADDING() <= $a_right && ref($b_fh))) {
+ # read new b_lines until we have one where the left coordinate is higher than a_right + pad
+ while ($b_line = <$b_fh>) {
+ if (defined($next_b_line->{left})) {
+ push(@b_lines, $next_b_line);
+ $next_b_line = {};
+ }
+ chomp($b_line);
+ DEBUG && say $b_line;
+ # $b_linectr++;
+ @b_fields = split(/\t/, $b_line);
+ # ($b_fields[0] eq $b_chr_prefix.$chr.$b_chr_suffix) || die "Chromosomes between tumor and germline file do not match: $b_fields[0] ne ${b_chr_prefix}${chr}${b_chr_suffix}";
+ if (BFILETYPE eq 'vcf') {
+ ### b-file is vcf file
+ $next_b_line->{left} = $b_fields[1];
+ $next_b_line->{ref} = $b_fields[3];
+ $next_b_line->{alt} = $b_fields[4];
+ $next_b_line->{report} = 'POS=' . $next_b_line->{left} . ';' if (REPORTBFEATCOORD);
+ $next_b_line->{right} = $next_b_line->{left} + length($next_b_line->{ref}) - 1;
+ if ($b_fields[7] !~ /END=(\d+)/) {
+ $next_b_line->{report} .= 'END=' . $next_b_line->{right} . ';' if (REPORTBFEATCOORD);
+ }
+ } elsif (BFILETYPE eq 'gff3') {
+ ### b-file is gff3 file
+ $next_b_line->{left} = $b_fields[3];
+ $next_b_line->{right} = $b_fields[4]; # here we read END as the last included base
+ } elsif (BFILETYPE eq 'bed_refvar') {
+ $next_b_line->{left} = $b_fields[1] + 1;
+ $next_b_line->{right} = $b_fields[2]; # here we read END as the last included base
+ ($next_b_line->{ref}, $next_b_line->{alt}) = split(/>/, $b_fields[3]);
+ } else {
+ ### b-file is bed file
+ $next_b_line->{left} = $b_fields[1] + 1;
+ $next_b_line->{right} = $b_fields[2]; # here we read END as the last included base
+ }
+
+ if ($next_b_line->{right} + PADDING() < $a_left) {
+ # b-feature is left of a-feature; skip...
+ $next_b_line = {};
+ next;
+ }
+ if (REPORTBFEATCOORD && (BFILETYPE eq 'gff3' || BFILETYPE eq 'bed' || BFILETYPE eq 'bed_refvar')) {
+ $next_b_line->{report} = 'POS=' . $next_b_line->{left} . ';END=' . $next_b_line->{right} . ';';
+ }
+ if (EMPTYBFIELDS()) {
+ for (@b_columns) {
+ $b_fields[$_] = '.' if (!defined($b_fields[$_]));
+ }
+ }
+ if (BREPORTCOLUMN() >= 0) {
+ $next_b_line->{report} .= join ';', $b_fields[BREPORTCOLUMN()], (map {$b_colnames[$_] . '=' . $b_fields[$_]} @b_columns);
+ } else {
+ $next_b_line->{report} .= join ';', (map {$b_colnames[$_] . '=' . $b_fields[$_]} @b_columns);
+ }
+ last if ($next_b_line->{left} - PADDING() > $a_right);
+ }
+ if (defined($next_b_line->{left}) && $next_b_line->{left} - PADDING() <= $a_right) {
+ # the while loop terminated because eof at b_fh
+ push(@b_lines, $next_b_line);
+ $next_b_line = {};
+ }
}
- $dist_left = $b_line->{left} - $a_left;
- $dist_right = $b_line->{right} - $a_right;
-
- if (MINOVERLAPFRACTION() < 0 || $ol_frac >= MINOVERLAPFRACTION() || MAXBORDERDISTANCESUM() < 0 || abs($dist_left) + abs($dist_right) <= MAXBORDERDISTANCESUM()) {
- $mh = {mt => "inexact(OF=$ol_frac;DL=$dist_left;DR=$dist_right)",
- mr => $b_line->{report},
- ofsd => $ol_frac};
- push(@matches, $mh);
- next;
+ # Now compare a_line to all b_lines in b_lines hash
+ B_LINE_LOOP:
+ foreach $b_line (@b_lines) {
+ if ($b_line->{right} + PADDING() < $a_left) {
+ $b_line = undef; # delete($b_lines{$b_linenr}); #####
+ next;
+ }
+ if ($b_line->{left} - PADDING() > $a_right) {
+ last;
+ }
+ if ((BFILETYPE eq 'vcf' || BFILETYPE eq 'bed_refvar') && AFILETYPE() eq 'vcf') {
+ %a_alts = ();
+ map {$a_alts{uc($_)} = 1} split(',', $a_fields{ALT});
+ foreach $alt (split(',', $b_line->{alt})) { # to check for exact match we must go through all alt alleles in b files
+ if ($b_line->{left} == $a_left && $b_line->{right} == $a_right && uc($b_line->{ref}) eq uc($a_fields{REF}) && $a_alts{uc($alt)}) {
+ $mh = { mt => 'exact',
+ mr => $b_line->{report},
+ ofsd => 3 };
+ push(@matches, $mh);
+ next B_LINE_LOOP;
+ }
+ }
+ }
+ next B_LINE_LOOP if (REPORTLEVEL == 4);
+ if ($b_line->{left} == $a_left && $b_line->{right} == $a_right) {
+ $mh = { mt => 'position',
+ mr => $b_line->{report},
+ ofsd => 2 };
+ push(@matches, $mh);
+ next;
+ }
+ if ($b_line->{left} <= $a_right && $b_line->{right} >= $a_left) {
+ if (BREAKPOINTMODE()) {
+ next if ($b_line->{left} - PADDING() > $a_left && $b_line->{right} + PADDING() < $a_right);
+ }
+ $ol_left = ($b_line->{left} > $a_left) ? $b_line->{left} : $a_left;
+ $ol_right = ($b_line->{right} < $a_right) ? $b_line->{right} : $a_right;
+ $ol_length = $ol_right - $ol_left + 1;
+ $ol_frac = $ol_length / (($b_line->{right} - $b_line->{left} > $a_right - $a_left) ? $b_line->{right} - $b_line->{left} + 1 : $a_right - $a_left + 1);
+ $ol_frac = sprintf("%.2f", $ol_frac);
+ } else {
+ $ol_frac = 0;
+ }
+ $dist_left = $b_line->{left} - $a_left;
+ $dist_right = $b_line->{right} - $a_right;
+
+ if (MINOVERLAPFRACTION() < 0 || $ol_frac >= MINOVERLAPFRACTION() || MAXBORDERDISTANCESUM() < 0 || abs($dist_left) + abs($dist_right) <= MAXBORDERDISTANCESUM()) {
+ $mh = { mt => "inexact(OF=$ol_frac;DL=$dist_left;DR=$dist_right)",
+ mr => $b_line->{report},
+ ofsd => $ol_frac };
+ push(@matches, $mh);
+ next;
+ }
}
- }
-
- @b_lines = grep { defined } @b_lines; # this should be improved!
-
-
- $a_fields{COLUMNNAME()} = '';
- if (REPORTLEVEL < 3) {
- foreach (@matches) {
- if ($_->{mt} eq 'exact') { # if we have an exact match report only this
- $rs = REPORTMATCHTYPE() ? 'MATCH=' . $_->{mt} . ';' . $_->{mr} : $_->{mr};
- $a_fields{COLUMNNAME()} = ($a_fields{COLUMNNAME()}) ? $a_fields{COLUMNNAME()} . '&' . $rs : $rs;
- next;
- }
- if (REPORTLEVEL == 1 && $_->{mt} eq 'position') { # if we have a position match report only this (and exact matches if we have)
- $rs = REPORTMATCHTYPE() ? 'MATCH=' . $_->{mt} . ';' . $_->{mr} : $_->{mr};
- $a_fields{COLUMNNAME()} = ($a_fields{COLUMNNAME()}) ? $a_fields{COLUMNNAME()} . '&' . $rs : $rs;
- }
+
+ @b_lines = grep {defined} @b_lines; # this should be improved!
+
+
+ $a_fields{COLUMNNAME()} = '';
+ if (REPORTLEVEL < 3) {
+ foreach (@matches) {
+ if ($_->{mt} eq 'exact') {
+ # if we have an exact match report only this
+ $rs = REPORTMATCHTYPE() ? 'MATCH=' . $_->{mt} . ';' . $_->{mr} : $_->{mr};
+ $a_fields{COLUMNNAME()} = ($a_fields{COLUMNNAME()}) ? $a_fields{COLUMNNAME()} . '&' . $rs : $rs;
+ next;
+ }
+ if (REPORTLEVEL == 1 && $_->{mt} eq 'position') {
+ # if we have a position match report only this (and exact matches if we have)
+ $rs = REPORTMATCHTYPE() ? 'MATCH=' . $_->{mt} . ';' . $_->{mr} : $_->{mr};
+ $a_fields{COLUMNNAME()} = ($a_fields{COLUMNNAME()}) ? $a_fields{COLUMNNAME()} . '&' . $rs : $rs;
+ }
+ }
}
- }
- if (MAXNROFMATCHES() && @matches) {
- @matches = (sort {$b->{ofsd} <=> $a->{ofsd}} @matches)[0 .. ((MAXNROFMATCHES()-1 < $#matches) ? MAXNROFMATCHES()-1 : $#matches)];
- }
+ if (MAXNROFMATCHES() && @matches) {
+ @matches = (sort {$b->{ofsd} <=> $a->{ofsd}} @matches)[0 .. ((MAXNROFMATCHES() - 1 < $#matches) ? MAXNROFMATCHES() - 1 : $#matches)];
+ }
- if (! $a_fields{COLUMNNAME()}) { # else report all matches
- if (REPORTMATCHTYPE()) {
- $a_fields{COLUMNNAME()} = join('&', (map {'MATCH=' . $_->{mt} .';' . $_->{mr}} @matches));
- } else {
- $a_fields{COLUMNNAME()} = join('&', (map {$_->{mr}} @matches));
+ if (!$a_fields{COLUMNNAME()}) { # else report all matches
+ if (REPORTMATCHTYPE()) {
+ $a_fields{COLUMNNAME()} = join('&', (map {'MATCH=' . $_->{mt} . ';' . $_->{mr}} @matches));
+ } else {
+ $a_fields{COLUMNNAME()} = join('&', (map {$_->{mr}} @matches));
+ }
+ }
+ if (!@matches) {
+ next if (REPORTONLYMATCHES());
+ $a_fields{COLUMNNAME()} = NOMATCHSTRING();
}
- }
- if (! @matches) {
- next if (REPORTONLYMATCHES());
- $a_fields{COLUMNNAME()} = NOMATCHSTRING();
- }
- say join "\t", @a_fields{@a_columns};
+ say join "\t", @a_fields{@a_columns};
} # AFILE_LOOP
close A;
close $b_fh if (ref($b_fh));
From 8d32ae14118c0ac48789648ee8ffb28af0fab55c Mon Sep 17 00:00:00 2001
From: "Philip R. Kensche"
Date: Wed, 15 May 2019 14:12:29 +0200
Subject: [PATCH 2/4] Added executibility check (REFERENCE_GENOME set to
readable file on submission host). Fixe reported version.
---
IndelCallingWorkflow.iml | 8 ++++++
README.md | 23 +++++++++++++++++
buildinfo.txt | 4 +--
buildversion.txt | 2 +-
.../b080/co/IndelCallingWorkflowPlugin.java | 4 +--
.../IndelCallingWorkflow.java | 25 +++++++++++++++++--
6 files changed, 59 insertions(+), 7 deletions(-)
diff --git a/IndelCallingWorkflow.iml b/IndelCallingWorkflow.iml
index 649637c..4e6cedc 100644
--- a/IndelCallingWorkflow.iml
+++ b/IndelCallingWorkflow.iml
@@ -1,5 +1,10 @@
+
+
+
+
+
@@ -14,5 +19,8 @@
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index e1d8ae9..b460ca7 100644
--- a/README.md
+++ b/README.md
@@ -52,12 +52,35 @@ There are quite extensive requirements in annotation etc. data required for the
| extractSamplesFromOutputFiles | true | |
| CHROMOSOME_INDICES | empty | Bash-array of chromosome names to which the analysis should be restricted |
+Since version 2.2.0 the workflow uses the [COWorkflowsBasePlugin](https://github.com/DKFZ-ODCF/COWorkflowsBasePlugin) 1.5.0+ with an alternative algorithm for extracting sample names from BAM files.
# Example call
TBD
# Changelist
+
+* Version update to 2.2.0
+
+ * Upgrade from [COWorkflowsBasePlugin](https://github.com/DKFZ-ODCF/COWorkflowsBasePlugin) 1.1.0 to 1.5.0
+ * Executability check for `REFERENCE_GENOME` variable (file accessible from submission host)
+
+* Version update to 2.1.0-1
+
+ * Added gnomAD exomes
+ * Check REF_genome for BAM files
+ * Check BAM file readability
+ * Added local controls
+ * Adding python script for parsing MNPs
+
+* Version update to 2.0.0-2
+
+ * Restricting screenshot generation to 100
+
+* Version update to 2.0.0-1
+
+ * README update
+
* Version update to 2.0.0
- TiNDA workflow was updated
diff --git a/buildinfo.txt b/buildinfo.txt
index 30c4947..9377d2b 100644
--- a/buildinfo.txt
+++ b/buildinfo.txt
@@ -1,2 +1,2 @@
-dependson=COWorkflowsBasePlugin:1.1.0
-RoddyAPIVersion=3.0
+dependson=COWorkflowsBasePlugin:1.5.0
+RoddyAPIVersion=3.5
diff --git a/buildversion.txt b/buildversion.txt
index 71cad19..e90fc70 100644
--- a/buildversion.txt
+++ b/buildversion.txt
@@ -1,2 +1,2 @@
-1.3
+2.2
0
diff --git a/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java b/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java
index b50a4d8..0f33cda 100644
--- a/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java
+++ b/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java
@@ -12,8 +12,8 @@
*/
public class IndelCallingWorkflowPlugin extends BasePlugin {
- public static final String CURRENT_VERSION_STRING = "1.3.0";
- public static final String CURRENT_VERSION_BUILD_DATE = "Tue Nov 13 12:17:53 CET 2018";
+ public static final String CURRENT_VERSION_STRING = "2.2.0";
+ public static final String CURRENT_VERSION_BUILD_DATE = "Wed May 15 14:12:04 CEST 2019";
@Override
public String getVersionInfo() {
diff --git a/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java b/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java
index f443598..8b882b8 100644
--- a/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java
+++ b/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java
@@ -8,13 +8,14 @@
import de.dkfz.b080.co.common.WorkflowUsingMergedBams;
import de.dkfz.b080.co.files.*;
-import de.dkfz.roddy.config.ConfigurationValue;
import de.dkfz.roddy.config.RecursiveOverridableMapContainerForConfigurationValues;
import de.dkfz.roddy.core.ExecutionContext;
import de.dkfz.roddy.knowledge.files.FileObject;
import de.dkfz.roddy.knowledge.files.Tuple2;
import de.dkfz.roddy.tools.LoggerWrapper;
+import java.io.File;
+
/**
* Indel calling based on the platypus pipeline.
*/
@@ -25,7 +26,7 @@ public class IndelCallingWorkflow extends WorkflowUsingMergedBams {
private static VCFFileForIndels extractVCF(FileObject runResult) {
return (VCFFileForIndels) ((Tuple2)runResult).value0;
}
-
+
@Override
public boolean execute(ExecutionContext context, BasicBamFile _bamControlMerged, BasicBamFile _bamTumorMerged) {
@@ -81,4 +82,24 @@ private boolean executeWithoutControl(TumorBamFile bamTumorMerged, boolean runTi
run("indelVcfFilterWithoutControl", bamTumorMerged, deepAnnotatedVCFFile);
return true;
}
+
+ private boolean checkFileCValue(ExecutionContext context, String cvalueName) {
+ String filenameString = context.getConfigurationValues().getString(cvalueName);
+ return !context.valueIsEmpty(filenameString) &&
+ context.fileIsAccessible(new File(filenameString), cvalueName);
+ }
+
+ private boolean checkConfiguration(ExecutionContext context) {
+ boolean returnValue;
+ returnValue = checkFileCValue(context, "REFERENCE_GENOME");
+ return returnValue;
+ }
+
+
+ @Override
+ public boolean checkExecutability(ExecutionContext context) {
+ boolean result = super.checkExecutability(context);
+ result &= checkConfiguration(context);
+ return result;
+ }
}
From 3378f57ab28460f18689c41f50fabbd9ba02829a Mon Sep 17 00:00:00 2001
From: "Philip R. Kensche"
Date: Wed, 15 May 2019 14:33:39 +0200
Subject: [PATCH 3/4] Fixed COWFBP dependency to 1.4.1.
---
README.md | 4 ++--
buildinfo.txt | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index b460ca7..56aa764 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ There are quite extensive requirements in annotation etc. data required for the
| extractSamplesFromOutputFiles | true | |
| CHROMOSOME_INDICES | empty | Bash-array of chromosome names to which the analysis should be restricted |
-Since version 2.2.0 the workflow uses the [COWorkflowsBasePlugin](https://github.com/DKFZ-ODCF/COWorkflowsBasePlugin) 1.5.0+ with an alternative algorithm for extracting sample names from BAM files.
+Since version 2.2.0 the workflow uses the [COWorkflowsBasePlugin](https://github.com/DKFZ-ODCF/COWorkflowsBasePlugin) 1.4.1+ with an alternative algorithm for extracting sample names from BAM files.
# Example call
@@ -62,7 +62,7 @@ TBD
* Version update to 2.2.0
- * Upgrade from [COWorkflowsBasePlugin](https://github.com/DKFZ-ODCF/COWorkflowsBasePlugin) 1.1.0 to 1.5.0
+ * Upgrade from [COWorkflowsBasePlugin](https://github.com/DKFZ-ODCF/COWorkflowsBasePlugin) 1.1.0 to 1.4.1
* Executability check for `REFERENCE_GENOME` variable (file accessible from submission host)
* Version update to 2.1.0-1
diff --git a/buildinfo.txt b/buildinfo.txt
index 9377d2b..a2e1726 100644
--- a/buildinfo.txt
+++ b/buildinfo.txt
@@ -1,2 +1,2 @@
-dependson=COWorkflowsBasePlugin:1.5.0
+dependson=COWorkflowsBasePlugin:1.4.1
RoddyAPIVersion=3.5
From 27f69f035502b9a312122adccf67aaa2eefc0c2e Mon Sep 17 00:00:00 2001
From: "Philip R. Kensche"
Date: Tue, 21 May 2019 13:28:18 +0200
Subject: [PATCH 4/4] Updated COWFBP dependency in Idea config.
---
IndelCallingWorkflow.iml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/IndelCallingWorkflow.iml b/IndelCallingWorkflow.iml
index 4e6cedc..5a2075b 100644
--- a/IndelCallingWorkflow.iml
+++ b/IndelCallingWorkflow.iml
@@ -18,9 +18,9 @@
-
+
\ No newline at end of file