-
Notifications
You must be signed in to change notification settings - Fork 117
/
Geno2MP.pm
218 lines (166 loc) · 6.07 KB
/
Geno2MP.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2024] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 CONTACT
Ensembl <http://www.ensembl.org/info/about/contact/index.html>
=cut
=head1 NAME
Geno2MP
=head1 SYNOPSIS
cp Geno2MP.pm ${HOME}/.vep/Plugins
./vep -i variations.vcf --plugin Geno2MP,file=/path/to/Geno2MP/data.vcf.gz
# Return more columns from Geno2MP VCF file
./vep -i variations.vcf --plugin Geno2MP,file=/path/to/Geno2MP/data.vcf.gz,cols=HPO_CT:FXN:nhomalt_male_aff:nhomalt_male_unaff
# Build and return Geno2MP URL based on GRCh37 variant location
./vep -i variations.vcf --plugin Geno2MP,file=/path/to/Geno2MP/data.vcf.gz,url=1
=head1 DESCRIPTION
A VEP plugin that adds information from Geno2MP, a web-accessible database of
rare variant genotypes linked to phenotypic information.
Parameters can be set using a key=value system:
file : VCF file containing Geno2MP data
cols : colon-delimited list of Geno2MP columns to return from INFO fields
(by default it only returns the column HPO_CT)
url : build and return URL to Geno2MP variant page (boolean; 0 by default);
the variant location in Geno2MP website is based on GRCh37 coordinates
Please cite Geno2MP alongside the VEP if you use this resource:
Geno2MP, NHGRI/NHLBI University of Washington-Center for Mendelian Genomics (UW-CMG), Seattle, WA
(URL: http://geno2mp.gs.washington.edu [date (month, yr) accessed]).
=cut
package Geno2MP;
use strict;
use warnings;
use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles);
use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line);
use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
sub prepare_header_info {
my $self = shift;
# Header information based on Geno2MP file (for user-selected columns only)
$self->{header_info} = {};
open IN, "tabix -f -h " . $self->{_files}[0] . " 1:1-1 |";
my @header = <IN>;
while (my $line = shift @header) {
next unless $line =~ /##INFO/;
# Get column name and description for user-selected columns
my ($col, $description) = $line =~ /.*ID=(.*?),.*,Description="(.*)".*/g;
next unless $self->{cols}->{$col};
if ($col =~ "HPO_CT") {
$col = $self->{hpo_ct};
$description = "Number of phenotypic profiles in Geno2MP";
}
my $key = $self->{label} . "_" . $col;
$self->{header_info}->{$key} = $description;
}
close IN;
# Geno2MP URL header information
if ($self->{url}) {
my $key = $self->{label} . "_URL";
$self->{header_info}->{$key} = "Link to Geno2MP variant page"
}
}
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
my $config = $self->{config};
$self->expand_left(0);
$self->expand_right(0);
my $param_hash = $self->params_to_hash();
# Add Geno2MP file
my $file = $param_hash->{file};
die "\nERROR: No Geno2MP file specified\nTry using 'file=path/to/Geno2MP/file.vcf.gz'\n"
unless defined($file);
$self->add_file($file);
# Build URL linking to Geno2MP variant page
$self->{url} = defined($param_hash->{url}) ? $param_hash->{url} : 0;
# Retrieve specific columns from VCF
my @cols = $param_hash->{cols} ? split(/\:/, $param_hash->{cols}) : "HPO_CT";
$self->{cols}->{$_} = 1 for @cols;
# Prepare header information from Geno2MP file header
$self->{label} = "Geno2MP";
$self->{hpo_ct} = "HPO_count";
$self->prepare_header_info();
return $self;
}
sub feature_types {
return ['Feature', 'Intergenic'];
}
sub get_header_info {
my $self = shift;
return $self->{header_info};
}
sub run {
my ($self, $tva) = @_;
my $vf = $tva->variation_feature;
# get allele
my $alt_alleles = $tva->base_variation_feature->alt_alleles;
my $ref_allele = $vf->ref_allele_string;
my @data = @{ $self->get_data($vf->{chr}, $vf->{start} - 2, $vf->{end}) };
return {} unless(@data);
foreach my $var (@data) {
my $matches = get_matched_variant_alleles(
{
ref => $ref_allele,
alts => $alt_alleles,
pos => $vf->{start},
strand => $vf->strand
},
{
ref => $var->{ref},
alts => [$var->{alt}],
pos => $var->{start},
}
);
return $var->{result} if (@$matches);
}
return {};
}
sub parse_data {
my ($self, $line) = @_;
# Parse VCF fields
my ($chrom, $pos, $id, $ref, $alt, $qual, $filter, $info) = split /\t/, $line;
# Parse INFO fields
my (%res, $grch37);
my @attribs = split/\;/, $info;
foreach my $att (@attribs){
my($col, $val) = split/\=/, $att;
# Save GRCh37 variant location if available
$grch37 = $val if $col eq "GRCh37";
if ($self->{cols}->{$col}) {
$col = $self->{hpo_ct} if $col =~ "HPO_CT";
my $key = $self->{label} . "_" . $col;
$res{$key} = $val;
}
}
# Add URL based on GRCh37 variant location (Geno2P webpage only supports GRCh37)
if ($self->{url}) {
# Guess between SNP or indels (other types not supported in Geno2MP)
my $type = (length($alt) == 1 && length($ref) == 1) ? "snp" : "indel";
# Get GRCh37 variant location from INFO if available
my ($c, $p) = $grch37 ? split(/_/, $grch37) : ($chrom, $pos);
my $key = $self->{label} . "_URL";
my $base = "https://geno2mp.gs.washington.edu/Geno2MP";
$res{$key} = "$base/#/variant/$c/$p/$ref%3E$alt/$type";
}
return {
ref => $ref,
alt => $alt,
start => $pos,
result => \%res
}
}
sub get_start {
return $_[1]->{start};
}
sub get_end {
return $_[1]->{end};
}
1;