forked from aleimba/bac-genomics-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcdd2cog.pl
414 lines (285 loc) · 13.1 KB
/
cdd2cog.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#!/usr/bin/perl
#######
# POD #
#######
=pod
=head1 NAME
C<cdd2cog.pl> - assign COG categories to protein sequences
=head1 SYNOPSIS
C<perl cdd2cog.pl -r rps-blast.out -c cddid.tbl -f fun.txt -w whog>
=head1 DESCRIPTION
The script assigns COG (L<cluster of orthologous
groups|http://www.ncbi.nlm.nih.gov/COG/>) categories to proteins.
For this purpose, the query proteins need to be blasted with
RPS-BLAST+ (L<Reverse Position-Specific BLAST|http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download>)
against NCBI's Conserved Domain Database
(L<CDD|http://www.ncbi.nlm.nih.gov/cdd>). Use
L<C<cds_extractor.pl>|/cds_extractor> beforehand to extract multi-fasta
protein files from GENBANK or EMBL files.
Both tab-delimited RPS-BLAST+ outformats, B<-outfmt 6> and B<-outfmt
7>, can be processed by C<cdd2cog.pl>. By default, RPS-BLAST+ hits
for each query protein are filtered for the best hit (lowest
e-value). Use option B<-a|all_hits> to assign COGs to all BLAST hits
and e.g. do a downstream filtering in a spreadsheet application.
Results are written to tab-delimited files in the F<./results>
folder, overall assignment statistics are printed to C<STDOUT>.
Several files are needed from NCBI's FTP server to run the RPS-BLAST+
and C<cdd2cog.pl>:
=over
=item 1.) L<CDD|ftp://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/>
More information about the files in the CDD FTP archive can be found
in the respective F<README> file.
=item 1.1.) F<cddid.tbl.gz>
The file needs to be unpacked:
C<gunzip cddid.tbl.gz>
Contains summary information about the CD models in a tab-delimited
format. The columns are: PSSM-Id, CD accession (e.g. COG#), CD short
name, CD description, and PSSM (position-specific scoring matrices)
length.
=item 1.2.) F<./little_endian/Cog_LE.tar.gz>
Unpack and untar via:
C<tar xvfz Cog_LE.tar.gz>
Preformatted RPS-BLAST+ database of the CDD COG distribution for
Intel CPUs and Unix/Windows architectures.
=item 2.) L<COG|ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG/>
Read F<readme> for more information about the respective files in
the COG FTP archive.
=item 2.1.) F<fun.txt>
One-letter functional classification used in the COG database.
=item 2.2.) F<whog>
Name, description, and corresponding functional classification of
each COG.
=back
=head1 OPTIONS
=head2 Mandatory options
=over 20
=item B<-r>=I<str>, B<-rps_report>=I<str>
Path to RPS-BLAST+ report/output, outfmt 6 or 7
=item B<-c>=I<str>, B<-cddid>=I<str>
Path to CDD's F<cddid.tbl> file
=item B<-f>=I<str>, B<-fun>=I<str>
Path to COG's F<fun.txt> file
=item B<-w>=I<str>, B<-whog>=I<str>
Path to COG's F<whog> file
=back
=head2 Optional options
=over 20
=item B<-h>, B<-help>
Help (perldoc POD)
=item B<-a>, B<-all_hits>
Don't filter RPS-BLAST+ output for the best hit, rather assign COGs
to all hits
=item B<-v>, B<-version>
Print version number to C<STDERR>
=back
=head1 OUTPUT
=over 20
=item C<STDOUT>
Overall assignment statistics
=item F<./results>
All tab-delimited output files are stored in this result folder
=item F<rps-blast_cog.txt>
COG assignments concatenated to the RPS-BLAST+ results for filtering
=item F<protein-id_cog.txt>
Slimmed down F<rps-blast_cog.txt> only including query id (first
BLAST report column), COGs, and functional categories
=item F<cog_stats.txt>
Assignment counts for each used COG
=item F<func_stats.txt>
Assignment counts for single-letter functional categories
=back
=head1 EXAMPLES
=head2 RPS-BLAST+
=over
=item C<rpsblast -query protein.fasta -db Cog -out rps-blast.out
-evalue 1e-2 -outfmt 6>
=item C<rpsblast -query protein.fasta -db Cog -out rps-blast.out
-evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen
qstart qend sstart send evalue bitscore qcovs'>
=back
=head2 C<cdd2cog.pl>
=over
=item C<perl cdd2cog.pl -r rps-blast.out -c cddid.tbl -f fun.txt
-w whog -a>
=back
=head1 VERSION
0.2 update: 2017-02-16
0.1 2013-08-01
=head1 AUTHOR
Andreas Leimbach aleimba[at]gmx[dot]de
=head1 ACKNOWLEDGEMENTS
I got the idea for using NCBI's CDD PSSMs for COG assignment from JGI's L<IMG/ER annotation
system|http://img.jgi.doe.gov/>, which employes the same technique.
=head1 LICENSE
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 (GPLv3) of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=cut
########
# MAIN #
########
use strict;
use warnings;
use autodie;
use Getopt::Long;
use Pod::Usage;
### Get the options with Getopt::Long
my $Rps_Report; # path to the rps-blast report/output
my $CDDid_File; # path to the CDD 'cddid.tbl' file
my $Fun_File; # path to the COG 'fun' file
my $Whog_File; # path to the COG 'whog' file
my $Opt_All_Hits; # give all blast hits for a query, not just the best (lowest evalue)
my $VERSION = 0.1;
my ($Opt_Version, $Opt_Help);
GetOptions ('rps_report=s' => \$Rps_Report,
'cddid=s' => \$CDDid_File,
'fun=s' => \$Fun_File,
'whog=s' => \$Whog_File,
'all_hits' => \$Opt_All_Hits,
'version' => \$Opt_Version,
'help|?' => \$Opt_Help);
### Run perldoc on POD
pod2usage(-verbose => 2) if ($Opt_Help);
die "$0 $VERSION\n" if ($Opt_Version);
if (!$Rps_Report || !$CDDid_File || !$Fun_File || !$Whog_File) {
my $warning = "\n### Fatal error: Option(s) or arguments for '-r', '-c', '-f', or '-w' are missing!\n";
pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
}
### Parse the 'cddid.tbl', 'fun.txt' and 'whog' file contents and store info in hash structures
my (%CDDid, %Fun, %Whog); # global hashes
parse_cdd_cog(); # subroutine
### Create results directory for output files
my $Out_Dir = './results/';
if (-e $Out_Dir) {
print "###Directory '$Out_Dir' already exists! Replace the directory and all its contents [y|n]? ";
my $user_ask = <STDIN>;
if ($user_ask =~ /y/i) {
unlink glob "$Out_Dir*"; # remove all files in results directory
rmdir $Out_Dir; # remove the empty directory
} else {
die "Script abborted!\n";
}
}
mkdir $Out_Dir or die "Can't create directory \"$Out_Dir\": $!\n";
### Parse the rps-blast report/output file and assign COGs
my %Cog_Stats; # store the total number of query protein hits for each COG, written to '$Cogstats_Out' below
my $Blast_Out = 'rps-blast_cog.txt'; # output file for COG assignments appended to RPS-BLAST results
open (my $Blast_Out_Fh, ">", "$Out_Dir"."$Blast_Out");
print $Blast_Out_Fh "query id\tsubject id\t% identity\talignment length\tmismatches\tgap opens\tq. start\tq. end\ts. start\ts. end\tevalue\tbit score\tCOG#\tfunctional categories\t\t\t\t\tCOG protein description\n"; # header for $Blast_Out
my $Locus_Cog = "protein-id_cog.txt"; # slimmed down $Blast_Out only including locus_tags, COGs, and functional categories
open (my $Locus_Cog_Fh, ">", "$Out_Dir"."$Locus_Cog");
print "Parsing RPS-BLAST report ...\n"; # status message
my $Skip = ''; # only keep best blast hit per query (lowest e-value), except option 'all_hits' is given
open (my $Rps_Report_Fh, "<", "$Rps_Report");
while (<$Rps_Report_Fh>) {
if (/^#/) { # skip comment lines in blast report for BLAST+ "outfmt 7"
next;
}
chomp;
my @line = split(/\t/, $_); # split tab-separated RPS-BLAST report
if ($Skip eq $line[0] && !$Opt_All_Hits) {
# only keep best blast hit per query, only if option 'all_hits' is NOT set
# $line[0] is query id, and should be locus_tag or specific ID from multi-fasta protein query file
next;
}
$Skip = $line[0];
my $pssm_id = $1 if $line[1] =~ /^CDD\:(\d+)/; # get PSSM-Id from the subject hit
my $cog = $CDDid{$pssm_id}; # get the COG# according to the PSSM-Id as listed in 'cddid.tbl'
$Cog_Stats{$cog}++; # increment hit-number for specific COG
### Collect functional categories stats
my @functions = split('', $Whog{$cog}->{'function'}); # split the single-letter functional categories to count them and join them as tab-separated below
foreach (@functions) {
$Fun{$_}->{'count'}++; # increment hit-number for specific functional category
}
### Print to result files
my $functions = join("\t", @functions); # join functional categories tab-separated
print $Locus_Cog_Fh "$line[0]\t$cog\t$functions\n"; # locus_tag\tCOG\tfunctional categories
$functions .= "\t" x (5 - @functions); # add additional tabs for COGs with fewer than five functions (which is the maximum number)
print $Blast_Out_Fh "$_\t$cog\t$functions\t$Whog{$cog}->{'desc'}\n"; # $_ = RPS-BLAST line
}
close $Rps_Report_Fh;
close $Blast_Out_Fh;
close $Locus_Cog_Fh;
### Total COG and functional categories stats
print "Writing assignment statistic files in '$Out_Dir' folder ...\n"; # status message
my $Cogstats_Out = 'cog_stats.txt'; # output file for assignment numbers for each COG
open (my $Cog_Stats_Fh, ">", "$Out_Dir"."$Cogstats_Out");
my $prot_stats = 0; # store total number of query proteins, which have a COG assignment
foreach my $cog (sort keys %Cog_Stats) {
print $Cog_Stats_Fh "$cog\t$Whog{$cog}->{'desc'}\t$Cog_Stats{$cog}\n"; # COG protein descriptions stored in %Whog
$prot_stats += $Cog_Stats{$cog}; # sum up total COG assignments
}
close $Cog_Stats_Fh;
my $Funcstats_Out = 'func_stats.txt'; # output file for assignment numbers for each functional category
open (my $Func_Stats_Fh, ">", "$Out_Dir"."$Funcstats_Out");
my $func_cats = 0; # store total number of assigned functional categories
foreach my $func (sort keys %Fun) {
print $Func_Stats_Fh "$func\t$Fun{$func}->{'desc'}\t$Fun{$func}->{'count'}\n";
$func_cats += $Fun{$func}->{'count'}; # sum up total functional category assignments
}
close $Func_Stats_Fh;
### State which files were created and print overall statistics
print "\n############################################################################\n";
print "The following tab-delimited files were created in the '$Out_Dir' directory:\n";
print "- $Blast_Out: COG assignments concatenated to the RPS-BLAST results for filtering\n";
print "- $Locus_Cog: Slimmed down '$Blast_Out' only including query id (first BLAST report column), COGs, and functional categories\n";
print "- $Cogstats_Out: COG assignment counts\n";
print "- $Funcstats_Out: Functional category assignment counts\n";
print "##############################################################################\n";
print "Overall assignment statistics:\n";
print "~ Total query proteins categorized into COGs: $prot_stats\n";
print "~ Total COGs used for the query proteins [of ", scalar keys %CDDid, " overall]: ", scalar keys %Cog_Stats, "\n";
print "~ Total number of assigned functional categories: $func_cats\n";
print "~ Total functional categories used for the query proteins [of ", scalar keys %Fun, " overall]: ", scalar grep ($Fun{$_}->{'count'} > 0, keys %Fun), "\n\n"; # grep for functional categories with a count > 0 to get the ones with assigned query proteins
exit;
###############
# Subroutines #
###############
### Subroutine to parse the 'cddid.tbl', 'fun' and 'whog' file contents and store in hash structures
sub parse_cdd_cog {
### 'cddid.tbl'
open (my $cddid_fh, "<", "$CDDid_File");
print "\nParsing CDDs '$CDDid_File' file ...\n"; # status message
while (<$cddid_fh>) {
chomp;
my @line = split(/\t/, $_); # split line at the tabs
if ($line[1] =~ /^COG\d{4}$/) { # search for COG CD accessions in cddid
$CDDid{$line[0]} = $line[1]; # hash to store info; $line[0] = PSSM-Id
}
}
close $cddid_fh;
### 'fun.txt'
open (my $fun_fh, "<", "$Fun_File");
print "Parsing COGs '$Fun_File' file ...\n"; # status message
while (<$fun_fh>) {
chomp;
$_ =~ s/^\s*|\s+$//g; # get rid of all leading and trailing whitespaces
if (/^\[(\w)\]\s*(.+)$/) {
$Fun{$1} = {'desc' => $2, 'count' => 0}; # anonymous hash in hash
# $1 = single-letter functional category, $2 = description of functional category
# count used to find functional categories not present in the query proteins for final overall assignment statistics
}
}
close $fun_fh;
### 'whog'
open (my $whog_fh, "<", "$Whog_File");
print "Parsing COGs '$Whog_File' file ...\n"; # status message
while (<$whog_fh>) {
chomp;
$_ =~ s/^\s*|\s+$//g; # get rid of all leading and trailing whitespaces
if (/^\[(\w+)\]\s*(COG\d{4})\s+(.+)$/) {
$Whog{$2} = {'function' => $1, 'desc' => $3}; # anonymous hash in hash
# $1 = single-letter functional categories, maximal five per COG (only COG5032 with five)
# $2 = COG#, $3 = COG protein description
}
}
close $whog_fh;
return 1;
}