forked from Sandramses/PAst
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPAst.pl
224 lines (177 loc) · 9.37 KB
/
PAst.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/perl -w
use strict;
#the Pseudomonas aeruginosa serotyper (PAst) v3
if ($ARGV[0] eq "help" or $ARGV[0] eq "-h" or $ARGV[0] eq "man") {
print "\nProgram:\tPAst (Pseudomonas aeruginosa serotyper)\nVersion:\t1.0 (using NCBI blast+ 2.2.29)\n\nUsage:\t\t./PAst.pl <path/to/BLASTbin> <path/to/output/directory> <path/to/input/directory> <path/to/OSAdatabase>\n\nBLAST bin:\tPath to the bin containing blasts executables (ex. /usr/bin/ncbi-blast-2.2.29+/bin/)\nOutput dir:\tDirectory where blast reports, OSA multifasta files and result summary (serotyping.txt) are placed\nInput dir:\tDirectory containing assembled input genomes in multifasta format (and nothing else)\nOSA database:\tPath to OSA database file (downloadable from Github)\n\n";
} else {
#Configuration (user specified):
my $BLASTbin = $ARGV[0];
my $outdir = $ARGV[1];
my $inputdir = $ARGV[2];
my $db = $ARGV[3];
#####################################################
################# Input acquisition #################
#####################################################
my @IDs = [];
#input filenames are retrieved from $inputdir
opendir DIR, $inputdir or die "Cannot open input directory $inputdir: $!\n";
@IDs = readdir DIR;
closedir DIR;
#directory references are removed from list
my @sorted = sort @IDs;
@IDs = @sorted;
splice(@IDs, 0, 2);
#number of files localized is printed for user and pipeline initiated
print "There are ", scalar(@IDs), " files in the input directory, these are now processed by PAST.\n";
#####################################################
################# Serotype pipeline #################
#####################################################
my @output = ();
#each file in the input directory is processed and typed separately
foreach my $file (@IDs) {
#blastn analysis is performed in terminal, output saved in $outdir
system("cd $outdir");
system("$BLASTbin/blastn -query $inputdir/$file -subject $db -out $outdir/BLAST$file");
#information about OSA clusters (sequence and length) is extracted from blastn output
my %data = &extractBLAST_sub($file);
#the isolate is typed if data have been extracted from blast output, otherwise error messages are printed
if (exists $data{O1}) {
#in silico serotype is determined based on blastn data
my @serotype = &serotype_sub(\%data);
#result summary for file is saved in @output and individual file output is generated (sub)
push(@output, &output_sub(\$file, \%data, \@serotype));
} else {
#error messages if data was not extracted from blastn output
print "- PAst encountered problems with BLAST analysis of $file, please check manually.\n";
push(@output, "No data could be extracted from the BLAST output for $file, please check file manually.")
}
}
#####################################################
################# Output generation #################
#####################################################
#combined result summary for all input files is generated as serotyping.txt
open(OUT, '>', "$outdir/serotyping.txt") or die "Output file serotyping.txt could not be written: $!";
#header line is followed by result summaries for individual files (tab-delimited)
print OUT "filename\tserotype\t%coverage of OSA\tOSA fragments\n";
print OUT join("\n", @output);
close OUT;
#user is informed the analysis is finalized an directed to the output files
print "The analysis has finished. Results (serotyping.txt) and BLAST reports are placed in $outdir\n";
#####################################################
#################### Subroutines ####################
#####################################################
#subroutine for extracting data from blastn output
sub extractBLAST_sub {
#open blast output file
open(IN, '<', "$outdir/BLAST$_[0]") or print "ERROR blastn output file BLAST$_[0] could not be opened or was not generated: $!";
#variables
my %extract = ();
my ($Otype, $cov) = '';
#parse blastn output and extract informatin into %extract
while (defined(my $line = <IN>)) {
#extract names of OSA clusters with hits in query
$Otype = $1 if $line =~ m/^Subject=\s+([^\s]+)/;
#extract %identity of blast hit
$cov = $1 if $line =~ m/^\s+Identities\s+=\s+\d+[^\d]\d+[^\d]+(\d+)/;
#extract dna sequence and length if %identity exceeds 94%
if ($line =~ m/^Query\s+(\d+)\s+(\w+)\s+(\d+)/) {
$extract{$Otype}{sequence} .= uc($2) unless $cov < 95;
$extract{$Otype}{length} += length($2) unless $cov < 95;
}
#add ' ' after each type, to separate multiple hits
$extract{$Otype}{sequence} .= ' ' if $line =~ m/^\s+Score/;
}
close IN;
#return extracted data
return %extract;
}
#subroutine for determining in silico serotype
sub serotype_sub {
#variables
my @type = ();
my $besttype = '';
my $tmp = 0;
my $flag = 0;
#OSA ref sizes
my %OSAref = (O1 => 18368, O2 => 23303, O3 => 20210, O4 => 15279, O6 => 15649, O7 => 19617, O9 => 17263, O10 => 17635,
O11 => 13868, O12 => 25864, O13 => 14316, WzyB => 1140);
#detection of WzyB
foreach my $O (sort keys %{$_[0]}) {
if ($O eq "WzyB") {
#calculate coverage of wzyB
${$_[0]}{$O}{coverage} = 100 - (($OSAref{$O} - ${$_[0]}{$O}{length})/$OSAref{$O} * 100);
#flag is on if wzyB was detected
$flag = 1 if ${$_[0]}{$O}{coverage} > 95;
}
}
#assignment of in silico serotype
foreach my $O (sort keys %{$_[0]}) {
#calculate coverage of OSA gene clusters
${$_[0]}{$O}{coverage} = 100 - (($OSAref{$O} - ${$_[0]}{$O}{length})/$OSAref{$O} * 100);
#save serotypes with over 95% coverage in @type and best hit
if ($O ne "WzyB") {
if (${$_[0]}{$O}{coverage} > 95) {
#O2 + wzyB = O2/O16
if ($flag == 1 and $O eq "O2") {
push(@type, $O);
#O2 - wzyB = O5/O18/O20
} elsif ($flag == 0 and $O eq "O2") {
push(@type, "O5");
${$_[0]}{O5}{coverage} = ${$_[0]}{$O}{coverage};
#other types based only on OSA
} else {push(@type, $O);}
#no significant hit, best type is stored
} elsif ($tmp < ${$_[0]}{$O}{coverage}) {
$tmp = ${$_[0]}{$O}{coverage};
$besttype = $O;
}
}
}
#if no significant hit is identified the best is added to @type
if (!defined $type[0]) {
push(@type, $besttype);
}
#return identified serotype of isolates
return (@type);
}
#subroutine for generating output
sub output_sub {
#variable
my @results = ();
#for each assigned serotype of the isolate print message to user about analysis result (one type, indecisive or no type)
foreach my $type (@{$_[2]}) {
#print serotyping result for user
if (${$_[1]}{$type}{coverage} eq "NT") {
print "- No serotype could be detected for isolate ${$_[0]}.\n";
} elsif (scalar(@{$_[2]}) == 1 and ${$_[1]}{$type}{coverage} > 95) {
print "- The serotype of the isolate ${$_[0]} is ${$_[2]}[0].\n";
} elsif (scalar(@{$_[2]}) > 1) {
print "- The serotype of the isolate ${$_[0]} is indecisive.\n";
}
#generate fasta file with sequence of OSA cluster
open(OUT, '>', "$outdir/$type-OSA_${$_[0]}") or print "ERROR output fasta file $outdir/$type-OSA_${$_[0]} could not be generated: $!\n";
my @seqs = ();
#split sequences from different blastn hits
if ($type eq 'O5') {
@seqs = split(" ", ${$_[1]}{O2}{sequence});
} else {
@seqs = split(" ", ${$_[1]}{$type}{sequence});
}
my $i = 1;
#print header and sequence (60 chars per line) for each blastn hot OSA fragment
foreach my $seq (@seqs) {
print OUT "> $type-OSA.$i, size ", length($seq), "\n";
for (my $j = 0; $j < length($seq); $j += 60) {
print OUT substr($seq, $j, 60), "\n";
}
$i++;
}
close OUT;
#generate result summary for each assigned serotype
push(@results, "${$_[0]}\t$type\t${$_[1]}{$type}{coverage}\t".scalar(@seqs)."\n");
}
#return result summary
return @results;
}
}
__END__