-
Notifications
You must be signed in to change notification settings - Fork 6
/
xmapisec.pl
executable file
·370 lines (325 loc) · 9.33 KB
/
xmapisec.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/perl -w
# xmapisec.pl (first version: 2016_06_21)
# create a molecule ID list from a BNX file
# parse two 'xmap' files obtained from the MQR alignment of this BNX data against two ref-cmaps
# identify molecules IDs present in either and both xmap files
# <optional> set a minimal confidence required to identify a match
# split the original BNX data in four new BNX subsets:
# 1) molecules aligning to ref-cmap#1
# 2) molecules aligning to ref-cmap#2
# 3) molecules aligning to both ref-cmaps
# 4) molecules that did not align to any ref-cmap
#
# Stephane Plaisance (VIB-NC+BITS) 2016/06/21; v1.0
#
# visit our Git: https://github.com/Nucleomics-VIB
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
use List::Util qw( min max );
# disable buffering to get output during long process (loop)
$|=1;
# handle command parameters
getopts('i:a:b:n:c:zh');
our($opt_i, $opt_a, $opt_b, $opt_n, $opt_c, $opt_z, $opt_h);
our $version="1.0 (2016-06-21)";
my $usage = "Aim: Identify molecules specific to two ref-cmaps, ubiquitous, or not-aligning
# Usage: xmapisec.pl <-i bnx-file> <-a first-xmap-file> <-b 2nd-xmap-file>
# script version:".$version."
# Additional optional parameters are:
# -n <prefix for the output files> (default='isec_')>
# -c <minimal confidence score to be considered (default='undef')>
# -z zip the output to save space> (default OFF)>
# <-h to display this help>";
## xmap fields used in this script
# 0 => "XmapEntryID",
## 1 => "QryContigID",
# 2 => "RefContigID",
# 3 => "QryStartPos",
# 4 => "QryEndPos",
# 5 => "RefStartPos",
# 6 => "RefEndPos",
# 7 => "Orientation",
## 8 => "Confidence",
# 9 => "HitEnum",
# 10 => "QryLen",
# 11 => "RefLen",
# 12 => "LabelChannel",
# 13 => "Alignment"
defined($opt_h) && die $usage . "\n";
my $inputfile = $opt_i || die $usage;
my $mapfile1 = $opt_a || die $usage;
my $mapfile2 = $opt_b || die $usage;
my $confcut = $opt_c || undef;
my $zipit = defined($opt_z) || undef;
# name output
my @suffixlist=(".bnx", ".bnx.gz", ".bnx.zip");
my($outbase, $outpath, $suffix) = fileparse($inputfile, @suffixlist);
if (defined ($opt_n)) {
$outbase=$opt_n;
}
# create 4 output file names
my $align1 = $outbase."_align-1.bnx";
my $align2 = $outbase."_align-2.bnx";
my $align3 = $outbase."_align-both.bnx";
my $align0 = $outbase."_align-none.bnx";
# also create 5 list files
my $bnxlst = $outbase."_bnx-idlist.txt";
my $lst1 = $outbase."_idlist-1.txt";
my $lst2 = $outbase."_idlist-2.txt";
my $lst3 = $outbase."_idlist-both.txt";
my $lst0 = $outbase."_idlist-none.txt";
# parse first xmap file
my $MAP1 = OpenMAP($mapfile1) or die $!;
print STDERR "# loading XMAP#1 aligned molecule IDs into hash#1\n";
# variables
my $first = 1;
my $cntln = 0;
my $cntali = 0;
my $keptali = 0;
my @fields = ();
my %in1hash = ();
while ( my $line = <$MAP1> ) {
# count lines
$cntln++;
# check top lines for "# XMAP File Version: 0.2"
if ($first == 1) {
if ($line !~ /^#\ XMAP\ File\ Version:\t0.2$/) {
if ($cntln > 20) {
die "## This does not seem to be a xmap file";
} else {
next;
}
}
$first = 0;
next;
}
# ignore any other comment line
next if ($line =~ /^#/);
# test data consistency
$line =~ /^[0-9]+.*/ or die "## does not seem to be a xmap file";
$cntali++;
@fields = split( /\t/, $line);
# check confidence cutoff from $confcut
if ( (! defined($confcut)) || ($fields[8] >= $confcut) ) {
$in1hash{$fields[1]}++;
$keptali++;
}
}
print STDERR "## finished parsing XMAP#1 data for ".$cntali." alignments (kept: $keptali)\n";
undef $MAP1;
# parse second xmap file
my $MAP2 = OpenMAP($mapfile2) or die $!;
print STDERR "# loading XMAP#2 aligned molecule IDs into hash#2\n";
# reuse $first $cntln $cntali @fields
$first = 1;
$cntln = 0;
$cntali = 0;
$keptali = 0;
@fields = ();
my %in2hash = ();
while ( my $line = <$MAP2> ) {
# count lines
$cntln++;
# check top lines for "# XMAP File Version: 0.2"
if ($first == 1) {
if ($line !~ /^#\ XMAP\ File\ Version:\t0.2$/) {
if ($cntln > 100) {
die "## This does not seem to be a xmap file";
} else {
next;
}
}
$first = 0;
next;
}
# ignore any other comment line
next if ($line =~ /^#/);
# test data consistency
$line =~ /^[0-9]+.*/ or die "## does not seem to be a xmap file";
$cntali++;
@fields = split( /\t/, $line);
# check confidence cutoff from $confcut
if ( (! defined($confcut)) || ($fields[8] >= $confcut) ) {
$in2hash{$fields[1]}++;
$keptali++;
}
}
print STDERR "## finished parsing XMAP#2 data for ".$cntali." alignments (kept: $keptali)\n";
undef $MAP2;
# create file handles for saving data
if ( defined($zipit) ) {
open ALI1, " | gzip -c > " . $align1 . ".gz" || die $!;
open ALI2, " | gzip -c > " . $align2 . ".gz" || die $!;
open ALI3, " | gzip -c > " . $align3 . ".gz" || die $!;
open ALI0, " | gzip -c > " . $align0 . ".gz" || die $!;
} else {
open ALI1, "> $align1" || die $!;
open ALI2, "> $align2" || die $!;
open ALI3, "> $align3" || die $!;
open ALI0, "> $align0" || die $!;
}
# also create ID lists
open LST1, "> $lst1" || die $!;
open LST2, "> $lst2" || die $!;
open LST3, "> $lst3" || die $!;
open LST0, "> $lst0" || die $!;
# count matches
my ( $cnt1, $cnt2, $cntboth, $cntno ) = (0, 0, 0, 0);
# parse BNX data and split based on hashes
print STDERR "# parsing BNX data and saving to files ";
my $BNX = OpenBNX($inputfile) or die $!;
open BNXLST, "> $bnxlst" || die $!;
# variables
$first = 1;
$cntln = 0;
$cntali = 0;
@fields = ();
my $countmol = 0;
while ( my $line = <$BNX> ) {
# count lines
$cntln++;
# check top line for "# BNX File Version: 1.2"
if ($first == 1) {
if ($line !~ /#\ BNX\ File\ Version:/) {
if ($cntln > 10) {
die "$line\n This does not seem to be a bnx file";
} else {
next;
}
}
$first = 0;
}
# print header to all outputs
if ( $line =~ /^#/ ) {
print ALI1 $line;
print ALI2 $line;
print ALI3 $line;
print ALI0 $line;
next;
}
# test data consistency
$line =~ /^0\b/ or die "## does not seem to be a valid bnx format";
$countmol++;
$countmol =~ /00000$/ && print STDERR ".";
# this is the zero-line
my $molid = (split(/\t/, $line))[1];
print BNXLST $molid."\n";
# add $line to @data
my @data = ();
push @data, $line;
# read three more lines in to @data
for ( my $d = 1; $d < 4; $d++ ) {
$line = <$BNX> || die "premature end of file";
push @data, $line;
}
# split according to hashes
if ( ! exists($in1hash{$molid}) && ! exists($in2hash{$molid}) ) {
# no match
print ALI0 map { "$_" } @data;
print LST0 $molid."\n";
$cntno++;
} elsif ( exists($in1hash{$molid}) && exists($in2hash{$molid}) ) {
# matching both
print ALI3 map { "$_" } @data;
print LST3 $molid."\n";
$cntboth++;
} elsif ( exists($in1hash{$molid}) ) {
# matching first
print ALI1 map { "$_" } @data;
print LST1 $molid."\n";
$cnt1++;
} elsif ( exists($in2hash{$molid}) ) {
print ALI2 map { "$_" } @data;
print LST2 $molid."\n";
$cnt2++;
} else {
die "## What's up Dr: $molid?\n";
}
}
print STDERR "\n## finished saving $countmol molecules to BNX files\n\n";
close BNXLST;
close ALI1;
close ALI2;
close ALI3;
close ALI0;
# report results
print STDERR "# Results\tcounts\t%total\n";
print STDERR "# aligning to #map1 only:\t$cnt1\t".
(sprintf("%d%%",100*$cnt1/$countmol))."\n";
print STDERR "# aligning to #map2 only:\t$cnt2\t".
(sprintf("%d%%",100*$cnt2/$countmol))."\n";
print STDERR "# aligning to both maps:\t$cntboth\t".
(sprintf("%d%%",100*$cntboth/$countmol))."\n";
print STDERR "# aligning to none:\t$cntno\t".
(sprintf("%d%%",100*$cntno/$countmol))."\n";
##############
#### Subs ####
sub OpenBNX {
# $Filename passed in, handle to file passed out
my $File = shift; # filename
my $FH; # file handle
if ( $File =~ /\.bnx$/ ) {
open( $FH, "cat $File | " ) or die("$!: can't open file $File");
} elsif ( $File =~ /\.bnx\.zip$/ ) {
open( $FH, "unzip -p $File | " ) or die("$!: can't open file $File");
} elsif ( $File =~ /(\.bnx\.gzip|\.bnx\.gz)$/ ) {
open( $FH, "gzip -dc $File | " ) or die("$!: can't open file $File");
} else {
die("$!: the file $File does not seem to be a 'bnx' file");
}
return $FH;
}
sub OpenMAP {
# $Filename passed in, handle to file passed out
my $File = shift; # filename
my $FH; # file handle
if ( $File =~ /\.xmap$/ ) {
open( $FH, "cat $File | " ) or die("$!: can't open file $File");
} elsif ( $File =~ /\.xmap\.zip$/ ) {
open( $FH, "unzip -p $File | " ) or die("$!: can't open file $File");
} elsif ( $File =~ /(\.xmap\.gzip|\.xmap\.gz)$/ ) {
open( $FH, "gzip -dc $File | " ) or die("$!: can't open file $File");
} else {
die("$!: the file $File does not seem to be a 'xmap' file");
}
return $FH;
}
######### example MQR command
# RefAligner
# -f
# -ref .../reference.cmap
# -i .../Molecules.bnx
# -o .../MoleculeQualityReport
# -nosplit 2
# -BestRef 1
# -biaswt 0
# -Mfast 0
# -FP 1.5
# -sf 0.2
# -sd 0.0
# -A 5
# -outlier 1e-4
# -endoutlier 1e-3
# -S
# -1000
# -sr 0.04
# -resbias 5 64
# -maxmem 256
# -M 3
# -minlen 150
# -maxlen 2000
# -minsites 8
# -minSNR 3
# -MaxIntensity 0.6
# -T 1e-7
# -maxthreads 48
# -hashgen 5 3 2.4 1.5 0.05 5.0 1 1 2
# -hash
# -hashdelta 10
# -hashmaxmem 256
# -insertThreads 8
# -usecolor 1
# -stdout
# -stderr