-
Notifications
You must be signed in to change notification settings - Fork 18
/
removeGaps.pl
109 lines (100 loc) · 2.84 KB
/
removeGaps.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/perl
######################################################
# Written by Sanaa Ahmed
# Jan. 03, 2013
# Given a fasta file and a gap file
# pulls non-gap sequence regions.
# Gap file must consist of a tab-delimited file
# with the following template
# Fasta header, start coord, end coord
######################################################
use strict;
use FileHandle;
use File::Basename;
my $ref_file= $ARGV[0]; #fasta file
open (IN,$ARGV[1]); #gap file
my %gaps;
my $lastend=0;
my $laststart=0;
my ($start,$end);
my ($contig,$reference);
my ($header,@seq,$ref_sequence);
my %sequence;
my ($gap_start,$gap_end);
my $len_seq=0;
my $fh= FileHandle->new($ref_file)|| die "$!";
if ($fh->open("< $ref_file")){
$/=">";
while (<$fh>){
$_=~ s/\>//g;
unless($_){next;};
($header,@seq)=split /\n/,$_;
$ref_sequence= join "",@seq;
$sequence{$header}=$ref_sequence;
# print "$header\n";
$len_seq= length $ref_sequence;
}
$/="\n";
$fh->close;
}
my ($name,$path,$suffix)=fileparse("$ARGV[1]",qr/\.[^.]*/);
while (<IN>){
chomp;
my ($ref,$gstart,$gend,$temp)=split /\t/,$_,4;
# print "Start: $gstart\nEnd: $gend\n";
$reference= $ref;
$gaps{$gstart}=$gend;
}
open (OUT, ">$path/temp/$reference.fna");
my $first=1;
foreach my $gap(sort {$a<=>$b} keys %gaps){
# print "$gap\t$gaps{$gap}\n";
my $length= $gaps{$gap}-$gap;
$gap_start= $gap;
$gap_end= $gaps{$gap};
if ($length >50){
if ($gap_start>$lastend && $gap_end>$lastend){
# print "$gap\t$gaps{$gap}\n";
$lastend=$gap_end;
if ($first){
if ($gap_start==1){
# $start=$gaps{$gap};
if ($gap_end>100){$start=$gap_end-99;}
elsif($gap_end<100){$start=1;}
}
else{
$start=1;
# $end=$gap;
$end=$gap_start+99;
$contig=$reference.'_'.$start.'_'.$end;
# print "$start\t$end\n";
my $output= substr($sequence{$reference},$start-1,$end-$start+1);
my $length= length $output;
if ($length>=250){print OUT ">$contig\n$output\n";}
}
$first=0;
}
else{
# $end=$gap;
$end=$gap+99;
$contig=$reference.'_'.$start.'_'.$end;
my $output= substr($sequence{$reference},$start-1,$end-$start+1);
# print ">$contig\n$output\n";
my $length= length $output;
if ($length>=250){print OUT ">$contig\n$output\n";}
}
# $start=$gaps{$gap};
$start=$gaps{$gap}-99;
}
}
}
if ($len_seq>$gap_end){
$start= $gap_end;
$end= $len_seq;
$contig=$reference.'_'.$start.'_'.$end;
my $output= substr($sequence{$reference},$start-1,$end-$start+1);
print OUT ">$contig\n$output\n";
}
close IN;
close OUt;
exit;