-
Notifications
You must be signed in to change notification settings - Fork 0
/
crispr_spurious2_initial.pl
90 lines (78 loc) · 6.06 KB
/
crispr_spurious2_initial.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/perl
use strict;
## crispr_spurious2_initial scanner
## Created by Pablo MIER on November 2019
## Copyright © 2019 Pablo MIER. All rights reserved.
my $minimumTR = 7; my $maximumTR = 20; #Minimum length of TR #Maximum length of TR
my $minimumSP = 7; my $maximumSP = 20; #Minimum length of spacer #Maximum length of spacer
#my $minimumUN = 2; my $maximumUN = 4; #Minimum number of repeats #Maximum number of repeats
my $file = $ARGV[0]; #First argument -> Input file
my $output = $ARGV[1]; #Second argument -> Output file
open (OUT,">>$output"); print OUT "ID\tNrRepeatedUnits\tRepeatedUnit\tUnitLength\tNrDiffAA\n"; #Print headline in output file
#my $mul = Bio::SeqIO->new( -file => "<$file" , '-format' => 'fasta'); #Input into multifasta file
#my $ind = 0;
open (IN, "$file");
my %hash;
my $id;
my $seq;
while (my $line=<IN>){
chomp $line;
if ($line=~/^>(.+)/){
$id=$line;
$seq="";
} else {
$seq.=$line;
$hash{$id}=$seq;
}
}
foreach my $aa (keys %hash) { #Go over the multifasta file, each sequence at a time
my $prot_name = $aa; my $prot_seq = $hash{$aa}; my $length = length($prot_seq); #PROTEIN INFORMATION
#$ind++; print "$ind\n";
my $ya = ''; #To account for considered units in current protein
for (my $in = 0; $in <= $length; $in++) { #Go over the complete length of the sequence
my $extra = 0; my $cc = 0; #Auxiliary variables
while ($cc >= 0) { #Exit variable
my $substr = substr($prot_seq,$in,$minimumTR+$extra); #Get seq from current position (in) till position in + /$minimumTR/ (minimum length of TR) + /$extra/ (additional aa when TR is found)
if ((length($substr) < $minimumTR) || (length($substr) > $maximumTR)) { $in = $length+1; $cc = -1; } #If current substring is shorter or larger than allowed, finish the search
else { #The substring has an allowed length
my $count = () = $prot_seq =~ /$substr/g; #Check how many times the current substring is present in the full sequence
if ($count > 1) { #If it is part of a TR, more than one; extend the TR
$cc++; $extra++; #Number of times a TR has been extended
} else { $cc = -1; } #Not repeated if not part of a TR, or the TR already finished; exit the loop
}
my $exitloop = $in+$minimumTR+$extra+1; #End coordinate of next iteration of the loop
if ($exitloop>$length) { $cc = -1; } #Next iteration would not be possible because the sequence would be finished; end the loop
}
if ($extra > 0) { #There is at least one repetition
my $substr2 = substr($prot_seq,$in,$minimumTR+$extra-1); #TR from current position till the last coordinate minus 1
my $number = () = $prot_seq =~ /$substr2/g; #Check how many times the TR is present in the full sequence
$in = $in + $extra; #Advance the complete repeat and re-start the search in the end of the current repeat
next if ((length($substr2) < $minimumTR) || (length($substr2) > $maximumTR)); #Check that the TR has an allowed length
my $number2 = () = $ya =~ /$substr2/g; #Check if the repeated unit has already been found
if ($number2 == 0) { #If the repeated unit is new, check that it is interspaced correctly
my @check = split(/$substr2/,$prot_seq); #Cut the complete sequence by the repeated unit
my $num_check = $#check; #Number of spacers separated by the repeated unit, plus beginning (first) and end (last) of the sequence
my $incorrect = 0; #Variable to count spacers too short or too long
for (my $aux = 1; $aux < $num_check; $aux++) { #Go over all spacers (not the first and the last fragment, they are not spacers)
my $length_spacer = length($check[$aux]); #Current spacer length
if (($length_spacer < $minimumSP) || ($length_spacer > $maximumSP)) { $incorrect++; } #If spacer is too short or too long, count it
}
my $ll = length($substr2); #Unit length
if ($incorrect == 0) { #If there are no incorrect spacers, take the repeated unit as true result
#Calculate number of different amino acids in repeated unit
my $count = 0; my $diff = 0;
$count = () = $substr2 =~ /A/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /C/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /D/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /E/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /F/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /G/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /H/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /I/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /K/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /L/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /M/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /N/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /P/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /Q/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /R/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /S/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /T/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /V/g; if ($count > 0) { $diff++;} $count = 0;
$count = () = $substr2 =~ /W/g; if ($count > 0) { $diff++;} $count = 0; $count = () = $substr2 =~ /Y/g; if ($count > 0) { $diff++;} $count = 0;
print OUT "$prot_name\t$number\t$substr2\t$ll\t$diff\n"; #AC NrRepeatedUnits RepeatedUnit UnitLength
} }
$ya = $ya."$substr2/"; #Add considered unit to the found ones for the current protein
} } }
exit;