forked from johnlees/which_tree
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalf_db_to_fasta_splice_intergenic.pl
executable file
·65 lines (52 loc) · 1.61 KB
/
alf_db_to_fasta_splice_intergenic.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/perl -w
# Converts ALF DB sequences into a fasta file for the organism
# Shuffles genes into order (by locus) and removes > lines
# Also includes intergenic regions evolved by dawg, and converted with dawg to
# alf
# Usage ./alf_db_to_fasta.pl SE0xx_dna.fa dawg/MSA > SE0xx.fa
use strict;
use warnings;
use Bio::SeqIO;
use Text::Wrap;
$Text::Wrap::columns = 70;
my $input_file = $ARGV[0];
my $intergenic_folder = $ARGV[1];
open(DB, $input_file) || die("Could not open input $input_file\n");
my @whole_seq;
while (my $db_line = <DB>)
{
next if($db_line !~ m/\w/);
chomp $db_line;
$db_line =~ m/^>G(\d+)_SE(\d+), sequence type: type\d+, locus: (\d+)$/;
my $gene_nr = $1;
my $org_nr = $2;
my $gene_pos = $3;
my $sequence = <DB>;
chomp $sequence;
if (-e "$intergenic_folder/MSA_i$gene_nr\_dna.fa")
{
my $int_in = Bio::SeqIO->new(-file=>"$intergenic_folder/MSA_i$gene_nr\_dna.fa", -format=>'Fasta');
while (my $int_org_seq = $int_in->next_seq())
{
if ($int_org_seq->primary_id() eq "SE$org_nr")
{
my $int_region = $int_org_seq->seq();
$int_region =~ s/-//g;
$whole_seq[$gene_pos-1] = $int_region;
last;
}
}
}
$whole_seq[$gene_pos-1] .= $sequence;
# my $blank = <DB>;
}
#my $seq_out = Bio::SeqIO->new(-fh=>\*STDOUT,
# -format=>'Fasta');
#
#$seq_out->write_seq(Bio::Seq->new(-display_id=>($input_file),
# -seq=>join("",@whole_seq)));
print "\>$input_file\n";
print wrap('', '', join("",@whole_seq));
print "\n";
close DB;
exit(0);