-
Notifications
You must be signed in to change notification settings - Fork 1
/
combine_data.pl
executable file
·114 lines (104 loc) · 3.67 KB
/
combine_data.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/perl -w
use strict;
use URI::Escape;
use Data::Dumper;
# ./get_intron.pl MSU_r7.all.gff3 > introns.gff
# ./get_intergenic.pl MSU_r7.all.gff3 > intergenic.gff
# cat MSU_r7.all.gff3 introns.gff intergenic.gff > all_plus_intergenic_introns.gff
# perl -pi -e 's/^\s+$//' all_plus_intergenic_introns.gff
# ./combine_data.pl Chr1.mPing.te_insertion_sites.table.txt all_plus_intergenic_introns.gff all.GOSlim_assignment > out.txt
my $insert = shift; #table ouput file from find_mping_insertions.pl
my $gff = shift; #from MSU of all genes, exons, and UTR
my $go = shift; #from MSU of all GOterms for genes
my %inserts;
my %annot;
my %go;
my %tds;
open (INTABLE, "<", $insert) or die "Can't open $insert $!\n";
<INTABLE>;
while (my $line = <INTABLE>){
chomp $line;
# my ($te, $chr, $start, $end, $four, $left_flanking_seq, ) = split /\t/ , $line;
my ($chr, $start, $end, $four, $left_flanking_seq, ) = split /\t/ , $line;
$tds{$chr}{$start}= substr($left_flanking_seq, -3);
push @{$inserts{$chr}} , $start;
}
close INTABLE;
open (GO, "<" , $go) or die "Can't open $go $! \n";
<GO>;
while (my $line = <GO>){
chomp $line;
my ($loc, $goid, $goterm, $gopart)= split /\t/, $line;
$loc =~ s/.\d+$//;
$go{$loc}{$goid}{'part'}=$gopart;
$go{$loc}{$goid}{'term'}=$goterm;
}
close GO;
open (INGFF, "<", $gff) or die "Can't open $gff $!\n";
<INGFF>;
while (my $line = <INGFF>){
chomp $line;
my @GFF = split /\t/ , $line;
my ($chr, $type, $start, $end, $strand, $nine) = ($GFF[0], $GFF[2] , $GFF[3], $GFF[4], $GFF[6], $GFF[8]);
next if !exists $inserts{$chr};
next if $type eq 'CDS' or $type eq 'mRNA';
$nine =~ /ID=(.+?);/;
my $ID = $1;
foreach my $insert (sort {$a <=> $b} @{$inserts{$chr}}){
if ($start <= $insert and $insert <= $end){
#insert is in this feature
$annot{$chr}{"$start..$end"}{'start'}=$start;
$annot{$chr}{"$start..$end"}{'end'}=$end;
$annot{$chr}{"$start..$end"}{'type'}=$type;
$annot{$chr}{"$start..$end"}{'ID'}=$ID;
$annot{$chr}{"$start..$end"}{'insert'}=$insert;
$annot{$chr}{"$start..$end"}{'strand'}=$strand;
if ($type eq 'gene'){
$nine =~ /Note=(.+)$/;
my $desc = $1;
my $str = uri_unescape($desc);
$annot{$chr}{"$start..$end"}{'desc'}=$str;
}elsif ($type eq 'intergenic'){
$nine =~ /downstream_gene=(.+?);/;
$annot{$chr}{"$start..$end"}{'downstream'} = $1;
$nine =~ /upstream_gene=(.+?)$/;
$annot{$chr}{"$start..$end"}{'upstream'} = $1;
}
}
}
}
close INGFF;
print "chr\tinsert_pos\tref TDS\tfeature\tstrand\tf.start\tf.end\tf.ID\tf.Desc\tintergenic.down.gene\tintergenic.up.gene\tgene.go.id\tgene.go.part\tgene.go.term\n";
foreach my $chr (keys %annot){
foreach my $range ( sort { (split /\.\./, $a)[0] <=> (split /\.\./, $b)[0]} keys %{$annot{$chr}}){
my $start = $annot{$chr}{$range}{'start'};
my $end = $annot{$chr}{$range}{'end'};
my $type = $annot{$chr}{$range}{'type'};
my $insert = $annot{$chr}{$range}{'insert'};
my $ID = $annot{$chr}{$range}{'ID'};
my $strand = $annot{$chr}{$range}{'strand'};
if ($strand eq '-'){
($start , $end) = ($end , $start);
}
my $note = 'N/A';
my $downstream = 'N/A';
my $upstream = 'N/A';
my $go = '';
if ($type eq 'gene'){
$note = $annot{$chr}{$range}{'desc'};
if (exists $go{$ID}){
foreach my $go_id (keys %{$go{$ID}}){
my $goterm = $go{$ID}{$go_id}{'term'};
my $part = $go{$ID}{$go_id}{'part'};
$go .= "$go_id\t$part\t$goterm\t";
}
}
}elsif($type eq 'intergenic'){
$downstream = $annot{$chr}{$range}{'downstream'};
$upstream = $annot{$chr}{$range}{'upstream'};
$ID = 'N/A';
}
my $tds = $tds{$chr}{$insert};
print "$chr\t$insert\t$tds\t$type\t$strand\t$start\t$end\t$ID\t$note\t$downstream\t$upstream\t$go\n";
}
}