-
Notifications
You must be signed in to change notification settings - Fork 61
/
Copy path6.1_Get_Expression_Kallisto.pl
99 lines (89 loc) · 2.64 KB
/
6.1_Get_Expression_Kallisto.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
use strict;
use warnings;
if (@ARGV != 4) {die "Usage: perl 6.1_Get_Expression_Kallisto.pl /path/outprefix /path/genome.gtf /path/kallisto_output_directory [0/1 - aggregate by gene?].\n";}
my $outprefix = $ARGV[0];
my $gtf_file = $ARGV[1];
my $abundance_string = $ARGV[2]."*.abundance*.tsv";
my $agg_by_gene = $ARGV[3];
my $suffix = "kallisto_trans";
if ($agg_by_gene) {
$suffix = "kallisto_gene";
}
my %Gene2ID2FragCount = ();
my %Gene2ID2TPM = ();
my @IDs = ();
my %transcript2gene = ();
#open (my $ifh, "/lustre/scratch108/compgen/team218/TA/genomebuilding/Mus_musculus.GRCm38.79.gtf") or die $!;
open (my $ifh, "$gtf_file") or die $!;
while (<$ifh>) {
if ($_ =~ /^#/) {next;}
my $geneid = "ERROR";
if ($_ =~ /gene_id "(.+?)"/) {
$geneid=$1;
} else {
die "No gene id!\n";
}
my @record = split(/\t/);
if ($record[2] =~ /gene/i) {
#gene
} elsif ($_ =~ /exon_id "(.+?)"/) {
#exon
} elsif ($_ =~ /transcript_id "(.+?)"/) {
#transcript
$transcript2gene{$1}=$geneid;
}
} close ($ifh);
print STDERR "Done reading Annotations\n";
#foreach my $file (glob("/lustre/scratch108/compgen/team218/TA/RNASeqFilesQuantified/Buettner_Kallisto/*.abundance*.tsv")) {
foreach my $file (glob("$abundance_string")) {
# $file =~ /([ATCG]{5,})/;
my $ID = "ERROR";
# if ($file =~ /([^\/]+_Cell\d\d)/) {
if ($file =~ /([^\/]+)\.abundance/) {
$ID = $1;
} else {die "$file did not match!\n";}
push(@IDs,$ID);
open(my $ifh, $file) or die $!;
while (<$ifh>) {
chomp;
if ($_ =~ /^#/ || $_ =~ /^target_id/) {next;} #skip header & comments
my @record=split(/\t/);
my $gene = $record[0]; $gene =~ s/\s+//g;
if (exists($transcript2gene{$gene}) && $agg_by_gene) {
$gene = $transcript2gene{$gene};
}
if (exists($Gene2ID2FragCount{$gene}->{$ID})) {
$Gene2ID2FragCount{$gene}->{$ID} += $record[3];
$Gene2ID2TPM{$gene}->{$ID} += $record[4];
} else {
$Gene2ID2FragCount{$gene}->{$ID} = $record[3];
$Gene2ID2TPM{$gene}->{$ID} = $record[4];
}
} close ($ifh);
}
open(my $ofh1, ">", "$outprefix\_$suffix\_counts.txt") or die $!;
open(my $ofh2, ">", "$outprefix\_$suffix\_tpm.txt") or die $!;
print $ofh1 "Gene\t".join("\t",@IDs)."\n";
print $ofh2 "Gene\t".join("\t",@IDs)."\n";
foreach my $gene (keys(%Gene2ID2FragCount)) {
print $ofh1 "$gene";
print $ofh2 "$gene";
foreach my $ID (@IDs) {
my $count = "NA";
my $tpm = "NA";
if (exists($Gene2ID2FragCount{$gene}->{$ID})) {
$count = $Gene2ID2FragCount{$gene}->{$ID};
} else {
$count = "NA";
}
if (exists($Gene2ID2TPM{$gene}->{$ID})) {
$tpm = $Gene2ID2TPM{$gene}->{$ID};
} else {
$tpm = "NA";
}
print $ofh1 "\t".$count;
print $ofh2 "\t".$tpm;
}
print $ofh1 "\n";
print $ofh2 "\n";
}