-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmpca_rna
180 lines (162 loc) · 6.13 KB
/
mpca_rna
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long 'HelpMessage';
use List::MoreUtils qw(firstidx);
my $usage=<<_EOH_;;
------------------------------------------------------------------------------------------
This script is used to output the PCA plot from multiple reads number matrixs
print command to a Rscript and then run it to output the plot
Usage:
mpca_rna --matrix Blenny_control_read_nb.xls Blue_eyed_control_read_nb.xls Common_control_read_nb.xls Yaldwyn_control_read_nb.xls \
--samples coldata_Blenny_control.txt coldata_Blue_eyed_control.txt coldata_Common_control.txt coldata_Yaldwyn_control.txt \
--column Site_1 Site_1 Site_1 Site_1 \
--title Blenny Blue_eyed Common Yaldwyn \
--prefix total_pca
Example:
1. --matrix: Blenny_read_nb.xls
B61 B62 B63 B64 B65 B66 B67 B68 B69 B71 B72 B73 B74 B75 B76 B77 B78 B79
OG0038649 430 218 222 486 402 612 266 159 278 334 365 190 614 464 346 543 477 490
OG0039547 0 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0
2. --samples: coldata_Blenny.txt
Site_1 Site_2 Species
B61 Vn Vent Blenny
B62 Vn Vent Blenny
Kang 2021-8-20
------------------------------------------------------------------------------------------
_EOH_
;
GetOptions(
'matrix:s{1,}', \my @matrix, # the overall reads number matrix
'samples:s{1,}', \ my @sample_file, # the sample information
'column:s{1,}', \my @col, # the column selected from the sample information
'title:s{1,}', \my @title, # the title of each plot
'prefix:s', \my $prefix, # the name of output pdf
'help', \ my $help
);
if ($help || (! @matrix) && (! @sample_file) || (!@col) || (!@title) || (!$prefix) ) {
die $usage; # all of these options are mandatory requirements
}
main: {
&creat_R_header();
my $nb=@matrix;
for (my $i = 0; $i < $nb; $i++) {
my $j=$i+1;
my $p="p".$j;
my $matrix=$matrix[$i];
my $sample_file=$sample_file[$i];
my $title=$title[$i];
my $col=$col[$i];
&creat_R_common($matrix, $sample_file, $col);
&ggplot_pca($sample_file, $col, $title, $p);
}
&plot_arrange();
exit(0);
}
###################################################################################
sub creat_R_header {
open PCA, ">pca.R" or die "can not creat pca.R\n";
print PCA "library(DESeq2)\n";
print PCA "library(ggplot2)\n";
print PCA "library(ggpubr)\n";
close PCA;
}
sub creat_R_common {
my ($matrix, $sample_file, $col)=@_;
open MATRIX, "$matrix" or die "can not open $matrix\n";
chomp(my $header = <MATRIX>);
close MATRIX;
my @a=split /\s+/, $header;
my $ind_nb=@a;
my $least_nb=$ind_nb*10;
open PCA, ">>pca.R" or die "can not creat pca.R\n";
print PCA "cts <- read.table(file=\"$matrix\",header=TRUE,row.names = 1)\n";
print PCA "coldata <- read.table(file=\"$sample_file\")\n";
my $dds="dds <- DESeqDataSetFromMatrix(countData = cts,";
$dds.="colData = coldata,design= ~$col)";
print PCA "$dds\n";
print PCA "keep <- rowSums(counts(dds)) >= $least_nb\n";
print PCA "dds <- dds[keep,]\ndds <- DESeq(dds)\nvsd <- vst(dds, blind=FALSE)\n";
print PCA "pcaData <- plotPCA(vsd, intgroup=c(\"$col\"), returnData=TRUE)\n";
print PCA "percentVar <- round(100 * attr(pcaData, \"percentVar\"))\n";
close PCA;
}
sub ggplot_pca {
my ($sample_file, $col, $title, $i)=@_;
my $ggplot2="ggplot(pcaData, aes(PC1, PC2,color=$col)) + \n ";
$ggplot2.="geom_point(size=6) +\n ";
my $plot_basic=&basic_plot_attr($title, $col);
$ggplot2.=$plot_basic;
$ggplot2.="xlab(paste0(\"PC1: \",percentVar[1],\"% variance\")) +\n ylab(paste0(\"PC2: \",percentVar[2],\"% variance\")) +\n ";
my $cols=&select_colour($sample_file, $col);
$ggplot2.="scale_color_manual(values=c($cols))";
open PCA, ">>pca.R" or die "can not creat pca.R\n";
print PCA "$i <- $ggplot2\n";
close PCA;
}
sub plot_arrange {
open PCA, ">>pca.R" or die "can not creat PCA.R\n";
my $output=$prefix.".pdf";
print PCA "pdf(file = \"$output\", width = 11.69, height = 8.27, onefile=FALSE)\n";
my $plot_nb=@matrix;
my ($nrow, $ncol);
if ($plot_nb==1) {
$nrow=1; $ncol=1;
} elsif ($plot_nb==2) {
$nrow=1; $ncol=2;
} elsif ($plot_nb==3 || $plot_nb==4) {
$nrow=2; $ncol=2;
} elsif ($plot_nb==5 || $plot_nb==6) {
$nrow=2; $ncol=3;
}
my $plot_names;
for (my $i = 1; $i <= $plot_nb; $i++) {
$plot_names.="p".$i.", ";
}
$plot_names=~s/\, $//;
my $ggarrange="ggarrange($plot_names,nrow=$nrow, ncol=$ncol,common.legend = T,align = \"h\")";
print PCA "$ggarrange\n";
print PCA "dev.off()";
close PCA;
`R --no-save --no-restore --no-site-file --no-init-file -q < pca.R 1>&2`;
`rm pca.R`;
}
sub basic_plot_attr {
my ($title, $col)=@_;
my $ggplot2="theme_bw() + ggtitle(\"$title\") +\n ";
$ggplot2.="geom_text(aes(label=\"\"), hjust=1, vjust=2, show.legend = FALSE, family=\"Times\", colour=\"black\",size=4) +\n ";
$ggplot2.="geom_text(aes(label=name, colour=$col), hjust=1, vjust=2, show.legend = FALSE, family=\"Times\", colour=\"black\",size=4) +\n ";
$ggplot2.="theme(legend.position = \"right\", legend.title = element_blank(),\n ";
$ggplot2.="panel.grid=element_blank(),axis.text.x=element_text(colour = \"black\", family=\"Times\",size=20),\n ";
$ggplot2.="axis.text.y=element_text(family=\"Times\",size=20,colour = \"black\"),\n ";
$ggplot2.="axis.title.x=element_text(family=\"Times\",size = 25,face=\"bold\"),\n ";
$ggplot2.="axis.title.y=element_text(family=\"Times\",size = 25,face=\"bold\"),\n ";
$ggplot2.="legend.text=element_text(face=\"bold\", family=\"Times\", colour=\"black\",size=18),\n ";
$ggplot2.="plot.title=element_text(colour = \"black\", family=\"Times\",size=20, face=\"bold\", hjust = 0.5)) + \n ";
$ggplot2.="xlab(paste0(\"PC1: \",percentVar[1],\"% variance\")) +\n ylab(paste0(\"PC2: \",percentVar[2],\"% variance\")) +\n ";
return($ggplot2);
}
sub select_colour {
my ($sample_file, $col)=@_;
my @colors=qw(DC0000FF F39B7F99 7E6148FF 00A087FF 4DBBD5FF 8491B4FF); # the maximum number of colour is 6
open SAMP, "$sample_file" or die "can not open $sample_file\n";
my $idx;
my @level;
my %hash;
while (<SAMP>) {
chomp;
my @a=split;
if (/^\s+/) {
$idx = firstidx { $_ eq $col } @a;
} else {
$hash{$a[$idx+1]}++;
}
}
my $level=keys %hash;
my $cols;
for (my $i = 0; $i < $level; $i++) {
$cols.="\'#$colors[$i]\',";
}
$cols=~s/\,$//;
return($cols);
}