-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzebra_GetFqList.pl
executable file
·206 lines (173 loc) · 6.33 KB
/
zebra_GetFqList.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/perl
=head1 Name
zebra_GetFqList.pl -- Get fastq list of zebra for analysis pipeline.
=head1 Description
This script aims at finding relationship of these three files and
printing input file for WES or RNAseq pipeline.
sample - library - index (pool.info)
|| ||
library(+/-\d) - chip-lane (seq.info)
|| ||
chip-lane - index (fqfile.list)
=head1 Version
Author: Song Bin, [email protected]
Version: 1.0 Date: 2016-9-20
Version: 1.1 Date: 2017-2-7
Version: 1.2 Date: 2017-2-20
Version: 1.3 Date: 2017-3-5
changing key of hash %lib2lane from $rawlib-$library to $rawlib-$id.
Version: 1.4 Date: 2017-5-4
Make sure that fqfile.list contains all samples/librarys' chip in pool.info and seq.info file.
Version: 1.5 Date: 2017-8-8
Fix this line: if($library =~ /(.*-N)[+-]\d/){
=head1 Usage
perl $0 <pool.info with header> <seq.info with header> <fqfile.list> <SeqType:new/old> <output>
Result examples:
for Zebra RNA-seq SE50+10 pipeline input (new)
BC008-A1 RHUMfddNAAAARAAZebra-N CL100006572_L01_6 /lfs1/zebra03/F14ZF1QSSY1656_Temp/CL100006572_L01/CL100006572_L01_6.fq.gz 50 200 1580787850
for Zebra WES PE50+10 pipeline input (new)
BC008-N RHUMbmaXDAAZRAAZebra-N CL100006996_L01_17 /szhwfs1/zebra_bgiseq500/zebra01/F14ZF1QSSY1656_Temp/CL100006996_L01/clean_data/CL100006996_L01_17_1.fq.gz.clean.gz,/szhwfs1/zebra_bgiseq500/zebra01/F14ZF1QSSY1656_Temp/CL100006996_L01/clean_data/CL100006996_L01_17_2.fq.gz.clean.gz 50;50 150 4890829790
for Zebra WES PE50 pipeline input (new)
BC044-A RHUMbmaXJALARAAZebra-N CL100010118_L02 /szhwfs1/zebra_bgiseq500/zebra01/F14ZF1QSSY1656_Temp/CL100010118_L02/CL100010118_L02_read_1.fq.gz,/szhwfs1/zebra_bgiseq500/zebra01/F14ZF1QSSY1656_Temp/CL100010118_L02/CL100010118_L02_read_2.fq.gz 50;50 150 68002894800
for Zebra WES PE50+10 pipeline input (old)
BC018-N 33 zebra /lfs1/zebra03/F14ZF1QSSY1656_Temp/CL300002657_L02/CL300002657_L02_45 library
=cut
use strict;
use warnings;
use File::Basename qw/dirname basename/;
use Data::Dumper;
die "perl $0 <pool.info> <seq.info> <fqfile.list> <SeqType:new/old> <output>\n" if(@ARGV != 5);
my ($poolfile,$seqfile,$fqlist,$seqtype,$output)=@ARGV;
die "SeqType should be 'new' or 'old' !" if($seqtype ne "new" and $seqtype ne "old");
my $report_backup_dir="/ifs3/yjy_data/Zebra_HTML_report_backup";
my (%pool,%libCount,%lib2lane,%lane2lib,%fqfiles,%fqStats);
open IN1,"$seqfile" || die $!;
<IN1>;
while(<IN1>){
chomp;
my ($id,$chip,$lane,$library,$seqType,$startTime,$endTime,$info)=(split /\t/)[0,1,2,3,4,6,8,9];
if(defined $info && $info =~ /作废/){
next;
}
my $rawlib=$library;
# Library sequenced for several times, eg. RHUMbmaXFACTRAAZebra-N+3
if($library =~ /(.*-N)[+-]\d/){
$rawlib=$1;
}
if(exists $lib2lane{$rawlib}){
$libCount{$rawlib}++;
}else{
$libCount{$rawlib}=1;
}
$lib2lane{$rawlib}{$libCount{$rawlib}}="$chip\_$lane";
$lane2lib{"$chip\_$lane"}=$library;
}
close IN1;
#print Dumper(\%lib2lane);
open IN2,"$poolfile" || die $!;
<IN2>;
while(<IN2>){
chomp;
my @F = split /\t/;
my ($taskID,$sample,$sampleID,$poolNum,$indexRange,$rawlib)=@F[0,1,2,3,4,5];
#Fix sample name if needed.
#$sample=~s/(.*)-(\d+)-(.*)A/$1-$3/ if $sample=~/(.*)-(\d+)-(.*)/;
if(!exists $lib2lane{$rawlib}){
die "Can't find the chip_lane of library $rawlib !\n";
}
foreach my $id (sort keys %{$lib2lane{$rawlib}}){
my $chipLaneID=$lib2lane{$rawlib}{$id};
#Index could be "41-44" or "41"
my @indexs;
if($indexRange=~/-/){
my ($from,$to)=(split /-/,$indexRange)[0,1];
for my $index ($from..$to){
push @indexs,$index;
}
}
else{push @indexs,$indexRange;}
for my $index (@indexs){
my $indexID="$chipLaneID\_$index";
$pool{$indexID}=$sample;
}
my $indexID1="$chipLaneID\_read";
$pool{$indexID1}=$sample;
}
}
close IN2;
open IN3,"$fqlist" || die $!;
while(<IN3>){
chomp;
my ($id,$file)=(split /\t/)[0,1];
# id could be "CL100004346_L01_41_1","CL100004346_L01_41_2"(PE with Index)
# or "CL100011896_L01_read_1","CL100011896_L01_read_2"(PE without Index)
# or "CL100006572_L01_10"(SE with Index)
my @ids=split /_/,$id;
my ($chip,$lane,$index)=@ids[0,1,2];
my $fqid=$#ids==3?$ids[3]:1;
my $indexID = "$chip\_$lane\_$index";
my $sample;
if(exists $pool{$indexID}){
$sample=$pool{$indexID};
$fqfiles{$sample}{$indexID}{$fqid}=$file;
}
else{
warn "Can't find sample name of the Chip_lane_index $indexID !\n";
next;
}
next if ($seqtype eq "old");
my $dir = dirname($file);
my $name = basename($file);
$name =~ s/(.*).fq.gz$/$1/;
#get fqStat file
my $fqStatFile = "$dir/$name.fq.fqStat.txt";
if(!-e $fqStatFile){
$fqStatFile = "$dir/$chip\_$lane\_Report/FqReport/$name.fq.fqStat.txt";
if(!-e $fqStatFile){
$fqStatFile = "$report_backup_dir/$chip\_$lane/FqReport/$name.fq.fqStat.txt";
}
}
if(-e $fqStatFile){
my $info = `cat $fqStatFile | head -13`;
map { my ($key,$value) = (split /\t/,$_)[0,1];
$key =~ s/^#(.*)/$1/;
$fqStats{$sample}{$indexID}{$fqid}{$key} = $value
} split (/\n/,$info);
}
else{
warn "Can't find fqStat file of $name !\n";
}
}
close IN3;
open OUT,">$output" || die $!;
foreach my $sample (sort keys %fqfiles){
foreach my $indexID (sort keys %{$fqfiles{$sample}}){
my ($chip,$lane,$index)=(split /_/,$indexID)[0,1,2];
my $library = $lane2lib{"$chip\_$lane"};
my $fqfile = $fqfiles{$sample}{$indexID}{1};
my $lanePath = $fqfile;
$lanePath=~s/(.*).fq.gz$/$1/;
$lanePath=~s/(.*)_read_1$/$1/;
if($seqtype eq "old"){
print OUT "$sample\t33\tzebra\t$lanePath\t$library\n";
}
elsif($seqtype eq "new"){
my $fqPrefix=(split /\//,$lanePath)[-1];
if(exists $fqfiles{$sample}{$indexID}{2}){
$fqfile.=",$fqfiles{$sample}{$indexID}{2}";
}
my $insertSize = "150";
my $readLength = "NA";
my $BaseNum = "NA";
if(exists $fqStats{$sample}{$indexID}{1}{"BaseNum"}){
$BaseNum=$fqStats{$sample}{$indexID}{1}{"BaseNum"};
$readLength = $fqStats{$sample}{$indexID}{1}{"row_readLen"};
if(exists $fqStats{$sample}{$indexID}{2}{"BaseNum"}){
$BaseNum+=$fqStats{$sample}{$indexID}{2}{"BaseNum"};
$readLength.=";$fqStats{$sample}{$indexID}{2}{row_readLen}";
}
}
print OUT "$sample\t$library\t$fqPrefix\t$fqfile\t$readLength\t$insertSize\t$BaseNum\n";
}
}
}