forked from qizongtai/LINE1_gemomic_alignment_and_clustering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_merge_uniq_bar_bedV2.pl
198 lines (190 loc) · 10 KB
/
cluster_merge_uniq_bar_bedV2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/perl
use strict;
use warnings;
use List::Util qw(min max);
use Data::Dumper;
$| = 1;
if (@ARGV != 3) { die "input files: 'bed.uniqID.uniqPOS.stat' or 'bed.uniqID.uniqPOS.stat.merge', length(bp) to cluster, minimal read number to call an unique barcode (insertion)"; }
my $cluster_bed = $ARGV[0] . "." . $ARGV[1] . "cluster";
my $cluster_overview = $ARGV[0] . "." . $ARGV[1] . "cluster.overview";
my $raw_gnashy = $ARGV[0] . "." . $ARGV[1] . "cluster.rawgnashy";
my $gnashy = $ARGV[0] . "." . $ARGV[1] . "cluster.gnashy";
my $more_1barcode = $ARGV[0] . "." . $ARGV[1] . "cluster.more1bar";
my $more_1barcode_more_reads = $ARGV[0] . "." . $ARGV[1] . "cluster.more1bar" . "." . "$ARGV[2]reads";
unlink ($cluster_bed) if (-e $cluster_bed);
unlink ($cluster_overview) if (-e $cluster_overview);
unlink ($raw_gnashy) if (-e $raw_gnashy);
unlink ($gnashy) if (-e $gnashy);
unlink ($more_1barcode) if (-e $more_1barcode);
unlink ($more_1barcode_more_reads) if (-e $more_1barcode_more_reads);
my $length = $ARGV[1];
my $read_cutoff = $ARGV[2];
my $total_read_counter = 0;
my $cluster_counter = 0;
my $more_1barcode_counter = 0;
my $more_1barcode_more_reads_counter = 0;
open (IN, "<", "$ARGV[0]") or die "input bed.uniquePOS.stat.merge file error\n";
open (OVERVIEW, ">", "$cluster_overview") or die "output cluster.overview file error\n";
my ($chr, $start, $end, $strand, $barcode, $count, @dump) = ();
my (@array_start, @array_end, @array_strand, @array_barcode, @array_count) = ();
my $last_chr; # undef this variable
while (<IN>) {
chomp;
$total_read_counter ++;
($chr, $start, $end, $strand, $barcode, $count, @dump) = split(/\s+/,$_);
if (!$last_chr) { $last_chr = $chr; } # this argument is used only for the first time
elsif ($chr ne $last_chr) { # cluster chr by chr; one chr at each time
&cluster_bed_unique ($length);
(@array_start, @array_end, @array_strand, @array_barcode, @array_count) = ();
$last_chr = $chr;
}
push(@array_start,$start);
push(@array_end, $end);
push(@array_strand, $strand);
push(@array_barcode, $barcode);
push(@array_count, $count);
}
&cluster_bed_unique ($length); #run "&cluster_by_id" again for the last chr
close IN;
my $percent_morebar = sprintf ("%.2f", 100*($more_1barcode_counter/$cluster_counter));
my $percent_onebar = 100 - $percent_morebar;
my $onebar_counter = $cluster_counter - $more_1barcode_counter;
my $percent_morebar_cutoff = sprintf ("%.2f", 100*($more_1barcode_more_reads_counter/$cluster_counter));
my $percent_onebar_cutoff = 100 - $percent_morebar_cutoff;
my $onebar_cutoff_counter = $cluster_counter - $more_1barcode_more_reads_counter;
print OVERVIEW "The total number of reads in the initial merged unique bedfile is $total_read_counter.\n";
print OVERVIEW "Clustered by $length bp at both ends, $cluster_counter clusters were identified.\n";
print OVERVIEW "Out of these clusters, $percent_onebar % ($onebar_counter) has one barcode and $percent_morebar % ($more_1barcode_counter) has more than one barcode.\n";
print OVERVIEW "If requiring at least $read_cutoff reads to call an unique barcode, ";
print OVERVIEW "$percent_onebar_cutoff % ($onebar_cutoff_counter) has one barcode and $percent_morebar_cutoff % ($more_1barcode_more_reads_counter) has more than one barcode.\n";
print OVERVIEW "These clusters were shown as below:\n";
system (" wc -l $cluster_bed >> $cluster_overview ");
system (" cut -f 1 $cluster_bed | sort | uniq -c | sort -nrk1 >> $cluster_overview "); # "cut|sort|uniq"; "cut -f1" -> select for the first column; "-c" -> counts for each unique value.
#system ("wc -l $bedfile_name > chr_list.txt"); #this outputs total counts of unique insertions.
#system ( "sort -u -k1,1 $bedfile > chr_list.txt" ); #"sort" outputs all columns although "-k1,1" sorts on first column. Need "cut -f1" to filter for just first column.
system (" cut -c 4- $raw_gnashy | grep '^[0-9XY]' | grep -v '[0-9]_g' > $gnashy ");
system (" sed -i 's/X/23/g' $gnashy ");
system (" sed -i 's/Y/24/g' $gnashy ");
print "$cluster_counter\t$more_1barcode_counter\t$more_1barcode_more_reads_counter\n";
<>;
######################################################################################################################################
sub cluster_bed_unique {
my ($len) = @_;
open (OUT, ">>", "$cluster_bed") or die "output cluster file error\n";
open (GNASHY, ">>", "$raw_gnashy") or die "output raw.gnashy file error\n";
open (MORE1BAR, ">>", "$more_1barcode") or die "output more_1barcode file error\n";
open (MORE1BAR_MOREREADS, ">>", "$more_1barcode_more_reads") or die "output more_1barcode_more_reads file error\n";
if (@array_start == 1 ) { #if there is only one read in chr, no comparison is needed.
$cluster_counter ++;
print OUT "$last_chr\t$start\t$end\t$strand\t1\t$count\n";
print GNASHY "$last_chr\t$start\t1\n";
} elsif (@array_start > 1 ) {
my $start_print = 0;
my $end_print = 0;
my $uniq_barcode_counter = 0;
my %barcode_read = ();
my $cluster_read_counter = 0; #count total reads for each cluster
#the first value ($index=0) was assigned
push (my @start_list_temp, $array_start[0]);
push (my @end_list_temp, $array_end[0]);
$barcode_read {$array_barcode[0]} = $array_count[0];
$cluster_read_counter = $array_count[0];
for (my $index=1; $index<@array_start; $index++) { # loop from the second value;
if ((abs($array_start[$index] - $array_start[$index-1]) <= $len) and
(abs($array_end[$index] - $array_end[$index-1]) <= $len) ) {
push (@start_list_temp, $array_start[$index]);
push (@end_list_temp, $array_end[$index]);
$barcode_read{$array_barcode[$index]} += $array_count[$index];
$cluster_read_counter += $array_count[$index];
} else {
$cluster_counter ++;
$start_print = min(@start_list_temp);
$end_print = max(@end_list_temp);
$uniq_barcode_counter = keys %barcode_read;
#output all clusters
print OUT "$last_chr\t$start_print\t$end_print\t$array_strand[$index-1]\t$uniq_barcode_counter\t$cluster_read_counter\n"; #print the previous one "$index-1"
print GNASHY "$last_chr\t$start_print\t$uniq_barcode_counter\n";
#output clusters with more than one barcode
if ($uniq_barcode_counter > 1) {
$more_1barcode_counter ++;
print MORE1BAR "$last_chr\t$start_print\t$end_print\t$array_strand[$index-1]\t$uniq_barcode_counter\t$cluster_read_counter";
foreach my $uniq_barcode (sort {$a cmp $b} keys %barcode_read) {
print MORE1BAR "\t$uniq_barcode\t$barcode_read{$uniq_barcode}"
}
print MORE1BAR "\n";
}
#output clusters with more than one barcode; the barcodes was counted when at least read_cutoff reads exist.
my %barcode_morereads = (); #set up a new hash: {barcode} -> numbe of reads (barcode has more than read_cutoff reads)
foreach my $uniq_barcode (sort {$a cmp $b} keys %barcode_read) {
if ($barcode_read{$uniq_barcode} > $read_cutoff) {
$barcode_morereads{$uniq_barcode} = $barcode_read{$uniq_barcode};
}
}
my $uniq_barcode_morereads_counter = keys %barcode_morereads; #the same as line105-113
if ($uniq_barcode_morereads_counter > 1) {
$more_1barcode_more_reads_counter ++;
print MORE1BAR_MOREREADS "$last_chr\t$start_print\t$end_print\t$array_strand[$index-1]\t$uniq_barcode_morereads_counter\t$cluster_read_counter";
foreach my $uniq_barcode (sort {$a cmp $b} keys %barcode_morereads) {
print MORE1BAR_MOREREADS "\t$uniq_barcode\t$barcode_morereads{$uniq_barcode}"
}
print MORE1BAR_MOREREADS "\n";
}
#initalize the the values for the next comparison as line
$start_print = 0;
$end_print = 0;
$uniq_barcode_counter = 0;
$uniq_barcode_morereads_counter = 0;
$cluster_read_counter = 0;
@start_list_temp = ();
@end_list_temp = ();
%barcode_read = ();
%barcode_morereads = ();
#process the "current index" value for the next comparison
push (@start_list_temp, $array_start[$index]);
push (@end_list_temp, $array_end[$index]);
$barcode_read{$array_barcode[$index]} = $array_count[$index];
$cluster_read_counter = $array_count[$index];
}
if ($index == $#array_start) { # deal with the last element in the loop
$cluster_counter ++; # same as line109-129
$start_print = min(@start_list_temp);
$end_print = max(@end_list_temp);
$uniq_barcode_counter = keys %barcode_read;
#output all clusters
print OUT "$last_chr\t$start_print\t$end_print\t$array_strand[$index]\t$uniq_barcode_counter\t$cluster_read_counter\n"; #need to print current index;
print GNASHY "$last_chr\t$start_print\t$uniq_barcode_counter\n";
#output clusters with more than one barcode
if ($uniq_barcode_counter > 1) {
$more_1barcode_counter ++;
print MORE1BAR "$last_chr\t$start_print\t$end_print\t$array_strand[$index]\t$uniq_barcode_counter\t$cluster_read_counter";
foreach my $uniq_barcode (sort {$a cmp $b} keys %barcode_read) {
print MORE1BAR "\t$uniq_barcode\t$barcode_read{$uniq_barcode}"
}
print MORE1BAR "\n";
}
#output clusters with more than one barcode; the barcodes was counted when at least read_cutoff reads exist.
my %barcode_morereads = (); #set up a new hash: {barcode} -> numbe of reads (barcode has more than read_cutoff reads)
foreach my $uniq_barcode (sort {$a cmp $b} keys %barcode_read) {
if ($barcode_read{$uniq_barcode} > $read_cutoff) {
$barcode_morereads{$uniq_barcode} = $barcode_read{$uniq_barcode};
}
}
my $uniq_barcode_morereads_counter = keys %barcode_morereads; #the same as line105-113
if ($uniq_barcode_morereads_counter > 1) {
$more_1barcode_more_reads_counter ++;
print MORE1BAR_MOREREADS "$last_chr\t$start_print\t$end_print\t$array_strand[$index]\t$uniq_barcode_morereads_counter\t$cluster_read_counter";
foreach my $uniq_barcode (sort {$a cmp $b} keys %barcode_morereads) {
print MORE1BAR_MOREREADS "\t$uniq_barcode\t$barcode_morereads{$uniq_barcode}"
}
print MORE1BAR_MOREREADS "\n";
}
}
}
} else {
die "\@array_start error.\n";
}
close OUT;
close GNASHY;
close MORE1BAR;
close MORE1BAR_MOREREADS;
}