From 240297b1974d05ea285890e51f9c084f34f58aa9 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Apr 2019 12:52:29 +0000 Subject: [PATCH 1/4] Script to remove samples from a normal panel. Will not work on files passed through . --- perl/bin/pindel_np_remsample.pl | 80 +++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100755 perl/bin/pindel_np_remsample.pl diff --git a/perl/bin/pindel_np_remsample.pl b/perl/bin/pindel_np_remsample.pl new file mode 100755 index 0000000..feb39b6 --- /dev/null +++ b/perl/bin/pindel_np_remsample.pl @@ -0,0 +1,80 @@ +#!/usr/bin/perl + +use strict; +use warnings FATAL=>'all'; +use autodie; + +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); + +if(@ARGV < 2) { + warn "\nUSAGE: $0 normal_panel.gff3[.gz] sampleToRemove_1 sampleToRemove_2 [...]\n\n"; + warn "Output is to stdout, recommended usage:\n"; + warn "\t$0 normal_panel.gff3[.gz] sampleToRemove_1 sampleToRemove_2 [...] | bgzip -c > new_panel.gff3.gz\n"; + exit(1); +} + +my $input = shift @ARGV; +my @samples = @ARGV; + +my $z = IO::Uncompress::Gunzip->new($input, MultiStream => 1) or die "gunzip failed: $GunzipError\n"; +while(my $line = <$z>) { + if($. < 3) { + print $line; + next; + } + if($. == 3) { + print clean_header($line, \@samples); + next; + } + print clean_record($line, \@samples); +} +close $z; + +sub clean_record { + my ($line, $samples) = @_; + chomp $line; + my @columns = split /\t/, $line; + my ($summary, @sample_stat) = split ',', pop @columns; + my $found = 0; + my @cleaned; + + for my $ss(@sample_stat) { + my $matched = 0; + for my $samp (@{$samples}) { + if($ss =~ m/^$samp=/) { + $matched++; + next; + } + } + if($matched > 0){ + $found++; + } + else { + push @cleaned, $ss; + } + } + if($found == 0) { + return $line."\n"; + } + my ($orig_count) = $summary =~ m/(\d+)$/; + my $fixed_count = $orig_count - $found; + return q{} if($fixed_count == 0); # removes line completely + my $corrected = join "\t", @columns, join ',', 'SAMPLE_COUNT='.$fixed_count, @cleaned; + return $corrected."\n"; +} + +sub clean_header { + my ($line, $samples) = @_; + chomp $line; + my @elements = split /\s/, $line; + my @new; + OUTER: for my $e (@elements) { + for my $samp (@{$samples}) { + next OUTER if $e =~ m/$samp/; + } + push @new, $e; + } + my $final = join q{ }, @new; + $final .= "\n"; + return $final; +} From 31089fcd2f26faff470cd8330a0c11d41b50bae0 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Fri, 3 May 2019 14:03:22 +0100 Subject: [PATCH 2/4] Checks for overlaps using In memory IntervalTree instead of calling out to tabix, ~20% speedup --- perl/Makefile.PL | 3 +- perl/lib/Sanger/CGP/Pindel/InputGen.pm | 36 +++++++++++++++------ perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm | 13 ++------ 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/perl/Makefile.PL b/perl/Makefile.PL index 131cab7..ae59474 100755 --- a/perl/Makefile.PL +++ b/perl/Makefile.PL @@ -1,7 +1,7 @@ #!/usr/bin/perl ########## LICENCE ########## -# Copyright (c) 2014-2018 Genome Research Ltd. +# Copyright (c) 2014-2019 Genome Research Ltd. # # Author: CASM/Cancer IT # @@ -46,5 +46,6 @@ WriteMakefile( 'Devel::Cover' => 1.09, 'Pod::Coverage' => 0.23, 'PerlIO::gzip' => 0.20, + 'Set::IntervalTree' => 0.12, } ); diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index bcd28be..d9a0fba 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -32,10 +32,10 @@ use Carp qw( croak ); use File::Which qw(which); use Try::Tiny qw(try catch finally); use File::Spec; +use Set::IntervalTree; +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use Data::Dumper; -use Bio::DB::HTS::Tabix; - use Sanger::CGP::Pindel; # add signal handler for any interrupt so that cleanup of temp is handled correctly. @@ -202,6 +202,22 @@ sub _process_set { } } +sub _tabix_to_interval_tree { + my $bed = shift; + my %tree; + my $z = IO::Uncompress::Gunzip->new($bed, MultiStream => 1) or die "gunzip failed: $GunzipError\n"; + my $value = 1; + while(my $line = <$z>) { + next if ($line =~ m/^#/); + chomp $line; + my ($chr, $s, $e) = split /\t/, $line; + $tree{$chr} = Set::IntervalTree->new() unless(exists $tree{$chr}); + $tree{$chr}->insert(\$value, $s, $e); + } + close $z; + return \%tree; +} + sub _completed_threads { my $self = shift; my @output_objects; @@ -223,6 +239,13 @@ sub reads_to_pindel { warn "Thread Worker $tid: started\n"; my $convert_start = time; + + my $tabix; + if(defined $bed) { + # was tabix, keeping name for consistency + $tabix = _tabix_to_interval_tree($bed); + } + @reads = @{$reads[0]} if(ref $reads[0] eq 'ARRAY'); my $total_reads = scalar @reads; @@ -230,11 +253,6 @@ sub reads_to_pindel { my $to_pindel = Sanger::CGP::Pindel::InputGen::PairToPindel->new($sample_name, $rg_pis); my $total_pairs = $total_reads / 2; - my $tabix; - if(defined $bed) { - $tabix = new Bio::DB::HTS::Tabix(filename => $bed); - } - for(1..$total_pairs) { my $pair = Sanger::CGP::Pindel::InputGen::Pair->new(\shift @reads, \shift @reads, $tabix); next unless($pair->keep_pair); @@ -242,9 +260,9 @@ sub reads_to_pindel { } my $retained = scalar @records; - my $excluded = $total_pairs - $retained; + my $excluded = $total_reads - $retained; my $convert_took = time - $convert_start; - warn "Thread Worker $tid: Excluded $excluded/$total_pairs (${convert_took}s)\n"; + warn "Thread Worker $tid: Excluded $excluded/$total_reads (${convert_took}s)\n"; warn "Thread Worker $tid: Generated $retained records\n"; return \@records if($thread == -1); return @records; diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm b/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm index 42881a2..65c8036 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm @@ -32,8 +32,6 @@ use List::MoreUtils qw(first_index last_index); use Sanger::CGP::Pindel; -# remove before release -use Carp qw(croak); use Data::Dumper; const my $PROPER_MAPPED => 2; @@ -153,21 +151,16 @@ sub _good_anchor { my $self = shift; return 0 if($self->unmapped); return 0 if($self->{'mapq'} <= $MIN_ANCHOR_MAPQ); + return 0 if(exists $self->{'tabix'} && $self->_interval_hit); return 0 if($self->mapped_seq <= $MIN_ANCHOR_MAPPED); return 0 if((scalar @{$self->_cigar_operations}) > $MAX_CIGAR_OPS_FOR_ANCHOR); return 0 if($self->frac_pbq_poor > $MAX_POOR_PBQ_FRAC); - return 0 if(exists $self->{'tabix'} && $self->_tabix_hit); 1; } -sub _tabix_hit { +sub _interval_hit { my $self = shift; - my $iter = $self->{'tabix'}->query_full($self->{'rname'}, $self->{'pos'}, $self->{'pos'}); - return 0 unless(defined $iter); - while(my $ret = $iter->next){ - return 1; - } - return 0; + return @{$self->{'tabix'}->{$self->{'rname'}}->fetch($self->{'pos'} - 1, $self->{'pos'})}; } sub frac_pbq_poor { From 2f87401f14fe882fa675bd5098261258d7154248 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Fri, 3 May 2019 16:53:06 +0100 Subject: [PATCH 3/4] The essential bit for missing enties --- perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm b/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm index 65c8036..41365cc 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm @@ -160,7 +160,8 @@ sub _good_anchor { sub _interval_hit { my $self = shift; - return @{$self->{'tabix'}->{$self->{'rname'}}->fetch($self->{'pos'} - 1, $self->{'pos'})}; + return 0 unless (exists $self->{'tabix'}->{$self->{'rname'}}); + return scalar @{$self->{'tabix'}->{$self->{'rname'}}->fetch($self->{'pos'} - 1, $self->{'pos'})}; } sub frac_pbq_poor { From 4aa53c79a47e6c5240cd093807403a956d5cc29a Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 7 May 2019 11:44:36 +0100 Subject: [PATCH 4/4] Version updates and details of change --- CHANGES.md | 7 +++++++ perl/lib/Sanger/CGP/Pindel.pm | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 121bece..8347e96 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,12 @@ # CHANGES +## 3.2.0 + +* Tabix search for high depth/excluded regions now performed in memory using IntervalTrees + * Reduces runtime of input step by ~50% + * Improved disk access profile + * Zero impact on results + ## 3.1.2 * 3.0.5 introduced species parsing bug causing single word species names to be invalid. diff --git a/perl/lib/Sanger/CGP/Pindel.pm b/perl/lib/Sanger/CGP/Pindel.pm index c15616f..2cee5f0 100644 --- a/perl/lib/Sanger/CGP/Pindel.pm +++ b/perl/lib/Sanger/CGP/Pindel.pm @@ -26,7 +26,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '3.1.2'; +our $VERSION = '3.2.0'; our @EXPORT = qw($VERSION); 1;