Skip to content

Commit

Permalink
Merge branch 'release/v3.2.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
keiranmraine committed May 7, 2019
2 parents 52e402f + 4aa53c7 commit e750aba
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 21 deletions.
7 changes: 7 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# CHANGES

## 3.2.0

* Tabix search for high depth/excluded regions now performed in memory using IntervalTrees
* Reduces runtime of input step by ~50%
* Improved disk access profile
* Zero impact on results

## 3.1.2

* 3.0.5 introduced species parsing bug causing single word species names to be invalid.
Expand Down
3 changes: 2 additions & 1 deletion perl/Makefile.PL
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/perl

########## LICENCE ##########
# Copyright (c) 2014-2018 Genome Research Ltd.
# Copyright (c) 2014-2019 Genome Research Ltd.
#
# Author: CASM/Cancer IT <[email protected]>
#
Expand Down Expand Up @@ -46,5 +46,6 @@ WriteMakefile(
'Devel::Cover' => 1.09,
'Pod::Coverage' => 0.23,
'PerlIO::gzip' => 0.20,
'Set::IntervalTree' => 0.12,
}
);
80 changes: 80 additions & 0 deletions perl/bin/pindel_np_remsample.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/perl

use strict;
use warnings FATAL=>'all';
use autodie;

use IO::Uncompress::Gunzip qw(gunzip $GunzipError);

if(@ARGV < 2) {
warn "\nUSAGE: $0 normal_panel.gff3[.gz] sampleToRemove_1 sampleToRemove_2 [...]\n\n";
warn "Output is to stdout, recommended usage:\n";
warn "\t$0 normal_panel.gff3[.gz] sampleToRemove_1 sampleToRemove_2 [...] | bgzip -c > new_panel.gff3.gz\n";
exit(1);
}

my $input = shift @ARGV;
my @samples = @ARGV;

my $z = IO::Uncompress::Gunzip->new($input, MultiStream => 1) or die "gunzip failed: $GunzipError\n";
while(my $line = <$z>) {
if($. < 3) {
print $line;
next;
}
if($. == 3) {
print clean_header($line, \@samples);
next;
}
print clean_record($line, \@samples);
}
close $z;

sub clean_record {
my ($line, $samples) = @_;
chomp $line;
my @columns = split /\t/, $line;
my ($summary, @sample_stat) = split ',', pop @columns;
my $found = 0;
my @cleaned;

for my $ss(@sample_stat) {
my $matched = 0;
for my $samp (@{$samples}) {
if($ss =~ m/^$samp=/) {
$matched++;
next;
}
}
if($matched > 0){
$found++;
}
else {
push @cleaned, $ss;
}
}
if($found == 0) {
return $line."\n";
}
my ($orig_count) = $summary =~ m/(\d+)$/;
my $fixed_count = $orig_count - $found;
return q{} if($fixed_count == 0); # removes line completely
my $corrected = join "\t", @columns, join ',', 'SAMPLE_COUNT='.$fixed_count, @cleaned;
return $corrected."\n";
}

sub clean_header {
my ($line, $samples) = @_;
chomp $line;
my @elements = split /\s/, $line;
my @new;
OUTER: for my $e (@elements) {
for my $samp (@{$samples}) {
next OUTER if $e =~ m/$samp/;
}
push @new, $e;
}
my $final = join q{ }, @new;
$final .= "\n";
return $final;
}
2 changes: 1 addition & 1 deletion perl/lib/Sanger/CGP/Pindel.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use strict;
use Const::Fast qw(const);

use base 'Exporter';
our $VERSION = '3.1.2';
our $VERSION = '3.2.0';
our @EXPORT = qw($VERSION);

1;
36 changes: 27 additions & 9 deletions perl/lib/Sanger/CGP/Pindel/InputGen.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ use Carp qw( croak );
use File::Which qw(which);
use Try::Tiny qw(try catch finally);
use File::Spec;
use Set::IntervalTree;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Data::Dumper;

use Bio::DB::HTS::Tabix;

use Sanger::CGP::Pindel;

# add signal handler for any interrupt so that cleanup of temp is handled correctly.
Expand Down Expand Up @@ -202,6 +202,22 @@ sub _process_set {
}
}

sub _tabix_to_interval_tree {
my $bed = shift;
my %tree;
my $z = IO::Uncompress::Gunzip->new($bed, MultiStream => 1) or die "gunzip failed: $GunzipError\n";
my $value = 1;
while(my $line = <$z>) {
next if ($line =~ m/^#/);
chomp $line;
my ($chr, $s, $e) = split /\t/, $line;
$tree{$chr} = Set::IntervalTree->new() unless(exists $tree{$chr});
$tree{$chr}->insert(\$value, $s, $e);
}
close $z;
return \%tree;
}

sub _completed_threads {
my $self = shift;
my @output_objects;
Expand All @@ -223,28 +239,30 @@ sub reads_to_pindel {
warn "Thread Worker $tid: started\n";

my $convert_start = time;

my $tabix;
if(defined $bed) {
# was tabix, keeping name for consistency
$tabix = _tabix_to_interval_tree($bed);
}

@reads = @{$reads[0]} if(ref $reads[0] eq 'ARRAY');

my $total_reads = scalar @reads;
my @records;
my $to_pindel = Sanger::CGP::Pindel::InputGen::PairToPindel->new($sample_name, $rg_pis);
my $total_pairs = $total_reads / 2;

my $tabix;
if(defined $bed) {
$tabix = new Bio::DB::HTS::Tabix(filename => $bed);
}

for(1..$total_pairs) {
my $pair = Sanger::CGP::Pindel::InputGen::Pair->new(\shift @reads, \shift @reads, $tabix);
next unless($pair->keep_pair);
push @records, @{$to_pindel->pair_to_pindel($pair)};

}
my $retained = scalar @records;
my $excluded = $total_pairs - $retained;
my $excluded = $total_reads - $retained;
my $convert_took = time - $convert_start;
warn "Thread Worker $tid: Excluded $excluded/$total_pairs (${convert_took}s)\n";
warn "Thread Worker $tid: Excluded $excluded/$total_reads (${convert_took}s)\n";
warn "Thread Worker $tid: Generated $retained records\n";
return \@records if($thread == -1);
return @records;
Expand Down
14 changes: 4 additions & 10 deletions perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ use List::MoreUtils qw(first_index last_index);

use Sanger::CGP::Pindel;

# remove before release
use Carp qw(croak);
use Data::Dumper;

const my $PROPER_MAPPED => 2;
Expand Down Expand Up @@ -153,21 +151,17 @@ sub _good_anchor {
my $self = shift;
return 0 if($self->unmapped);
return 0 if($self->{'mapq'} <= $MIN_ANCHOR_MAPQ);
return 0 if(exists $self->{'tabix'} && $self->_interval_hit);
return 0 if($self->mapped_seq <= $MIN_ANCHOR_MAPPED);
return 0 if((scalar @{$self->_cigar_operations}) > $MAX_CIGAR_OPS_FOR_ANCHOR);
return 0 if($self->frac_pbq_poor > $MAX_POOR_PBQ_FRAC);
return 0 if(exists $self->{'tabix'} && $self->_tabix_hit);
1;
}

sub _tabix_hit {
sub _interval_hit {
my $self = shift;
my $iter = $self->{'tabix'}->query_full($self->{'rname'}, $self->{'pos'}, $self->{'pos'});
return 0 unless(defined $iter);
while(my $ret = $iter->next){
return 1;
}
return 0;
return 0 unless (exists $self->{'tabix'}->{$self->{'rname'}});
return scalar @{$self->{'tabix'}->{$self->{'rname'}}->fetch($self->{'pos'} - 1, $self->{'pos'})};
}

sub frac_pbq_poor {
Expand Down

0 comments on commit e750aba

Please sign in to comment.