From aba694ec9160cffcf3f738399e86920a70f093ad Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Fri, 31 Aug 2012 16:10:29 -0400 Subject: [PATCH 1/8] Replace SimpleAlign.pm --- Bio/SimpleAlign.pm | 3014 +++++++++++++++++++++++++------------------- 1 file changed, 1736 insertions(+), 1278 deletions(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index 6114a82630..8dc666e02d 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -13,6 +13,7 @@ # History: # 11/3/00 Added threshold feature to consensus and consensus_aa - PS # May 2001 major rewrite - Heikki Lehvaslaiho +# August 2010 refactoring - Jun Yin =head1 NAME @@ -37,7 +38,7 @@ Bio::SimpleAlign - Multiple alignments held as a set of sequences $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6; # Extract sequences and check values for the alignment column $pos - foreach $seq ($aln->each_seq) { + foreach $seq ($aln->next_Seq) { $res = $seq->subseq($pos, $pos); $count{$res}++; } @@ -46,11 +47,13 @@ Bio::SimpleAlign - Multiple alignments held as a set of sequences } # Manipulate - $aln->remove_seq($seq); - $mini_aln = $aln->slice(20,30); # get a block of columns - $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences - $new_aln = $aln->remove_columns([20,30]); # remove by position - $new_aln = $aln->remove_columns(['mismatch']); # remove by property + $aln->remove_LocatableSeq($seq); + $mini_aln = $aln->select_columns([20..30,45..48]) # get a block of columns + #more parameters + $mini_aln = $aln->select_columns(-selection=>[20..30],-toggle=>0,-keepgaponly=>1) + $mini_aln = $aln->select_Seqs([1,5..10,15]) # select certain sequences + $aln->remove_columns([20..30]); # remove by position + $aln->remove_columns(['mismatch']); # remove by property # Analyze $str = $aln->consensus_string($threshold_percent); @@ -83,6 +86,8 @@ C, and generally is what is used to print out the alignment. They default to name/start-end. The SimpleAlign Module is derived from the Align module by Ewan Birney. +This package is refactored by Jun Yin as part of the Google Summer of +Code project in 2010. =head1 FEEDBACK @@ -133,6 +138,7 @@ Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU Hongyu Zhang, forward at hongyu.org Jay Hannah, jay at jays.net Alexandr Bezginov, albezg at gmail.com +Jun Yin, jun.yin at ucd.ie =head1 SEE ALSO @@ -200,10 +206,13 @@ use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Args : -source => string representing the source program where this alignment came from -annotation => Bio::AnnotationCollectionI - -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set) - -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta + -seq_annotation => Bio::AnnotationCollectionI for sequences + (requires -annotation also be set) + -seqs => array ref containing Bio::LocatableSeq + or Bio::Seq::Meta -consensus => consensus string - -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge) + -consensus_meta => Bio::Seq::Meta object containing + consensus met information (kludge) =cut @@ -257,29 +266,29 @@ sub new { # assumes these are in correct alignment order if ($seqs && ref($seqs) eq 'ARRAY') { for my $seq (@$seqs) { - $self->add_seq($seq); + $self->add_Seq($seq); } } return $self; # success - we hope! } -=head1 Modifier methods +=head1 Alignment modifier methods -These methods modify the MSA by adding, removing or shuffling complete -sequences. +These methods modify the original MSA by adding, removing or shuffling +complete sequences. -=head2 add_seq +=head2 add_Seq - Title : add_seq - Usage : $myalign->add_seq($newseq); - $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5); + Title : add_Seq + Usage : $aln->add_Seq($newseq); + $aln->add_Seq(-SEQ=>$newseq, -ORDER=>5); Function : Adds another sequence to the alignment. *Does not* align it - just adds it to the hashes. If -ORDER is specified, the sequence is inserted at the the position spec'd by -ORDER, and existing sequences are pushed down the storage array. - Returns : nothing + Returns : 1 Args : A Bio::LocatableSeq object Positive integer for the sequence position (optional) @@ -289,11 +298,17 @@ See L for more information sub addSeq { my $self = shift; - $self->deprecated("addSeq - deprecated method. Use add_seq() instead."); - $self->add_seq(@_); + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"addSeq - deprecated method. Use add_Seq() instead."); + $self->add_Seq(@_); } sub add_seq { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"add_seq - deprecated method. Use add_Seq() instead."); + $self->add_Seq(@_); +} + +sub add_Seq { my $self = shift; my @args = @_; my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args); @@ -350,27 +365,36 @@ sub add_seq { } $self->{'_seq'}->{$name} = $seq; + + 1; } -=head2 remove_seq +=head2 remove_LocatableSeq - Title : remove_seq - Usage : $aln->remove_seq($seq); + Title : remove_LocatableSeq + Usage : $aln->remove_LocatableSeq($seq); Function : Removes a single sequence from an alignment - Returns : + Returns : 1 Argument : a Bio::LocatableSeq object =cut sub removeSeq { my $self = shift; - $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead."); - $self->remove_seq(@_); + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"removeSeq - deprecated method. Use remove_seq() instead."); + $self->remove_LocatableSeq(@_); } sub remove_seq { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"remove_seq - deprecated method. Use remove_LocatableSeq() instead."); + $self->remove_LocatableSeq(@_); +} + + +sub remove_LocatableSeq { my $self = shift; my $seq = shift; my ($name,$id); @@ -379,7 +403,14 @@ sub remove_seq { unless ref $seq && $seq->isa( 'Bio::LocatableSeq'); $id = $seq->id(); +<<<<<<< HEAD + $start = $seq->start(); + $end = $seq->end(); + $name = sprintf("%s/%d-%d",$id,$start,$end); + +======= $name = $seq->get_nse; +>>>>>>> upstream/master if( !exists $self->{'_seq'}->{$name} ) { $self->throw("Sequence $name does not exist in the alignment to remove!"); @@ -420,11 +451,50 @@ sub remove_seq { return 1; } +=head2 remove_Seqs + + Title : remove_Seqs + Usage : $aln->remove_Seqs([1,3,5..7]); + $aln->remove_Seqs(-selection=>[1],-toggle=>1); + Function : Removes specified sequences from the alignment + Returns : 1 + Argument : An reference list of positive integers for the selected + sequences. An optional parameter can be defined to toggle + the coordinate selection. + See also select_Seqs + +=cut + +sub remove_Seqs { + my $self=shift; + my ($sel, $toggle) = $self->_rearrange([qw(SELECTION TOGGLE)], @_); + @{$sel}=sort {$a<=>$b} @{$sel}; + + $self->throw("Select start has to be a positive integer, not [".$sel->[0]."]") + unless $sel->[0] =~ /^\d+$/ and $sel->[0] > 0; + $self->throw("Select end has to be a positive integer, not [".$sel->[$#$sel]."]") + unless $sel->[$#$sel] =~ /^\d+$/ and $sel->[$#$sel] > 0; + + #use select_Seqs to select removed Seq objects, then use remove_LocatableSeq to remove the selected Seqs + my $removed_aln; + if($toggle) { + $removed_aln=$self->select_Seqs(_toggle_selection($sel,$self->num_sequences)); + } + else { + $removed_aln=$self->select_Seqs($sel); + } + + foreach my $seq ($removed_aln->next_Seq) { + $self->remove_LocatableSeq($seq); + } + + return 1; +} -=head2 purge +=head2 remove_redundant_Seqs - Title : purge - Usage : $aln->purge(0.7); + Title : remove_redundant_Seqs + Usage : $aln->remove_redundant_Seqs(0.7); Function: Removes sequences above given sequence similarity This function will grind on large alignments. Beware! Example : @@ -434,10 +504,16 @@ sub remove_seq { =cut sub purge { + my ($self,@args) = @_; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"purge - deprecated method. Use remove_redundant_Seqs() instead."); + $self->remove_redundant_Seq(@args); +} + +sub remove_redundant_Seqs { my ($self,$perc) = @_; my (%duplicate, @dups); - my @seqs = $self->each_seq(); + my @seqs = $self->next_Seq(); for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment my $seq = $seqs[$i]; @@ -483,138 +559,19 @@ sub purge { } } foreach my $seq (@dups) { - $self->remove_seq($seq); + $self->remove_LocatableSeq($seq); } return @dups; } -=head2 sort_alphabetically - - Title : sort_alphabetically - Usage : $ali->sort_alphabetically - Function : Changes the order of the alignment to alphabetical on name - followed by numerical by number. - Returns : - Argument : - -=cut - -sub sort_alphabetically { - my $self = shift; - my ($seq,$nse,@arr,%hash,$count); - - foreach $seq ( $self->each_seq() ) { - $nse = $seq->get_nse; - $hash{$nse} = $seq; - } - - $count = 0; - - %{$self->{'_order'}} = (); # reset the hash; - - foreach $nse ( sort _alpha_startend keys %hash) { - $self->{'_order'}->{$count} = $nse; - - $count++; - } - 1; -} - -=head2 sort_by_list - - Title : sort_by_list - Usage : $aln_ordered=$aln->sort_by_list($list_file) - Function : Arbitrarily order sequences in an alignment - Returns : A new Bio::SimpleAlign object - Argument : a file listing sequence names in intended order (one name per line) - -=cut - -sub sort_by_list { - my ($self, $list) = @_; - my (@seq, @ids, %order); - - foreach my $seq ( $self->each_seq() ) { - push @seq, $seq; - push @ids, $seq->display_id; - } - - my $ct=1; - open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list"); - while (<$listfh>) { - chomp; - my $name=$_; - $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids); - $order{$name}=$ct++; - } - close($listfh); - - # use the map-sort-map idiom: - my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq; - my $aln = $self->new; - foreach (@sorted) { $aln->add_seq($_) } - return $aln; -} - -=head2 set_new_reference - - Title : set_new_reference - Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or - the sequence whoes name is "B31" (full, exact, and case-sensitive), - as the reference (1st) sequence - Function : Change/Set a new reference (i.e., the first) sequence - Returns : a new Bio::SimpleAlign object. - Throws an exception if designated sequence not found - Argument : a positive integer of sequence order, or a sequence name - in the original alignment - -=cut - -sub set_new_reference { - my ($self, $seqid) = @_; - my $aln = $self->new; - my (@seq, @ids, @new_seq); - my $is_num=0; - foreach my $seq ( $self->each_seq() ) { - push @seq, $seq; - push @ids, $seq->display_id; - } - - if ($seqid =~ /^\d+$/) { # argument is seq position - $is_num=1; - $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences); - } else { # argument is a seq name - $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids); - } - - for (my $i=0; $i<=$#seq; $i++) { - my $pos=$i+1; - if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) { - unshift @new_seq, $seq[$i]; - } else { - push @new_seq, $seq[$i]; - } - } - foreach (@new_seq) { $aln->add_seq($_); } - return $aln; -} - -sub _in_aln { # check if input name exists in the alignment - my ($str, $ref) = @_; - foreach (@$ref) { - return 1 if $str eq $_; - } - return 0; -} - +=head2 uniq_Seq -=head2 uniq_seq - - Title : uniq_seq - Usage : $aln->uniq_seq(): Remove identical sequences in + Title : uniq_Seq + Usage : $aln->uniq_Seq(): Remove identical sequences in in the alignment. Ambiguous base ("N", "n") and leading and ending gaps ("-") are NOT counted as differences. + $aln->uniq_Seq is a variation of $aln->remove_redundant_Seqs(1) Function : Make a new alignment of unique sequence types (STs) Returns : 1a. if called in a scalar context, a new Bio::SimpleAlign object (all sequences renamed as "ST") @@ -629,13 +586,19 @@ sub _in_aln { # check if input name exists in the alignment =cut sub uniq_seq { + my ($self,@args) = @_; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"uniq_seq - deprecated method. Use uniq_Seq() instead."); + $self->uniq_Seq(@args); +} + +sub uniq_Seq { my ($self, $seqid) = @_; my $aln = $self->new; my (%member, %order, @seq, @uniq_str, $st); my $order=0; my $len = $self->length(); $st = {}; - foreach my $seq ( $self->each_seq() ) { + foreach my $seq ( $self->next_Seq() ) { my $str = $seq->seq(); # it's necessary to ignore "n", "N", leading gaps and ending gaps in @@ -687,7 +650,7 @@ sub uniq_seq { -start=>1, -end =>$end ); - $aln->add_seq($new); + $aln->add_Seq($new); foreach (@{$member{$str}}) { push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n"); @@ -734,118 +697,366 @@ sub _convert_leading_ending_gaps { return $s_new; } -=head1 Sequence selection methods -Methods returning one or more sequences objects. -=head2 each_seq +=head2 sort_alphabetically - Title : each_seq - Usage : foreach $seq ( $align->each_seq() ) - Function : Gets a Seq object from the alignment - Returns : Seq object - Argument : + Title : sort_alphabetically + Usage : $ali->sort_alphabetically + Function : Changes the order of the alignment to alphabetical on name + followed by numerical by number. + Returns : 1 + Argument : None =cut -sub eachSeq { +sub sort_alphabetically { my $self = shift; - $self->deprecated("eachSeq - deprecated method. Use each_seq() instead."); - $self->each_seq(); -} - -sub each_seq { - my $self = shift; - my (@arr,$order); + my ($seq,$nse,@arr,%hash,$count); - foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) { - if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) { - push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}}); - } - } - return @arr; -} + foreach $seq ( $self->next_Seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + $count = 0; -=head2 each_alphabetically + %{$self->{'_order'}} = (); # reset the hash; - Title : each_alphabetically - Usage : foreach $seq ( $ali->each_alphabetically() ) - Function : Returns a sequence object, but the objects are returned - in alphabetically sorted order. - Does not change the order of the alignment. - Returns : Seq object - Argument : + foreach $nse ( sort _alpha_startend keys %hash) { + $self->{'_order'}->{$count} = $nse; -=cut + $count++; + } + 1; +} -sub each_alphabetically { - my $self = shift; - my ($seq,$nse,@arr,%hash,$count); +=head2 sort_by_list - foreach $seq ( $self->each_seq() ) { - $nse = $seq->get_nse; - $hash{$nse} = $seq; - } + Title : sort_by_list + Usage : $aln_ordered=$aln->sort_by_list($list_file) + Function : Arbitrarily order sequences in an alignment + Returns : A new Bio::SimpleAlign object + Argument : a file listing sequence names in intended order + (one name per line) - foreach $nse ( sort _alpha_startend keys %hash) { - push(@arr,$hash{$nse}); - } - return @arr; -} +=cut -sub _alpha_startend { - my ($aname,$astart,$bname,$bstart); - ($aname,$astart) = split (/-/,$a); - ($bname,$bstart) = split (/-/,$b); +sub sort_by_list { + my ($self, $list) = @_; + my (@seq, @ids, %order); - if( $aname eq $bname ) { - return $astart <=> $bstart; + foreach my $seq ( $self->next_Seq() ) { + push @seq, $seq; + push @ids, $seq->display_id; } - else { - return $aname cmp $bname; + + my $ct=1; + open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list"); + while (<$listfh>) { + chomp; + my $name=$_; + $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids); + $order{$name}=$ct++; } + close($listfh); + + # use the map-sort-map idiom: + my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq; + my $aln = $self->new; + foreach (@sorted) { $aln->add_Seq($_) } + return $aln; } -=head2 each_seq_with_id - Title : each_seq_with_id - Usage : foreach $seq ( $align->each_seq_with_id() ) - Function : Gets a Seq objects from the alignment, the contents - being those sequences with the given name (there may be - more than one) - Returns : Seq object - Argument : a seq name +=head2 sort_by_pairwise_identity -=cut + Title : sort_by_pairwise_identity() + Usage : $ali->sort_by_pairwise_identity(2) + Function : Changes the order of the alignment by the pairwise percentage + identity of the reference sequence + Returns : 1 + Argument : Optional, the position or id of reference sequences to be compared + with. Default is the first sequence + See also set_new_reference -sub eachSeqWithId { - my $self = shift; - $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead."); - $self->each_seq_with_id(@_); -} +=cut -sub each_seq_with_id { - my $self = shift; - my $id = shift; +sub sort_by_pairwise_identity { + my ($self,$seqid) = @_; + my ($seq,$nse,@arr,%hash,$count); + $seqid=defined($seqid)?$seqid:1; + my @pairwise_iden=$self->pairwise_percentage_identity($seqid); + + my $seqcount = 0; + foreach $seq ( $self->next_Seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $pairwise_iden[$seqcount++]; + } - $self->throw("Method each_seq_with_id needs a sequence name argument") - unless defined $id; + $count=0; - my (@arr, $seq); + %{$self->{'_order'}} = (); # reset the hash; - if (exists($self->{'_start_end_lists'}->{$id})) { - @arr = @{$self->{'_start_end_lists'}->{$id}}; - } - return @arr; + foreach $nse ( sort {$hash{$b}<=>$hash{$a}} keys %hash) { + $self->{'_order'}->{$count} = $nse; + $count++; + } + 1; } -=head2 get_seq_by_pos +=head2 sort_by_length - Title : get_seq_by_pos - Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment - Function : Gets a sequence based on its position in the alignment. - Numbering starts from 1. Sequence positions larger than + Title : sort_by_length() + Usage : $ali->sort_by_length() + Function : Changes the order of the alignment by the ungapped length of + the sequences + Returns : 1 + Argument : None + +=cut + +sub sort_by_length { + my $self = shift; + my %seq2length; + + foreach my $order (keys %{$self->{'_order'}}) { + my $nse=$self->{'_order'}->{$order}; + my $seq=$self->{'_seq'}->{$nse}; + $seq2length{$seq->get_nse}=$seq->_ungapped_len; + } + + %{$self->{'_order'}} = (); # reset the hash; + + my $count=0; + foreach my $nse (sort {$seq2length{$a}<=>$seq2length{$b}} keys %seq2length) { + $self->{'_order'}->{$count} = $nse; + $count++; + } + + 1; +} + +=head2 sort_by_start + + Title : sort_by_start + Usage : $ali->sort_by_start + Function : Changes the order of the alignment to the start position of + each subalignment + Returns : 1 + Argument : None + +=cut + +sub sort_by_start { + my $self = shift; + my ($seq,$nse,@arr,%hash,$count); + foreach $seq ( $self->next_Seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + $count = 0; + %{$self->{'_order'}} = (); # reset the hash; + foreach $nse ( sort _startend keys %hash) { + $self->{'_order'}->{$count} = $nse; + $count++; + } + 1; +} + +sub _startend +{ + my ($aname,$arange) = split (/[\/]/,$a); + my ($bname,$brange) = split (/[\/]/,$b); + my ($astart,$aend) = split(/\-/,$arange); + my ($bstart,$bend) = split(/\-/,$brange); + return $astart <=> $bstart; +} +=head2 set_new_reference + + Title : set_new_reference + Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, + or the sequence whoes name is "B31" (full, exact, and + case-sensitive), as the reference (1st) sequence + Function : Change/Set a new reference (i.e., the first) sequence + Returns : a new Bio::SimpleAlign object. + Throws an exception if designated sequence not found + Argument : a positive integer of sequence order, or a sequence name + in the original alignment + +=cut + +sub set_new_reference { + my ($self, $seqid) = @_; + my $aln = $self->new; + my (@seq, @ids, @new_seq); + my $is_num=0; + foreach my $seq ( $self->next_Seq() ) { + push @seq, $seq; + push @ids, $seq->display_id; + } + + if ($seqid =~ /^\d+$/) { # argument is seq position + $is_num=1; + $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences); + } else { # argument is a seq name + $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids); + } + + for (my $i=0; $i<=$#seq; $i++) { + my $pos=$i+1; + if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) { + unshift @new_seq, $seq[$i]; + } else { + push @new_seq, $seq[$i]; + } + } + foreach (@new_seq) { $aln->add_Seq($_); } + return $aln; +} + +sub _in_aln { # check if input name exists in the alignment + my ($str, $ref) = @_; + foreach (@$ref) { + return 1 if $str eq $_; + } + return 0; +} + + + +=head1 Alignment selection methods + +These methods are used to select the sequences or horizontal/vertical +subsets of the current MSA. + +=head2 next_Seq + + Title : next_Seq + Usage : foreach $seq ( $aln->next_Seq() ) + Function : Gets a Seq object from the alignment + Returns : Seq object + Argument : + +=cut + +sub eachSeq { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"eachSeq - deprecated method. Use next_Seq() instead."); + $self->next_Seq(); +} + +sub each_seq { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"each_seq - deprecated method. Use next_Seq() instead."); + $self->next_Seq(); +} + +sub next_Seq { + my $self = shift; + my (@arr,$order); + + foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) { + if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) { + push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}}); + } + } + return @arr; +} + + +=head2 next_alphabetically + + Title : next_alphabetically + Usage : foreach $seq ( $aln->next_alphabetically() ) + Function : Returns a sequence object, but the objects are returned + in alphabetically sorted order. + Does not change the order of the alignment. + Returns : Seq object + Argument : None + +=cut + +sub each_alphabetically { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"each_alphabetically - deprecated method. Use next_alphabetically() instead."); + $self->next_Seq(); +} + + +sub next_alphabetically { + my $self = shift; + my ($seq,$nse,@arr,%hash,$count); + + foreach $seq ( $self->next_Seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + + foreach $nse ( sort _alpha_startend keys %hash) { + push(@arr,$hash{$nse}); + } + return @arr; +} + +sub _alpha_startend { + my ($aname,$astart,$bname,$bstart); + ($aname,$astart) = split (/-/,$a); + ($bname,$bstart) = split (/-/,$b); + + if( $aname eq $bname ) { + return $astart <=> $bstart; + } + else { + return $aname cmp $bname; + } +} + +=head2 next_Seq_with_id + + Title : next_Seq_with_id + Usage : foreach $seq ( $aln->next_Seq_with_id() ) + Function : Gets a Seq objects from the alignment, the contents + being those sequences with the given name (there may be + more than one) + Returns : Seq object + Argument : a seq name + +=cut + +sub eachSeqWithId { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"eachSeqWithId - deprecated method. Use next_Seq_with_id() instead."); + $self->next_Seq_with_id(@_); +} + +sub each_seq_with_id { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"each_seq_with_id - deprecated method. Use next_Seq_with_id() instead."); + $self->next_Seq_with_id(@_); +} + +sub next_Seq_with_id { + my $self = shift; + my $id = shift; + + $self->throw("Method next_Seq_with_id needs a sequence name argument") + unless defined $id; + + my (@arr, $seq); + + if (exists($self->{'_start_end_lists'}->{$id})) { + @arr = @{$self->{'_start_end_lists'}->{$id}}; + } + return @arr; +} + +=head2 get_Seq_by_pos + + Title : get_Seq_by_pos + Usage : $seq = $aln->get_Seq_by_pos(3) # third sequence from the alignment + Function : Gets a sequence based on its position in the alignment. + Numbering starts from 1. Sequence positions larger than num_sequences() will throw an error. Returns : a Bio::LocatableSeq object Args : positive integer for the sequence position @@ -853,6 +1064,12 @@ sub each_seq_with_id { =cut sub get_seq_by_pos { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"get_seq_by_pos - deprecated method. Use get_Seq_by_pos() instead."); + $self->get_Seq_by_pos(@_); +} + +sub get_Seq_by_pos { my $self = shift; my ($pos) = @_; @@ -866,10 +1083,10 @@ sub get_seq_by_pos { return $self->{'_seq'}->{$nse}; } -=head2 get_seq_by_id +=head2 get_Seq_by_id - Title : get_seq_by_id - Usage : $seq = $aln->get_seq_by_id($name) # seq named $name + Title : get_Seq_by_id + Usage : $seq = $aln->get_Seq_by_id($name) # seq named $name Function : Gets a sequence based on its name. Sequences that do not exist will warn and return undef Returns : a Bio::LocatableSeq object @@ -878,6 +1095,13 @@ sub get_seq_by_pos { =cut sub get_seq_by_id { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"get_seq_by_id - deprecated method. Use get_Seq_by_id() instead."); + $self->get_Seq_by_id(@_); +} + + +sub get_Seq_by_id { my ($self,$name) = @_; unless( defined $name ) { $self->warn("Must provide a sequence name"); @@ -891,84 +1115,95 @@ sub get_seq_by_id { return; } -=head2 seq_with_features - Title : seq_with_features - Usage : $seq = $aln->seq_with_features(-pos => 1, - -consensus => 60 - -mask => - sub { my $consensus = shift; +=head2 select_Seqs - for my $i (1..5){ - my $n = 'N' x $i; - my $q = '\?' x $i; - while($consensus =~ /[^?]$q[^?]/){ - $consensus =~ s/([^?])$q([^?])/$1$n$2/; - } - } - return $consensus; - } - ); - Function: produces a Bio::Seq object by first splicing gaps from -pos - (by means of a splice_by_seq_pos() call), then creating - features using non-? chars (by means of a consensus_string() - call with stringency -consensus). - Returns : a Bio::Seq object - Args : -pos : required. sequence from which to build the Bio::Seq - object - -consensus : optional, defaults to consensus_string()'s - default cutoff value - -mask : optional, a coderef to apply to consensus_string()'s - output before building features. this may be useful for - closing gaps of 1 bp by masking over them with N, for - instance + Title : select_Seqs + Usage : $aln2 = $aln->select_Seqs([1,5..10,15]) # three first sequences + $aln2 = $aln->select_Seqs(-selection=>[2..4,11..14],-toggle=>1) # toggle selection + Function : Creates a new alignment from a subset of + sequences. Numbering starts from 1. Sequence positions + larger than num_sequences() will thow an error. + Returns : a Bio::SimpleAlign object + Args : An reference list of positive integers for the selected + sequences. An optional parameter can be defined to toggle the + coordinate selection. + =cut -sub seq_with_features{ - my ($self,%arg) = @_; - - #first do the preparatory splice - $self->throw("must provide a -pos argument") unless $arg{-pos}; - $self->splice_by_seq_pos($arg{-pos}); +sub select_Seqs { + my $self=shift; + my ($sel, $toggle) = $self->_rearrange([qw(SELECTION TOGGLE)], @_); + @{$sel}=sort {$a<=>$b} @{$sel}; - my $consensus_string = $self->consensus_string($arg{-consensus}); - $consensus_string = $arg{-mask}->($consensus_string) - if defined($arg{-mask}); + $self->throw("Select start has to be a positive integer, not [".$sel->[0]."]") + unless $sel->[0] =~ /^\d+$/ and $sel->[0] > 0; + $self->throw("Select end has to be a positive integer, not [".$sel->[$#$sel]."]") + unless $sel->[$#$sel] =~ /^\d+$/ and $sel->[$#$sel] > 0; + + my @positions; + if($toggle) { + @positions=@{_toggle_selection($sel,$self->num_sequences)}; + } + else { + @positions=@{$sel}; + } + + my $aln = $self->new; + foreach my $pos (@positions) { + $aln->add_Seq($self->get_Seq_by_pos($pos)); + } + $aln->id($self->id); + # fix for meta, sf, ann + return $aln; +} - my(@bs,@es); - push @bs, 1 if $consensus_string =~ /^[^?]/; +sub select { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"select - deprecated method. Use select_Seqs() instead."); + $self->select_Seqs([$_[0]..$_[1]]); +} - while($consensus_string =~ /\?[^?]/g){ - push @bs, pos($consensus_string); - } - while($consensus_string =~ /[^?]\?/g){ - push @es, pos($consensus_string); +sub select_noncont { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"select_noncont - deprecated method. Use select_Seqs() instead."); + if($_[0] eq 'nosort') { + shift; } + $self->select_Seqs([@_]); +} - push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/; +=head2 select_columns - my $seq = Bio::Seq->new(); + Title : select_columns + Usage : $newaln = $aln->select_columns([20..30,45..48]); + $newaln = $aln->select_columns(['mismatch']); + $newaln = $aln->select_columns(-selection=>['mismatch'],-toggle=>1); + Function : Creates a slice from the alignment from the selected columns. + The first column in the alignment is denoted 1. + Sequences with no residues in the slice are excluded from the + new alignment and a warning is printed. Slice beyond the length + of the sequence does not do padding. + Returns : A Bio::SimpleAlign object + Args : Positive integers for the selected colums, or specified type + ('match'|'weak'|'strong'|'mismatch') + First optional boolean can be defined to toggle the coordinate + selection. Second optional boolean which if true will keep gap-only + columns in the newly created slice. Example: -# my $rootfeature = Bio::SeqFeature::Generic->new( -# -source_tag => 'location', -# -start => $self->get_seq_by_pos($arg{-pos})->start, -# -end => $self->get_seq_by_pos($arg{-pos})->end, -# ); -# $seq->add_SeqFeature($rootfeature); + $aln2 = $aln->select_columns([20..30],0,1) + or $aln2 = $aln->select_columns(-selection=>[20..30],-toggle=>0,-keepgaponly=>1) - while(my $b = shift @bs){ - my $e = shift @es; - $seq->add_SeqFeature( - Bio::SeqFeature::Generic->new( - -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start, - -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start, - -source_tag => $self->source || 'MSA', - ) - ); - } +=cut +<<<<<<< HEAD +sub slice { + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"slice - deprecated method. Use select_columns() instead."); + $self->select_columns([$_[0]..$_[1]],0,defined($_[2])?$_[2]:0); +======= return $seq; } @@ -1009,52 +1244,21 @@ sub select { $aln->id($self->id); # fix for meta, sf, ann return $aln; +>>>>>>> upstream/master } -=head2 select_noncont - - Title : select_noncont - Usage : # 1st and 3rd sequences, sorted - $aln2 = $aln->select_noncont(1, 3) - - # 1st and 3rd sequences, sorted (same as first) - $aln2 = $aln->select_noncont(3, 1) - - # 1st and 3rd sequences, unsorted - $aln2 = $aln->select_noncont('nosort',3, 1) - - Function : Creates a new alignment from a subset of sequences. Numbering - starts from 1. Sequence positions larger than num_sequences() will - throw an error. Sorts the order added to new alignment by default, - to prevent sorting pass 'nosort' as the first argument in the list. - Returns : a Bio::SimpleAlign object - Args : array of integers for the sequences. If the string 'nosort' is - passed as the first argument, the sequences will not be sorted - in the new alignment but will appear in the order listed. - -=cut -sub select_noncont { +sub select_columns { my $self = shift; - my $nosort = 0; - my (@pos) = @_; - if ($pos[0] !~ m{^\d+$}) { - my $sortcmd = shift @pos; - if ($sortcmd eq 'nosort') { - $nosort = 1; - } else { - $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time."); - } - } - - my $end = $self->num_sequences; - foreach ( @pos ) { - $self->throw("position must be a positive integer, > 0 and <= $end not [$_]") - unless( /^\d+$/ && $_ > 0 && $_ <= $end ); + #my ($start, $end, $keep_gap_only) = @_; + my ($sel, $toggle,$keep_gap_only) = $self->_rearrange([qw(SELECTION TOGGLE KEEPGAPONLY)], @_); + if($sel->[0]=~/^[a-z]/i) { + $sel=$self->_select_columns_by_type($sel); } - - @pos = sort {$a <=> $b} @pos unless $nosort; +<<<<<<< HEAD + @{$sel}=sort {$a<=$b} @{$sel}; +======= my $aln = $self->new; foreach my $p (@pos) { $aln->add_seq($self->get_seq_by_pos($p)); @@ -1100,27 +1304,29 @@ sub select_noncont_by_name { Args : Positive integer for start column, positive integer for end column, optional boolean which if true will keep gap-only columns in the newly created slice. Example: +>>>>>>> upstream/master - $aln2 = $aln->slice(20,30,1) + #warnings + $self->throw("select_columns start has to be a positive integer, not [".$sel->[0]."]") + unless $sel->[0] =~ /^\d+$/ and $sel->[0] > 0; + $self->throw("select_columns end has to be a positive integer, not [".$sel->[$#$sel]."]") + unless $sel->[$#$sel] =~ /^\d+$/ and $sel->[$#$sel] > 0; + $self->throw("This alignment has only ". $self->length . " residues. select_columns start " . + "[".$sel->[0]."] is too big.") if $sel->[0] > $self->length; -=cut + my $newcoords; + + if($toggle) { + $sel=_toggle_selection($sel,$self->length); + } + $newcoords=_cont_coords($sel); -sub slice { - my $self = shift; - my ($start, $end, $keep_gap_only) = @_; - - $self->throw("Slice start has to be a positive integer, not [$start]") - unless $start =~ /^\d+$/ and $start > 0; - $self->throw("Slice end has to be a positive integer, not [$end]") - unless $end =~ /^\d+$/ and $end > 0; - $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]") - unless $start <= $end; - $self->throw("This alignment has only ". $self->length . " residues. Slice start " . - "[$start] is too big.") if $start > $self->length; my $cons_meta = $self->consensus_meta; my $aln = $self->new; $aln->id($self->id); - foreach my $seq ( $self->each_seq() ) { + my ($start,$end)=($newcoords->[0],$newcoords->[$#$newcoords]); + + foreach my $seq ( $self->next_Seq() ) { my $new_seq = $seq->isa('Bio::Seq::MetaI') ? Bio::Seq::Meta->new (-id => $seq->id, @@ -1134,43 +1340,103 @@ sub slice { -verbose => $self->verbose); # seq + my $seq_end = $end; $seq_end = $seq->length if( $end > $seq->length ); + # the actual selected seq - my $slice_seq = $seq->subseq($start, $seq_end); + my $slice_seq; + for(my $num=0;$num<$#$newcoords;) { + $slice_seq.= $seq->subseq($newcoords->[$num], $newcoords->[$num+1]); + $num+=2; + } + $new_seq->seq( $slice_seq ); - $slice_seq =~ s/\W//g; + #all the non residue chars + my $nonres = join("",$self->gap_char, $self->match_char,$self->missing_char); + + #need to convert to non-gapped start and end + my ($ng_start,$ng_end); #ungapped start and ungapped end + + if($slice_seq=~/^[\Q$nonres\E]+$/) { + ($ng_start,$ng_end)=($start,$end); + } + else { + if($slice_seq=~/^[\Q$nonres\E]+/) { + $ng_start=$sel->[length($&)]; + } + else { + $ng_start=$start; + } + + if($slice_seq=~/[\Q$nonres\E]+$/) { + $ng_end=$sel->[$#$sel-length($&)-1]; + } + else { + $ng_end=$end; + } + } + + my $slice_length=($slice_seq =~s/([^\Q$nonres\E])/$1/g); + my ($newstart, $newend, $fakestart, $oldend); + my ($pre_start_seq,$pre_start_length,$after_end_seq,$after_end_length); + + + if($ng_end>=CORE::length($seq->seq())) { + $after_end_length=0; + } + else { + $after_end_seq = $seq->subseq($ng_end+1,CORE::length($seq->seq())); + $after_end_length=($after_end_seq=~s/([^\Q$nonres\E])/$1/g); + } + + #calculate the new start and new end + #new_start=old_start+pre_start-1 + #new_end=old_end-after_end+1 + #in order to pass the $seq->end test, we should assign a fake start first + if($ng_start>1) { + + my $pre_start_seq = $seq->subseq(1, $ng_start - 1); + my $pre_start_length=($pre_start_seq=~s/([^\Q$nonres\E])/$1/g); #need to check + if (defined($seq->strand) && $seq->strand < 0){ + $newend=$seq->end-$pre_start_length; + $fakestart=$newend-$slice_length+1; + $newstart=$seq->start+$after_end_length; + } + else { + $newend=$seq->end-$after_end_length; + $fakestart=$newend-$slice_length+1; + $newstart=$seq->start+$pre_start_length; + } + } + else { + if (defined($seq->strand) && $seq->strand < 0){ + $newend=$seq->end; + $fakestart=$newend-$slice_length+1; + $newstart=$seq->start+$after_end_length; + } + else { + $newend=$seq->end-$after_end_length; + $fakestart=$newend-$slice_length+1; + $newstart=$seq->start; + } + } + $new_seq->start($fakestart); + $new_seq->end($newend); + $new_seq->start($newstart); - if ($start > 1) { - my $pre_start_seq = $seq->subseq(1, $start - 1); - $pre_start_seq =~ s/\W//g; - if (!defined($seq->strand)) { - $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); - } elsif ($seq->strand < 0){ - $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1); - } else { - $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); - } - } else { - if ((defined $seq->strand)&&($seq->strand < 0)){ - $new_seq->start( $seq->end - CORE::length($slice_seq) + 1); - } else { - $new_seq->start( $seq->start); - } - } if ($new_seq->isa('Bio::Seq::MetaI')) { for my $meta_name ($seq->meta_names) { - $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end)); + $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $ng_start, $end)); } } - $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 ); if ($new_seq->start and $new_seq->end >= $new_seq->start) { - $aln->add_seq($new_seq); + $aln->add_Seq($new_seq); } else { if( $keep_gap_only ) { - $aln->add_seq($new_seq); + $aln->add_Seq($new_seq); } else { my $nse = $seq->get_nse(); $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.". @@ -1190,69 +1456,161 @@ sub slice { return $aln; } + +sub _select_columns_by_type { + #code copied from _remove_columns_by_type to enable selecting columns by type + my ($self,$type) = @_; + my @selects; + + my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type}); + my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type}); + my %matchchars = ( 'match' => '\*', + 'weak' => '\.', + 'strong' => ':', + 'mismatch' => ' ', + ); + # get the characters to delete against + my $del_char; + foreach my $type (@{$type}){ + $del_char.= $matchchars{$type}; + } + + my $length = 0; + my $match_line = $self->match_line; + # do the matching to get the segments to remove + if($del_char){ + while($match_line =~ m/[$del_char]/g ){ + my $start = pos($match_line); + $match_line=~/\G[$del_char]+/gc; + my $end = pos($match_line); + + #have to offset the start and end for subsequent removes + push @selects, $start..$end; + } + } + + # remove the segments + return \@selects; +} =head2 remove_columns Title : remove_columns - Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or - $aln2 = $aln->remove_columns([0,0],[6,8]) - Function : Creates an aligment with columns removed corresponding to + Usage : my $newaln=$aln->remove_columns(['mismatch','weak']) or + my $newaln=$aln->remove_columns([3,6..8]) or + my $newaln=$aln->remove_columns(-selection=>[3,6..8],-toggle=>1,-keepgaponly=>0) + Function : Modify the aligment with columns removed corresponding to the specified type or by specifying the columns by number. - Returns : Bio::SimpleAlign object - Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'| - 'all_gaps_columns') or array ref where the referenced array + remove_columns is a variance of select_columns(-toogle=>1) + Returns : 1 + Args : Array ref of types ('match'|'weak'|'strong'|'mismatch') + or array ref where the referenced array contains a pair of integers that specify a range. - The first column is 0 + use remove_gaps to remove columns containing gaps + The first column is 1 + See also select_columns =cut sub remove_columns { - my ($self,@args) = @_; - @args || $self->throw("Must supply column ranges or column types"); - my $aln; + my ($self,@args) = @_; + @args || $self->throw("Must supply column ranges or column types"); + my ($sel, $toggle,$gap) = $self->_rearrange([qw(SELECTION TOGGLE KEEPGAPONLY)], @args); + my $newaln=$self->select_columns(-selection=>$sel,-toggle=>$toggle?0:1,-keepgaponly=>$gap); + return $newaln; +} - if ($args[0][0] =~ /^[a-z_]+$/i) { - $aln = $self->_remove_columns_by_type($args[0]); - } elsif ($args[0][0] =~ /^\d+$/) { - $aln = $self->_remove_columns_by_num(\@args); - } else { - $self->throw("You must pass array references to remove_columns(), not @args"); - } - # fix for meta, sf, ann - $aln; +sub _cont_coords { + #This function is used to merge the coordinates from select and remove functions in order to reduce the number of calculations in select and #remove. For exmaple, if the input of remove_columns is remove_columns([2,5,7..10]), this function will transform ([2,5,7..10]) to + # ([2,2,5,5,7,10]). + + my ($old_coords)=@_; + @{$old_coords}=sort {$a<=>$b} @{$old_coords}; + + my $cont_coords; + + push @{$cont_coords},$old_coords->[0]; + for(my $num=0;$num<@{$old_coords};) { + if($num==@{$old_coords}-1) { + #for single selection + push @{$cont_coords},$old_coords->[$num]; + last; + } + if($old_coords->[$num+1]-$old_coords->[$num]>1) { + if($num+2==@{$old_coords}) { + push @{$cont_coords},$old_coords->[$num],$old_coords->[$num+1],$old_coords->[$num+1]; + last; + } + else { + push @{$cont_coords},$old_coords->[$num],$old_coords->[$num+1]; + } + } + else { + if ($num+2==@{$old_coords}) { + push @{$cont_coords},$old_coords->[$num+1]; + last; + } + } + $num++; + } + return $cont_coords; +} + + +sub _toggle_selection { + #This function is used to toggle the selection of sequences or columns + my ($old_coords,$length)=@_; + my %hash=map {$_=>1} @{$old_coords}; + my $new_coords; + for(my $num=1;$num<=$length;$num++) { + unless(defined($hash{$num})) { + push @{$new_coords},$num; + } + } + return $new_coords; } =head2 remove_gaps Title : remove_gaps - Usage : $aln2 = $aln->remove_gaps + Usage : $aln2 = $aln->remove_gaps(-reference=>5) Function : Creates an aligment with gaps removed Returns : a Bio::SimpleAlign object - Args : a gap character(optional) if none specified taken + Args : -GAPCHAR a gap character(optional) if none specified taken from $self->gap_char, - [optional] $all_gaps_columns flag (1 or 0, default is 0) - indicates that only all-gaps columns should be deleted - -Used from method L in most cases. Set gap character -using L. + -ALLGAPCOL $all_gaps_columns flag (1 or 0, default is 0) + indicates that only all-gaps columns should be deleted + -REFERENCE splices all aligned sequences where the specified + sequence has gaps. =cut sub remove_gaps { - my ($self,$gapchar,$all_gaps_columns) = @_; - my $gap_line; - if ($all_gaps_columns) { - $gap_line = $self->all_gap_line($gapchar); - } else { - $gap_line = $self->gap_line($gapchar); - } + my $self=shift @_; + my ($gapchar,$all_gaps_columns,$reference) = $self->_rearrange([qw(GAPCHAR ALLGAPCOL REFERENCE)], @_); + my $gap_line; + if($reference) { + my $refseq=$self->get_Seq_by_pos($reference); + $gap_line=$refseq->seq(); + } + else { + if ($all_gaps_columns) { + $gap_line = $self->all_gap_line($gapchar); + } else { + $gap_line = $self->gap_line($gapchar); + } + } + my $aln = $self->new; - my @remove; my $length = 0; my $del_char = $gapchar || $self->gap_char; # Do the matching to get the segments to remove + my $removed_cols; while ($gap_line =~ m/[$del_char]/g) { +<<<<<<< HEAD + push @{$removed_cols}, pos($gap_line); +======= my $start = pos($gap_line)-1; $gap_line =~ m/\G[$del_char]+/gc; my $end = pos($gap_line)-1; @@ -1262,251 +1620,790 @@ sub remove_gaps { $end -=$length; $length += ($end-$start+1); push @remove, [$start,$end]; +>>>>>>> upstream/master } - #remove the segments - $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self; + $aln = $#$removed_cols >= 0 ? $self->select_columns($removed_cols,1) : $self; # fix for meta, sf, ann return $aln; } - -sub _remove_col { - my ($self,$aln,$remove) = @_; - my @new; - - my $gap = $self->gap_char; - - # splice out the segments and create new seq - foreach my $seq($self->each_seq){ - my $new_seq = Bio::LocatableSeq->new( - -id => $seq->id, - -alphabet=> $seq->alphabet, - -strand => $seq->strand, - -verbose => $self->verbose); - my $sequence = $seq->seq; - foreach my $pair(@{$remove}){ - my $start = $pair->[0]; - my $end = $pair->[1]; - $sequence = $seq->seq unless $sequence; - my $orig = $sequence; - my $head = $start > 0 ? substr($sequence, 0, $start) : ''; - my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1); - $sequence = $head.$tail; - # start - unless (defined $new_seq->start) { - if ($start == 0) { - my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g; - $new_seq->start($seq->start + $end + 1 - $start_adjust); - } - else { - my $start_adjust = $orig =~ /^$gap+/; - if ($start_adjust) { - $start_adjust = $+[0] == $start; - } - $new_seq->start($seq->start + $start_adjust); - } - } - # end - if (($end + 1) >= CORE::length($orig)) { - my $end_adjust = () = substr($orig, $start) =~ /$gap/g; - $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust); - } - else { - $new_seq->end($seq->end); - } - } - - if ($new_seq->end < $new_seq->start) { - # we removed all columns except for gaps: set to 0 to indicate no - # sequence - $new_seq->start(0); - $new_seq->end(0); - } - - $new_seq->seq($sequence) if $sequence; - push @new, $new_seq; - } - # add the new seqs to the alignment - foreach my $new(@new){ - $aln->add_seq($new); - } - # fix for meta, sf, ann - return $aln; +sub splice_by_seq_pos{ + my $self = shift; + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"splice_by_seq_pos - deprecated method. Use remove_gaps() instead."); + $self->remove_gaps(-reference=>$_[0]); } -sub _remove_columns_by_type { - my ($self,$type) = @_; - my $aln = $self->new; - my @remove; +=head2 mask_columns - my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type}); - my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type}); - my %matchchars = ( 'match' => '\*', - 'weak' => '\.', - 'strong' => ':', - 'mismatch' => ' ', - 'gaps' => '', - 'all_gaps_columns' => '' - ); - # get the characters to delete against - my $del_char; - foreach my $type (@{$type}){ - $del_char.= $matchchars{$type}; - } + Title : mask_columns + Usage : $aln2 = $aln->mask_columns([3..8,11..22]) + $aln2 = $aln->mask_columns(-selection=>[2,5,7..10],-toggle=>1) + Function : Masks slices of the alignment inclusive of the defined + columns, and the first column in the alignment is denoted 1. + Mask beyond the length of the sequence does not do padding. + Returns : A Bio::SimpleAlign object + Args : Positive integers should be used to defined the column numbers + The mask character should be defined by $aln->mask_char() or "?" as + default. An optional parameter can be defined to toggle the + coordinate selection. + Note : - my $length = 0; - my $match_line = $self->match_line; - # do the matching to get the segments to remove - if($del_char){ - while($match_line =~ m/[$del_char]/g ){ - my $start = pos($match_line)-1; - $match_line=~/\G[$del_char]+/gc; - my $end = pos($match_line)-1; +=cut - #have to offset the start and end for subsequent removes - $start-=$length; - $end -=$length; - $length += ($end-$start+1); - push @remove, [$start,$end]; - } +sub mask_columns { + #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing + my $self=shift; + my ($sel, $toggle) = $self->_rearrange([qw(SELECTION TOGGLE)], @_); + @{$sel}=sort {$a<=>$b} @{$sel}; + + my $nonres = join("",$self->gap_char, $self->match_char,$self->missing_char); + my $mask_char=$self->mask_char; + + #Exceptions + $self->throw("Mask start has to be a positive integer and less than ". + "alignment length, not [".$sel->[0]."]") + unless $sel->[0] =~ /^\d+$/ && $sel->[0] > 0 && $sel->[0] <= $self->length; + + $self->throw("Mask end has to be a positive integer and less than ". + "alignment length, not [".$sel->[$#$sel]."]") + unless $sel->[$#$sel] =~ /^\d+$/ && $sel->[$#$sel] > 0 && $sel->[$#$sel] <= $self->length; + + #calculate the coords, and make it in a continous way + my @newcoords; + if($toggle) { + @newcoords=@{_cont_coords(_toggle_selection($sel,$self->length))}; } - - # remove the segments - $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self; - $aln = $aln->remove_gaps() if $gap; - $aln = $aln->remove_gaps('', 1) if $all_gaps_columns; - # fix for meta, sf, ann - $aln; -} - - -sub _remove_columns_by_num { - my ($self,$positions) = @_; + else { + @newcoords=@{_cont_coords($sel)}; + } + my $aln = $self->new; + $aln->id($self->id); + - # sort the positions - @$positions = sort { $a->[0] <=> $b->[0] } @$positions; - - my @remove; - my $length = 0; - foreach my $pos (@{$positions}) { - my ($start, $end) = @{$pos}; - - #have to offset the start and end for subsequent removes - $start-=$length; - $end -=$length; - $length += ($end-$start+1); - push @remove, [$start,$end]; - } - - #remove the segments - $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self; - # fix for meta, sf, ann - $aln; + foreach my $seq ( $self->next_Seq() ) { + my $new_seq = Bio::LocatableSeq->new(-id => $seq->id, + -alphabet => $seq->alphabet, + -strand => $seq->strand, + -verbose => $self->verbose); + my $dna_string=$seq->seq(); + my $masked_string; + for(my $num=0;$num<@newcoords;) { + my ($start,$end)=@newcoords[$num,$num+1]; + $masked_string = substr($dna_string, $start - 1, $end - $start + 1); + $masked_string =~ s{[^$nonres]}{$mask_char}g; + $dna_string = substr($dna_string,0,$start-1) . $masked_string . substr($dna_string,$end); + $num+=2; + } + $new_seq->seq($dna_string); + $aln->add_Seq($new_seq); + } + return $aln; } +=head2 seq_with_features + Title : seq_with_features + Usage : $seq = $aln->seq_with_features(-pos => 1, + -consensus => 60 + -mask => + sub { my $consensus = shift; -=head1 Change sequences within the MSA + for my $i (1..5){ + my $n = 'N' x $i; + my $q = '\?' x $i; + while($consensus =~ /[^?]$q[^?]/){ + $consensus =~ s/([^?])$q([^?])/$1$n$2/; + } + } + return $consensus; + } + ); + Function: produces a Bio::Seq object by first splicing gaps from -pos + (by means of a remove_gaps(-reference=>1) call), then creating + features using non-? chars (by means of a consensus_string() + call with stringency -consensus). + Returns : a Bio::Seq object + Args : -pos : required. sequence from which to build the Bio::Seq + object + -consensus : optional, defaults to consensus_string()'s + default cutoff value + -mask : optional, a coderef to apply to consensus_string()'s + output before building features. this may be useful for + closing gaps of 1 bp by masking over them with N, for + instance -These methods affect characters in all sequences without changing the -alignment. +=cut -=head2 splice_by_seq_pos +sub seq_with_features{ + my ($self,%arg) = @_; - Title : splice_by_seq_pos - Usage : $status = splice_by_seq_pos(1); - Function: splices all aligned sequences where the specified sequence - has gaps. - Example : - Returns : 1 on success - Args : position of sequence to splice by + #first do the preparatory splice + $self->throw("must provide a -pos argument") unless $arg{-pos}; + $self->remove_gaps(-reference=>$arg{-pos}); + my $consensus_string = $self->consensus_string($arg{-consensus}); + $consensus_string = $arg{-mask}->($consensus_string) + if defined($arg{-mask}); -=cut + my(@bs,@es); -sub splice_by_seq_pos{ - my ($self,$pos) = @_; + push @bs, 1 if $consensus_string =~ /^[^?]/; - my $guide = $self->get_seq_by_pos($pos); - my $guide_seq = $guide->seq; + while($consensus_string =~ /\?[^?]/g){ + push @bs, pos($consensus_string); + } + while($consensus_string =~ /[^?]\?/g){ + push @es, pos($consensus_string); + } - $guide_seq =~ s/\./\-/g; + push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/; - my @gaps = (); - $pos = -1; - while(($pos = index($guide_seq, '-', $pos)) > -1 ){ - unshift @gaps, $pos; - $pos++; - } + my $seq = Bio::Seq->new(); + +# my $rootfeature = Bio::SeqFeature::Generic->new( +# -source_tag => 'location', +# -start => $self->get_seq_by_pos($arg{-pos})->start, +# -end => $self->get_seq_by_pos($arg{-pos})->end, +# ); +# $seq->add_SeqFeature($rootfeature); - foreach my $seq ($self->each_seq){ - my @bases = split '', $seq->seq; + while(my $b = shift @bs){ + my $e = shift @es; + $seq->add_SeqFeature( + Bio::SeqFeature::Generic->new( + -start => $b - 1 + $self->get_Seq_by_pos($arg{-pos})->start, + -end => $e - 1 + $self->get_Seq_by_pos($arg{-pos})->start, + -source_tag => $self->source || 'MSA', + ) + ); + } + + return $seq; +} + + +=head1 Change sequences within the MSA + +These methods affect characters in all sequences without changing the +alignment. + + +=head2 map_chars + + Title : map_chars + Usage : $ali->map_chars('\.','-') + Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap + characters. + + Note that the first argument is interpreted as a regexp + so be careful and escape any wild card characters (e.g. + do $ali->map_chars('\.','-') to replace periods with dashes. + Returns : + Argument : A regexp and a string + +=cut + +sub map_chars { + my $self = shift; + my $from = shift; + my $to = shift; + my ( $seq, $temp ); + + $self->throw("Need two arguments: a regexp and a string") + unless defined $from and defined $to; + +<<<<<<< HEAD + foreach $seq ( $self->next_Seq() ) { + $temp = $seq->seq(); + $temp =~ s/$from/$to/g; + $seq->seq($temp); +======= + foreach $seq ( $self->each_seq() ) { + $temp = $seq->seq(); + $temp =~ s/$from/$to/g; + $seq->seq($temp); +>>>>>>> upstream/master + } + return 1; +} + + +=head2 uppercase + + Title : uppercase() + Usage : $ali->uppercase() + Function : Sets all the sequences to uppercase + Returns : 1 + Argument : None + +=cut + +sub uppercase { + my $self = shift; + my $seq; + my $temp; + + foreach $seq ( $self->next_Seq() ) { + $temp = $seq->seq(); + $temp =~ tr/[a-z]/[A-Z]/; + + $seq->seq($temp); + } + return 1; +} + +=head2 lowercase + + Title : lowercase() + Usage : $ali->lowercase() + Function : Sets all the sequences to lowercase + Returns : 1 + Argument : None + +=cut + +sub lowercase { + my $self = shift; + my $seq; + my $temp; + + foreach $seq ( $self->next_Seq() ) { + $temp = $seq->seq(); + $temp =~ tr/[A-Z]/[a-z]/; + + $seq->seq($temp); + } + return 1; +} + + +=head2 togglecase + + Title : togglecase() + Usage : $ali->togglecase() + Function : Sets all the sequences to opposite case + Returns : 1 + Argument : None + +=cut + +<<<<<<< HEAD +sub togglecase { + my $self = shift; + my $seq; + my $temp; +======= +sub gap_line { + my ($self,$gapchar) = @_; + $gapchar = $gapchar || $self->gap_char; + my %gap_hsh; # column gaps vector + foreach my $seq ( $self->each_seq ) { + my $i = 0; + map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/} + map {[$i++, $_]} split(//, uc ($seq->seq)); + } + my $gap_line; + foreach my $pos ( 0..$self->length-1 ) { + $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.'; + } + return $gap_line; +} + +=head2 all_gap_line + + Title : all_gap_line() + Usage : $line = $align->all_gap_line() + Function : Generates a gap line - much like consensus string + except that a line where '-' represents all-gap column + Args : (optional) gap line characters ('-' by default) + Returns : string +>>>>>>> upstream/master + + foreach $seq ( $self->next_Seq() ) { + $temp = $seq->seq(); + $temp =~ tr/[A-Za-z]/[a-zA-Z]/; + +<<<<<<< HEAD + $seq->seq($temp); +======= +sub all_gap_line { + my ($self,$gapchar) = @_; + $gapchar = $gapchar || $self->gap_char; + my %gap_hsh; # column gaps counter hash + my @seqs = $self->each_seq; + foreach my $seq ( @seqs ) { + my $i = 0; + map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/} + map {[$i++, $_]} split(//, uc ($seq->seq)); + } + my $gap_line; + foreach my $pos ( 0..$self->length-1 ) { + if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) { + # gaps column + $gap_line .= $self->gap_char; + } else { + $gap_line .= '.'; + } +>>>>>>> upstream/master + } + return 1; +} + +<<<<<<< HEAD +======= +=head2 gap_col_matrix + + Title : gap_col_matrix() + Usage : my $cols = $align->gap_col_matrix() + Function : Generates an array where each element in the array is a + hash reference with a key of the sequence name and a + value of 1 if the sequence has a gap at that column + Returns : Reference to an array + Args : Optional: gap line character ($aln->gap_char or '-' by default) + +=cut + +sub gap_col_matrix { + my ( $self, $gapchar ) = @_; + $gapchar = $gapchar || $self->gap_char; + my %gap_hsh; # column gaps vector + my @cols; + foreach my $seq ( $self->each_seq ) { + my $i = 0; + my $str = $seq->seq; + my $len = $seq->length; + my $ch; + my $id = $seq->display_id; + while ( $i < $len ) { + $ch = substr( $str, $i, 1 ); + $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ ); + } + } + return \@cols; +} +>>>>>>> upstream/master + +=head2 match + + Title : match() + Usage : $aln->match() + Function : Goes through all columns and changes residues that are + identical to residue in first sequence to match '.' + character. Sets match_char. + + USE WITH CARE: Most MSA formats do not support match + characters in sequences, so this is mostly for output + only. NEXUS format (Bio::AlignIO::nexus) can handle + it. + Returns : 1 + Argument : a match character, optional, defaults to '.' + If the character is defined, it will reset $aln->match_char + +=cut + +sub match { + my ($self, $match) = @_; + + if(defined $match) { + $self->match_char($match); + } + else { + $match=$self->match_char(); + } + + my ($matching_char) = $match; + $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #'; + $self->map_chars($matching_char, '-'); + + my @seqs = $self->next_Seq(); + return 1 unless scalar @seqs > 1; + + my $refseq = shift @seqs ; + my @refseq = split //, $refseq->seq; + my $gapchar = $self->gap_char; + + foreach my $seq ( @seqs ) { + my @varseq = split //, $seq->seq(); + for ( my $i=0; $i < scalar @varseq; $i++) { + $varseq[$i] = $match if defined $refseq[$i] && + ( $refseq[$i] =~ /[A-Za-z\*]/ || + $refseq[$i] =~ /$gapchar/ ) + && $refseq[$i] eq $varseq[$i]; + } + $seq->seq(join '', @varseq); + } + + return 1; +} + + +=head2 unmatch + + Title : unmatch() + Usage : $ali->unmatch() + Function : Undoes the effect of method match. Unsets match_char. + Returns : 1 + Argument : a match character, optional, defaults to '.' + +See L and L + +=cut + +sub unmatch { + my ($self, $match) = @_; + + if(defined $match) { + $self->match_char($match); + } + else { + $match=$self->match_char(); + } + + my @seqs = $self->next_Seq(); + return 1 unless scalar @seqs > 1; + + my $refseq = shift @seqs ; + my @refseq = split //, $refseq->seq; + my $gapchar = $self->gap_char; + foreach my $seq ( @seqs ) { + my @varseq = split //, $seq->seq(); + for ( my $i=0; $i < scalar @varseq; $i++) { + $varseq[$i] = $refseq[$i] if defined $refseq[$i] && + ( $refseq[$i] =~ /[A-Za-z\*]/ || + $refseq[$i] =~ /$gapchar/ ) && + $varseq[$i] eq $match; + } + $seq->seq(join '', @varseq); + } + $self->match_char(''); + return 1; +} + + +=head1 Consensus sequences + +Methods to calculate consensus sequences for the MSA + +=head2 consensus_string + + Title : consensus_string + Usage : $str = $ali->consensus_string($threshold_percent) + Function : Makes a strict consensus + Returns : Consensus string + Argument : Optional treshold ranging from 0 to 100. + The consensus residue has to appear at least threshold % + of the sequences at a given location, otherwise a '?' + character will be placed at that location. + (Default value = 0%) + +=cut + +sub consensus_string { + my $self = shift; + my $threshold = shift; + + my $out = ""; + my $len = $self->length - 1; + + foreach ( 0 .. $len ) { + $out .= $self->_consensus_aa($_,$threshold); + } + return $out; +} + +sub _consensus_aa { + my $self = shift; + my $point = shift; + my $threshold_percent = shift || -1 ; + my ($seq,%hash,$count,$letter,$key); + my $gapchar = $self->gap_char; + foreach $seq ( $self->next_Seq() ) { + $letter = substr($seq->seq,$point,1); + $self->throw("--$point-----------") if $letter eq ''; + ($letter eq $gapchar || $letter =~ /\./) && next; + # print "Looking at $letter\n"; + $hash{$letter}++; + } + my $number_of_sequences = $self->num_sequences(); + my $threshold = $number_of_sequences * $threshold_percent / 100. ; + $count = -1; + $letter = '?'; + + foreach $key ( sort keys %hash ) { + # print "Now at $key $hash{$key}\n"; + if( $hash{$key} > $count && $hash{$key} >= $threshold) { + $letter = $key; + $count = $hash{$key}; + } + } + return $letter; +} + +=head2 consensus_conservation + + Title : consensus_conservation + Usage : @conservation = $ali->consensus_conservation(); + Function : Conservation (as a percent) of each position of alignment + Returns : Array of percentages [0-100]. Gap columns are 0% conserved. + Argument : + +=cut + +sub consensus_conservation { + my $self = shift; + my @cons; + my $num_sequences = $self->num_sequences; + foreach my $point (0..$self->length-1) { + my %hash = $self->_consensus_counts($point); + # max frequency of a non-gap letter + my $max = (sort {$b<=>$a} values %hash )[0]; + push @cons, 100 * $max / $num_sequences; + } + return @cons; +} + +# Frequency of each letter in one column +sub _consensus_counts { + my $self = shift; + my $point = shift; + my %hash; + my $gapchar = $self->gap_char; + foreach my $seq ( $self->next_Seq() ) { + my $letter = substr($seq->seq,$point,1); + $self->throw("--$point-----------") if $letter eq ''; + ($letter eq $gapchar || $letter =~ /\./) && next; + $hash{$letter}++; + } + return %hash; +} + +=head2 consensus_iupac + + Title : consensus_iupac + Usage : $str = $ali->consensus_iupac() + Function : Makes a consensus using IUPAC ambiguity codes from DNA + and RNA. The output is in upper case except when gaps in + a column force output to be in lower case. + + Note that if your alignment sequences contain a lot of + IUPAC ambiquity codes you often have to manually set + alphabet. Bio::PrimarySeq::_guess_type thinks they + indicate a protein sequence. + Returns : consensus string + Argument : none + Throws : on protein sequences + +=cut + +sub consensus_iupac { + my $self = shift; + my $out = ""; + my $len = $self->length-1; + + # only DNA and RNA sequences are valid + foreach my $seq ( $self->next_Seq() ) { + $self->throw("Seq [". $seq->get_nse. "] is a protein") + if $seq->alphabet eq 'protein'; + } + # loop over the alignment columns + foreach my $count ( 0 .. $len ) { + $out .= $self->_consensus_iupac($count); + } + return $out; +} + +sub _consensus_iupac { + my ($self, $column) = @_; + my ($string, $char, $rna); + + #determine all residues in a column + foreach my $seq ( $self->next_Seq() ) { + $string .= substr($seq->seq, $column, 1); + } + $string = uc $string; + + # quick exit if there's an N in the string + if ($string =~ /N/) { + $string =~ /\W/ ? return 'n' : return 'N'; + } + # ... or if there are only gap characters + return '-' if $string =~ /^\W+$/; + + # treat RNA as DNA in regexps + if ($string =~ /U/) { + $string =~ s/U/T/; + $rna = 1; + } + + # the following s///'s only need to be done to the _first_ ambiguity code + # as we only need to see the _range_ of characters in $string + + if ($string =~ /[VDHB]/) { + $string =~ s/V/AGC/; + $string =~ s/D/AGT/; + $string =~ s/H/ACT/; + $string =~ s/B/CTG/; + } + + if ($string =~ /[SKYRWM]/) { + $string =~ s/S/GC/; + $string =~ s/K/GT/; + $string =~ s/Y/CT/; + $string =~ s/R/AG/; + $string =~ s/W/AT/; + $string =~ s/M/AC/; + } + + # and now the guts of the thing + + if ($string =~ /A/) { + $char = 'A'; # A A + if ($string =~ /G/) { + $char = 'R'; # A and G (purines) R + if ($string =~ /C/) { + $char = 'V'; # A and G and C V + if ($string =~ /T/) { + $char = 'N'; # A and G and C and T N + } + } elsif ($string =~ /T/) { + $char = 'D'; # A and G and T D + } + } elsif ($string =~ /C/) { + $char = 'M'; # A and C M + if ($string =~ /T/) { + $char = 'H'; # A and C and T H + } + } elsif ($string =~ /T/) { + $char = 'W'; # A and T W + } + } elsif ($string =~ /C/) { + $char = 'C'; # C C + if ($string =~ /T/) { + $char = 'Y'; # C and T (pyrimidines) Y + if ($string =~ /G/) { + $char = 'B'; # C and T and G B + } + } elsif ($string =~ /G/) { + $char = 'S'; # C and G S + } + } elsif ($string =~ /G/) { + $char = 'G'; # G G + if ($string =~ /C/) { + $char = 'S'; # G and C S + } elsif ($string =~ /T/) { + $char = 'K'; # G and T K + } + } elsif ($string =~ /T/) { + $char = 'T'; # T T + } - splice(@bases, $_, 1) foreach @gaps; - $seq->seq(join('', @bases)); - } + $char = 'U' if $rna and $char eq 'T'; + $char = lc $char if $string =~ /\W/; - 1; + return $char; } -=head2 map_chars - Title : map_chars - Usage : $ali->map_chars('\.','-') - Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap - characters. +=head2 consensus_meta - Note that the first argument is interpreted as a regexp - so be careful and escape any wild card characters (e.g. - do $ali->map_chars('\.','-') to replace periods with dashes. - Returns : - Argument : A regexp and a string + Title : consensus_meta + Usage : $seqmeta = $ali->consensus_meta() + Function : Returns a Bio::Seq::Meta object containing the consensus + strings derived from meta data analysis. + Returns : Bio::Seq::Meta + Argument : Bio::Seq::Meta + Throws : non-MetaI object =cut -sub map_chars { - my $self = shift; - my $from = shift; - my $to = shift; - my ( $seq, $temp ); - - $self->throw("Need two arguments: a regexp and a string") - unless defined $from and defined $to; - - foreach $seq ( $self->each_seq() ) { - $temp = $seq->seq(); - $temp =~ s/$from/$to/g; - $seq->seq($temp); +sub consensus_meta { + my ($self, $meta) = @_; + if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) { + $self->throw('Not a Bio::Seq::MetaI object'); } - return 1; + return $self->{'_aln_meta'} = $meta if $meta; + return $self->{'_aln_meta'} } -=head2 uppercase +=head2 bracket_string - Title : uppercase() - Usage : $ali->uppercase() - Function : Sets all the sequences to uppercase - Returns : - Argument : + Title : bracket_string + Usage : my @params = (-refseq => 'testseq', + -allele1 => 'allele1', + -allele2 => 'allele2', + -delimiters => '{}', + -separator => '/'); + $str = $aln->bracket_string(@params) -=cut + Function : When supplied with a list of parameters (see below), returns a + string in BIC format. This is used for allelic comparisons. + Briefly, if either allele contains a base change when compared to + the refseq, the base or gap for each allele is represented in + brackets in the order present in the 'alleles' parameter. -sub uppercase { - my $self = shift; - my $seq; - my $temp; + For the following data: - foreach $seq ( $self->each_seq() ) { - $temp = $seq->seq(); - $temp =~ tr/[a-z]/[A-Z]/; + >testseq + GGATCCATTGCTACT + >allele1 + GGATCCATTCCTACT + >allele2 + GGAT--ATTCCTCCT - $seq->seq($temp); + the returned string with parameters 'refseq => testseq' and + 'alleles => [qw(allele1 allele2)]' would be: + + GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT + Returns : BIC-formatted string + Argument : Required args + refseq : string (ID) of the reference sequence used + as basis for comparison + allele1 : string (ID) of the first allele + allele2 : string (ID) of the second allele + Optional args + delimiters: two symbol string of left and right delimiters. + Only the first two symbols are used + default = '[]' + separator : string used as a separator. Only the first + symbol is used + default = '/' + Throws : On no refseq/alleles, or invalid refseq/alleles. + +=cut + +sub bracket_string { + my ($self, @args) = @_; + my ($ref, $a1, $a2, $delim, $sep) = + $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args); + $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref); + my ($ld, $rd); + ($ld, $rd) = split('', $delim, 2) if $delim; + $ld ||= '['; + $rd ||= ']'; + $sep ||= '/'; + my ($refseq, $allele1, $allele2) = + map {( $self->next_Seq_with_id($_) )} ($ref, $a1, $a2); + if (!$refseq || !$allele1 || !$allele2) { + $self->throw("One of your refseq/allele IDs is invalid!"); } - return 1; + my $len = $self->length-1; + my $bic = ''; + # loop over the alignment columns + for my $column ( 0 .. $len ) { + my $string; + my ($compres, $res1, $res2) = + map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2); + # are any of the allele symbols different from the refseq? + $string = ($compres eq $res1 && $compres eq $res2) ? $compres : + $ld.$res1.$sep.$res2.$rd; + $bic .= $string; + } + return $bic; } + =head2 cigar_line Title : cigar_line() @@ -1532,7 +2429,7 @@ sub cigar_line { # create a precursor, something like (1,4,5,6,7,33,45), # where each number corresponds to a conserved position - foreach my $seq ( $self->each_seq ) { + foreach my $seq ( $self->next_Seq ) { my @seq = split "", uc ($seq->seq); my $pos = 1; for (my $x = 0 ; $x < $len ; $x++ ) { @@ -1587,7 +2484,7 @@ sub cigar_line { =head2 match_line Title : match_line() - Usage : $line = $align->match_line() + Usage : $line = $aln->match_line() Function : Generates a match line - much like consensus string except that a line indicating the '*' for a match. Args : (optional) Match line characters ('*' by default) @@ -1607,7 +2504,7 @@ sub match_line { my @seqchars; my $alphabet; - foreach my $seq ( $self->each_seq ) { + foreach my $seq ( $self->next_Seq ) { push @seqchars, [ split(//, uc ($seq->seq)) ]; $alphabet = $seq->alphabet unless defined $alphabet; } @@ -1676,7 +2573,7 @@ sub match_line { =head2 gap_line Title : gap_line() - Usage : $line = $align->gap_line() + Usage : $line = $aln->gap_line() Function : Generates a gap line - much like consensus string except that a line where '-' represents gap Args : (optional) gap line characters ('-' by default) @@ -1688,14 +2585,14 @@ sub gap_line { my ($self,$gapchar) = @_; $gapchar = $gapchar || $self->gap_char; my %gap_hsh; # column gaps vector - foreach my $seq ( $self->each_seq ) { + foreach my $seq ( $self->next_Seq ) { my $i = 0; - map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/} + map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar} map {[$i++, $_]} split(//, uc ($seq->seq)); } my $gap_line; foreach my $pos ( 0..$self->length-1 ) { - $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.'; + $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.'; } return $gap_line; } @@ -1703,7 +2600,7 @@ sub gap_line { =head2 all_gap_line Title : all_gap_line() - Usage : $line = $align->all_gap_line() + Usage : $line = $aln->all_gap_line() Function : Generates a gap line - much like consensus string except that a line where '-' represents all-gap column Args : (optional) gap line characters ('-' by default) @@ -1715,17 +2612,17 @@ sub all_gap_line { my ($self,$gapchar) = @_; $gapchar = $gapchar || $self->gap_char; my %gap_hsh; # column gaps counter hash - my @seqs = $self->each_seq; + my @seqs = $self->next_Seq; foreach my $seq ( @seqs ) { my $i = 0; - map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/} + map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar} map {[$i++, $_]} split(//, uc ($seq->seq)); } my $gap_line; foreach my $pos ( 0..$self->length-1 ) { if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) { # gaps column - $gap_line .= $self->gap_char; + $gap_line .= $gapchar; } else { $gap_line .= '.'; } @@ -1736,118 +2633,35 @@ sub all_gap_line { =head2 gap_col_matrix Title : gap_col_matrix() - Usage : my $cols = $align->gap_col_matrix() - Function : Generates an array where each element in the array is a - hash reference with a key of the sequence name and a - value of 1 if the sequence has a gap at that column - Returns : Reference to an array - Args : Optional: gap line character ($aln->gap_char or '-' by default) + Usage : my $cols = $aln->gap_col_matrix() + Function : Generates an array of hashes where + each entry in the array is a hash reference + with keys of all the sequence names and + and value of 1 or 0 if the sequence has a gap at that column + Args : (optional) gap line characters ($aln->gap_char or '-' by default) =cut sub gap_col_matrix { - my ( $self, $gapchar ) = @_; + my ($self,$gapchar) = @_; $gapchar = $gapchar || $self->gap_char; - my %gap_hsh; # column gaps vector + my %gap_hsh; # column gaps vector my @cols; - foreach my $seq ( $self->each_seq ) { - my $i = 0; - my $str = $seq->seq; - my $len = $seq->length; - my $ch; - my $id = $seq->display_id; - while ( $i < $len ) { - $ch = substr( $str, $i, 1 ); - $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ ); - } - } - return \@cols; -} - -=head2 match - - Title : match() - Usage : $ali->match() - Function : Goes through all columns and changes residues that are - identical to residue in first sequence to match '.' - character. Sets match_char. - - USE WITH CARE: Most MSA formats do not support match - characters in sequences, so this is mostly for output - only. NEXUS format (Bio::AlignIO::nexus) can handle - it. - Returns : 1 - Argument : a match character, optional, defaults to '.' - -=cut - -sub match { - my ($self, $match) = @_; - - $match ||= '.'; - my ($matching_char) = $match; - $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #'; - $self->map_chars($matching_char, '-'); - - my @seqs = $self->each_seq(); - return 1 unless scalar @seqs > 1; - - my $refseq = shift @seqs ; - my @refseq = split //, $refseq->seq; - my $gapchar = $self->gap_char; - - foreach my $seq ( @seqs ) { - my @varseq = split //, $seq->seq(); - for ( my $i=0; $i < scalar @varseq; $i++) { - $varseq[$i] = $match if defined $refseq[$i] && - ( $refseq[$i] =~ /[A-Za-z\*]/ || - $refseq[$i] =~ /$gapchar/ ) - && $refseq[$i] eq $varseq[$i]; + foreach my $seq ( $self->next_Seq ) { + my $i = 0; + my $str = $seq->seq; + my $len = $seq->length; + my $ch; + my $id = $seq->display_id; + while( $i < $len ) { + $ch = substr($str, $i, 1); + $cols[$i++]->{$id} = ($ch eq $gapchar); } - $seq->seq(join '', @varseq); } - $self->match_char($match); - return 1; + return \@cols; } -=head2 unmatch - - Title : unmatch() - Usage : $ali->unmatch() - Function : Undoes the effect of method match. Unsets match_char. - Returns : 1 - Argument : a match character, optional, defaults to '.' - -See L and L - -=cut - -sub unmatch { - my ($self, $match) = @_; - - $match ||= '.'; - - my @seqs = $self->each_seq(); - return 1 unless scalar @seqs > 1; - - my $refseq = shift @seqs ; - my @refseq = split //, $refseq->seq; - my $gapchar = $self->gap_char; - foreach my $seq ( @seqs ) { - my @varseq = split //, $seq->seq(); - for ( my $i=0; $i < scalar @varseq; $i++) { - $varseq[$i] = $refseq[$i] if defined $refseq[$i] && - ( $refseq[$i] =~ /[A-Za-z\*]/ || - $refseq[$i] =~ /$gapchar/ ) && - $varseq[$i] eq $match; - } - $seq->seq(join '', @varseq); - } - $self->match_char(''); - return 1; -} - =head1 MSA attributes Methods for setting and reading the MSA attributes. @@ -1904,37 +2718,58 @@ sub accession { Returns : An description string Argument : An description string (optional) -=cut +=cut + +sub description { + my ($self, $name) = @_; + + if (defined( $name )) { + $self->{'_description'} = $name; + } + + return $self->{'_description'}; +} + +=head2 source + + Title : source + Usage : $aln->source($newval) + Function: sets the Alignment source program + Example : + Returns : value of source + Args : newvalue (optional) + -sub description { - my ($self, $name) = @_; +=cut - if (defined( $name )) { - $self->{'_description'} = $name; +sub source{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_source'} = $value; } - - return $self->{'_description'}; + return $self->{'_source'}; } =head2 missing_char Title : missing_char - Usage : $myalign->missing_char("?") + Usage : $myalign->missing_char("&") Function : Gets/sets the missing_char attribute of the alignment It is generally recommended to set it to 'n' or 'N' for nucleotides and to 'X' for protein. Returns : An missing_char string, - Argument : An missing_char string (optional) + Argument : An missing_char string (optional), default as '&' =cut sub missing_char { my ($self, $char) = @_; - - if (defined $char ) { - $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1; - $self->{'_missing_char'} = $char; - } + + if (defined $char || ! defined $self->{'_missing_char'} ) { + $char= '&' unless defined $char; + $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; + $self->{'_missing_char'} = $char; + } return $self->{'_missing_char'}; } @@ -1945,17 +2780,18 @@ sub missing_char { Usage : $myalign->match_char('.') Function : Gets/sets the match_char attribute of the alignment Returns : An match_char string, - Argument : An match_char string (optional) + Argument : An match_char string (optional), default as '.' =cut sub match_char { my ($self, $char) = @_; - if (defined $char ) { - $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1; - $self->{'_match_char'} = $char; - } + if (defined $char || ! defined $self->{'_match_char'} ) { + $char= '.' unless defined $char; + $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; + $self->{'_match_char'} = $char; + } return $self->{'_match_char'}; } @@ -1966,23 +2802,48 @@ sub match_char { Usage : $myalign->gap_char('-') Function : Gets/sets the gap_char attribute of the alignment Returns : An gap_char string, defaults to '-' - Argument : An gap_char string (optional) + Argument : An gap_char string (optional), default as '-' =cut sub gap_char { my ($self, $char) = @_; - if (defined $char || ! defined $self->{'_gap_char'} ) { - $char= '-' unless defined $char; - $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; - $self->{'_gap_char'} = $char; - } - return $self->{'_gap_char'}; + if (defined $char || ! defined $self->{'_gap_char'} ) { + $char= '-' unless defined $char; + $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; + $self->{'_gap_char'} = $char; + } + return $self->{'_gap_char'}; } -=head2 symbol_chars +=head2 mask_char + + Title : mask_char + Usage : $aln->mask_char('?') + Function : Gets/sets the mask_char attribute of the alignment + Returns : An mask_char string, + Argument : An mask_char string (optional), default as '?' + +=cut + +sub mask_char { + my ($self, $char) = @_; + + #may need to check whether $char is the same with _gap_char, _match_char and _missing_char + if (defined $char || ! defined $self->{'_mask_char'} ) { + $char= '?' unless defined $char; + $self->throw("Single mask character, not [$char]!") if CORE::length($char) > 1; + $self->{'_mask_char'} = $char; + } + return $self->{'_mask_char'}; +} + + + + +=head2 symbol_chars Title : symbol_chars Usage : my @symbolchars = $aln->symbol_chars; Function: Returns all the seen symbols (other than gaps) @@ -1995,277 +2856,45 @@ sub symbol_chars{ my ($self,$includeextra) = @_; unless ($self->{'_symbols'}) { - foreach my $seq ($self->each_seq) { + foreach my $seq ($self->next_Seq) { map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); } } my %copy = %{$self->{'_symbols'}}; if( ! $includeextra ) { foreach my $char ( $self->gap_char, $self->match_char, - $self->missing_char) { + $self->missing_char, $self->mask_char) { delete $copy{$char} if( defined $char ); } } return keys %copy; } -=head1 Alignment descriptors - -These read only methods describe the MSA in various ways. - - -=head2 score - - Title : score - Usage : $str = $ali->score() - Function : get/set a score of the alignment - Returns : a score for the alignment - Argument : an optional score to set - -=cut - -sub score { - my $self = shift; - $self->{score} = shift if @_; - return $self->{score}; -} - -=head2 consensus_string - - Title : consensus_string - Usage : $str = $ali->consensus_string($threshold_percent) - Function : Makes a strict consensus - Returns : Consensus string - Argument : Optional threshold ranging from 0 to 100. - The consensus residue has to appear at least threshold % - of the sequences at a given location, otherwise a '?' - character will be placed at that location. - (Default value = 0%) - -=cut - -sub consensus_string { - my $self = shift; - my $threshold = shift; - - my $out = ""; - my $len = $self->length - 1; - - foreach ( 0 .. $len ) { - $out .= $self->_consensus_aa($_,$threshold); - } - return $out; -} - -=head2 consensus_conservation - - Title : consensus_conservation - Usage : @conservation = $ali->consensus_conservation(); - Function : Conservation (as a percent) of each position of alignment - Returns : Array of percentages [0-100]. Gap columns are 0% conserved. - Argument : - -=cut - -sub consensus_conservation { - my $self = shift; - my @cons; - my $num_sequences = $self->num_sequences; - foreach my $point (0..$self->length-1) { - my %hash = $self->_consensus_counts($point); - # max frequency of a non-gap letter - my $max = (sort {$b<=>$a} values %hash )[0]; - push @cons, 100 * $max / $num_sequences; - } - return @cons; -} - -sub _consensus_aa { - my $self = shift; - my $point = shift; - my $threshold_percent = shift || -1 ; - my ($seq,%hash,$count,$letter,$key); - my $gapchar = $self->gap_char; - %hash = $self->_consensus_counts($point); - my $number_of_sequences = $self->num_sequences(); - my $threshold = $number_of_sequences * $threshold_percent / 100. ; - $count = -1; - $letter = '?'; - - foreach $key ( sort keys %hash ) { - # print "Now at $key $hash{$key}\n"; - if( $hash{$key} > $count && $hash{$key} >= $threshold) { - $letter = $key; - $count = $hash{$key}; - } - } - return $letter; -} - -# Frequency of each letter in one column -sub _consensus_counts { - my $self = shift; - my $point = shift; - my %hash; - my $gapchar = $self->gap_char; - foreach my $seq ( $self->each_seq() ) { - my $letter = substr($seq->seq,$point,1); - $self->throw("--$point-----------") if $letter eq ''; - ($letter eq $gapchar || $letter =~ /\./) && next; - $hash{$letter}++; - } - return %hash; -} - - -=head2 consensus_iupac - - Title : consensus_iupac - Usage : $str = $ali->consensus_iupac() - Function : Makes a consensus using IUPAC ambiguity codes from DNA - and RNA. The output is in upper case except when gaps in - a column force output to be in lower case. - - Note that if your alignment sequences contain a lot of - IUPAC ambiquity codes you often have to manually set - alphabet. Bio::PrimarySeq::_guess_type thinks they - indicate a protein sequence. - Returns : consensus string - Argument : none - Throws : on protein sequences - -=cut - -sub consensus_iupac { - my $self = shift; - my $out = ""; - my $len = $self->length-1; - - # only DNA and RNA sequences are valid - foreach my $seq ( $self->each_seq() ) { - $self->throw("Seq [". $seq->get_nse. "] is a protein") - if $seq->alphabet eq 'protein'; - } - # loop over the alignment columns - foreach my $count ( 0 .. $len ) { - $out .= $self->_consensus_iupac($count); - } - return $out; -} - -sub _consensus_iupac { - my ($self, $column) = @_; - my ($string, $char, $rna); - - #determine all residues in a column - foreach my $seq ( $self->each_seq() ) { - $string .= substr($seq->seq, $column, 1); - } - $string = uc $string; - - # quick exit if there's an N in the string - if ($string =~ /N/) { - $string =~ /\W/ ? return 'n' : return 'N'; - } - # ... or if there are only gap characters - return '-' if $string =~ /^\W+$/; - - # treat RNA as DNA in regexps - if ($string =~ /U/) { - $string =~ s/U/T/; - $rna = 1; - } - - # the following s///'s only need to be done to the _first_ ambiguity code - # as we only need to see the _range_ of characters in $string - - if ($string =~ /[VDHB]/) { - $string =~ s/V/AGC/; - $string =~ s/D/AGT/; - $string =~ s/H/ACT/; - $string =~ s/B/CTG/; - } - - if ($string =~ /[SKYRWM]/) { - $string =~ s/S/GC/; - $string =~ s/K/GT/; - $string =~ s/Y/CT/; - $string =~ s/R/AG/; - $string =~ s/W/AT/; - $string =~ s/M/AC/; - } - - # and now the guts of the thing - if ($string =~ /A/) { - $char = 'A'; # A A - if ($string =~ /G/) { - $char = 'R'; # A and G (purines) R - if ($string =~ /C/) { - $char = 'V'; # A and G and C V - if ($string =~ /T/) { - $char = 'N'; # A and G and C and T N - } - } elsif ($string =~ /T/) { - $char = 'D'; # A and G and T D - } - } elsif ($string =~ /C/) { - $char = 'M'; # A and C M - if ($string =~ /T/) { - $char = 'H'; # A and C and T H - } - } elsif ($string =~ /T/) { - $char = 'W'; # A and T W - } - } elsif ($string =~ /C/) { - $char = 'C'; # C C - if ($string =~ /T/) { - $char = 'Y'; # C and T (pyrimidines) Y - if ($string =~ /G/) { - $char = 'B'; # C and T and G B - } - } elsif ($string =~ /G/) { - $char = 'S'; # C and G S - } - } elsif ($string =~ /G/) { - $char = 'G'; # G G - if ($string =~ /C/) { - $char = 'S'; # G and C S - } elsif ($string =~ /T/) { - $char = 'K'; # G and T K - } - } elsif ($string =~ /T/) { - $char = 'T'; # T T - } - $char = 'U' if $rna and $char eq 'T'; - $char = lc $char if $string =~ /\W/; +=head1 Alignment descriptors - return $char; -} +These read only methods describe the MSA in various ways. -=head2 consensus_meta +=head2 score - Title : consensus_meta - Usage : $seqmeta = $ali->consensus_meta() - Function : Returns a Bio::Seq::Meta object containing the consensus - strings derived from meta data analysis. - Returns : Bio::Seq::Meta - Argument : Bio::Seq::Meta - Throws : non-MetaI object + Title : score + Usage : $str = $ali->score() + Function : get/set a score of the alignment + Returns : a score for the alignment + Argument : an optional score to set =cut -sub consensus_meta { - my ($self, $meta) = @_; - if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) { - $self->throw('Not a Bio::Seq::MetaI object'); - } - return $self->{'_aln_meta'} = $meta if $meta; - return $self->{'_aln_meta'} +sub score { + my $self = shift; + $self->{score} = shift if @_; + return $self->{score}; } + + =head2 is_flush Title : is_flush @@ -2273,7 +2902,7 @@ sub consensus_meta { Function : Tells you whether the alignment : is flush, i.e. all of the same length Returns : 1 or 0 - Argument : + Argument : None =cut @@ -2283,7 +2912,7 @@ sub is_flush { my $length = (-1); my $temp; - foreach $seq ( $self->each_seq() ) { + foreach $seq ( $self->next_Seq() ) { if( $length == (-1) ) { $length = CORE::length($seq->seq()); next; @@ -2311,13 +2940,13 @@ sub is_flush { Function : Returns the maximum length of the alignment. To be sure the alignment is a block, use is_flush Returns : Integer - Argument : + Argument : None =cut sub length_aln { my $self = shift; - $self->deprecated("length_aln - deprecated method. Use length() instead."); + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"length_aln - deprecated method. Use length() instead."); $self->length(@_); } @@ -2327,7 +2956,7 @@ sub length { my $length = -1; my $temp; - foreach $seq ( $self->each_seq() ) { + foreach $seq ( $self->next_Seq() ) { $temp = $seq->length(); if( $temp > $length ) { $length = $temp; @@ -2345,20 +2974,20 @@ sub length { Function : Gets the maximum length of the displayname in the alignment. Used in writing out various MSA formats. Returns : integer - Argument : + Argument : None =cut sub maxname_length { my $self = shift; - $self->deprecated("maxname_length - deprecated method.". + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"maxname_length - deprecated method.". " Use maxdisplayname_length() instead."); $self->maxdisplayname_length(); } sub maxnse_length { my $self = shift; - $self->deprecated("maxnse_length - deprecated method.". + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"maxnse_length - deprecated method.". " Use maxnse_length() instead."); $self->maxdisplayname_length(); } @@ -2368,7 +2997,7 @@ sub maxdisplayname_length { my $maxname = (-1); my ($seq,$len); - foreach $seq ( $self->each_seq() ) { + foreach $seq ( $self->next_Seq() ) { $len = CORE::length $self->displayname($seq->get_nse()); if( $len > $maxname ) { @@ -2397,7 +3026,7 @@ sub max_metaname_length { my ($seq,$len); # check seq meta first - for $seq ( $self->each_seq() ) { + for $seq ( $self->next_Seq() ) { next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names); for my $mtag ($seq->meta_names) { $len = CORE::length $mtag; @@ -2427,7 +3056,7 @@ sub max_metaname_length { Usage : $no = $ali->num_residues Function : number of residues in total in the alignment Returns : integer - Argument : + Argument : None Note : replaces no_residues() =cut @@ -2436,7 +3065,7 @@ sub num_residues { my $self = shift; my $count = 0; - foreach my $seq ($self->each_seq) { + foreach my $seq ($self->next_Seq) { my $str = $seq->seq(); $count += ($str =~ s/[A-Za-z]//g); @@ -2458,7 +3087,7 @@ sub num_residues { sub num_sequences { my $self = shift; - return scalar($self->each_seq); + return scalar($self->next_Seq); } @@ -2470,41 +3099,35 @@ sub num_sequences { percentage identity of the alignment Returns : The average percentage identity of the alignment Args : None - Notes : This method implemented by Kevin Howe calculates a figure that is - designed to be similar to the average pairwise identity of the - alignment (identical in the absence of gaps), without having to - explicitly calculate pairwise identities proposed by Richard Durbin. - Validated by Ewan Birney ad Alex Bateman. + Notes : This method implemented by Kevin Howe calculates a figure that + is designed to be similar to the average pairwise identity of + the alignment (identical in the absence of gaps), without having + to explicitly calculate pairwise identities proposed by Richard + Durbin. Validated by Ewan Birney ad Alex Bateman. =cut sub average_percentage_identity{ my ($self,@args) = @_; - my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', - 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); - + my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z); + my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes); if (! $self->is_flush()) { $self->throw("All sequences in the alignment must be the same length"); } - @seqs = $self->each_seq(); + @seqs = $self->next_Seq(); $len = $self->length(); # load the each hash with correct keys for existence checks - for( my $index=0; $index < $len; $index++) { - foreach my $letter (@alphabet) { - $countHashes[$index]->{$letter} = 0; - } - } foreach my $seq (@seqs) { my @seqChars = split //, $seq->seq(); for( my $column=0; $column < @seqChars; $column++ ) { my $char = uc($seqChars[$column]); - if (exists $countHashes[$column]->{$char}) { + if (exists $alphabet{$char}) { $countHashes[$column]->{$char}++; } } @@ -2575,7 +3198,7 @@ sub overall_percentage_identity{ my $len; my $total = 0; # number of positions with identical residues my @countHashes; - my @seqs = $self->each_seq; + my @seqs = $self->next_Seq; my $nof_seqs = scalar @seqs; my $aln_len = $self->length(); for my $seq (@seqs) { @@ -2624,19 +3247,82 @@ sub overall_percentage_identity{ return ($total / $len ) * 100.0; } +=head2 pairwise_percentage_identity + + Title : pairwise_percentage_identity() + Usage : @pairwiseiden=$ali->pairwise_percentage_identity(3) + Function : Returns pairwise percentage identity of each sequence to the + reference sequence(first sequence as default), or selected sequence + See set_new_reference for information on reference sequence + Returns : A list of percentage identity to the reference sequence + Argument : A number for the position of the reference sequence or the + sequence name of the reference sequence + +=cut + +sub pairwise_percentage_identity { + my ($self,$seqid)=@_; + my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z); -=head1 Alignment positions + my ($refseq, @seqs,@ids,@pairwise_iden); + + if (! $self->is_flush()) { + $self->throw("All sequences in the alignment must be the same length"); + } + + foreach my $seq ( $self->next_Seq() ) { + push @seqs, $seq; + push @ids, $seq->display_id; + } + + # load the each hash with correct keys for existence checks + if(defined($seqid)) { + my $is_num=0; + if ($seqid =~ /^\d+$/) { # argument is seq position + $is_num=1; + $self->throw("The new reference sequence number has to be a positive integer >=1 and <= num_sequences ") if ($seqid < 1 || $seqid > $self->num_sequences); + } + else { # argument is a seq name + $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids); + } + #return the requested seq as the reference seq in the pairwise comparison + for (my $i=0; $i<=$#seqs; $i++) { + my $pos=$i+1; + if ( ($is_num && $pos == $seqid) || ($seqid eq $seqs[$i]->display_id) ) { + $refseq= $seqs[$i]; + last; + } + } + } + else { + $refseq=$seqs[0]; + } + + #calculate the length of the reference sequence + my @refseqchars=split //, uc($refseq->seq()); + my $reflength=$refseq->seq()=~tr/A-Za-z//;; -Methods to map a sequence position into an alignment column and back. -column_from_residue_number() does the former. The latter is really a -property of the sequence object and can done using -L: + if($reflength==0) { + $self->throw("The reference sequence should be non-zero length "); + } + + #calculate the pairwise identity + foreach my $seq (@seqs) { + my $idcount=0; + my @seqChars = split //, $seq->seq(); + for( my $column=0; $column < @seqChars; $column++ ) { + my $char = uc($seqChars[$column]); + if(exists $alphabet{$char} && exists $alphabet{$refseqchars[$column]} && $char eq $refseqchars[$column]) { + $idcount++; + } + } + push @pairwise_iden,$idcount/$reflength; + } + + return @pairwise_iden; +} - # select somehow a sequence from the alignment, e.g. - my $seq = $aln->get_seq_by_pos(1); - #$loc is undef or Bio::LocationI object - my $loc = $seq->location_from_column(5); =head2 column_from_residue_number @@ -2677,7 +3363,7 @@ sub column_from_residue_number { $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name}; $self->throw("Second argument residue number missing") unless $resnumber; - foreach my $seq ($self->each_seq_with_id($name)) { + foreach my $seq ($self->next_Seq_with_id($name)) { my $col; eval { $col = $seq->column_from_residue_number($resnumber); @@ -2691,6 +3377,7 @@ sub column_from_residue_number { } + =head1 Sequence names Methods to manipulate the display name. The default name based on the @@ -2710,13 +3397,13 @@ ways. sub get_displayname { my $self = shift; - $self->deprecated("get_displayname - deprecated method. Use displayname() instead."); + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"get_displayname - deprecated method. Use displayname() instead."); $self->displayname(@_); } sub set_displayname { my $self = shift; - $self->deprecated("set_displayname - deprecated method. Use displayname() instead."); + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"set_displayname - deprecated method. Use displayname() instead."); $self->displayname(@_); } @@ -2744,7 +3431,7 @@ sub displayname { Function : Sets the names to be name_# where # is the number of times this name has been used. Returns : 1, on success - Argument : + Argument : None =cut @@ -2780,7 +3467,7 @@ sub set_displayname_count { Function : Makes all the sequences be displayed as just their name, not name/start-end Returns : 1 - Argument : + Argument : None =cut @@ -2788,7 +3475,7 @@ sub set_displayname_flat { my $self = shift; my ($nse,$seq); - foreach $seq ( $self->each_seq() ) { + foreach $seq ( $self->next_Seq() ) { $nse = $seq->get_nse(); $self->displayname($nse,$seq->id()); } @@ -2801,7 +3488,7 @@ sub set_displayname_flat { Usage : $ali->set_displayname_normal() Function : Makes all the sequences be displayed as name/start-end Returns : 1, on success - Argument : + Argument : None =cut @@ -2809,33 +3496,13 @@ sub set_displayname_normal { my $self = shift; my ($nse,$seq); - foreach $seq ( $self->each_seq() ) { + foreach $seq ( $self->next_Seq() ) { $nse = $seq->get_nse(); $self->displayname($nse,$nse); } return 1; } -=head2 source - - Title : source - Usage : $obj->source($newval) - Function: sets the Alignment source program - Example : - Returns : value of source - Args : newvalue (optional) - - -=cut - -sub source{ - my ($self,$value) = @_; - if( defined $value) { - $self->{'_source'} = $value; - } - return $self->{'_source'}; -} - =head2 set_displayname_safe Title : set_displayname_safe @@ -2856,7 +3523,7 @@ sub set_displayname_safe { my ($seq, %phylip_name); my $ct=0; my $new=Bio::SimpleAlign->new(); - foreach $seq ( $self->each_seq() ) { + foreach $seq ( $self->next_Seq() ) { $ct++; my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct; $phylip_name{$pname}=$seq->id(); @@ -2866,7 +3533,7 @@ sub set_displayname_safe { -start => $seq->{_start}, -end => $seq->{_end} ); - $new->add_seq($new_seq); + $new->add_Seq($new_seq); } $self->debug("$ct seq names changed. Restore names by using restore_displayname."); @@ -2889,7 +3556,7 @@ sub restore_displayname { my $ref=shift; my %name=%$ref; my $new=Bio::SimpleAlign->new(); - foreach my $seq ( $self->each_seq() ) { + foreach my $seq ( $self->next_Seq() ) { $self->throw("No sequence with name") unless defined $name{$seq->id()}; my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()}, -seq => $seq->seq(), @@ -2897,123 +3564,11 @@ sub restore_displayname { -start => $seq->{_start}, -end => $seq->{_end} ); - $new->add_seq($new_seq); + $new->add_Seq($new_seq); } return $new; } -=head2 sort_by_start - - Title : sort_by_start - Usage : $ali->sort_by_start - Function : Changes the order of the alignment to the start position of each - subalignment - Returns : - Argument : - -=cut - -sub sort_by_start { - my $self = shift; - my ($seq,$nse,@arr,%hash,$count); - foreach $seq ( $self->each_seq() ) { - $nse = $seq->get_nse; - $hash{$nse} = $seq; - } - $count = 0; - %{$self->{'_order'}} = (); # reset the hash; - foreach $nse ( sort _startend keys %hash) { - $self->{'_order'}->{$count} = $nse; - $count++; - } - 1; -} - -sub _startend -{ - my ($aname,$arange) = split (/[\/]/,$a); - my ($bname,$brange) = split (/[\/]/,$b); - my ($astart,$aend) = split(/\-/,$arange); - my ($bstart,$bend) = split(/\-/,$brange); - return $astart <=> $bstart; -} - -=head2 bracket_string - - Title : bracket_string - Usage : my @params = (-refseq => 'testseq', - -allele1 => 'allele1', - -allele2 => 'allele2', - -delimiters => '{}', - -separator => '/'); - $str = $aln->bracket_string(@params) - - Function : When supplied with a list of parameters (see below), returns a - string in BIC format. This is used for allelic comparisons. - Briefly, if either allele contains a base change when compared to - the refseq, the base or gap for each allele is represented in - brackets in the order present in the 'alleles' parameter. - - For the following data: - - >testseq - GGATCCATTGCTACT - >allele1 - GGATCCATTCCTACT - >allele2 - GGAT--ATTCCTCCT - - the returned string with parameters 'refseq => testseq' and - 'alleles => [qw(allele1 allele2)]' would be: - - GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT - Returns : BIC-formatted string - Argument : Required args - refseq : string (ID) of the reference sequence used - as basis for comparison - allele1 : string (ID) of the first allele - allele2 : string (ID) of the second allele - Optional args - delimiters: two symbol string of left and right delimiters. - Only the first two symbols are used - default = '[]' - separator : string used as a separator. Only the first - symbol is used - default = '/' - Throws : On no refseq/alleles, or invalid refseq/alleles. - -=cut - -sub bracket_string { - my ($self, @args) = @_; - my ($ref, $a1, $a2, $delim, $sep) = - $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args); - $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref); - my ($ld, $rd); - ($ld, $rd) = split('', $delim, 2) if $delim; - $ld ||= '['; - $rd ||= ']'; - $sep ||= '/'; - my ($refseq, $allele1, $allele2) = - map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2); - if (!$refseq || !$allele1 || !$allele2) { - $self->throw("One of your refseq/allele IDs is invalid!"); - } - my $len = $self->length-1; - my $bic = ''; - # loop over the alignment columns - for my $column ( 0 .. $len ) { - my $string; - my ($compres, $res1, $res2) = - map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2); - # are any of the allele symbols different from the refseq? - $string = ($compres eq $res1 && $compres eq $res2) ? $compres : - $ld.$res1.$sep.$res2.$rd; - $bic .= $string; - } - return $bic; -} - =head2 methods implementing Bio::FeatureHolderI @@ -3127,18 +3682,6 @@ sub feature_count { } } -=head2 get_all_SeqFeatures - - Title : get_all_SeqFeatures - Usage : - Function: Get all SeqFeatures. - Example : - Returns : an array of Bio::SeqFeatureI implementing objects - Args : none - Note : Falls through to Bio::FeatureHolderI implementation. - -=cut - =head2 methods for Bio::AnnotatableI AnnotatableI implementation to support sequence alignments which @@ -3171,20 +3714,6 @@ sub annotation { return $obj->{'_annotation'}; } -=head1 Deprecated methods - -=cut - -=head2 no_residues - - Title : no_residues - Usage : $no = $ali->no_residues - Function : number of residues in total in the alignment - Returns : integer - Argument : - Note : deprecated in favor of num_residues() - -=cut sub no_residues { my $self = shift; @@ -3194,17 +3723,6 @@ sub no_residues { $self->num_residues(@_); } -=head2 no_sequences - - Title : no_sequences - Usage : $depth = $ali->no_sequences - Function : number of sequence in the sequence alignment - Returns : integer - Argument : - Note : deprecated in favor of num_sequences() - -=cut - sub no_sequences { my $self = shift; $self->deprecated(-warn_version => 1.0069, @@ -3213,66 +3731,6 @@ sub no_sequences { $self->num_sequences(@_); } -=head2 mask_columns - - Title : mask_columns - Usage : $aln2 = $aln->mask_columns(20,30) - Function : Masks a slice of the alignment inclusive of start and - end columns, and the first column in the alignment is denoted 1. - Mask beyond the length of the sequence does not do padding. - Returns : A Bio::SimpleAlign object - Args : Positive integer for start column, positive integer for end column, - optional string value use for the mask. Example: - - $aln2 = $aln->mask_columns(20,30,'?') - Note : Masking must use a character that is not used for gaps or - frameshifts. These can be adjusted using the relevant global - variables, but be aware these may be (uncontrollably) modified - elsewhere within BioPerl (see bug 2715) - -=cut - -sub mask_columns { - #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing - my $self = shift; - - my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS. - $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS; - - # coordinates are alignment-based, not sequence-based - my ($start, $end, $mask_char) = @_; - unless (defined $mask_char) { $mask_char = 'N' } - - $self->throw("Mask start has to be a positive integer and less than ". - "alignment length, not [$start]") - unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length; - $self->throw("Mask end has to be a positive integer and less than ". - "alignment length, not [$end]") - unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length; - $self->throw("Mask start [$start] has to be smaller than or equal to ". - "end [$end]") unless $start <= $end; - $self->throw("Mask character $mask_char has to be a single character ". - "and not a gap or frameshift symbol") - unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres}; - - my $aln = $self->new; - $aln->id($self->id); - foreach my $seq ( $self->each_seq() ) { - my $new_seq = Bio::LocatableSeq->new(-id => $seq->id, - -alphabet => $seq->alphabet, - -strand => $seq->strand, - -verbose => $self->verbose); - - # convert from 1-based alignment coords! - my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1); - $masked_string =~ s{[^$nonres]}{$mask_char}g; - my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end); - $new_seq->seq($new_dna_string); - $aln->add_seq($new_seq); - } - return $aln; -} - 1; From 09545cc6f7fda052ccbd5bba60ce5add6ff1b642 Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Tue, 4 Sep 2012 11:12:43 -0400 Subject: [PATCH 2/8] get_nse --- Bio/SimpleAlign.pm | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index 8dc666e02d..9572c92f5f 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -403,14 +403,7 @@ sub remove_LocatableSeq { unless ref $seq && $seq->isa( 'Bio::LocatableSeq'); $id = $seq->id(); -<<<<<<< HEAD - $start = $seq->start(); - $end = $seq->end(); - $name = sprintf("%s/%d-%d",$id,$start,$end); - -======= $name = $seq->get_nse; ->>>>>>> upstream/master if( !exists $self->{'_seq'}->{$name} ) { $self->throw("Sequence $name does not exist in the alignment to remove!"); From 8738981518882e3260171b24f23f5300ba4a4618 Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Wed, 12 Sep 2012 16:51:25 -0400 Subject: [PATCH 3/8] All contradicts fixed --- Bio/SimpleAlign.pm | 311 ++++++++++++++------------------------------- 1 file changed, 94 insertions(+), 217 deletions(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index 9572c92f5f..0cb49c3f43 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -499,7 +499,7 @@ sub remove_Seqs { sub purge { my ($self,@args) = @_; $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"purge - deprecated method. Use remove_redundant_Seqs() instead."); - $self->remove_redundant_Seq(@args); + $self->remove_redundant_Seqs(@args); } sub remove_redundant_Seqs { @@ -1114,21 +1114,26 @@ sub get_Seq_by_id { Title : select_Seqs Usage : $aln2 = $aln->select_Seqs([1,5..10,15]) # three first sequences $aln2 = $aln->select_Seqs(-selection=>[2..4,11..14],-toggle=>1) # toggle selection + $aln2 = $aln->select_Seqs(-selection=>[11..14,2..4],-nosort=>1) #no sort Function : Creates a new alignment from a subset of sequences. Numbering starts from 1. Sequence positions larger than num_sequences() will thow an error. Returns : a Bio::SimpleAlign object Args : An reference list of positive integers for the selected - sequences. An optional parameter can be defined to toggle the - coordinate selection. + sequences. + An optional parameter can be defined to toggle the coordinate selection. + If nosort is defined, sequences in the alignment will be returned following + the given order. =cut sub select_Seqs { my $self=shift; - my ($sel, $toggle) = $self->_rearrange([qw(SELECTION TOGGLE)], @_); - @{$sel}=sort {$a<=>$b} @{$sel}; + my ($sel, $toggle,$nosort) = $self->_rearrange([qw(SELECTION TOGGLE NOSORT)], @_); + unless($nosort) { + @{$sel}=sort {$a<=>$b} @{$sel}; + } $self->throw("Select start has to be a positive integer, not [".$sel->[0]."]") unless $sel->[0] =~ /^\d+$/ and $sel->[0] > 0; @@ -1163,9 +1168,36 @@ sub select_noncont { my $self = shift; $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"select_noncont - deprecated method. Use select_Seqs() instead."); if($_[0] eq 'nosort') { - shift; + shift @_; + $self->select_Seqs(-selection=>[@_],-nosort=>1); } - $self->select_Seqs([@_]); + else { + $self->select_Seqs([@_]); + } +} + + +=head2 select_noncont_by_name + + Title : select_noncont_by_name + Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456'); + Function : Creates a new alignment from a subset of sequences which are + selected by name (sequence ID). + Returns : a Bio::SimpleAlign object + Args : array of names (i.e., identifiers) for the sequences. + +=cut + +sub select_noncont_by_name { + my ($self, @names) = @_; + + my $aln = $self->new; + foreach my $name (@names) { + $aln->add_seq($self->get_seq_by_id($name)); + } + $aln->id($self->id); + + return $aln; } =head2 select_columns @@ -1191,53 +1223,11 @@ sub select_noncont { =cut -<<<<<<< HEAD + sub slice { my $self = shift; $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"slice - deprecated method. Use select_columns() instead."); $self->select_columns([$_[0]..$_[1]],0,defined($_[2])?$_[2]:0); -======= - return $seq; -} - - -=head1 Create new alignments - -The result of these methods are horizontal or vertical subsets of the -current MSA. - -=head2 select - - Title : select - Usage : $aln2 = $aln->select(1, 3) # three first sequences - Function : Creates a new alignment from a continuous subset of - sequences. Numbering starts from 1. Sequence positions - larger than num_sequences() will throw an error. - Returns : a Bio::SimpleAlign object - Args : positive integer for the first sequence - positive integer for the last sequence to include (optional) - -=cut - -sub select { - my $self = shift; - my ($start, $end) = @_; - - $self->throw("Select start has to be a positive integer, not [$start]") - unless $start =~ /^\d+$/ and $start > 0; - $self->throw("Select end has to be a positive integer, not [$end]") - unless $end =~ /^\d+$/ and $end > 0; - $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]") - unless $start <= $end; - - my $aln = $self->new; - foreach my $pos ($start .. $end) { - $aln->add_seq($self->get_seq_by_pos($pos)); - } - $aln->id($self->id); - # fix for meta, sf, ann - return $aln; ->>>>>>> upstream/master } @@ -1249,66 +1239,37 @@ sub select_columns { $sel=$self->_select_columns_by_type($sel); } -<<<<<<< HEAD @{$sel}=sort {$a<=$b} @{$sel}; -======= - my $aln = $self->new; - foreach my $p (@pos) { - $aln->add_seq($self->get_seq_by_pos($p)); - } - $aln->id($self->id); - # fix for meta, sf, ann - return $aln; -} - -=head2 select_noncont_by_name - - Title : select_noncont_by_name - Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456'); - Function : Creates a new alignment from a subset of sequences which are - selected by name (sequence ID). - Returns : a Bio::SimpleAlign object - Args : array of names (i.e., identifiers) for the sequences. - -=cut - -sub select_noncont_by_name { - my ($self, @names) = @_; - - my $aln = $self->new; - foreach my $name (@names) { - $aln->add_seq($self->get_seq_by_id($name)); - } - $aln->id($self->id); - - return $aln; -} - -=head2 slice - - Title : slice - Usage : $aln2 = $aln->slice(20,30) - Function : Creates a slice from the alignment inclusive of start and - end columns, and the first column in the alignment is denoted 1. - Sequences with no residues in the slice are excluded from the - new alignment and a warning is printed. Slice beyond the length of - the sequence does not do padding. - Returns : A Bio::SimpleAlign object - Args : Positive integer for start column, positive integer for end column, - optional boolean which if true will keep gap-only columns in the newly - created slice. Example: ->>>>>>> upstream/master #warnings - $self->throw("select_columns start has to be a positive integer, not [".$sel->[0]."]") - unless $sel->[0] =~ /^\d+$/ and $sel->[0] > 0; - $self->throw("select_columns end has to be a positive integer, not [".$sel->[$#$sel]."]") - unless $sel->[$#$sel] =~ /^\d+$/ and $sel->[$#$sel] > 0; + #the old remove_columns used a 0-based coordinates, so if the first coordinate is 0, the first two coordinates will be added by 1. + unless($sel->[0] =~ /^\d+$/ and $sel->[0] > 0) { + if($sel->[0] == 0) { + $self->deprecated(-warn_version => 1.0060, -throw_version => 1.0080, -message =>"select_columns start is 1 based. It has to be a positive integer, not [".$sel->[0]."]"); + $sel->[0]++; + $sel->[1]++; + } + else { + $self->throw("select_columns start has to be a positive integer, not [".$sel->[0]."]"); + } + } + + unless ($sel->[$#$sel] =~ /^\d+$/ and $sel->[$#$sel] > 0) { + if($sel->[$#$sel] == 0) { + $self->deprecated(-warn_version => 1.0060, -throw_version => 1.0080, -message =>"select_columns end is 1 based. It has to be a positive integer, not [".$sel->[$#$sel]."]"); + $sel->[0]++; + $sel->[1]++; + } + else { + $self->throw("select_columns end has to be a positive integer, not [".$sel->[$#$sel]."]"); + } + } $self->throw("This alignment has only ". $self->length . " residues. select_columns start " . "[".$sel->[0]."] is too big.") if $sel->[0] > $self->length; my $newcoords; + #toggle the coordinate selection if($toggle) { $sel=_toggle_selection($sel,$self->length); } @@ -1601,19 +1562,7 @@ sub remove_gaps { # Do the matching to get the segments to remove my $removed_cols; while ($gap_line =~ m/[$del_char]/g) { -<<<<<<< HEAD push @{$removed_cols}, pos($gap_line); -======= - my $start = pos($gap_line)-1; - $gap_line =~ m/\G[$del_char]+/gc; - my $end = pos($gap_line)-1; - - #have to offset the start and end for subsequent removes - $start-=$length; - $end -=$length; - $length += ($end-$start+1); - push @remove, [$start,$end]; ->>>>>>> upstream/master } #remove the segments $aln = $#$removed_cols >= 0 ? $self->select_columns($removed_cols,1) : $self; @@ -1647,21 +1596,39 @@ sub splice_by_seq_pos{ sub mask_columns { #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing my $self=shift; - my ($sel, $toggle) = $self->_rearrange([qw(SELECTION TOGGLE)], @_); - @{$sel}=sort {$a<=>$b} @{$sel}; + my @args=@_; + my ($sel, $toggle); + + #Exceptions + unless(ref($args[0])) { + $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"mask_columns - column range must be in refrence."); + my ($start, $end, $mask_char) = @args; + $sel=[($start, $end)]; + if(defined $mask_char) { + $self->mask_char($mask_char); + } + } + else { + ($sel, $toggle) = $self->_rearrange([qw(SELECTION TOGGLE)], @_); + } - my $nonres = join("",$self->gap_char, $self->match_char,$self->missing_char); - my $mask_char=$self->mask_char; - + @{$sel}=sort {$a<=>$b} @{$sel}; + #Exceptions + $self->throw("Mask start has to be a positive integer and less than ". "alignment length, not [".$sel->[0]."]") unless $sel->[0] =~ /^\d+$/ && $sel->[0] > 0 && $sel->[0] <= $self->length; $self->throw("Mask end has to be a positive integer and less than ". "alignment length, not [".$sel->[$#$sel]."]") - unless $sel->[$#$sel] =~ /^\d+$/ && $sel->[$#$sel] > 0 && $sel->[$#$sel] <= $self->length; + unless $sel->[$#$sel] =~ /^\d+$/ && $sel->[$#$sel] > 0 && $sel->[$#$sel] <= $self->length; + + my $nonres = join("",$self->gap_char, $self->match_char,$self->missing_char); + my $mask_char=$self->mask_char; + + #calculate the coords, and make it in a continous way my @newcoords; if($toggle) { @@ -1806,17 +1773,10 @@ sub map_chars { $self->throw("Need two arguments: a regexp and a string") unless defined $from and defined $to; -<<<<<<< HEAD foreach $seq ( $self->next_Seq() ) { - $temp = $seq->seq(); - $temp =~ s/$from/$to/g; - $seq->seq($temp); -======= - foreach $seq ( $self->each_seq() ) { - $temp = $seq->seq(); - $temp =~ s/$from/$to/g; - $seq->seq($temp); ->>>>>>> upstream/master + $temp = $seq->seq(); + $temp =~ s/$from/$to/g; + $seq->seq($temp); } return 1; } @@ -1881,102 +1841,19 @@ sub lowercase { =cut -<<<<<<< HEAD sub togglecase { my $self = shift; my $seq; my $temp; -======= -sub gap_line { - my ($self,$gapchar) = @_; - $gapchar = $gapchar || $self->gap_char; - my %gap_hsh; # column gaps vector - foreach my $seq ( $self->each_seq ) { - my $i = 0; - map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/} - map {[$i++, $_]} split(//, uc ($seq->seq)); - } - my $gap_line; - foreach my $pos ( 0..$self->length-1 ) { - $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.'; - } - return $gap_line; -} - -=head2 all_gap_line - - Title : all_gap_line() - Usage : $line = $align->all_gap_line() - Function : Generates a gap line - much like consensus string - except that a line where '-' represents all-gap column - Args : (optional) gap line characters ('-' by default) - Returns : string ->>>>>>> upstream/master - + foreach $seq ( $self->next_Seq() ) { - $temp = $seq->seq(); - $temp =~ tr/[A-Za-z]/[a-zA-Z]/; - -<<<<<<< HEAD - $seq->seq($temp); -======= -sub all_gap_line { - my ($self,$gapchar) = @_; - $gapchar = $gapchar || $self->gap_char; - my %gap_hsh; # column gaps counter hash - my @seqs = $self->each_seq; - foreach my $seq ( @seqs ) { - my $i = 0; - map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/} - map {[$i++, $_]} split(//, uc ($seq->seq)); - } - my $gap_line; - foreach my $pos ( 0..$self->length-1 ) { - if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) { - # gaps column - $gap_line .= $self->gap_char; - } else { - $gap_line .= '.'; - } ->>>>>>> upstream/master + $temp = $seq->seq(); + $temp =~ tr/[A-Za-z]/[a-zA-Z]/; + $seq->seq($temp); } return 1; } -<<<<<<< HEAD -======= -=head2 gap_col_matrix - - Title : gap_col_matrix() - Usage : my $cols = $align->gap_col_matrix() - Function : Generates an array where each element in the array is a - hash reference with a key of the sequence name and a - value of 1 if the sequence has a gap at that column - Returns : Reference to an array - Args : Optional: gap line character ($aln->gap_char or '-' by default) - -=cut - -sub gap_col_matrix { - my ( $self, $gapchar ) = @_; - $gapchar = $gapchar || $self->gap_char; - my %gap_hsh; # column gaps vector - my @cols; - foreach my $seq ( $self->each_seq ) { - my $i = 0; - my $str = $seq->seq; - my $len = $seq->length; - my $ch; - my $id = $seq->display_id; - while ( $i < $len ) { - $ch = substr( $str, $i, 1 ); - $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ ); - } - } - return \@cols; -} ->>>>>>> upstream/master - =head2 match Title : match() From 9a29065cc28c6e97690cfc7363d61bad7c956f81 Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Wed, 12 Sep 2012 17:18:45 -0400 Subject: [PATCH 4/8] Reformat SimpleAlign2.t --- t/Align/SimpleAlign2.t | 879 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 879 insertions(+) create mode 100644 t/Align/SimpleAlign2.t diff --git a/t/Align/SimpleAlign2.t b/t/Align/SimpleAlign2.t new file mode 100644 index 0000000000..e1ca10b0f0 --- /dev/null +++ b/t/Align/SimpleAlign2.t @@ -0,0 +1,879 @@ +# -*-Perl-*- Test Harness script for Bioperl +# $Id$ + +use strict; + +BEGIN { + use lib '.'; + use Bio::Root::Test; + + test_begin( -tests => 214 ); + + use_ok('Bio::SimpleAlign'); + use_ok('Bio::AlignIO'); + use_ok('Bio::SeqFeature::Generic'); + use_ok('Bio::Location::Simple'); + use_ok('Bio::Location::Split'); +} + +my $DEBUG = test_debug(); + +my ( $str, $aln, @seqs, $seq ); + +$str = Bio::AlignIO->new( -file => test_input_file('testaln.pfam') ); +isa_ok( $str, 'Bio::AlignIO' ); +$aln = $str->next_aln(); +is $aln->get_Seq_by_pos(1)->get_nse, '1433_LYCES/9-246', "pfam input test"; + +my $aln1 = $aln->remove_columns(['mismatch']); +is( + $aln1->match_line, + '::*::::*:**:*:*:***:**.***::*.*::**::**:***..**:' + . '*:*.::::*:.:*.*.**:***.**:*.:.**::**.*:***********:::*:.:*:**.*::*:' + . '.*.:*:**:****************::', + 'match_line' +); + +my $aln2 = $aln->select_Seqs([1]); +isa_ok( $aln2, 'Bio::Align::AlignI' ); +is( $aln2->num_sequences, 1, 'select_Seqs single selection' ); + +# test select sequences +$aln2 = $aln->select_Seqs([1..6,8,9,10]); +is( $aln2->num_sequences, 9, 'select_Seqs multiple selection' ); + +# test select sequences with parameter +$aln2 = $aln->select_Seqs(-selection=>[1,2,3,4,5,6,7,8,9,10],-toggle=>0); +is( $aln2->num_sequences, 10, 'select_Seqs parameter check' ); + +# test remove_Seqs reverse selection +$aln2->remove_Seqs([2..$aln2->num_sequences],1); +is( $aln2->num_sequences, 9, 'remove_Seqs' ); + + +# test select non continuous-nosort option +$aln2->remove_Seqs(-selection=>[1,3,5,7..9],-toggle=>0); +is( $aln2->num_sequences, 3, 'remove_Seqs' ); + + +@seqs = $aln->next_Seq(); +is scalar @seqs, 16, 'each_seq'; +is $seqs[0]->get_nse, '1433_LYCES/9-246', 'get_nse'; +is $seqs[0]->id, '1433_LYCES', 'id'; +is $seqs[0]->num_gaps, 3, 'num_gaps'; +@seqs = $aln->next_alphabetically(); +is scalar @seqs, 16, 'each_alphabetically'; + +is $aln->column_from_residue_number( '1433_LYCES', 10 ), 2, + 'column_from_residue_number'; +is $aln->displayname( '1433_LYCES/9-246', 'my_seq' ), 'my_seq', + 'display_name get/set'; +is $aln->displayname('1433_LYCES/9-246'), 'my_seq', 'display_name get'; +is substr( $aln->consensus_string(50), 0, 60 ), + "RE??VY?AKLAEQAERYEEMV??MK?VAE??????ELSVEERNLLSVAYKNVIGARRASW", + 'consensus_string'; +is substr( $aln->consensus_string(100), 0, 60 ), + "?????????L????E????M???M????????????L??E?RNL?SV?YKN??G??R??W", + 'consensus_string'; +is substr( $aln->consensus_string(0), 0, 60 ), + "REDLVYLAKLAEQAERYEEMVEFMKKVAELGAPAEELSVEERNLLSVAYKNVIGARRASW", + 'consensus_string'; + +ok( @seqs = $aln->next_Seq_with_id('143T_HUMAN') ); +is scalar @seqs, 1, 'each_seq_with_id'; + +is $aln->is_flush, 1, 'is_flush'; +ok( $aln->id('x') && $aln->id eq 'x', 'id get/set' ); + +is $aln->length, 242, 'length'; +is $aln->num_residues, 3769, 'num_residues'; +is $aln->num_sequences, 16, 'num_sequences'; +is( sprintf( "%.2f", $aln->overall_percentage_identity() ), + 33.06, 'overall_percentage_identity' ); +is( sprintf( "%.2f", $aln->overall_percentage_identity('align') ), + 33.06, 'overall_percentage_identity (align)' ); +is( sprintf( "%.2f", $aln->overall_percentage_identity('short') ), + 35.24, 'overall_percentage_identity (short)' ); +is( sprintf( "%.2f", $aln->overall_percentage_identity('long') ), + 33.47, 'overall_percentage_identity (long)' ); +is( sprintf( "%.2f", $aln->average_percentage_identity() ), + 66.91, 'average_percentage_identity' ); + +ok $aln->set_displayname_count; +is $aln->displayname('1433_LYCES/9-246'), '1433_LYCES_1', + 'set_displayname_count'; +ok $aln->set_displayname_flat; +is $aln->displayname('1433_LYCES/9-246'), '1433_LYCES', 'set_displayname_flat'; +ok $aln->set_displayname_normal; +is $aln->displayname('1433_LYCES/9-246'), '1433_LYCES/9-246', + 'set_displayname_normal'; +ok $aln->uppercase; +ok $aln->map_chars( '\.', '-' ); +@seqs = $aln->next_Seq_with_id('143T_HUMAN'); +is substr( $seqs[0]->seq, 0, 60 ), + 'KTELIQKAKLAEQAERYDDMATCMKAVTEQGA---ELSNEERNLLSVAYKNVVGGRRSAW', + 'uppercase, map_chars'; + +is( + $aln->match_line, + ' ::*::::* : * *: *: *:***:**.***::*.' + . ' *::**::**:*** . . ** :* :* . :: :: *: . :* .*. **:' + . '***.** :*. : .* * : : **.*:***********:::* : .: * :** .' + . '*::*: .*. : *: **:****************:: ', + 'match_line' +); +ok $aln->remove_LocatableSeq( $seqs[0] ), 'remove_seqs'; +is $aln->num_sequences, 15, 'remove_seqs'; +ok $aln->add_Seq( $seqs[0] ), 'add_seq'; +is $aln->num_sequences, 16, 'add_seq'; +ok $seq = $aln->get_Seq_by_pos(1), 'get_seq_by_pos'; +is( $seq->id, '1433_LYCES', 'get_seq_by_pos' ); +ok( ( $aln->missing_char() eq '&' ) and ( $aln->missing_char('X') eq 'X' ) ); +ok( ( $aln->match_char() eq '.' ) and ( $aln->match_char('-') eq '-' ) ); +ok( ( $aln->gap_char() eq '-' ) and ( $aln->gap_char('.') eq '.' ) ); +ok( ( $aln->mask_char() eq '?' ) and ( $aln->gap_char('N') eq 'N' ) ); + +is $aln->remove_redundant_Seqs(0.7), 12, 'purge'; +is $aln->num_sequences, 4, 'purge'; + +SKIP: { + test_skip( -tests => 24, -requires_module => 'IO::String' ); + + my $string; + my $out = IO::String->new($string); + + my $s1 = Bio::LocatableSeq->new( + -id => 'AAA', + -seq => 'aawtat-tn-', + -start => 1, + -end => 8, + -alphabet => 'dna' + ); + my $s2 = Bio::LocatableSeq->new( + -id => 'BBB', + -seq => '-aaaat-tt-', + -start => 1, + -end => 7, + -alphabet => 'dna' + ); + $a = Bio::SimpleAlign->new(); + $a->add_Seq($s1); + $a->add_Seq($s2); + + my $a_removed=$a->remove_gaps(-reference=>2); + is($a_removed->length, 7, "remove_gaps reference check"); + + + is( $a->consensus_iupac, "aAWWAT-TN-", 'IO::String consensus_iupac' ); + $s1->seq('aaaaattttt'); + $s1->alphabet('dna'); + $s1->end(10); + $s2->seq('-aaaatttt-'); + $s2->end(8); + $a = Bio::SimpleAlign->new(); + $a->add_Seq($s1); + $a->add_Seq($s2); + + my $strout = Bio::AlignIO->new( -fh => $out, -format => 'pfam' ); + $strout->write_aln($a); + is( + $string, + "AAA/1-10 aaaaattttt\n" . "BBB/1-8 -aaaatttt-\n", + 'IO::String write_aln normal' + ); + + $out->setpos(0); + $string = ''; + my $b = $a->select_columns([2..9]); + $strout->write_aln($b); + is $string, + "AAA/2-9 aaaatttt\n" . "BBB/1-8 aaaatttt\n", + 'IO::String write_aln slice'; + + $out->setpos(0); + $string = ''; + $b = $a->select_columns([9..10]); + $strout->write_aln($b); + is $string, + "AAA/9-10 tt\n" . "BBB/8-8 t-\n", + 'IO::String write_aln slice'; + + $a->verbose(-1); + $out->setpos(0); + $string = ''; + $b = $a->select_columns([1..2]); + $strout->write_aln($b); + is $string, + "AAA/1-2 aa\n" . "BBB/1-1 -a\n", + 'IO::String write_aln slice'; + + $a->verbose(-1); + $out->setpos(0); + $string = ''; + my $c = $a->select_columns(-selection=>[3..$a->length],-toggle=>1); + $strout->write_aln($c); + is $string, + "AAA/1-2 aa\n" . "BBB/1-1 -a\n", + 'IO::String write_aln slice'; + + # not sure what coordinates this should return... + $a->verbose(-1); + $out->setpos(0); + $string = ''; + $b = $a->select_columns(-selection=>[1..1],-toggle=>0,-keepgaponly=>1); + $strout->write_aln($b); + is $string, + "AAA/1-1 a\n" . "BBB/1-0 -\n", + 'IO::String write_aln slice'; + + + $a->verbose(-1); + $out->setpos(0); + $string = ''; + $b = $a->select_columns(-selection=>[2]); + $strout->write_aln($b); + is $string, + "AAA/2-2 a\n" . "BBB/1-1 a\n", + 'IO::String write_aln slice'; + + eval { $b = $a->select_columns([11..13]); }; + + like( $@, qr/EX/ ); + + # remove_columns by position + $out->setpos(0); + $string = ''; + $str = Bio::AlignIO->new( -file => test_input_file('mini-align.aln') ); + $aln1 = $str->next_aln; + $aln2 = $aln1->remove_columns([1]); + $strout->write_aln($aln2); + is $string, + "P84139/2-33 NEGEHQIKLDELFEKLLRARLIFKNKDVLRRC\n" + . "P814153/2-33 NEGMHQIKLDVLFEKLLRARLIFKNKDVLRRC\n" + . "BAB68554/1-14 ------------------AMLIFKDKQLLQQC\n" + . "gb|443893|124775/1-32 MRFRFQIKVPPAVEGARPALLIFKSRPELGGC\n", + 'remove_columns by position'; + + + + # and when arguments are entered in "wrong order"? + $out->setpos(0); + $string = ''; + my $aln3 = $aln1->remove_columns(-selection=>[2,6,7,31]); + $strout->write_aln($aln3); + is $string, + "P84139/1-33 MEGEIKLDELFEKLLRARLIFKNKDVLRC\n" + . "P814153/1-33 MEGMIKLDVLFEKLLRARLIFKNKDVLRC\n" + . "BAB68554/1-14 ----------------AMLIFKDKQLLQC\n" + . "gb|443893|124775/2-32 -RFRIKVPPAVEGARPALLIFKSRPELGC\n", + 'remove_columns by position (wrong order)'; + + my %cigars = $aln1->cigar_line; + is $cigars{'gb|443893|124775/1-32'}, '19,19:21,24:29,29:32,32', + 'cigar_line'; + is $cigars{'P814153/1-33'}, '20,20:22,25:30,30:33,33', 'cigar_line'; + is $cigars{'BAB68554/1-14'}, '1,1:3,6:11,11:14,14', 'cigar_line'; + is $cigars{'P84139/1-33'}, '20,20:22,25:30,30:33,33', 'cigar_line'; + + my @idens=$aln1->pairwise_percentage_identity(); + is join(";",map {sprintf("%0.4f",$_)} @idens),"1.0000;0.9394;0.2424;0.3333","pairwise_percentage_identity"; + + $aln1->sort_by_pairwise_identity; + is join(";",map {$_->display_id} $aln1->next_Seq),"P84139;P814153;gb|443893|124775;BAB68554","sort_by_pairwise_identity"; + + # sort_alphabetically + my $s3 = Bio::LocatableSeq->new( + -id => 'ABB', + -seq => '-attat-tt-', + -start => 1, + -end => 7, + -alphabet => 'dna' + ); + $a->add_Seq($s3); + + is $a->get_Seq_by_pos(2)->id, "BBB", 'sort_alphabetically - before'; + ok $a->sort_alphabetically; + is $a->get_Seq_by_pos(2)->id, "ABB", 'sort_alphabetically - after'; + + $b = $a->remove_gaps(); + is $b->consensus_string, "aaaattt", 'remove_gaps'; + + $s1->seq('aaaaattt--'); + + $b = $a->remove_gaps( undef, 'all_gaps_only' ); + is $b->consensus_string, "aaaaatttt", 'remove_gaps all_gaps_only'; + + # test set_new_reference: + $str = Bio::AlignIO->new( -file => test_input_file('testaln.aln') ); + $aln = $str->next_aln(); + my $new_aln = $aln->set_new_reference(3); + $a = $new_aln->get_Seq_by_pos(1)->display_id; + $new_aln = $aln->set_new_reference('P851414'); + $b = $new_aln->get_Seq_by_pos(1)->display_id; + is $a, 'P851414', 'set_new_reference'; + is $b, 'P851414', 'set_new_reference'; + + # test uniq_seq: + $str = Bio::AlignIO->new( + -verbose => $DEBUG, + -file => test_input_file('testaln2.fasta') + ); + $aln = $str->next_aln(); + $new_aln = $aln->uniq_Seq(); + $a = $new_aln->num_sequences; + is $a, 11, 'uniq_seq'; + + # check if slice works well with a LocateableSeq in its negative strand + my $seq1 = Bio::LocatableSeq->new( + -SEQ => "ATGCTG-ATG", + -START => 1, + -END => 9, + -ID => "test1", + -STRAND => -1 + ); + + my $seq2 = Bio::LocatableSeq->new( + -SEQ => "A-GCTGCATG", + -START => 1, + -END => 9, + -ID => "test2", + -STRAND => 1 + ); + + $string = ''; + my $aln_negative = Bio::SimpleAlign->new(); + $aln_negative->add_Seq($seq1); + $aln_negative->add_Seq($seq2); + my $start_column = + $aln_negative->column_from_residue_number( + $aln_negative->get_Seq_by_pos(1)->display_id, 2 ); + my $end_column = + $aln_negative->column_from_residue_number( + $aln_negative->get_Seq_by_pos(1)->display_id, 5 ); + $aln_negative = $aln_negative->select_columns([$end_column..$start_column]); + my $seq_negative = $aln_negative->get_Seq_by_pos(1); + is( $seq_negative->start, 2, "bug 2099" ); + is( $seq_negative->end, 5, "bug 2099" ); + + # bug 2793 + my $s11 = Bio::LocatableSeq->new( -id => 'testseq1', -seq => 'AAA' ); + my $s21 = Bio::LocatableSeq->new( -id => 'testseq2', -seq => 'CCC' ); + $a = Bio::SimpleAlign->new(); + ok( $a->add_Seq( $s11, 1 ), "bug 2793" ); + is( $a->get_Seq_by_pos(1)->seq, 'AAA', "bug 2793" ); + ok( $a->add_Seq( $s21, 2 ), "bug 2793" ); + is( $a->get_Seq_by_pos(2)->seq, 'CCC', "bug 2793" ); + throws_ok { $a->add_Seq( $s21, 0 ) } qr/must be >= 1/, 'Bad sequence, bad!'; +} + +# test for Bio::SimpleAlign annotation method and +# Bio::FeatureHolder stuff + +$aln = Bio::SimpleAlign->new; +for my $seqset ( [qw(one AGAGGAT)], [qw(two AGACGAT)], [qw(three AGAGGTT)] ) { + $aln->add_Seq( + Bio::LocatableSeq->new( + -id => $seqset->[0], + -seq => $seqset->[1] + ) + ); +} + +is $aln->num_sequences, 3, 'added 3 seqs'; + +$aln->add_SeqFeature( + Bio::SeqFeature::Generic->new( + -start => 1, + -end => 1, + -primary_tag => 'charLabel', + ) +); +$aln->add_SeqFeature( + Bio::SeqFeature::Generic->new( + -start => 3, + -end => 3, + -primary_tag => 'charLabel', + + ) +); +is( $aln->feature_count, 2, 'first 2 features added' ); + +my $splitloc = Bio::Location::Split->new; +$splitloc->add_sub_Location( + Bio::Location::Simple->new( + -start => 2, + -end => 3 + ) +); + +$splitloc->add_sub_Location( + Bio::Location::Simple->new( + -start => 5, + -end => 6 + ) +); + +$aln->add_SeqFeature( + Bio::SeqFeature::Generic->new( + -location => $splitloc, + -primary_tag => 'charLabel', + ) +); + +is( $aln->feature_count, 3, '3rd feature added' ); + +#do slices and masks as defined by the feature +my $i = 0; +my @slice_lens = qw(1 1 2 2); +for my $feature ( $aln->get_SeqFeatures ) { + for my $loc ( $feature->location->each_Location ) { + my $masked = $aln->mask_columns([$loc->start..$loc->end]); + #"This should pass but dies; see bug 2842"; + lives_ok {my $fslice = $masked->select_columns([$loc->start..$loc->end])}; + $masked->verbose(-1); + my $fslice = $masked->select_columns([$loc->start..$loc->end]); + is( $fslice->length, $slice_lens[ $i++ ], "slice $i len" ); + for my $s ( $fslice->next_Seq ) { + like( $s->seq, qr/^\?+$/, 'correct masked seq' ); + } + } +} + +# test set_displayname_safe & restore_displayname: +$str = Bio::AlignIO->new( -file => test_input_file('pep-266.aln') ); +$aln = $str->next_aln(); +is $aln->get_Seq_by_pos(3)->display_id, 'Smik_Contig1103.1', + 'initial display id ok'; +my ( $new_aln, $ref ) = $aln->set_displayname_safe(); +is $new_aln->get_Seq_by_pos(3)->display_id, 'S000000003', 'safe display id ok'; +my $restored_aln = $new_aln->restore_displayname($ref); +is $restored_aln->get_Seq_by_pos(3)->display_id, 'Smik_Contig1103.1', + 'restored display id ok'; + +# test sort_by_list: +$str = Bio::AlignIO->new( -file => test_input_file('testaln.aln') ); +my $list_file = test_input_file('testaln.list'); +$aln = $str->next_aln(); +$new_aln = $aln->sort_by_list($list_file); +$a = $new_aln->get_Seq_by_pos(1)->display_id; +is $a, 'BAB68554', 'sort by list ok'; + +# test for Binary/Morphological/Mixed data + +# sort_by_start + +# test sort_by_list: + +my $s1 = Bio::LocatableSeq->new( + -id => 'AAA', + -seq => 'aawtat-tn-', + -start => 12, + -end => 19, + -alphabet => 'dna' +); +my $s2 = Bio::LocatableSeq->new( + -id => 'BBB', + -seq => '-aaaat-tt-', + -start => 1, + -end => 7, + -alphabet => 'dna' +); +my $s3 = Bio::LocatableSeq->new( + -id => 'BBB', + -seq => '-aaaat-t--', + -start => 31, + -end => 36, + -alphabet => 'dna' +); + + +$a = Bio::SimpleAlign->new(); +$a->add_Seq($s1); +$a->add_Seq($s2); +$a->add_Seq($s3); + +$a->uppercase(); +is ($s1->seq,'AAWTAT-TN-', "uppercase"); +$a->lowercase(); +is ($s1->seq,'aawtat-tn-', "lowercase"); +$a->togglecase(); +is ($s1->seq,'AAWTAT-TN-', "togglecase"); +$a->lowercase(); + +@seqs = $a->next_Seq; +is( $seqs[0]->start, 12 ); +is( $seqs[1]->start, 1 ); +is( $seqs[2]->start, 31 ); + +$a->sort_by_start; +@seqs = $a->next_Seq; + +is( $seqs[0]->start, 1 , "sort_by_start"); +is( $seqs[1]->start, 12 ); +is( $seqs[2]->start, 31 ); + + +$a->sort_by_length; +@seqs = $a->next_Seq; + +is( $seqs[0]->start, 31 , "sort_by_length"); +is( $seqs[1]->start, 1 ); +is( $seqs[2]->start, 12 ); + +$a->sort_by_pairwise_identity(3); +@seqs = $a->next_Seq; + +is( $seqs[0]->start, 12 , "sort_by_pairwise_identity"); +is( $seqs[1]->start, 1 ); +is( $seqs[2]->start, 31 ); + +my %testdata = ( + 'allele1' => 'GGATCCATT[C/C]CTACT', + 'allele2' => 'GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT', + 'allele3' => 'G[G/C]ATCCATT[C/G]CTACT', + 'allele4' => 'GGATCCATT[C/G]CTACT', + 'allele5' => 'GGATCCATT[C/G]CTAC[T/A]', + 'allele6' => 'GGATCCATT[C/G]CTA[C/G][T/A]', + 'testseq' => 'GGATCCATT[C/G]CTACT' +); + +my $alnin = Bio::AlignIO->new( + -format => 'fasta', + -file => test_input_file('alleles.fas') +); + +$aln = $alnin->next_aln; + +my $ct = 0; + +# compare all to test seq + +for my $ls ( sort keys %testdata ) { + $ct++; + my $str = $aln->bracket_string( + -refseq => 'testseq', + -allele1 => 'allele1', + -allele2 => $ls, + ); + is( $str, $testdata{$ls}, "BIC:$str" ); +} + +%testdata = ( + 'allele1' => 'GGATCCATT{C.C}CTACT', + 'allele2' => 'GGAT{C.-}{C.-}ATT{C.C}CT{A.C}CT', + 'allele3' => 'G{G.C}ATCCATT{C.G}CTACT', + 'allele4' => 'GGATCCATT{C.G}CTACT', + 'allele5' => 'GGATCCATT{C.G}CTAC{T.A}', + 'allele6' => 'GGATCCATT{C.G}CTA{C.G}{T.A}', + 'testseq' => 'GGATCCATT{C.G}CTACT' +); + +for my $ls ( sort keys %testdata ) { + $ct++; + my $str = $aln->bracket_string( + -refseq => 'testseq', + -allele1 => 'allele1', + -allele2 => $ls, + -delimiters => '{}', + -separator => '.' + ); + is( $str, $testdata{$ls}, "BIC:$str" ); +} + +# is _remove_col really working correctly? +my $a = Bio::LocatableSeq->new( + -id => 'a', + -strand => 1, + -seq => 'atcgatcgatcgatcg', + -start => 5, + -end => 20 +); +my $b = Bio::LocatableSeq->new( + -id => 'b', + -strand => 1, + -seq => '-tcgatc-atcgatcg', + -start => 30, + -end => 43 +); +my $c = Bio::LocatableSeq->new( + -id => 'c', + -strand => -1, + -seq => 'atcgatcgatc-atc-', + -start => 50, + -end => 63 +); +my $d = Bio::LocatableSeq->new( + -id => 'd', + -strand => -1, + -seq => '--cgatcgatcgat--', + -start => 80, + -end => 91 +); +my $e = Bio::LocatableSeq->new( + -id => 'e', + -strand => 1, + -seq => '-t-gatcgatcga-c-', + -start => 100, + -end => 111 +); +$aln = Bio::SimpleAlign->new(); +$aln->add_Seq($a); +$aln->add_Seq($b); +$aln->add_Seq($c); + + +my $gapless = $aln->remove_gaps(); +foreach my $seq ( $gapless->next_Seq ) { + if ( $seq->id eq 'a' ) { + is $seq->start, 6; + is $seq->end, 19; + is $seq->seq, 'tcgatcatcatc','remove_gaps ok'; + } + elsif ( $seq->id eq 'b' ) { + is $seq->start, 30; + is $seq->end, 42; + is $seq->seq, 'tcgatcatcatc'; + } + elsif ( $seq->id eq 'c' ) { + is $seq->start, 50; + is $seq->end, 62; + is $seq->seq, 'tcgatcatcatc'; + } +} + +$aln->add_Seq($d); +$aln->add_Seq($e); + + +$gapless = $aln->remove_gaps(); +foreach my $seq ( $gapless->next_Seq ) { + if ( $seq->id eq 'a' ) { + is $seq->start, 8; + is $seq->end, 17; + is $seq->seq, 'gatcatca'; + } + elsif ( $seq->id eq 'b' ) { + is $seq->start, 32; + is $seq->end, 40; + is $seq->seq, 'gatcatca'; + } + elsif ( $seq->id eq 'c' ) { + is $seq->start, 52; + is $seq->end, 60; + is $seq->seq, 'gatcatca'; + } + elsif ( $seq->id eq 'd' ) { + is $seq->start, 81; + is $seq->end, 90; + is $seq->seq, 'gatcatca'; + } + elsif ( $seq->id eq 'e' ) { + is $seq->start, 101; + is $seq->end, 110; + is $seq->seq, 'gatcatca'; + } +} + +# bug 3077 + +my $slice = $aln->select_columns([1..4]); + +for my $seq ($slice->next_Seq) { + if ( $seq->id eq 'a' ) { + is $seq->start, 5; + is $seq->end, 8; + is $seq->strand, 1; + is $seq->seq, 'atcg'; + } + elsif ( $seq->id eq 'b' ) { + is $seq->start, 30; + is $seq->end, 32; + is $seq->strand, 1; + is $seq->seq, '-tcg'; + } + elsif ( $seq->id eq 'c' ) { + is $seq->start, 60; + is $seq->end, 63; + is $seq->strand, -1; + is $seq->seq, 'atcg'; + } + elsif ( $seq->id eq 'd' ) { + is $seq->start, 90; + is $seq->end, 91; + is $seq->strand, -1; + is $seq->seq, '--cg'; + } + elsif ( $seq->id eq 'e' ) { + is $seq->start, 100; + is $seq->end, 101; + is $seq->strand, 1; + is $seq->seq, '-t-g'; + } +} + +my $f = Bio::LocatableSeq->new( + -id => 'f', + -seq => 'a-cgatcgatcgat-g', + -start => 30, + -end => 43 +); +$aln = Bio::SimpleAlign->new(); +$aln->add_Seq($a); +$aln->add_Seq($f); + +$gapless = $aln->remove_gaps(); +foreach my $seq ( $gapless->next_Seq ) { + if ( $seq->id eq 'a' ) { + is $seq->start, 5; + is $seq->end, 20; + is $seq->seq, 'acgatcgatcgatg'; + } + elsif ( $seq->id eq 'f' ) { + is $seq->start, 30; + is $seq->end, 43; + is $seq->seq, 'acgatcgatcgatg'; + } +} + +my $g = + Bio::LocatableSeq->new( -id => 'g', -seq => 'atgc', -start => 5, -end => 8 ); +my $h = Bio::LocatableSeq->new( + -id => 'h', + -seq => 't-cg', + -start => 30, + -end => 32 +); +$aln = Bio::SimpleAlign->new(); +$aln->add_Seq($g); +$aln->add_Seq($h); + +# test for new method in API get_seq_by_id +my $retrieved = $aln->get_Seq_by_id('g'); +is( defined $retrieved, 1 ); +my $removed = $aln->remove_columns(-selection=>[1],-toggle=>1); +foreach my $seq ( $removed->next_Seq ) { + if ( $seq->id eq 'g' ) { + is $seq->start, 5, "remove_columns toggle selection"; + is $seq->end, 5; + is $seq->seq, 'a'; + } + elsif ( $seq->id eq 'h' ) { + is $seq->start, 30; + is $seq->end, 30; + is $seq->seq, 't'; + } +} + +# work out mask_columns(), see bug 2842 +SKIP: { + test_skip(-tests => 6, -requires_module => 'IO::String'); + my $io = Bio::AlignIO->new( -file => test_input_file("testaln.aln") ); + my $aln = $io->next_aln(); + isa_ok( $aln, 'Bio::SimpleAlign' ); + my $consensus = <consensus_conservation; + # 422 positions, mostly two of six sequences conserved, set as default + my @cons_expect = (100 * 2/6) x 422; + # Exceptionally columns as a mask, manually determined (1-based columns) + $cons_expect[$_-1] = 100 * 1/6 for (5,12,41,70,82,310,390); + $cons_expect[$_-1] = 100 * 3/6 for (27,30,32,36,47,49,61,66,69,71,77,79, + 81,91,96,97,105,114,115,117,118,121,122,129,140,146,156,159,160,162, + 183,197,217,221,229,242,247,248,261,266,282,287,295,316,323,329,335,337,344,); + $cons_expect[$_-1] = 100 * 4/6 for (84,93,99,100,102,107,108,112,113,119,150,); + $cons_expect[$_-1] = 100 * 5/6 for (81,110); + # Format for string comparison + @cons_expect = map { sprintf "%4.1f", $_ } @cons_expect; + @cons_got = map { sprintf "%4.1f", $_ } @cons_got; + is(length($aln->consensus_string), scalar(@cons_got),"conservation length"); + is_deeply(\@cons_got, \@cons_expect, "conservation scores"); + + is( $aln->consensus_string, $consensus, 'consensus string looks ok' ); + + is( aln2str( $aln => 'pfam' ), <mask_columns([12..20]); + is( aln2str( $newaln, 'pfam' ), <mask_char('N'); + + my $newaln2 = $aln->mask_columns([12..20]); + is( aln2str( $newaln2, 'pfam' ), <mask_char('?'); + + +###### test with phylip + + my $phy_fh = IO::String->new( <new( -fh => $phy_fh, -format => 'phylip' ); + + $aln = $in->next_aln(); + is( aln2str( $aln, 'phylip' ), <mask_columns([15..20]); + is( aln2str( $newaln,'phylip' ), <new( $out ); + my $alignio_out = Bio::AlignIO->new(-fh => $out_fh, -format => $fmt); + $alignio_out->write_aln( $aln ); + return $out; +} + From 00075252edc8a0afa85f7cc9cb066ba3b3fddb2d Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Tue, 2 Oct 2012 14:37:59 -0400 Subject: [PATCH 5/8] up-to-date Oct 2,2012 --- Bio/SimpleAlign.pm | 2 ++ t/Align/SimpleAlign2.t | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index 0cb49c3f43..959644ef7a 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -2714,12 +2714,14 @@ sub mask_char { =head2 symbol_chars + Title : symbol_chars Usage : my @symbolchars = $aln->symbol_chars; Function: Returns all the seen symbols (other than gaps) Returns : array of characters that are the seen symbols Args : boolean to include the gap/missing/match characters + =cut sub symbol_chars{ diff --git a/t/Align/SimpleAlign2.t b/t/Align/SimpleAlign2.t index e1ca10b0f0..35cf00f688 100644 --- a/t/Align/SimpleAlign2.t +++ b/t/Align/SimpleAlign2.t @@ -850,18 +850,18 @@ EOF $aln = $in->next_aln(); is( aln2str( $aln, 'phylip' ), <mask_columns([15..20]); is( aln2str( $newaln,'phylip' ), < Date: Tue, 2 Oct 2012 16:02:46 -0400 Subject: [PATCH 6/8] Backward support for mask_columsn --- Bio/SimpleAlign.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index 959644ef7a..da6a41f65c 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -1603,7 +1603,7 @@ sub mask_columns { unless(ref($args[0])) { $self->deprecated(-warn_version => 1.0080, -throw_version => 1.0090, -message =>"mask_columns - column range must be in refrence."); my ($start, $end, $mask_char) = @args; - $sel=[($start, $end)]; + $sel=[$start..$end]; if(defined $mask_char) { $self->mask_char($mask_char); } From 72b46db98fab5e9a8c92dbb1cb4c8edf4f68e283 Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Tue, 2 Oct 2012 17:26:13 -0400 Subject: [PATCH 7/8] Improved backward support for remove_columns --- Bio/SimpleAlign.pm | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index da6a41f65c..1845b14215 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -83,7 +83,8 @@ However, in some cases people do not want the name/start-end to be displayed: either multiple names in an alignment or names specific to the alignment (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called C, and generally is what is used to print out the -alignment. They default to name/start-end. +alignment. They default to name/start-end. Please be cautious that, the +sequence coordinates are 1 based for all the methods in Bio::SimpleAlign. The SimpleAlign Module is derived from the Align module by Ewan Birney. This package is refactored by Jun Yin as part of the Google Summer of @@ -1468,7 +1469,32 @@ sub _select_columns_by_type { sub remove_columns { my ($self,@args) = @_; @args || $self->throw("Must supply column ranges or column types"); - my ($sel, $toggle,$gap) = $self->_rearrange([qw(SELECTION TOGGLE KEEPGAPONLY)], @args); + + #provide backward support + #backward support for remove columns using multiple anonymous array reference + if(defined $args[1] && ref($args[0]) && ref($args[1])) { + $self->deprecated(-warn_version => 1.0060, -throw_version => 1.0080, -message =>"select_columns start is 1 based and in a single array. Please check the method description."); + #the old method use multiple arrays and the coordiantes are 0 based + my @newargs; + foreach my $arg (@args) { + push @{$newargs[0]},($arg->[0]+1)..($arg->[1]+1); + } + @args=@newargs; + } + + my ($sel, $toggle,$gap) = $self->_rearrange([qw(SELECTION TOGGLE KEEPGAPONLY)], @args); + + #backward support for remove columns using 0-based coordinate + if($sel->[0] =~ /^\d+$/ && $sel->[0]==0) { + $self->deprecated(-warn_version => 1.0060, -throw_version => 1.0080, -message =>"select_columns start is 1 based and in a single array. Please check the method description."); + #the old method use multiple arrays and the coordiantes are 0 based + my @newargs; + foreach my $arg (@{$sel}) { + push @{$newargs[0]},$arg+1; + } + @{$sel}=@newargs; + } + my $newaln=$self->select_columns(-selection=>$sel,-toggle=>$toggle?0:1,-keepgaponly=>$gap); return $newaln; } From 3dd4aeeaba3ab2f8837cecb1797d27d3db540f79 Mon Sep 17 00:00:00 2001 From: Jun Yin Date: Tue, 2 Oct 2012 17:55:44 -0400 Subject: [PATCH 8/8] remove_columns updated --- Bio/SimpleAlign.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index 1845b14215..38a8bbcb02 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -1484,15 +1484,15 @@ sub remove_columns { my ($sel, $toggle,$gap) = $self->_rearrange([qw(SELECTION TOGGLE KEEPGAPONLY)], @args); - #backward support for remove columns using 0-based coordinate + #if there is a 0 based coordinate, all coordinates will be added by 1 if($sel->[0] =~ /^\d+$/ && $sel->[0]==0) { $self->deprecated(-warn_version => 1.0060, -throw_version => 1.0080, -message =>"select_columns start is 1 based and in a single array. Please check the method description."); #the old method use multiple arrays and the coordiantes are 0 based - my @newargs; + my @newsel; foreach my $arg (@{$sel}) { - push @{$newargs[0]},$arg+1; + push @newsel,$arg+1; } - @{$sel}=@newargs; + $sel=[@newsel]; } my $newaln=$self->select_columns(-selection=>$sel,-toggle=>$toggle?0:1,-keepgaponly=>$gap);