BaseAlignFeature.pm 24.7 KB
Newer Older
1 2
=head1 LICENSE

3
  Copyright (c) 1999-2010 The European Bioinformatics Institute and
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut
20 21 22

=head1 NAME

23 24
Bio::EnsEMBL::BaseAlignFeature - Baseclass providing a common abstract
implmentation for alignment features
25 26 27

=head1 SYNOPSIS

28 29 30 31 32 33 34 35 36 37 38
  my $feat = new Bio::EnsEMBL::DnaPepAlignFeature(
    -slice        => $slice,
    -start        => 100,
    -end          => 120,
    -strand       => 1,
    -hseqname     => 'SP:RF1231',
    -hstart       => 200,
    -hend         => 220,
    -analysis     => $analysis,
    -cigar_string => '10M3D5M2I'
  );
39

40
  Alternatively if you have an array of ungapped features
41

42 43
  my $feat =
    new Bio::EnsEMBL::DnaPepAlignFeature( -features => \@features );
44

45
  Where @features is an array of Bio::EnsEMBL::FeaturePair
46

47
  There is a method to manipulate the cigar_string into ungapped features
48

49
  my @ungapped_features = $feat->ungapped_features();
50

51
  This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair
52

53
  $analysis is a Bio::EnsEMBL::Analysis object
54 55

  Bio::EnsEMBL::Feature methods can be used
56
  Bio::EnsEMBL::FeaturePair methods can be used
57

58 59
  The cigar_string contains the ungapped pieces that make up the gapped 
  alignment
60

61 62 63
  It looks like: n Matches [ x Deletes or Inserts m Matches ]*
  but a bit more condensed like "23M4I12M2D1M"
  and evenmore condensed as you can ommit 1s "23M4I12M2DM"
64 65


66
  To make things clearer this is how a blast HSP would be parsed
67

68 69
  >AK014066
         Length = 146
70

71
    Minus Strand HSPs:
72

73 74
    Score = 76 (26.8 bits), Expect = 1.4, P = 0.74
    Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1
75

76 77 78
  Query:   479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300
               G  APPP PQG R   P P G + P   L             + + ++  R  +A   +
  Sbjct:     7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64
79

80 81 82
  Query:   299 SSPHNPSPLPS 267
                  H P+P P+
  Sbjct:    65 PLTHTPTPTPT 75
83

84 85
  The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2 
  and the strand is -1.
86

87
  The alignment is made up of the following ungapped pieces :
88

89 90 91
  sequence 1 start 447 , sequence 2 start 7  , match length 33 , strand -1
  sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1
  sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1
92

93 94 95
  These ungapped pieces are made up into the following string (called a cigar 
  string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75 
  hstrand 1 and feature type would be DnaPepAlignFeature
96 97

=cut
98

99

100 101 102
package Bio::EnsEMBL::BaseAlignFeature;

use Bio::EnsEMBL::FeaturePair;
103 104
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
105 106 107 108

use vars qw(@ISA);
use strict;

109
@ISA = qw(Bio::EnsEMBL::FeaturePair);
110

111 112 113 114 115

=head2 new

  Arg [..]   : List of named arguments. (-cigar_string , -features) defined
               in this constructor, others defined in FeaturePair and 
Graham McVicker's avatar
Graham McVicker committed
116 117 118
               SeqFeature superclasses.  Either cigar_string or a list
               of ungapped features should be provided - not both.
  Example    : $baf = new BaseAlignFeatureSubclass(-cigar_string => '3M3I12M');
119
  Description: Creates a new BaseAlignFeature using either a cigarstring or
Graham McVicker's avatar
Graham McVicker committed
120 121 122 123 124 125 126
               a list of ungapped features.  BaseAlignFeature is an abstract
               baseclass and should not actually be instantiated - rather its
               subclasses should be.
  Returntype : Bio::EnsEMBL::BaseAlignFeature
  Exceptions : thrown if both feature and cigar string args are provided
               thrown if neither feature nor cigar string args are provided
  Caller     : general
127
  Status     : Stable
128 129 130

=cut

131
sub new {
132
  
133 134 135 136 137 138 139 140 141 142 143 144
  my $caller = shift;

  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_);

  my ($cigar_string,$features) = rearrange([qw(CIGAR_STRING FEATURES)], @_);

  if (defined($cigar_string) && defined($features)) {
    throw("CIGAR_STRING or FEATURES argument is required - not both.");
  } elsif (defined($features)) {
    $self->_parse_features($features);
Laura Clarke's avatar
 
Laura Clarke committed
145
    
146 147 148 149 150
  } elsif (defined($cigar_string)) {
    $self->{'cigar_string'} = $cigar_string;
  } else {
    throw("CIGAR_STRING or FEATURES argument is required");
  }
Laura Clarke's avatar
 
Laura Clarke committed
151
  
152
  return $self;
153 154 155
}


156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
=head2 new_fast

  Arg [1]    : hashref $hashref
               A hashref which will be blessed into a PepDnaAlignFeature.
  Example    : none
  Description: This allows for very fast object creation when a large number
               of PepDnaAlignFeatures needs to be created.  This is a bit of
               a hack but necessary when thousands of features need to be
               generated within a couple of seconds for web display. It is
               not recommended that this method be called unless you know what
               you are doing.  It requires knowledge of the internals of this
               class and its superclasses.
  Returntype : Bio::EnsEMBL::BaseAlignFeature
  Exceptions : none
  Caller     : none currently
171
  Status     : Stable
172 173 174 175 176 177 178 179

=cut

sub new_fast {
  my ($class, $hashref) = @_;

  return bless $hashref, $class;
}
Graham McVicker's avatar
Graham McVicker committed
180

181

Arne Stabenau's avatar
Arne Stabenau committed
182
=head2 cigar_string
183

Arne Stabenau's avatar
Arne Stabenau committed
184
  Arg [1]    : string $cigar_string
185
  Example    : ( "12MI3M" )
Arne Stabenau's avatar
Arne Stabenau committed
186 187 188
  Description: get/set for attribute cigar_string
               cigar_string describes the alignment. "xM" stands for 
               x matches (mismatches), "xI" for inserts into query sequence 
Graham McVicker's avatar
Graham McVicker committed
189 190
               (thats the ensembl sequence), "xD" for deletions 
               (inserts in the subject). an "x" that is 1 can be omitted.
Arne Stabenau's avatar
Arne Stabenau committed
191
  Returntype : string
192
  Exceptions : none
Arne Stabenau's avatar
Arne Stabenau committed
193
  Caller     : general
194
  Status     : Stable
195 196 197 198

=cut

sub cigar_string {
199 200 201
  my $self = shift;
  $self->{'cigar_string'} = shift if(@_);
  return $self->{'cigar_string'};
202 203 204
}


205 206 207 208 209 210 211 212
=head2 alignment_length

  Arg [1]    : None
  Example    : 
  Description: return the alignment length (including indels) based on the cigar_string
  Returntype : int
  Exceptions : 
  Caller     : 
213
  Status     : Stable
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237

=cut

sub alignment_length {
  my $self = shift;

  if (! defined $self->{'_alignment_length'} && defined $self->cigar_string) {
    
    my @pieces = ( $self->cigar_string =~ /(\d*[MDI])/g );
    unless (@pieces) {
      print STDERR "Error parsing cigar_string\n";
    }
    my $alignment_length = 0;
    foreach my $piece (@pieces) {
      my ($length) = ( $piece =~ /^(\d*)/ );
      if (! defined $length || $length eq "") {
        $length = 1;
      }
      $alignment_length += $length;
    }
    $self->{'_alignment_length'} = $alignment_length;
  }
  return $self->{'_alignment_length'};
}
Graham McVicker's avatar
Graham McVicker committed
238

239 240
=head2 ungapped_features

Arne Stabenau's avatar
Arne Stabenau committed
241
  Args       : none
Graham McVicker's avatar
Graham McVicker committed
242
  Example    : @ungapped_features = $align_feature->get_feature
Arne Stabenau's avatar
Arne Stabenau committed
243 244 245 246 247
  Description: converts the internal cigar_string into an array of
               ungapped feature pairs
  Returntype : list of Bio::EnsEMBL::FeaturePair
  Exceptions : cigar_string not set internally
  Caller     : general
248
  Status     : Stable
249

Arne Stabenau's avatar
Arne Stabenau committed
250
=cut
251 252 253 254

sub ungapped_features {
  my ($self) = @_;

255 256
  if (!defined($self->{'cigar_string'})) {
    throw("No cigar_string defined.  Can't return ungapped features");
257 258
  }

259
  return @{$self->_parse_cigar()};
260 261
}

262 263 264 265 266 267 268 269 270 271 272 273 274 275
=head2 strands_reversed
 
  Arg [1]    : int $strands_reversed
  Example    : none
  Description: get/set for attribute strands_reversed
               0 means that strand and hstrand are the original strands obtained
                 from the alignment program used
               1 means that strand and hstrand have been flipped as compared to
                 the original result provided by the alignment program used.
                 You may want to use the reverse_complement method to restore the
                 original strandness.
  Returntype : int
  Exceptions : none
  Caller     : general
276
  Status     : Stable
277 278
 
=cut
Graham McVicker's avatar
Graham McVicker committed
279

280 281 282 283 284 285
sub strands_reversed {
   my ($self, $arg) = @_;
 
   if ( defined $arg ) {
      $self->{'strands_reversed'} = $arg ;
   }
286 287 288

   $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'});

289 290
   return $self->{'strands_reversed'};
}
Graham McVicker's avatar
Graham McVicker committed
291

292 293 294 295 296 297 298 299 300
=head2 reverse_complement

  Args       : none
  Example    : none
  Description: reverse complement the FeaturePair,
               modifing strand, hstrand and cigar_string in consequence
  Returntype : none
  Exceptions : none
  Caller     : general
301
  Status     : Stable
302 303 304 305 306 307 308 309 310

=cut

sub reverse_complement {
  my ($self) = @_;

  # reverse strand in both sequences
  $self->strand($self->strand * -1);
  $self->hstrand($self->hstrand * -1);
311

312 313 314 315 316 317 318 319
  # reverse cigar_string as consequence
  my $cigar_string = $self->cigar_string;
  $cigar_string =~ s/(D|I|M)/$1 /g;
  my @cigar_pieces = split / /,$cigar_string;
  $cigar_string = "";
  while (my $piece = pop @cigar_pieces) {
    $cigar_string .= $piece;
  }
320 321 322 323 324 325 326 327
  
  $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'});

  if ($self->strands_reversed) {
    $self->strands_reversed(0)
  } else {
    $self->strands_reversed(1);
  }
328 329 330

  $self->cigar_string($cigar_string);
}
331

Graham McVicker's avatar
Graham McVicker committed
332 333


334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
=head2 transform

  Arg  1     : String $coordinate_system_name
  Arg [2]    : String $coordinate_system_version
  Example    : $feature = $feature->transform('contig');
               $feature = $feature->transform('chromosome', 'NCBI33');
  Description: Moves this AlignFeature to the given coordinate system.
               If the feature cannot be transformed to the destination 
               coordinate system undef is returned instead.
  Returntype : Bio::EnsEMBL::BaseAlignFeature;
  Exceptions : wrong parameters
  Caller     : general
  Status     : Medium Risk
             : deprecation needs to be removed at some time

=cut

sub transform {
  my $self = shift;

  # catch for old style transform calls
  if( ref $_[0] eq 'HASH') {
    deprecate("Calling transform with a hashref is deprecate.\n" .
              'Use $feat->transfer($slice) or ' .
              '$feat->transform("coordsysname") instead.');
    my (undef, $new_feat) = each(%{$_[0]});
    return $self->transfer($new_feat->slice);
  }

363 364 365 366 367
  my $new_feature = $self->SUPER::transform(@_);
  if ( !defined($new_feature)
    || $new_feature->length() != $self->length() )
  {
    my @segments = @{ $self->project(@_) };
368

369 370 371
    if ( !@segments ) {
      return undef;
    }
372 373

    my @ungapped;
374 375 376 377
    foreach my $f ( $self->ungapped_features() ) {
      $f = $f->transform(@_);
      if ( defined($f) ) {
        push( @ungapped, $f );
378
      } else {
379 380
        warning( "Failed to transform alignment feature; "
            . "ungapped component could not be transformed" );
381 382 383 384
        return undef;
      }
    }

385 386
    eval { $new_feature = $self->new( -features => \@ungapped ); };

387 388 389 390
    if ($@) {
      warning($@);
      return undef;
    }
391
  } ## end if ( !defined($new_feature...))
392 393 394 395 396

  return $new_feature;
}


397 398 399 400
=head2 _parse_cigar

  Args       : none
  Example    : none
Graham McVicker's avatar
Graham McVicker committed
401 402
  Description: PRIVATE (internal) method - creates ungapped features from 
               internally stored cigar line
403 404 405
  Returntype : list of Bio::EnsEMBL::FeaturePair
  Exceptions : none
  Caller     : ungapped_features
406
  Status     : Stable
407 408 409

=cut

410 411 412 413 414
sub _parse_cigar {
  my ( $self ) = @_;

  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();
415

416 417 418
  my $string = $self->{'cigar_string'};

  throw("No cigar string defined in object") if(!defined($string));
419 420

  my @pieces = ( $string =~ /(\d*[MDI])/g );
Laura Clarke's avatar
 
Laura Clarke committed
421
  #print "cigar: ",join ( ",", @pieces ),"\n";
422

423
  my @features;
424 425
  my $strand1 = $self->{'strand'} || 1;
  my $strand2 = $self->{'hstrand'}|| 1;
426 427

  my ( $start1, $start2 );
428

429
  if( $strand1 == 1 ) {
430
    $start1 = $self->{'start'};
431
  } else {
432
    $start1 = $self->{'end'};
433 434 435
  }

  if( $strand2 == 1 ) {
436
    $start2 = $self->{'hstart'};
437
  } else {
438
    $start2 = $self->{'hend'};
439 440
  }

441 442 443
  #
  # Construct ungapped blocks as FeaturePairs objects for each MATCH
  #
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
  foreach my $piece (@pieces) {

    my ($length) = ( $piece =~ /^(\d*)/ );
    if( $length eq "" ) { $length = 1 }
    my $mapped_length;

    # explicit if statements to avoid rounding problems
    # and make sure we have sane coordinate systems
    if( $query_unit == 1 && $hit_unit == 3 ) {
      $mapped_length = $length*3;
    } elsif( $query_unit == 3 && $hit_unit == 1 ) {
      $mapped_length = $length / 3;
    } elsif ( $query_unit == 1 && $hit_unit == 1 ) {
      $mapped_length = $length;
    } else {
459 460
      throw("Internal error $query_unit $hit_unit, currently only " .
            "allowing 1 or 3 ");
461
    }
462

463 464
    if( int($mapped_length) != $mapped_length and
        ($piece =~ /M$/ or $piece =~ /D$/)) {
465 466
      throw("Internal error with mismapped length of hit, query " .
            "$query_unit, hit $hit_unit, length $length");
467 468 469
    }

    if( $piece =~ /M$/ ) {
470 471 472 473
      #
      # MATCH
      #
      my ( $qstart, $qend);
474
      if( $strand1 == 1 ) {
475 476 477
        $qstart = $start1;
        $qend = $start1 + $length - 1;
        $start1 = $qend + 1;
478
      } else {
479 480 481
        $qend = $start1;
        $qstart = $start1 - $length + 1;
        $start1 = $qstart - 1;
482
      }
483

484
      my ($hstart, $hend);
485
      if( $strand2 == 1 ) {
486 487 488
        $hstart = $start2;
        $hend = $start2 + $mapped_length - 1;
        $start2 = $hend + 1;
489
      } else {
490 491 492
        $hend = $start2;
        $hstart = $start2 - $mapped_length + 1;
        $start2 = $hstart - 1;
493 494
      }

495

496 497
      push @features, Bio::EnsEMBL::FeaturePair->new
        (-SLICE      => $self->{'slice'},
498
         -SEQNAME   => $self->{'seqname'},
499 500 501
         -START      => $qstart,
         -END        => $qend,
         -STRAND     => $strand1,
Abel Ureta-Vidal's avatar
Abel Ureta-Vidal committed
502
         -HSLICE      => $self->{'hslice'},
503 504 505 506 507 508 509
         -HSEQNAME   => $self->{'hseqname'},
         -HSTART     => $hstart,
         -HEND       => $hend,
         -HSTRAND    => $strand2,
         -SCORE      => $self->{'score'},
         -PERCENT_ID => $self->{'percent_id'},
         -ANALYSIS   => $self->{'analysis'},
510
         -P_VALUE    => $self->{'p_value'},
511 512
         -EXTERNAL_DB_ID => $self->{'external_db_id'}, 
         -HCOVERAGE   => $self->{'hcoverage'},
513 514 515
         -GROUP_ID    => $self->{'group_id'},
         -LEVEL_ID    => $self->{'level_id'});
      
516 517 518

      # end M cigar bits 
    } elsif( $piece =~ /I$/ ) {
519 520 521
      #
      # INSERT
      #
522
      if( $strand1 == 1 ) {
523
        $start1 += $length;
524
      } else {
525
        $start1 -= $length;
526 527
      }
    } elsif( $piece =~ /D$/ ) {
528 529 530
      #
      # DELETION
      #
531
      if( $strand2 == 1 ) {
532
        $start2 += $mapped_length;
533
      } else {
534
        $start2 -= $mapped_length;
535 536
      }
    } else {
537 538
      throw( "Illegal cigar line $string!" );
    }
539
  }
540 541

  return \@features;
542 543 544 545 546
}




547
=head2 _parse_features
548

549 550 551 552 553
  Arg  1     : listref Bio::EnsEMBL::FeaturePair $ungapped_features
  Example    : none
  Description: creates internal cigarstring and start,end hstart,hend
               entries.
  Returntype : none, fills in values of self
Graham McVicker's avatar
Graham McVicker committed
554
  Exceptions : argument list undergoes many sanity checks - throws under many
555
               invalid conditions
556
  Caller     : new
557
  Status     : Stable
558

559
=cut
560

561 562
my $message_only_once = 1;

563 564
sub _parse_features {
  my ($self,$features ) = @_;
565

566 567
  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();
568

569
  if (ref($features) ne "ARRAY") {
570
    throw("features must be an array reference not a [".ref($features)."]");
571 572
  }

573 574 575 576
  my $strand  = $features->[0]->strand;

  throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand);

577
  my @f;
578

579 580 581 582 583 584 585 586 587
  #
  # Sort the features on their start position
  # Ascending order on positive strand, descending on negative strand
  #
  if( $strand == 1 ) {
    @f = sort {$a->start <=> $b->start} @$features;
  } else {
    @f = sort { $b->start <=> $a->start} @$features;
  }
588

589
  my $hstrand     = $f[0]->hstrand;
590
  my $slice       = $f[0]->slice();
Abel Ureta-Vidal's avatar
Abel Ureta-Vidal committed
591
  my $hslice       = $f[0]->hslice();
592
  my $name        = $slice->name() if($slice);
593 594 595 596
  my $hname       = $f[0]->hseqname;
  my $score       = $f[0]->score;
  my $percent     = $f[0]->percent_id;
  my $analysis    = $f[0]->analysis;
597
  my $pvalue      = $f[0]->p_value();
598 599
  my $external_db_id = $f[0]->external_db_id;
  my $hcoverage   = $f[0]->hcoverage;
600 601 602
  my $group_id    = $f[0]->group_id;
  my $level_id    = $f[0]->level_id;

Laura Clarke's avatar
 
Laura Clarke committed
603
  my $seqname = $f[0]->seqname;
604
  # implicit strand 1 for peptide sequences
605 606
  $strand  ||= 1;
  $hstrand ||= 1;
607
  my $ori = $strand * $hstrand;
608 609

  throw("No features in the array to parse") if(scalar(@f) == 0);
610

611 612
  my $prev1; # where last feature q part ended
  my $prev2; # where last feature s part ended
613

614 615
  my $string;

616 617
  # Use strandedness info of query and hit to make sure both sets of 
  # start and end  coordinates are oriented the right way around.
618 619
  my $f1start;
  my $f1end;
620 621
  my $f2end;
  my $f2start;
622

623
  if ($strand == 1) {
624 625
    $f1start = $f[0]->start;
    $f1end = $f[-1]->end;
626
  } else {
627 628
    $f1start = $f[-1]->start;
    $f1end = $f[0]->end;
629
  }
630

631 632 633 634 635 636 637
  if ($hstrand == 1) {
    $f2start = $f[0]->hstart;
    $f2end = $f[-1]->hend;
  } else {
    $f2start = $f[-1]->hstart;
    $f2end = $f[0]->hend;
  }
638

639 640 641
  #
  # Loop through each portion of alignment and construct cigar string
  #
642

643 644 645 646
  foreach my $f (@f) {
    #
    # Sanity checks
    #
647

648
    if (!$f->isa("Bio::EnsEMBL::FeaturePair")) {
649
      throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair");
650
    }
651 652
    if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) {
      throw("Inconsistent hstrands in feature array");
653
    }
654 655
    if( defined($f->strand()) && ($f->strand != $strand)) {
      throw("Inconsistent strands in feature array");
656
    }
657 658 659
    if ( defined($name) && $name ne $f->slice->name()) {
      throw("Inconsistent names in feature array [$name - ".
            $f->slice->name()."]");
660
    }
661 662 663
    if ( defined($hname) && $hname ne $f->hseqname) {
      throw("Inconsistent hit names in feature array [$hname - ".
            $f->hseqname . "]");
664
    }
665 666 667
    if ( defined($score) && $score ne $f->score) {
      throw("Inconsisent scores in feature array [$score - " .
            $f->score . "]");
668
    }
669
    if (defined($f->percent_id) && $percent ne $f->percent_id) {
670 671 672 673 674 675
      throw("Inconsistent pids in feature array [$percent - " .
            $f->percent_id . "]");
    }
    if(defined($pvalue) && $pvalue != $f->p_value()) {
      throw("Inconsistant p_values in feature arraw [$pvalue " .
            $f->p_value() . "]");
676
    }
Laura Clarke's avatar
 
Laura Clarke committed
677 678 679 680
    if($seqname && $seqname ne $f->seqname){
      throw("Inconsistent seqname in feature array [$seqname - ".
            $f->seqname . "]");
    }
681 682
    my $start1 = $f->start;      #source sequence alignment start
    my $start2 = $f->hstart();   #hit sequence alignment start
683

684 685 686 687 688
    #
    # More sanity checking
    #
    if (defined($prev1)) {
      if ( $strand == 1 ) {
689 690 691 692 693
        if ($f->start < $prev1) {
          throw("Inconsistent coords in feature array (forward strand).\n" .
		       "Start [".$f->start()."] in current feature should be greater " .
           "than previous feature end [$prev1].");
        }
694
      } else {
695 696 697 698 699
        if ($f->end > $prev1) {
          throw("Inconsistent coords in feature array (reverse strand).\n" .
                "End [".$f->end() ."] should be less than previous feature " .
                "start [$prev1].");
        }
700
      }
701
    }
702

703
    my $length = ($f->end - $f->start + 1);    #length of source seq alignment
704
    my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment
705

706 707
    # using multiplication to avoid rounding errors, hence the
    # switch from query to hit for the ratios
708

709 710 711 712 713 714
    #
    # Yet more sanity checking
    #
    if($query_unit > $hit_unit){
      # I am going to make the assumption here that this situation will 
      # only occur with DnaPepAlignFeatures, this may not be true
Laura Clarke's avatar
Laura Clarke committed
715 716 717
      my $query_p_length = sprintf "%.0f", ($length/$query_unit);
      my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit);
      if( $query_p_length != $hit_p_length) {
718
        print "$query_p_length (query_p_length) != $hit_p_length (hit_p_length)\n";
719 720 721
        throw( "Feature lengths not comparable Lengths:" .$length .
               " " . $hlength . " Ratios:" . $query_unit . " " .
               $hit_unit );
Arne Stabenau's avatar
Arne Stabenau committed
722
      }
723
    } else{
Laura Clarke's avatar
Laura Clarke committed
724 725
      my $query_d_length = sprintf "%.0f", ($length*$hit_unit);
      my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit);
726
      if( $length * $hit_unit != $hlength * $query_unit ) {
727 728 729
        throw( "Feature lengths not comparable Lengths:" . $length .
               " " . $hlength . " Ratios:" . $query_unit . " " .
               $hit_unit );
Arne Stabenau's avatar
Arne Stabenau committed
730
      }
731 732 733
    }

    my $hlengthfactor = ($query_unit/$hit_unit);
734

735 736 737 738 739 740 741 742 743
    #
    # Check to see if there is an I type (insertion) gap:
    #   If there is a space between the end of the last source sequence 
    #   alignment and the start of this one, then this is an insertion
    #

    my $insertion_flag = 0;
    if( $strand == 1 ) {
      if( ( defined $prev1 ) && ( $f->start > $prev1 + 1  )) {
744

745 746 747 748 749 750 751
        #there is an insertion
        $insertion_flag = 1;
        my $gap = $f->start - $prev1 - 1;
        if( $gap == 1 ) {
          $gap = ""; # no need for a number if gap length is 1
        }
        $string .= "$gap"."I";
752

Arne Stabenau's avatar
Arne Stabenau committed
753
      }
754 755 756 757

      #shift our position in the source seq alignment
      $prev1 = $f->end();
    } else {
758

759 760
      if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) {

761 762 763 764 765 766 767
        #there is an insertion
        $insertion_flag = 1;
        my $gap = $prev1 - $f->end() - 1;
        if( $gap == 1 ) {
          $gap = ""; # no need for a number if gap length is 1
        }
        $string .= "$gap"."I";
Arne Stabenau's avatar
Arne Stabenau committed
768
      }
769

770 771 772
      #shift our position in the source seq alignment
      $prev1 = $f->start();
    }
773

774 775 776 777 778 779 780
    #
    # Check to see if there is a D type (deletion) gap
    #   There is a deletion gap if there is a space between the end of the
    #   last portion of the hit sequence alignment and this one
    #
    if( $hstrand == 1 ) {
      if((  defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) {
781

782 783 784
        #there is a deletion
        my $gap = $f->hstart - $prev2 - 1;
        my $gap2 = int( $gap * $hlengthfactor + 0.5 );
785
	
786 787 788 789 790 791 792
        if( $gap2 == 1 ) {
          $gap2 = "";  # no need for a number if gap length is 1
        }
        $string .= "$gap2"."D";

        #sanity check,  Should not be an insertion and deletion
        if($insertion_flag) {
793 794 795 796 797
          if ($message_only_once) {
            warning("Should not be an deletion and insertion on the " .
                    "same alignment region. cigar_line=$string\n");
            $message_only_once = 0;
          }
798 799
        }
      }
800 801 802 803 804
      #shift our position in the hit seq alignment
      $prev2 = $f->hend();

     } else {
      if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {
805

806 807 808
        #there is a deletion
        my $gap = $prev2 - $f->hend - 1;
        my $gap2 = int( $gap * $hlengthfactor + 0.5 );
809
	
810 811 812 813 814 815 816
        if( $gap2 == 1 ) {
          $gap2 = "";  # no need for a number if gap length is 1
        }
        $string .= "$gap2"."D";

        #sanity check,  Should not be an insertion and deletion
        if($insertion_flag) {
817 818 819 820 821 822
          if ($message_only_once) {
            warning("Should not be an deletion and insertion on the " .
                    "same alignment region. prev2 = $prev2; f->hend() = " .
                    $f->hend() . "; cigar_line = $string;\n");
            $message_only_once = 0;
          }
823
        }
824
      }
825 826 827
      #shift our position in the hit seq alignment
      $prev2 = $f->hstart();
    }
828

829 830 831
    my $matchlength = $f->end() - $f->start() + 1;
    if( $matchlength == 1 ) {
      $matchlength = "";
832
    }
833 834
    $string .= $matchlength."M";
  }
835

836 837
  $self->{'start'}      = $f1start;
  $self->{'end'}        = $f1end;
Laura Clarke's avatar
 
Laura Clarke committed
838
  $self->{'seqname'}    = $seqname;
839 840 841 842 843
  $self->{'strand'}     = $strand;
  $self->{'score'}      = $score;
  $self->{'percent_id'} = $percent;
  $self->{'analysis'}   = $analysis;
  $self->{'slice'}      = $slice;
Abel Ureta-Vidal's avatar
Abel Ureta-Vidal committed
844
  $self->{'hslice'}     = $hslice;
845 846 847 848 849 850
  $self->{'hstart'}     = $f2start;
  $self->{'hend'}       = $f2end;
  $self->{'hstrand'}    = $hstrand;
  $self->{'hseqname'}   = $hname;
  $self->{'cigar_string'} = $string;
  $self->{'p_value'}      = $pvalue;
851 852
  $self->{'external_db_id'} = $external_db_id; 
  $self->{'hcoverage'}    = $hcoverage; 
853 854
  $self->{'group_id'}     = $group_id;
  $self->{'level_id'}     = $level_id;
855
}
856

Laura Clarke's avatar
 
Laura Clarke committed
857

Arne Stabenau's avatar
Arne Stabenau committed
858

859

Laura Clarke's avatar
 
Laura Clarke committed
860 861


862 863 864
=head2 _hit_unit

  Args       : none
Arne Stabenau's avatar
Arne Stabenau committed
865
  Example    : none
866 867 868
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
Arne Stabenau's avatar
Arne Stabenau committed
869
  Exceptions : none
870
  Caller     : internal
871
  Status     : Stable
Laura Clarke's avatar
 
Laura Clarke committed
872 873 874

=cut

875 876
sub _hit_unit {
  my $self = shift;
877
  throw( "Abstract method call!" );
Laura Clarke's avatar
 
Laura Clarke committed
878 879
}

Graham McVicker's avatar
Graham McVicker committed
880 881


882
=head2 _query_unit
Laura Clarke's avatar
 
Laura Clarke committed
883

884
  Args       : none
Arne Stabenau's avatar
Arne Stabenau committed
885
  Example    : none
886 887 888
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
Arne Stabenau's avatar
Arne Stabenau committed
889
  Exceptions : none
890
  Caller     : internal
891
  Status     : Stable
Laura Clarke's avatar
 
Laura Clarke committed
892 893 894

=cut

895
sub _query_unit {
Laura Clarke's avatar
 
Laura Clarke committed
896
  my $self = shift;
897
  throw( "Abstract method call!" );
Laura Clarke's avatar
 
Laura Clarke committed
898