BaseAlignFeature.pm 26.3 KB
Newer Older
1 2 3 4 5 6
# EnsEMBL module for storing dna-protein pairwise alignments
#
# Cared for by Michele Clamp <michele@sanger.ac.uk>
#
# You may distribute this module under the same terms as perl itself
#
7
=pod
8 9 10

=head1 NAME

11 12
Bio::EnsEMBL::DnaPepAlignFeature - Ensembl specific dna-protein 
                                   pairwise alignment feature
13 14 15 16

=head1 SYNOPSIS

  my $feat = new Bio::EnsEMBL::DnaPepAlignFeature(-seqname => 'myseq',
17 18 19 20 21 22
                                                  -start   => 100,
                                                  -end     => 120,
                                                  -strand  => 1,
                                                  -hstart  => 200,
                                                  -hend    => 220,
                                                  -analysis    => $analysis,
Laura Clarke's avatar
 
Laura Clarke committed
23
                                                  -cigar_string => '');
24

25
  Alternatively if you have an array of ungapped features
26

27
  my $feat = new Bio::EnsEMBL::DnaPepAlignFeature(-features => \@features);
28

29
  Where @features is an array of Bio::EnsEMBL::FeaturePair
30

31
  There is a method to manipulate the cigar_string into ungapped features
32

33
  my @ungapped_features = $feat->ungapped_features;
34

35
  This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair
36

37
  $analysis is a Bio::EnsEMBL::Analysis object
38
  
39 40
  Bio::EnsEMBL::SeqFeature methods can be used
  Bio::EnsEMBL::FeaturePair methods can be used
41

42 43
  The cigar_string contains the ungapped pieces that make up the gapped 
  alignment
44 45 46 47
  
  It looks like: n Matches [ x Deletes or Inserts m Matches ]*
  but a bit more condensed like "23M4I12M2D1M"
  and evenmore condensed as you can ommit 1s "23M4I12M2DM"
48 49


50
  To make things clearer this is how a blast HSP would be parsed
51

52 53
  >AK014066
         Length = 146
54

55
    Minus Strand HSPs:
56

57 58
    Score = 76 (26.8 bits), Expect = 1.4, P = 0.74
    Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1
59

60 61 62
  Query:   479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300
               G  APPP PQG R   P P G + P   L             + + ++  R  +A   +
  Sbjct:     7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64
63

64 65 66
  Query:   299 SSPHNPSPLPS 267
                  H P+P P+
  Sbjct:    65 PLTHTPTPTPT 75
67

68 69
  The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2 
  and the strand is -1.
70

71
  The alignment is made up of the following ungapped pieces :
72

73 74 75
  sequence 1 start 447 , sequence 2 start 7  , match length 33 , strand -1
  sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1
  sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1
76

77 78 79
  These ungapped pieces are made up into the following string (called a cigar 
  string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75 
  hstrand 1 and feature type would be DnaPepAlignFeature
80
  
81 82
=cut 

83

84 85 86 87 88 89 90 91
package Bio::EnsEMBL::BaseAlignFeature;

use Bio::EnsEMBL::FeaturePair;
use Bio::EnsEMBL::SeqFeature;

use vars qw(@ISA);
use strict;

92
@ISA = qw(Bio::EnsEMBL::FeaturePair);
93

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108

=head2 new

  Arg [..]   : List of named arguments. (-cigar_string , -features) defined
               in this constructor, others defined in FeaturePair and 
               SeqFeature superclasses.  
  Example    : 
  Description: Creates a new BaseAlignFeature using either a cigarstring or
               a list of ungapped features.
  Returntype : 
  Exceptions : 
  Caller     : 

=cut

109 110 111 112
sub new {
    my ($class,@args) = @_;

    my $self = $class->SUPER::new(@args);
Laura Clarke's avatar
 
Laura Clarke committed
113
    #print "calling new with @args\n";
114 115
    my ($cigar_string,$features) 
      = $self->_rearrange([qw(CIGAR_STRING FEATURES)], @args);
Laura Clarke's avatar
 
Laura Clarke committed
116

117 118 119 120 121 122
    if (defined($cigar_string) && defined($features)) {
      $self->throw("Can't input cigar_string and an array of features.");
    } elsif (defined($features)) {
      $self->_parse_features($features);
    } elsif (!defined($cigar_string)) {
      $self->throw("Must have a cigar string defining the alignment");
123
    } 
124
    
125
    $self->cigar_string($cigar_string);
126
    
127 128 129 130
    return $self;
}


Arne Stabenau's avatar
Arne Stabenau committed
131
=head2 cigar_string
132

Arne Stabenau's avatar
Arne Stabenau committed
133
  Arg [1]    : string $cigar_string
134
  Example    : ( "12MI3M" )
Arne Stabenau's avatar
Arne Stabenau committed
135 136 137 138 139 140 141 142
  Description: get/set for attribute cigar_string
               cigar_string describes the alignment. "xM" stands for 
               x matches (mismatches), "xI" for inserts into query sequence 
               (thats the ensembl sequence), "xD" for deletions (inserts in the 
               subject). an "x" that is 1 can be omitted.
  Returntype : string
  Exceptions : throws on a get without previous set
  Caller     : general
143 144 145 146 147 148 149

=cut



sub cigar_string {
  my ($self,$arg) = @_;
150
  
151
  if (defined($arg)) {
152
    
153 154 155 156 157 158 159 160 161
    # Do some checks to see whether this is valid
    my $tmp = $arg;
    
    $self->{_cigar_string} = $arg;
  }

  if (!defined($self->{_cigar_string})) {
    $self->throw("No cigar string defined - can't return one");
  }
162
  
163 164 165 166 167 168
  return $self->{_cigar_string};
}


=head2 ungapped_features

Arne Stabenau's avatar
Arne Stabenau committed
169 170 171 172 173 174 175
  Args       : none
  Example    : none
  Description: converts the internal cigar_string into an array of
               ungapped feature pairs
  Returntype : list of Bio::EnsEMBL::FeaturePair
  Exceptions : cigar_string not set internally
  Caller     : general
176

Arne Stabenau's avatar
Arne Stabenau committed
177
=cut
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192



sub ungapped_features {
  my ($self) = @_;

  if (defined($self->cigar_string)) {
    my @features = $self->_parse_cigar;
    return @features;
  } else {
    $self->throw("No cigar_string defined.  Can't return ungapped features");
  }

}

193

194

195
=head2 dbID
196

197 198 199 200 201 202 203 204 205 206 207 208 209
  Arg [1]    : int $dbID
  Example    : none
  Description: get/set for the database internal id
  Returntype : int
  Exceptions : none
  Caller     : general, set from adaptor on store

=cut


sub dbID{
  my ($self, $arg) = @_;

210
  if(defined $arg){
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    $self->{_database_id} = $arg;
  }

  return $self->{_database_id}; 

}

=head2 adaptor

  Arg [1]    : Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor $adaptor
  Example    : none
  Description: get/set for this objects Adaptor
  Returntype : Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor
  Exceptions : none
  Caller     : general, set from adaptor on store

=cut

sub adaptor {
   my $self = shift;
   if( @_ ) {
      my $value = shift;
      $self->{'adaptor'} = $value;
234
    }
235 236
    return $self->{'adaptor'};

237 238
}

239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
=head2 reverse_complement

  Args       : none
  Example    : none
  Description: reverse complement the FeaturePair,
               modifing strand, hstrand and cigar_string in consequence
  Returntype : none
  Exceptions : none
  Caller     : general

=cut

sub reverse_complement {
  my ($self) = @_;

  # reverse strand in both sequences
  $self->strand($self->strand * -1);
  $self->hstrand($self->hstrand * -1);
  
  # reverse cigar_string as consequence
  my $cigar_string = $self->cigar_string;
  $cigar_string =~ s/(D|I|M)/$1 /g;
  my @cigar_pieces = split / /,$cigar_string;
  $cigar_string = "";
  while (my $piece = pop @cigar_pieces) {
    $cigar_string .= $piece;
  }

  $self->cigar_string($cigar_string);
}
269

270 271 272 273 274 275 276 277
=head2 _parse_cigar

  Args       : none
  Example    : none
  Description: creates ungapped features from internally stored cigar line
  Returntype : list of Bio::EnsEMBL::FeaturePair
  Exceptions : none
  Caller     : ungapped_features
278 279 280

=cut

281 282 283 284 285
sub _parse_cigar {
  my ( $self ) = @_;

  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();
286

287
  
288
  my $string = $self->cigar_string;
289
 
290 291 292 293 294 295
  if (!defined($string)) {
    $self->throw("No cigar string defined in object.  This should be caught by the cigar_string method and never happen");
  }

  
  my @pieces = ( $string =~ /(\d*[MDI])/g );
Laura Clarke's avatar
 
Laura Clarke committed
296 297
  #print "cigar: ",join ( ",", @pieces ),"\n";
  
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
  my @features;
  my $strand1 = $self->strand() || 1;
  my $strand2 = $self->hstrand() || 1;

  my ( $start1, $start2 );
  
  if( $strand1 == 1 ) {
    $start1 = $self->start();
  } else {
    $start1 = $self->end();
  }

  if( $strand2 == 1 ) {
    $start2 = $self->hstart();
  } else {
    $start2 = $self->hend();
  }


  foreach my $piece (@pieces) {

    my ($length) = ( $piece =~ /^(\d*)/ );
    if( $length eq "" ) { $length = 1 }
    my $mapped_length;

    # explicit if statements to avoid rounding problems
    # and make sure we have sane coordinate systems
    if( $query_unit == 1 && $hit_unit == 3 ) {
      $mapped_length = $length*3;
    } elsif( $query_unit == 3 && $hit_unit == 1 ) {
      $mapped_length = $length / 3;
    } elsif ( $query_unit == 1 && $hit_unit == 1 ) {
      $mapped_length = $length;
    } else {
      $self->throw("Internal error $query_unit $hit_unit, currently only allowing 1 or 3 ");
    }
    
    if( int($mapped_length) != $mapped_length ) {
      $self->throw("Internal error with mismapped length of hit, query $query_unit, hit $hit_unit, length $length");
    }

    if( $piece =~ /M$/ ) {
      my $feature1 = new Bio::EnsEMBL::SeqFeature();
      
      my ( $a, $b );
      if( $strand1 == 1 ) {
344 345 346
        $a = $start1;
        $b = $start1 + $length - 1;
        $start1 = $b + 1;
347
      } else {
348 349 350
        $b = $start1;
        $a = $start1 - $length + 1;
        $start1 = $a - 1;
351 352 353 354 355 356 357 358 359 360 361 362 363
      }
      
      $feature1->start($a);
      $feature1->end  ($b);
      $feature1->strand($self->strand() );
      $feature1->score($self->score);
      $feature1->seqname($self->seqname);
      $feature1->phase($self->phase);
      $feature1->p_value($self->p_value);
      $feature1->percent_id($self->percent_id);

      my $feature2 = new Bio::EnsEMBL::SeqFeature();
      if( $strand2 == 1 ) {
364 365 366
        $a = $start2;
        $b = $start2 + $mapped_length - 1;
        $start2 = $b + 1;
367
      } else {
368 369 370
        $b = $start2;
        $a = $start2 - $mapped_length + 1;
        $start2 = $a - 1;
371 372 373 374 375 376 377 378 379 380 381 382
      }

      $feature2->start($a);
      $feature2->end($b);
      $feature2->strand($self->hstrand());
      $feature2->score($self->score);
      $feature2->seqname($self->hseqname);
      $feature2->phase($self->phase);
      $feature2->p_value($self->p_value);
      $feature2->percent_id($self->percent_id);
    
      my $fp = new Bio::EnsEMBL::FeaturePair(-feature1 => $feature1,
383
                                             -feature2 => $feature2);
384 385 386 387 388 389 390 391

 
      $fp->analysis($self->analysis);

      push(@features,$fp);
      # end M cigar bits 
    } elsif( $piece =~ /I$/ ) {
      if( $strand1 == 1 ) {
392
        $start1 += $length;
393
      } else {
394
        $start1 -= $length;
395 396 397
      }
    } elsif( $piece =~ /D$/ ) {
      if( $strand2 == 1 ) {
398
        $start2 += $mapped_length;
399
      } else {
400
        $start2 -= $mapped_length;
401 402 403 404 405 406 407 408 409 410 411 412 413 414
      }
    } else {
      $self->throw( "Illegal cigar line $string!" );
    } 
      
  }
  # should the features be sorted ?
  # 
  return @features;
}




415
=head2 _parse_features
416

417 418 419 420 421 422 423
  Arg  1     : listref Bio::EnsEMBL::FeaturePair $ungapped_features
  Example    : none
  Description: creates internal cigarstring and start,end hstart,hend
               entries.
  Returntype : none, fills in values of self
  Exceptions : argument list is sanity checked
  Caller     : new
424

425
=cut
426

427 428
sub _parse_features {
  my ($self,$features ) = @_;
429

430 431
  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();
432

433
  if (ref($features) ne "ARRAY") {
434 435
    $self->throw("features must be an array reference not a [" . 
		 ref($features) . "]");
436 437
  }

438 439 440 441 442 443 444 445 446 447 448 449
  my $strand     = $features->[0]->strand;
  my @f;
  
  #
  # Sort the features on their start position
  # Ascending order on positive strand, descending on negative strand
  #
  if( $strand == 1 ) {
    @f = sort {$a->start <=> $b->start} @$features;
  } else {
    @f = sort { $b->start <=> $a->start} @$features;
  }
450

451
  #print STDERR $f[0]->gffstring."\n";  
452 453 454 455 456 457 458 459
  my $hstrand     = $f[0]->hstrand;
  my $name        = $f[0]->seqname;
  my $hname       = $f[0]->hseqname;
  my $score       = $f[0]->score;
  my $percent     = $f[0]->percent_id;
  my $pvalue      = $f[0]->p_value;
  my $analysis    = $f[0]->analysis;
  my $phase       = $f[0]->phase;
460
 
461 462 463 464
  # implicit strand 1 for peptide sequences
  ( defined $strand ) || ( $strand = 1 );
  ( defined $hstrand ) || ( $hstrand = 1 );
  my $ori = $strand * $hstrand;
Arne Stabenau's avatar
Arne Stabenau committed
465 466
    
    
467 468 469
  if (scalar(@f) == 0) {
    $self->throw("No features in the array to parse");
  }
470

471 472
  my $prev1; # where last feature q part ended
  my $prev2; # where last feature s part ended
Arne Stabenau's avatar
Arne Stabenau committed
473
    
474 475
  my $string;

476 477
  # Use strandedness info of query and hit to make sure both sets of start and end 
  # coordinates are oriented the right way around.
478 479
  my $f1start;
  my $f1end;
480 481
  my $f2end;
  my $f2start;
482
  #print STDERR "STRAND = ".$strand."\n";
483
  if ($strand == 1) {
484 485
    $f1start = $f[0]->start;
    $f1end = $f[-1]->end;
486
  } else {
487 488
    $f1start = $f[-1]->start;
    $f1end = $f[0]->end;
489
  }
490
  #print STDERR "HSTRAND = ".$hstrand."\n";
491 492 493 494 495 496 497
  if ($hstrand == 1) {
    $f2start = $f[0]->hstart;
    $f2end = $f[-1]->hend;
  } else {
    $f2start = $f[-1]->hstart;
    $f2end = $f[0]->hend;
  }
498
  #print STDERR "f1start ".$f1start." f1end ".$f1end." f2start ".$f2start." f2end ".$f2end." \n";
499 500 501
  #
  # Loop through each portion of alignment and construct cigar string
  #
502

503 504 505 506 507 508
  foreach my $f (@f) {
    #
    # Sanity checks
    #
    if (!$f->isa("Bio::EnsEMBL::FeaturePair")) {
      $self->throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair");
509
    }
510 511 512 513
    if( defined $f->hstrand() ) {
      if ($f->hstrand != $hstrand) {
	$self->throw("Inconsistent hstrands in feature array");
      }
514
    }
515 516 517
    if( defined $f->strand() ) {
      if ($f->strand != $strand) {
	$self->throw("Inconsistent strands in feature array");
Arne Stabenau's avatar
Arne Stabenau committed
518
      }
519 520 521 522 523 524 525 526 527 528 529 530 531
    }
    if ($name ne $f->seqname) {
      $self->throw("Inconsistent names in feature array [$name - " . 
		   $f->seqname . "]");
    }
    if ($hname ne $f->hseqname) {
      $self->throw("Inconsistent names in feature array [$hname - " . 
		   $f->hseqname . "]");
    }
    if ($score ne $f->score) {
      $self->throw("Inconsisent scores in feature array [$score - " . 
		   $f->score . "]");
    }
532
    if (defined($f->percent_id) && $percent ne $f->percent_id) {
533 534 535 536 537 538
      $self->throw("Inconsistent pids in feature array [$percent - " . 
		   $f->percent_id . "]");
    }
    
    my $start1 = $f->start;      #source sequence alignment start
    my $start2 = $f->hstart();   #hit sequence alignment start
539

540 541 542 543 544 545 546 547 548 549
    #
    # More sanity checking
    #
    if (defined($prev1)) {
      if ( $strand == 1 ) {
	if ($f->start < $prev1) {
	  $self->throw("Inconsistent coordinates feature is forward strand " .
		       "hstart in current feature should be greater than " .
		       "hend in previous feature " . $f->start . " < " .
		       $prev1."\n");
Arne Stabenau's avatar
Arne Stabenau committed
550
	}
551 552 553 554 555
      } else {
	if ($f->end > $prev1) {
	  $self->throw("Inconsistent coordinates in feature array feature " .
		       "is reverse strand hend should be less than previous " .
		       "hstart " . $f->end . " > $prev1");
Arne Stabenau's avatar
Arne Stabenau committed
556
	}
557
      }
558
    }
559

560 561
    my $length = ($f->end - $f->start + 1); #length of source seq alignment
    my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment
562
 
563 564 565 566 567 568 569 570 571
    # using multiplication to avoid rounding errors, hence the
    # switch from query to hit for the ratios
    
    #
    # Yet more sanity checking
    #
    if($query_unit > $hit_unit){
      # I am going to make the assumption here that this situation will 
      # only occur with DnaPepAlignFeatures, this may not be true
Laura Clarke's avatar
Laura Clarke committed
572 573 574
      my $query_p_length = sprintf "%.0f", ($length/$query_unit);
      my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit);
      if( $query_p_length != $hit_p_length) {
575
	#print STDERR $length."/".$query_unit." ".$hlength."*".$hit_unit."\n";
576 577 578
	$self->throw( "Feature lengths not comparable Lengths:" .$length . 
		      " " . $hlength . " Ratios:" . $query_unit . " " . 
		      $hit_unit );
Arne Stabenau's avatar
Arne Stabenau committed
579
      }
580
    } else{
Laura Clarke's avatar
Laura Clarke committed
581 582
      my $query_d_length = sprintf "%.0f", ($length*$hit_unit);
      my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit);
583
      if( $length * $hit_unit != $hlength * $query_unit ) {
584
	#print STDERR $length."*".$hit_unit." ".$hlength."*".$query_unit."\n";
585 586 587
	$self->throw( "Feature lengths not comparable Lengths:" . $length . 
		      " " . $hlength . " Ratios:" . $query_unit . " " . 
		      $hit_unit );
Arne Stabenau's avatar
Arne Stabenau committed
588
      }
589 590 591
    }

    my $hlengthfactor = ($query_unit/$hit_unit);
592

593 594 595 596 597 598
    # if( $query_unit == 1 && $hit_unit == 3 ) {
    #	$hlengthfactor = (1/3);
    #     }
    #     if( $query_unit == 3 && $hit_unit == 1 ) {
    #	$hlengthfactor = 3;
    #      }
Arne Stabenau's avatar
Arne Stabenau committed
599
      
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614

    #
    # Check to see if there is an I type (insertion) gap:
    #   If there is a space between the end of the last source sequence 
    #   alignment and the start of this one, then this is an insertion
    #

    my $insertion_flag = 0;
    if( $strand == 1 ) {
      if( ( defined $prev1 ) && ( $f->start > $prev1 + 1  )) {
	#there is an insertion
	$insertion_flag = 1;
	my $gap = $f->start - $prev1 - 1;
	if( $gap == 1 ) {
	  $gap = ""; # no need for a number if gap length is 1
Arne Stabenau's avatar
Arne Stabenau committed
615
	}
616
	$string .= "$gap"."I";
Arne Stabenau's avatar
Arne Stabenau committed
617
      }
618 619 620 621

      #shift our position in the source seq alignment
      $prev1 = $f->end();
    } else {
622
      #print STDERR "there is a insertion\n";
623 624 625 626 627 628 629
      if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) {

	#there is an insertion
	$insertion_flag = 1;
	my $gap = $prev1 - $f->end() - 1;
	if( $gap == 1 ) {
	  $gap = ""; # no need for a number if gap length is 1
Laura Clarke's avatar
 
Laura Clarke committed
630
	}
631
	$string .= "$gap"."I";
Arne Stabenau's avatar
Arne Stabenau committed
632
      }
633

634 635 636
      #shift our position in the source seq alignment
      $prev1 = $f->start();
    }
Laura Clarke's avatar
 
Laura Clarke committed
637
      
638 639 640 641 642 643 644 645 646 647 648 649 650
    #
    # Check to see if there is a D type (deletion) gap
    #   There is a deletion gap if there is a space between the end of the
    #   last portion of the hit sequence alignment and this one
    #
    if( $hstrand == 1 ) {
      if((  defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) {
	#there is a deletion
	my $gap = $f->hstart - $prev2 - 1;
	my $gap2 = int( $gap * $hlengthfactor + 0.05 );
	
	if( $gap2 == 1 ) {
	  $gap2 = "";  # no need for a number if gap length is 1
Arne Stabenau's avatar
Arne Stabenau committed
651
	}
652
	$string .= "$gap2"."D";
653

654 655
	#sanity check,  Should not be an insertion and deletion
	if($insertion_flag) {
656
	  $self->warn("Should not be an deletion and insertion on the " .
657
		       "same alignment region. cigar_line=$string\n");
Arne Stabenau's avatar
Arne Stabenau committed
658
	} 
659 660 661 662 663 664 665 666 667 668 669 670
      } 
      #shift our position in the hit seq alignment
      $prev2 = $f->hend();

     } else {
      if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {
	#there is a deletion
	my $gap = $prev2 - $f->hend - 1;
	my $gap2 = int( $gap * $hlengthfactor + 0.05 );
	
	if( $gap2 == 1 ) {
	  $gap2 = "";  # no need for a number if gap length is 1
Arne Stabenau's avatar
Arne Stabenau committed
671
	}
672 673 674 675 676 677 678
	$string .= "$gap2"."D";
	#sanity check,  Should not be an insertion and deletion
	if($insertion_flag) {
	  $self->throw("Should not be an deletion and insertion on the " .
		       "same alignment region. prev2 = $prev2; f->hend() = " .
		       $f->hend() . "; cigar_line = $string;\n");
	} 
679
      }
680
      #shift our position in the hit seq alignment
681
     
682 683
      $prev2 = $f->hstart();
    }
684
      
685 686 687
    my $matchlength = $f->end() - $f->start() + 1;
    if( $matchlength == 1 ) {
      $matchlength = "";
688
    }
689
    $string .= $matchlength."M";
690

691
    #print STDERR "finished with this feature\n\n";
692
  }
693 694
#  print STDERR "creating align feature start ".$f1start." end ".$f1end." strand ".$strand." score ".$score." percent id ".$percent." pvalue ".$pvalue." seqname ".$name." phase ".$phase." analysis ".$analysis." hstart ".$f2start," hend ".$f2end." hstrand ".$hstrand." hid ".$hname."\n"; 
  if(!$score){
695
    #$self->warn("score is not set assume its 1");
696 697
    $score = 1;
  } 
698
  my $feature1 = new Bio::EnsEMBL::SeqFeature();
699
  
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726
  $feature1->start($f1start);
  $feature1->end  ($f1end);
  $feature1->strand($strand);
  $feature1->score($score);
  $feature1->percent_id($percent);
  $feature1->p_value($pvalue);
  $feature1->seqname($name);
  $feature1->phase($phase);
  $feature1->analysis($analysis);
  #print STDERR "checking feature1 ".$feature1->gffstring."\n";
  $feature1->validate;
  
  my $feature2 = new Bio::EnsEMBL::SeqFeature();
  
  $feature2->start($f2start);
  $feature2->end  ($f2end);
  $feature2->strand($hstrand);
  $feature2->score($score);
  $feature2->seqname($hname);
  $feature2->percent_id($percent);
  $feature2->p_value($pvalue);
  $feature2->phase($phase);
  $feature2->analysis($analysis);
  $feature2->validate;
  $self->feature1($feature1);
  $self->feature2($feature2);
  $self->cigar_string($string);
727
}
728

Laura Clarke's avatar
 
Laura Clarke committed
729

Arne Stabenau's avatar
Arne Stabenau committed
730

731 732 733



Graham McVicker's avatar
Graham McVicker committed
734
sub _transform_to_RawContig{
Simon Potter's avatar
Simon Potter committed
735
  my ($self) = @_;
736

Laura Clarke's avatar
 
Laura Clarke committed
737
  if(!$self->contig){
738 739
    $self->throw("can't transform coordinates of " . $self . 
		 " without some sort of contig defined");
Laura Clarke's avatar
 
Laura Clarke committed
740
  }
741

742
  my $rcAdaptor = $self->contig->adaptor()->db()->get_RawContigAdaptor();
Laura Clarke's avatar
 
Laura Clarke committed
743
  my @out;
Laura Clarke's avatar
 
Laura Clarke committed
744

Laura Clarke's avatar
 
Laura Clarke committed
745 746 747
  my @features = $self->_parse_cigar();
  my @mapped_features;
  my %rc_features;
Laura Clarke's avatar
 
Laura Clarke committed
748

Laura Clarke's avatar
 
Laura Clarke committed
749
  foreach my $f(@features){
Graham McVicker's avatar
Graham McVicker committed
750
    my @new_features = $self->_transform_feature_to_RawContig($f);
Laura Clarke's avatar
 
Laura Clarke committed
751 752
    push(@mapped_features, @new_features);
  }
Arne Stabenau's avatar
Arne Stabenau committed
753

Laura Clarke's avatar
 
Laura Clarke committed
754 755 756 757
  foreach my $mf(@mapped_features){
    my $contig_id = $mf->seqname;
    if(!$rc_features{$contig_id}){
      $rc_features{$contig_id} = [];
758
    }
759 760

    push(@{$rc_features{$contig_id}}, $mf);
Laura Clarke's avatar
 
Laura Clarke committed
761
  }
Arne Stabenau's avatar
Arne Stabenau committed
762

Laura Clarke's avatar
 
Laura Clarke committed
763
  foreach my $contig_id(keys(%rc_features)){
764
    my $outputf = $self->new( -features => $rc_features{$contig_id} );
Laura Clarke's avatar
 
Laura Clarke committed
765 766 767
    $outputf->analysis( $self->analysis() );
    $outputf->score( $self->score() );
    $outputf->percent_id( $self->percent_id() );
768
    $outputf->p_value( $self->p_value());
Laura Clarke's avatar
 
Laura Clarke committed
769 770
    my $contig = $rcAdaptor->fetch_by_dbID($contig_id);
    $outputf->contig($contig);
Laura Clarke's avatar
 
Laura Clarke committed
771
    push(@out, $outputf);
Laura Clarke's avatar
 
Laura Clarke committed
772
  }
Laura Clarke's avatar
 
Laura Clarke committed
773
  #print STDERR "returning ".@out." feature from basealignfeature\n";
Laura Clarke's avatar
 
Laura Clarke committed
774 775
  return @out;

776 777
}

Laura Clarke's avatar
 
Laura Clarke committed
778 779


780 781 782
=head2 _hit_unit

  Args       : none
Arne Stabenau's avatar
Arne Stabenau committed
783
  Example    : none
784 785 786
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
Arne Stabenau's avatar
Arne Stabenau committed
787
  Exceptions : none
788
  Caller     : internal
Laura Clarke's avatar
 
Laura Clarke committed
789 790 791

=cut

792 793 794
sub _hit_unit {
  my $self = shift;
  $self->throw( "Abstract method call!" );
Laura Clarke's avatar
 
Laura Clarke committed
795 796
}

797
=head2 _query_unit
Laura Clarke's avatar
 
Laura Clarke committed
798

799
  Args       : none
Arne Stabenau's avatar
Arne Stabenau committed
800
  Example    : none
801 802 803
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
Arne Stabenau's avatar
Arne Stabenau committed
804
  Exceptions : none
805
  Caller     : internal
Laura Clarke's avatar
 
Laura Clarke committed
806 807 808

=cut

809
sub _query_unit {
Laura Clarke's avatar
 
Laura Clarke committed
810
  my $self = shift;
811
  $self->throw( "Abstract method call!" );
Laura Clarke's avatar
 
Laura Clarke committed
812 813
}

Arne Stabenau's avatar
Arne Stabenau committed
814

Graham McVicker's avatar
Graham McVicker committed
815
sub _transform_feature_to_RawContig{
Laura Clarke's avatar
 
Laura Clarke committed
816 817
  my($self, $feature) =  @_;

818
  my $slice = $self->contig;
819

820 821 822
  unless($slice){
    $self->throw("can't transform coordinates of $self without some " .
		 "sort of contig defined");
Laura Clarke's avatar
 
Laura Clarke committed
823
  }
824 825 826

  my $asma = $slice->adaptor->db->get_AssemblyMapperAdaptor;
  my $mapper = $asma->fetch_by_type( $slice->assembly_type );
Laura Clarke's avatar
 
Laura Clarke committed
827
  
828 829
  my $rcAdaptor = $slice->adaptor()->db()->get_RawContigAdaptor();

Laura Clarke's avatar
 
Laura Clarke committed
830
  my @out;
831 832 833 834 835 836 837 838 839 840 841 842 843 844

  #first convert the feature's slice coords to assembly coords
  my ($start, $end, $strand);
  if($slice->strand == 1) {
    $start  = $slice->chr_start + $feature->start - 1;
    $end    = $slice->chr_start + $feature->end   - 1;
    $strand = $feature->strand;
  } else {
    $start  = $slice->chr_end - $feature->end    + 1;
    $end    = $slice->chr_end - $feature->start  + 1;
    $strand = $feature->strand * -1;
  }

  #now convert the feature's assembly coords to raw contig coords
Laura Clarke's avatar
 
Laura Clarke committed
845 846
  my @mapped = $mapper->map_coordinates_to_rawcontig
    (
847 848 849 850
     $slice->chr_name(),
     $start,
     $end,
     $strand
Laura Clarke's avatar
 
Laura Clarke committed
851 852
    );

853
  unless( @mapped ) {
Laura Clarke's avatar
 
Laura Clarke committed
854 855 856
    $self->throw( "couldn't map ".$self."\n" );
    return $self;
  }
857 858
  my ( $hit_start, $hit_end );

Laura Clarke's avatar
 
Laura Clarke committed
859
  if( scalar( @mapped ) > 1 ) {
860
    #print STDERr "splitting evidence\n";
861 862 863 864 865
    if( $feature->hstrand == 1 ) {
      $hit_start = $feature->hstart();
    } else {
      $hit_end = $feature->hend();
    }
Laura Clarke's avatar
 
Laura Clarke committed
866 867
    #print STDERR " feature is being mapped across multiple contigs ".$feature->gffstring."\n";
  SPLIT: for( my $i=0; $i <= $#mapped; $i++ ) {
868 869 870 871 872
      if($mapped[$i]->isa("Bio::EnsEMBL::Mapper::Gap")){
	$self->warn("piece of evidence lies on gap\n");
	next SPLIT;
      }
      #print STDERR "query coords ".$mapped[$i]->end." ".$mapped[$i]->start."\n";
Laura Clarke's avatar
 
Laura Clarke committed
873
      my $query_length = ($mapped[$i]->end - $mapped[$i]->start + 1);
874
      #print STDERR " query length = ".$query_length."\n";
Laura Clarke's avatar
 
Laura Clarke committed
875 876 877 878
      my $hit_length;
      if($self->_query_unit == $self->_hit_unit){
	$hit_length = $query_length;
      }elsif($self->_query_unit > $self->_hit_unit){
879 880 881
	my $tmp =  ($query_length/$self->_query_unit);
	#print STDERR "tmp = ".$tmp."\n";
	#$hit_length = int($tmp);
882
	$hit_length = sprintf "%.0f", $tmp;
Laura Clarke's avatar
 
Laura Clarke committed
883
      }elsif($self->_hit_unit > $self->_query_unit){
884 885
	my $tmp = ($query_length*$self->_hit_unit);
	#print STDERR "tmp = ".$tmp."\n";
886
	$hit_length = sprintf "%.0f", $tmp;
Laura Clarke's avatar
 
Laura Clarke committed
887
      }
888
      #print STDERR "hit length ".$hit_length."\n";
889 890 891
      if($hit_length == 0){
	next SPLIT;
      }
892 893 894 895 896
      if( $feature->hstrand() == 1 ) {
	$hit_end = ($hit_start + $hit_length) - 1;
      } else {
	$hit_start = ( $hit_end - $hit_length + 1 );
      } 
897
      #print "hit start ".$hit_start." hit end ".$hit_end."\n";
Laura Clarke's avatar
 
Laura Clarke committed
898
      my $rawContig = $rcAdaptor->fetch_by_dbID( $mapped[$i]->id() );
899 900

      my $new_feature = Bio::EnsEMBL::FeaturePair->new();
901
     
Laura Clarke's avatar
 
Laura Clarke committed
902 903 904 905 906 907 908
      $new_feature->start($mapped[$i]->start);
      $new_feature->end($mapped[$i]->end);
      $new_feature->strand($mapped[$i]->strand);
      $new_feature->seqname($mapped[$i]->id);
      $new_feature->score($feature->score);
      $new_feature->percent_id($feature->percent_id);
      $new_feature->p_value($feature->p_value);
909

Laura Clarke's avatar
 
Laura Clarke committed
910 911 912 913
      $new_feature->hstart($hit_start);
      $new_feature->hend($hit_end);
      $new_feature->hstrand($feature->hstrand);
      $new_feature->hseqname($feature->hseqname);
914

Laura Clarke's avatar
 
Laura Clarke committed
915
      $new_feature->analysis($feature->analysis);
916
      $new_feature->contig($rawContig);
917 918
      #print STDERR "FEATURE: ",join( " ", ( $new_feature->start(), $new_feature->end(), $new_feature->seqname,
	#			$new_feature->contig(), $new_feature->hstart(), $new_feature->hend() )),"\n";
Laura Clarke's avatar
 
Laura Clarke committed
919 920
      #print STDERR "split feature ".$new_feature->gffstring."\n";
      push(@out, $new_feature);
921 922 923 924 925
      if( $feature->hstrand() == 1 ) {
	$hit_start = ($hit_end + 1);
      } else {
	$hit_end = $hit_start -1;
      }
Laura Clarke's avatar
 
Laura Clarke committed
926 927 928 929 930 931 932
    }
  }else{
    if($mapped[0]->isa("Bio::EnsEMBL::Mapper::Gap")){
      $self->warn("piece of evidence lies on gap\n");
      return;
    }
    my $rawContig = $rcAdaptor->fetch_by_dbID( $mapped[0]->id() );
933 934

    my $new_feature = Bio::EnsEMBL::FeaturePair->new();
Laura Clarke's avatar
 
Laura Clarke committed
935 936 937 938 939 940 941
    $new_feature->start($mapped[0]->start);
    $new_feature->end($mapped[0]->end);
    $new_feature->strand($mapped[0]->strand);
    $new_feature->seqname($mapped[0]->id);
    $new_feature->score($feature->score);
    $new_feature->percent_id($feature->percent_id);
    $new_feature->p_value($feature->p_value);
942

Laura Clarke's avatar
 
Laura Clarke committed
943 944 945 946
    $new_feature->hstart($feature->hstart);
    $new_feature->hend($feature->hend);
    $new_feature->hstrand($feature->hstrand);
    $new_feature->hseqname($feature->hseqname);
947

Laura Clarke's avatar
 
Laura Clarke committed
948
    $new_feature->analysis($feature->analysis);
949
    $new_feature->contig($rawContig);
950
    
Laura Clarke's avatar
 
Laura Clarke committed
951 952 953
    push(@out, $new_feature);
  }

Laura Clarke's avatar
 
Laura Clarke committed
954
  #foreach my $sf(@out){
955
    #print STDERR "returning gff ".$sf->gffstring."\n";
Laura Clarke's avatar
 
Laura Clarke committed
956
  #}
957

Laura Clarke's avatar
 
Laura Clarke committed
958
  return @out;
959