BaseAlignFeature.pm 25.8 KB
Newer Older
1 2 3 4 5 6
# EnsEMBL module for storing dna-protein pairwise alignments
#
# Cared for by Michele Clamp <michele@sanger.ac.uk>
#
# You may distribute this module under the same terms as perl itself
#
7
=pod
8 9 10

=head1 NAME

11 12
Bio::EnsEMBL::DnaPepAlignFeature - Ensembl specific dna-protein 
                                   pairwise alignment feature
13 14 15 16

=head1 SYNOPSIS

  my $feat = new Bio::EnsEMBL::DnaPepAlignFeature(-seqname => 'myseq',
17 18 19 20 21 22
                                                  -start   => 100,
                                                  -end     => 120,
                                                  -strand  => 1,
                                                  -hstart  => 200,
                                                  -hend    => 220,
                                                  -analysis    => $analysis,
Laura Clarke's avatar
 
Laura Clarke committed
23
                                                  -cigar_string => '');
24

25
  Alternatively if you have an array of ungapped features
26

27
  my $feat = new Bio::EnsEMBL::DnaPepAlignFeature(-features => \@features);
28

29
  Where @features is an array of Bio::EnsEMBL::FeaturePair
30

31
  There is a method to manipulate the cigar_string into ungapped features
32

33
  my @ungapped_features = $feat->ungapped_features;
34

35
  This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair
36

37
  $analysis is a Bio::EnsEMBL::Analysis object
38
  
39 40
  Bio::EnsEMBL::SeqFeature methods can be used
  Bio::EnsEMBL::FeaturePair methods can be used
41

42 43
  The cigar_string contains the ungapped pieces that make up the gapped 
  alignment
44 45 46 47
  
  It looks like: n Matches [ x Deletes or Inserts m Matches ]*
  but a bit more condensed like "23M4I12M2D1M"
  and evenmore condensed as you can ommit 1s "23M4I12M2DM"
48 49


50
  To make things clearer this is how a blast HSP would be parsed
51

52 53
  >AK014066
         Length = 146
54

55
    Minus Strand HSPs:
56

57 58
    Score = 76 (26.8 bits), Expect = 1.4, P = 0.74
    Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1
59

60 61 62
  Query:   479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300
               G  APPP PQG R   P P G + P   L             + + ++  R  +A   +
  Sbjct:     7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64
63

64 65 66
  Query:   299 SSPHNPSPLPS 267
                  H P+P P+
  Sbjct:    65 PLTHTPTPTPT 75
67

68 69
  The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2 
  and the strand is -1.
70

71
  The alignment is made up of the following ungapped pieces :
72

73 74 75
  sequence 1 start 447 , sequence 2 start 7  , match length 33 , strand -1
  sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1
  sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1
76

77 78 79
  These ungapped pieces are made up into the following string (called a cigar 
  string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75 
  hstrand 1 and feature type would be DnaPepAlignFeature
80
  
81 82
=cut 

83

84 85 86 87 88 89 90 91
package Bio::EnsEMBL::BaseAlignFeature;

use Bio::EnsEMBL::FeaturePair;
use Bio::EnsEMBL::SeqFeature;

use vars qw(@ISA);
use strict;

92
@ISA = qw(Bio::EnsEMBL::FeaturePair);
93

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108

=head2 new

  Arg [..]   : List of named arguments. (-cigar_string , -features) defined
               in this constructor, others defined in FeaturePair and 
               SeqFeature superclasses.  
  Example    : 
  Description: Creates a new BaseAlignFeature using either a cigarstring or
               a list of ungapped features.
  Returntype : 
  Exceptions : 
  Caller     : 

=cut

109 110 111 112
sub new {
    my ($class,@args) = @_;

    my $self = $class->SUPER::new(@args);
Laura Clarke's avatar
 
Laura Clarke committed
113
    #print "calling new with @args\n";
114 115
    my ($cigar_string,$features) 
      = $self->_rearrange([qw(CIGAR_STRING FEATURES)], @args);
Laura Clarke's avatar
 
Laura Clarke committed
116

117 118 119 120 121 122
    if (defined($cigar_string) && defined($features)) {
      $self->throw("Can't input cigar_string and an array of features.");
    } elsif (defined($features)) {
      $self->_parse_features($features);
    } elsif (!defined($cigar_string)) {
      $self->throw("Must have a cigar string defining the alignment");
123
    } 
124
    
125
    $self->cigar_string($cigar_string);
126
    
127 128 129 130
    return $self;
}


Arne Stabenau's avatar
Arne Stabenau committed
131
=head2 cigar_string
132

Arne Stabenau's avatar
Arne Stabenau committed
133
  Arg [1]    : string $cigar_string
134
  Example    : ( "12MI3M" )
Arne Stabenau's avatar
Arne Stabenau committed
135 136 137 138 139 140 141 142
  Description: get/set for attribute cigar_string
               cigar_string describes the alignment. "xM" stands for 
               x matches (mismatches), "xI" for inserts into query sequence 
               (thats the ensembl sequence), "xD" for deletions (inserts in the 
               subject). an "x" that is 1 can be omitted.
  Returntype : string
  Exceptions : throws on a get without previous set
  Caller     : general
143 144 145 146 147 148 149

=cut



sub cigar_string {
  my ($self,$arg) = @_;
150
  
151
  if (defined($arg)) {
152
    
153 154 155 156 157 158 159 160 161
    # Do some checks to see whether this is valid
    my $tmp = $arg;
    
    $self->{_cigar_string} = $arg;
  }

  if (!defined($self->{_cigar_string})) {
    $self->throw("No cigar string defined - can't return one");
  }
162
  
163 164 165 166 167 168
  return $self->{_cigar_string};
}


=head2 ungapped_features

Arne Stabenau's avatar
Arne Stabenau committed
169 170 171 172 173 174 175
  Args       : none
  Example    : none
  Description: converts the internal cigar_string into an array of
               ungapped feature pairs
  Returntype : list of Bio::EnsEMBL::FeaturePair
  Exceptions : cigar_string not set internally
  Caller     : general
176

Arne Stabenau's avatar
Arne Stabenau committed
177
=cut
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192



sub ungapped_features {
  my ($self) = @_;

  if (defined($self->cigar_string)) {
    my @features = $self->_parse_cigar;
    return @features;
  } else {
    $self->throw("No cigar_string defined.  Can't return ungapped features");
  }

}

193

194

195
=head2 dbID
196

197 198 199 200 201 202 203 204 205 206 207 208 209
  Arg [1]    : int $dbID
  Example    : none
  Description: get/set for the database internal id
  Returntype : int
  Exceptions : none
  Caller     : general, set from adaptor on store

=cut


sub dbID{
  my ($self, $arg) = @_;

210
  if(defined $arg){
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    $self->{_database_id} = $arg;
  }

  return $self->{_database_id}; 

}

=head2 adaptor

  Arg [1]    : Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor $adaptor
  Example    : none
  Description: get/set for this objects Adaptor
  Returntype : Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor
  Exceptions : none
  Caller     : general, set from adaptor on store

=cut

sub adaptor {
   my $self = shift;
   if( @_ ) {
      my $value = shift;
      $self->{'adaptor'} = $value;
234
    }
235 236
    return $self->{'adaptor'};

237 238
}

239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
=head2 reverse_complement

  Args       : none
  Example    : none
  Description: reverse complement the FeaturePair,
               modifing strand, hstrand and cigar_string in consequence
  Returntype : none
  Exceptions : none
  Caller     : general

=cut

sub reverse_complement {
  my ($self) = @_;

  # reverse strand in both sequences
  $self->strand($self->strand * -1);
  $self->hstrand($self->hstrand * -1);
  
  # reverse cigar_string as consequence
  my $cigar_string = $self->cigar_string;
  $cigar_string =~ s/(D|I|M)/$1 /g;
  my @cigar_pieces = split / /,$cigar_string;
  $cigar_string = "";
  while (my $piece = pop @cigar_pieces) {
    $cigar_string .= $piece;
  }

  $self->cigar_string($cigar_string);
}
269

270 271 272 273 274 275 276 277
=head2 _parse_cigar

  Args       : none
  Example    : none
  Description: creates ungapped features from internally stored cigar line
  Returntype : list of Bio::EnsEMBL::FeaturePair
  Exceptions : none
  Caller     : ungapped_features
278 279 280

=cut

281 282 283 284 285
sub _parse_cigar {
  my ( $self ) = @_;

  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();
286

287
  
288
  my $string = $self->cigar_string;
289
 
290 291 292 293 294 295
  if (!defined($string)) {
    $self->throw("No cigar string defined in object.  This should be caught by the cigar_string method and never happen");
  }

  
  my @pieces = ( $string =~ /(\d*[MDI])/g );
Laura Clarke's avatar
 
Laura Clarke committed
296 297
  #print "cigar: ",join ( ",", @pieces ),"\n";
  
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
  my @features;
  my $strand1 = $self->strand() || 1;
  my $strand2 = $self->hstrand() || 1;

  my ( $start1, $start2 );
  
  if( $strand1 == 1 ) {
    $start1 = $self->start();
  } else {
    $start1 = $self->end();
  }

  if( $strand2 == 1 ) {
    $start2 = $self->hstart();
  } else {
    $start2 = $self->hend();
  }


  foreach my $piece (@pieces) {

    my ($length) = ( $piece =~ /^(\d*)/ );
    if( $length eq "" ) { $length = 1 }
    my $mapped_length;

    # explicit if statements to avoid rounding problems
    # and make sure we have sane coordinate systems
    if( $query_unit == 1 && $hit_unit == 3 ) {
      $mapped_length = $length*3;
    } elsif( $query_unit == 3 && $hit_unit == 1 ) {
      $mapped_length = $length / 3;
    } elsif ( $query_unit == 1 && $hit_unit == 1 ) {
      $mapped_length = $length;
    } else {
      $self->throw("Internal error $query_unit $hit_unit, currently only allowing 1 or 3 ");
    }
    
    if( int($mapped_length) != $mapped_length ) {
      $self->throw("Internal error with mismapped length of hit, query $query_unit, hit $hit_unit, length $length");
    }

    if( $piece =~ /M$/ ) {
340
      my $fp = new Bio::EnsEMBL::FeaturePair;
341 342 343
      
      my ( $a, $b );
      if( $strand1 == 1 ) {
344 345 346
        $a = $start1;
        $b = $start1 + $length - 1;
        $start1 = $b + 1;
347
      } else {
348 349 350
        $b = $start1;
        $a = $start1 - $length + 1;
        $start1 = $a - 1;
351 352
      }
      
353 354 355 356 357 358 359 360 361
      $fp->start($a);
      $fp->end  ($b);
      $fp->strand($self->strand() );
      $fp->score($self->score);
      $fp->seqname($self->seqname);
      $fp->phase($self->phase);
      $fp->p_value($self->p_value);
      $fp->percent_id($self->percent_id);

362
      if( $strand2 == 1 ) {
363 364 365
        $a = $start2;
        $b = $start2 + $mapped_length - 1;
        $start2 = $b + 1;
366
      } else {
367 368 369
        $b = $start2;
        $a = $start2 - $mapped_length + 1;
        $start2 = $a - 1;
370 371
      }

372 373 374 375 376 377
      $fp->hstart($a);
      $fp->hend($b);
      $fp->hstrand($self->hstrand());
      $fp->hseqname($self->hseqname);
         
      $fp->contig($self->contig);
378 379 380 381 382 383
      $fp->analysis($self->analysis);

      push(@features,$fp);
      # end M cigar bits 
    } elsif( $piece =~ /I$/ ) {
      if( $strand1 == 1 ) {
384
        $start1 += $length;
385
      } else {
386
        $start1 -= $length;
387 388 389
      }
    } elsif( $piece =~ /D$/ ) {
      if( $strand2 == 1 ) {
390
        $start2 += $mapped_length;
391
      } else {
392
        $start2 -= $mapped_length;
393 394 395 396 397 398 399 400 401 402 403 404 405 406
      }
    } else {
      $self->throw( "Illegal cigar line $string!" );
    } 
      
  }
  # should the features be sorted ?
  # 
  return @features;
}




407
=head2 _parse_features
408

409 410 411 412 413 414 415
  Arg  1     : listref Bio::EnsEMBL::FeaturePair $ungapped_features
  Example    : none
  Description: creates internal cigarstring and start,end hstart,hend
               entries.
  Returntype : none, fills in values of self
  Exceptions : argument list is sanity checked
  Caller     : new
416

417
=cut
418

419 420
sub _parse_features {
  my ($self,$features ) = @_;
421

422 423
  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();
424

425
  if (ref($features) ne "ARRAY") {
426 427
    $self->throw("features must be an array reference not a [" . 
		 ref($features) . "]");
428 429
  }

430 431 432 433 434 435 436 437 438 439 440 441
  my $strand     = $features->[0]->strand;
  my @f;
  
  #
  # Sort the features on their start position
  # Ascending order on positive strand, descending on negative strand
  #
  if( $strand == 1 ) {
    @f = sort {$a->start <=> $b->start} @$features;
  } else {
    @f = sort { $b->start <=> $a->start} @$features;
  }
442

443

444 445 446 447 448 449 450 451
  my $hstrand     = $f[0]->hstrand;
  my $name        = $f[0]->seqname;
  my $hname       = $f[0]->hseqname;
  my $score       = $f[0]->score;
  my $percent     = $f[0]->percent_id;
  my $pvalue      = $f[0]->p_value;
  my $analysis    = $f[0]->analysis;
  my $phase       = $f[0]->phase;
452
 
453 454 455 456
  # implicit strand 1 for peptide sequences
  ( defined $strand ) || ( $strand = 1 );
  ( defined $hstrand ) || ( $hstrand = 1 );
  my $ori = $strand * $hstrand;
Arne Stabenau's avatar
Arne Stabenau committed
457 458
    
    
459 460 461
  if (scalar(@f) == 0) {
    $self->throw("No features in the array to parse");
  }
462

463 464
  my $prev1; # where last feature q part ended
  my $prev2; # where last feature s part ended
Arne Stabenau's avatar
Arne Stabenau committed
465
    
466 467
  my $string;

468 469
  # Use strandedness info of query and hit to make sure both sets of start and end 
  # coordinates are oriented the right way around.
470 471
  my $f1start;
  my $f1end;
472 473
  my $f2end;
  my $f2start;
474
  #print STDERR "STRAND = ".$strand."\n";
475
  if ($strand == 1) {
476 477
    $f1start = $f[0]->start;
    $f1end = $f[-1]->end;
478
  } else {
479 480
    $f1start = $f[-1]->start;
    $f1end = $f[0]->end;
481
  }
482

483
  #print STDERR "HSTRAND = ".$hstrand."\n";
484 485 486 487 488 489 490
  if ($hstrand == 1) {
    $f2start = $f[0]->hstart;
    $f2end = $f[-1]->hend;
  } else {
    $f2start = $f[-1]->hstart;
    $f2end = $f[0]->hend;
  }
491

492 493 494
  #
  # Loop through each portion of alignment and construct cigar string
  #
495

496 497 498 499
  foreach my $f (@f) {
    #
    # Sanity checks
    #
500

501 502
    if (!$f->isa("Bio::EnsEMBL::FeaturePair")) {
      $self->throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair");
503
    }
504 505 506 507
    if( defined $f->hstrand() ) {
      if ($f->hstrand != $hstrand) {
	$self->throw("Inconsistent hstrands in feature array");
      }
508
    }
509 510 511
    if( defined $f->strand() ) {
      if ($f->strand != $strand) {
	$self->throw("Inconsistent strands in feature array");
Arne Stabenau's avatar
Arne Stabenau committed
512
      }
513 514 515 516 517 518 519 520 521 522 523 524 525
    }
    if ($name ne $f->seqname) {
      $self->throw("Inconsistent names in feature array [$name - " . 
		   $f->seqname . "]");
    }
    if ($hname ne $f->hseqname) {
      $self->throw("Inconsistent names in feature array [$hname - " . 
		   $f->hseqname . "]");
    }
    if ($score ne $f->score) {
      $self->throw("Inconsisent scores in feature array [$score - " . 
		   $f->score . "]");
    }
526
    if (defined($f->percent_id) && $percent ne $f->percent_id) {
527 528 529 530 531 532
      $self->throw("Inconsistent pids in feature array [$percent - " . 
		   $f->percent_id . "]");
    }
    
    my $start1 = $f->start;      #source sequence alignment start
    my $start2 = $f->hstart();   #hit sequence alignment start
533

534 535 536 537 538 539 540 541 542 543
    #
    # More sanity checking
    #
    if (defined($prev1)) {
      if ( $strand == 1 ) {
	if ($f->start < $prev1) {
	  $self->throw("Inconsistent coordinates feature is forward strand " .
		       "hstart in current feature should be greater than " .
		       "hend in previous feature " . $f->start . " < " .
		       $prev1."\n");
Arne Stabenau's avatar
Arne Stabenau committed
544
	}
545 546 547 548 549
      } else {
	if ($f->end > $prev1) {
	  $self->throw("Inconsistent coordinates in feature array feature " .
		       "is reverse strand hend should be less than previous " .
		       "hstart " . $f->end . " > $prev1");
Arne Stabenau's avatar
Arne Stabenau committed
550
	}
551
      }
552
    }
553

554 555
    my $length = ($f->end - $f->start + 1); #length of source seq alignment
    my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment
556

557 558 559 560 561 562 563 564 565
    # using multiplication to avoid rounding errors, hence the
    # switch from query to hit for the ratios
    
    #
    # Yet more sanity checking
    #
    if($query_unit > $hit_unit){
      # I am going to make the assumption here that this situation will 
      # only occur with DnaPepAlignFeatures, this may not be true
Laura Clarke's avatar
Laura Clarke committed
566 567 568
      my $query_p_length = sprintf "%.0f", ($length/$query_unit);
      my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit);
      if( $query_p_length != $hit_p_length) {
569
	#print STDERR $length."/".$query_unit." ".$hlength."*".$hit_unit."\n";
570 571 572
	$self->throw( "Feature lengths not comparable Lengths:" .$length . 
		      " " . $hlength . " Ratios:" . $query_unit . " " . 
		      $hit_unit );
Arne Stabenau's avatar
Arne Stabenau committed
573
      }
574
    } else{
Laura Clarke's avatar
Laura Clarke committed
575 576
      my $query_d_length = sprintf "%.0f", ($length*$hit_unit);
      my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit);
577
      if( $length * $hit_unit != $hlength * $query_unit ) {
578

579 580 581
	$self->throw( "Feature lengths not comparable Lengths:" . $length . 
		      " " . $hlength . " Ratios:" . $query_unit . " " . 
		      $hit_unit );
Arne Stabenau's avatar
Arne Stabenau committed
582
      }
583 584 585
    }

    my $hlengthfactor = ($query_unit/$hit_unit);
586

587 588 589 590 591 592
    # if( $query_unit == 1 && $hit_unit == 3 ) {
    #	$hlengthfactor = (1/3);
    #     }
    #     if( $query_unit == 3 && $hit_unit == 1 ) {
    #	$hlengthfactor = 3;
    #      }
Arne Stabenau's avatar
Arne Stabenau committed
593
      
594 595 596 597 598 599 600 601 602 603

    #
    # Check to see if there is an I type (insertion) gap:
    #   If there is a space between the end of the last source sequence 
    #   alignment and the start of this one, then this is an insertion
    #

    my $insertion_flag = 0;
    if( $strand == 1 ) {
      if( ( defined $prev1 ) && ( $f->start > $prev1 + 1  )) {
604

605 606 607 608 609
	#there is an insertion
	$insertion_flag = 1;
	my $gap = $f->start - $prev1 - 1;
	if( $gap == 1 ) {
	  $gap = ""; # no need for a number if gap length is 1
Arne Stabenau's avatar
Arne Stabenau committed
610
	}
611
	$string .= "$gap"."I";
612

Arne Stabenau's avatar
Arne Stabenau committed
613
      }
614 615 616 617

      #shift our position in the source seq alignment
      $prev1 = $f->end();
    } else {
618

619 620 621 622 623 624 625
      if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) {

	#there is an insertion
	$insertion_flag = 1;
	my $gap = $prev1 - $f->end() - 1;
	if( $gap == 1 ) {
	  $gap = ""; # no need for a number if gap length is 1
Laura Clarke's avatar
 
Laura Clarke committed
626
	}
627
	$string .= "$gap"."I";
628
	#print STDERR "cigar stands at ".$string."\n" if($verbose);
Arne Stabenau's avatar
Arne Stabenau committed
629
      }
630

631 632 633
      #shift our position in the source seq alignment
      $prev1 = $f->start();
    }
Laura Clarke's avatar
 
Laura Clarke committed
634
      
635 636 637 638 639 640 641
    #
    # Check to see if there is a D type (deletion) gap
    #   There is a deletion gap if there is a space between the end of the
    #   last portion of the hit sequence alignment and this one
    #
    if( $hstrand == 1 ) {
      if((  defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) {
642

643 644 645 646 647 648
	#there is a deletion
	my $gap = $f->hstart - $prev2 - 1;
	my $gap2 = int( $gap * $hlengthfactor + 0.05 );
	
	if( $gap2 == 1 ) {
	  $gap2 = "";  # no need for a number if gap length is 1
Arne Stabenau's avatar
Arne Stabenau committed
649
	}
650
	$string .= "$gap2"."D";
651

652 653
	#sanity check,  Should not be an insertion and deletion
	if($insertion_flag) {
654
	  $self->warn("Should not be an deletion and insertion on the " .
655
		       "same alignment region. cigar_line=$string\n");
Arne Stabenau's avatar
Arne Stabenau committed
656
	} 
657 658 659 660 661 662
      } 
      #shift our position in the hit seq alignment
      $prev2 = $f->hend();

     } else {
      if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {
663

664 665 666 667 668 669
	#there is a deletion
	my $gap = $prev2 - $f->hend - 1;
	my $gap2 = int( $gap * $hlengthfactor + 0.05 );
	
	if( $gap2 == 1 ) {
	  $gap2 = "";  # no need for a number if gap length is 1
Arne Stabenau's avatar
Arne Stabenau committed
670
	}
671
	$string .= "$gap2"."D";
672

673 674 675 676 677 678
	#sanity check,  Should not be an insertion and deletion
	if($insertion_flag) {
	  $self->throw("Should not be an deletion and insertion on the " .
		       "same alignment region. prev2 = $prev2; f->hend() = " .
		       $f->hend() . "; cigar_line = $string;\n");
	} 
679
      }
680
      #shift our position in the hit seq alignment
681
     
682 683
      $prev2 = $f->hstart();
    }
684
      
685 686 687
    my $matchlength = $f->end() - $f->start() + 1;
    if( $matchlength == 1 ) {
      $matchlength = "";
688
    }
689
    $string .= $matchlength."M";
690

691
    #print STDERR "finished with this feature\n\n";
692
  }
693

694
  if(!$score){
695
    #$self->warn("score is not set assume its 1");
696 697
    $score = 1;
  } 
698
  my $feature1 = new Bio::EnsEMBL::SeqFeature();
699
  
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
  $feature1->start($f1start);
  $feature1->end  ($f1end);
  $feature1->strand($strand);
  $feature1->score($score);
  $feature1->percent_id($percent);
  $feature1->p_value($pvalue);
  $feature1->seqname($name);
  $feature1->phase($phase);
  $feature1->analysis($analysis);
  #print STDERR "checking feature1 ".$feature1->gffstring."\n";
  $feature1->validate;
  
  my $feature2 = new Bio::EnsEMBL::SeqFeature();
  
  $feature2->start($f2start);
  $feature2->end  ($f2end);
  $feature2->strand($hstrand);
  $feature2->score($score);
  $feature2->seqname($hname);
  $feature2->percent_id($percent);
  $feature2->p_value($pvalue);
  $feature2->phase($phase);
  $feature2->analysis($analysis);
  $feature2->validate;
724 725


726 727 728
  $self->feature1($feature1);
  $self->feature2($feature2);
  $self->cigar_string($string);
729

730
}
731

Laura Clarke's avatar
 
Laura Clarke committed
732

Arne Stabenau's avatar
Arne Stabenau committed
733

734 735 736



Graham McVicker's avatar
Graham McVicker committed
737
sub _transform_to_RawContig{
Simon Potter's avatar
Simon Potter committed
738
  my ($self) = @_;
739

Laura Clarke's avatar
 
Laura Clarke committed
740
  if(!$self->contig){
741 742
    $self->throw("can't transform coordinates of ". $self
		 . " without some sort of contig defined");
Laura Clarke's avatar
 
Laura Clarke committed
743
  }
744

745
  my $rcAdaptor = $self->contig->adaptor()->db()->get_RawContigAdaptor();
746
  #my $global_start = $self->contig->chr_start();
Laura Clarke's avatar
 
Laura Clarke committed
747
  my @out;
Laura Clarke's avatar
 
Laura Clarke committed
748

Laura Clarke's avatar
 
Laura Clarke committed
749 750 751
  my @features = $self->_parse_cigar();
  my @mapped_features;
  my %rc_features;
Laura Clarke's avatar
 
Laura Clarke committed
752

Laura Clarke's avatar
 
Laura Clarke committed
753
  foreach my $f(@features){
754

Graham McVicker's avatar
Graham McVicker committed
755
    my @new_features = $self->_transform_feature_to_RawContig($f);
Laura Clarke's avatar
 
Laura Clarke committed
756 757
    push(@mapped_features, @new_features);
  }
Arne Stabenau's avatar
Arne Stabenau committed
758

Laura Clarke's avatar
 
Laura Clarke committed
759
  foreach my $mf(@mapped_features){
760

761
    my $contig_id = $mf->contig->dbID;
Laura Clarke's avatar
 
Laura Clarke committed
762 763
    if(!$rc_features{$contig_id}){
      $rc_features{$contig_id} = [];
764
    }
765 766

    push(@{$rc_features{$contig_id}}, $mf);
Laura Clarke's avatar
 
Laura Clarke committed
767
  }
Arne Stabenau's avatar
Arne Stabenau committed
768

Laura Clarke's avatar
 
Laura Clarke committed
769
  foreach my $contig_id(keys(%rc_features)){
770

771
    my $outputf = $self->new( -features => $rc_features{$contig_id} );
Laura Clarke's avatar
 
Laura Clarke committed
772 773 774
    $outputf->analysis( $self->analysis() );
    $outputf->score( $self->score() );
    $outputf->percent_id( $self->percent_id() );
775
    $outputf->p_value( $self->p_value());
Laura Clarke's avatar
 
Laura Clarke committed
776 777
    my $contig = $rcAdaptor->fetch_by_dbID($contig_id);
    $outputf->contig($contig);
Laura Clarke's avatar
 
Laura Clarke committed
778
    push(@out, $outputf);
Laura Clarke's avatar
 
Laura Clarke committed
779
  }
Laura Clarke's avatar
 
Laura Clarke committed
780
  #print STDERR "returning ".@out." feature from basealignfeature\n";
Laura Clarke's avatar
 
Laura Clarke committed
781 782
  return @out;

783 784
}

Laura Clarke's avatar
 
Laura Clarke committed
785 786


787 788 789
=head2 _hit_unit

  Args       : none
Arne Stabenau's avatar
Arne Stabenau committed
790
  Example    : none
791 792 793
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
Arne Stabenau's avatar
Arne Stabenau committed
794
  Exceptions : none
795
  Caller     : internal
Laura Clarke's avatar
 
Laura Clarke committed
796 797 798

=cut

799 800 801
sub _hit_unit {
  my $self = shift;
  $self->throw( "Abstract method call!" );
Laura Clarke's avatar
 
Laura Clarke committed
802 803
}

804
=head2 _query_unit
Laura Clarke's avatar
 
Laura Clarke committed
805

806
  Args       : none
Arne Stabenau's avatar
Arne Stabenau committed
807
  Example    : none
808 809 810
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
Arne Stabenau's avatar
Arne Stabenau committed
811
  Exceptions : none
812
  Caller     : internal
Laura Clarke's avatar
 
Laura Clarke committed
813 814 815

=cut

816
sub _query_unit {
Laura Clarke's avatar
 
Laura Clarke committed
817
  my $self = shift;
818
  $self->throw( "Abstract method call!" );
Laura Clarke's avatar
 
Laura Clarke committed
819 820
}

Arne Stabenau's avatar
Arne Stabenau committed
821

Graham McVicker's avatar
Graham McVicker committed
822
sub _transform_feature_to_RawContig{
Laura Clarke's avatar
 
Laura Clarke committed
823 824
  my($self, $feature) =  @_;

825 826
  my $verbose = 0;
  #$verbose = 1 if($feature->hseqname eq 'CE25688');
827

828
  #print STDERR "transforming ".$feature->gffstring."\n" if($verbose);
829

830 831 832 833
  if(!$self->contig){
    $self->throw("can't transform coordinates of ".$self." without some sort of contig defined");
  }
  my $mapper = $self->contig->adaptor->db->get_AssemblyMapperAdaptor->fetch_by_type( $self->contig()->assembly_type() );
Laura Clarke's avatar
 
Laura Clarke committed
834
  
835
  my $rcAdaptor = $self->contig->adaptor()->db()->get_RawContigAdaptor();
836 837


838 839 840 841 842 843 844 845 846 847 848 849
  my $slice = $feature->contig;
  my $slice_start  = $slice->chr_start;
  my $slice_end    = $slice->chr_end;
  my $slice_strand = $slice->strand;

  my ($global_start, $global_end, $global_strand);

  #change feature coords from slice coordinates to assembly
  if($slice_strand == 1) {
    $global_start  = $feature->start + $slice_start - 1;
    $global_end    = $feature->end   + $slice_start - 1;
    $global_strand = $feature->strand;
850
  } else {
851 852 853
    $global_start  = $slice_end - $feature->end   + 1;
    $global_end    = $slice_end - $feature->start + 1;
    $global_strand = $feature->strand * -1;
854 855
  }

856 857
  my @out;
  #convert assembly coords to raw contig coords
Laura Clarke's avatar
 
Laura Clarke committed
858 859
  my @mapped = $mapper->map_coordinates_to_rawcontig
    (
860
     $slice->chr_name(),
861 862 863
     $global_start,
     $global_end,
     $global_strand
Laura Clarke's avatar
 
Laura Clarke committed
864 865
    );

866
  if( ! @mapped ) {
Laura Clarke's avatar
 
Laura Clarke committed
867 868 869
    $self->throw( "couldn't map ".$self."\n" );
    return $self;
  }
870 871
  my ( $hit_start, $hit_end );

Laura Clarke's avatar
 
Laura Clarke committed
872
  if( scalar( @mapped ) > 1 ) {
873
    #print STDERr "splitting evidence\n";
874 875 876 877 878
    if( $feature->hstrand == 1 ) {
      $hit_start = $feature->hstart();
    } else {
      $hit_end = $feature->hend();
    }
Laura Clarke's avatar
 
Laura Clarke committed
879 880
    #print STDERR " feature is being mapped across multiple contigs ".$feature->gffstring."\n";
  SPLIT: for( my $i=0; $i <= $#mapped; $i++ ) {
881 882 883 884 885
      if($mapped[$i]->isa("Bio::EnsEMBL::Mapper::Gap")){
	$self->warn("piece of evidence lies on gap\n");
	next SPLIT;
      }
      #print STDERR "query coords ".$mapped[$i]->end." ".$mapped[$i]->start."\n";
Laura Clarke's avatar
 
Laura Clarke committed
886
      my $query_length = ($mapped[$i]->end - $mapped[$i]->start + 1);
887
      #print STDERR " query length = ".$query_length."\n";
Laura Clarke's avatar
 
Laura Clarke committed
888 889 890 891
      my $hit_length;
      if($self->_query_unit == $self->_hit_unit){
	$hit_length = $query_length;
      }elsif($self->_query_unit > $self->_hit_unit){
892 893 894
	my $tmp =  ($query_length/$self->_query_unit);
	#print STDERR "tmp = ".$tmp."\n";
	#$hit_length = int($tmp);
895
	$hit_length = sprintf "%.0f", $tmp;
Laura Clarke's avatar
 
Laura Clarke committed
896
      }elsif($self->_hit_unit > $self->_query_unit){
897 898
	my $tmp = ($query_length*$self->_hit_unit);
	#print STDERR "tmp = ".$tmp."\n";
899
	$hit_length = sprintf "%.0f", $tmp;
Laura Clarke's avatar
 
Laura Clarke committed
900
      }
901
      #print STDERR "hit length ".$hit_length."\n";
902 903 904
      if($hit_length == 0){
	next SPLIT;
      }
905 906 907 908 909
      if( $feature->hstrand() == 1 ) {
	$hit_end = ($hit_start + $hit_length) - 1;
      } else {
	$hit_start = ( $hit_end - $hit_length + 1 );
      } 
910
      #print "hit start ".$hit_start." hit end ".$hit_end."\n";
Laura Clarke's avatar
 
Laura Clarke committed
911
      my $rawContig = $rcAdaptor->fetch_by_dbID( $mapped[$i]->id() );
912 913 914 915
      my $f1 = new Bio::EnsEMBL::SeqFeature();
      my $f2 = new Bio::EnsEMBL::SeqFeature();
      my $new_feature = Bio::EnsEMBL::FeaturePair->new(-feature1=>$f1,
						       -feature2=>$f2);
916
     
Laura Clarke's avatar
 
Laura Clarke committed
917 918 919 920 921 922 923 924 925 926 927
      $new_feature->start($mapped[$i]->start);
      $new_feature->end($mapped[$i]->end);
      $new_feature->strand($mapped[$i]->strand);
      $new_feature->seqname($mapped[$i]->id);
      $new_feature->score($feature->score);
      $new_feature->percent_id($feature->percent_id);
      $new_feature->p_value($feature->p_value);
      $new_feature->hstart($hit_start);
      $new_feature->hend($hit_end);
      $new_feature->hstrand($feature->hstrand);
      $new_feature->hseqname($feature->hseqname);
928

Laura Clarke's avatar