BaseMapper.pm 6.55 KB
Newer Older
1

2
=head1 LICENSE
3

Andy Yates's avatar
Andy Yates committed
4
  Copyright (c) 1999-2012 The European Bioinformatics Institute and
5
  Genome Research Limited.  All rights reserved.
6

7 8
  This software is distributed under a modified Apache license.
  For license details, please see
9

10
    http://www.ensembl.org/info/about/code_licence.html
11

12
=head1 CONTACT
13

14
  Please email comments or questions to the public Ensembl
15
  developers list at <dev@ensembl.org>.
16

17 18
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
19

20
=cut
21

22
=head1 NAME
23

24
=head1 SYNOPSIS
25

26
=head1 DESCRIPTION
27

28
=head1 METHODS
29 30 31

=cut

32
package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::IdMapping::BaseObject;
our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);

use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
use Bio::EnsEMBL::IdMapping::MappingList;

# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
use constant SIMILAR_SCORE_RATIO => 0.01;

#
# find the highest unambiguous score for all sources and targets in a scoring
# matrix
#
sub basic_mapping {
53 54
  my $self         = shift;
  my $matrix       = shift;
55 56 57
  my $mapping_name = shift;

  # argument checks
58 59 60
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
61 62 63
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

64 65 66 67 68 69 70 71 72 73 74 75 76 77
  throw('Need a name for serialising the mapping.')
    unless ($mapping_name);

  # Create a new MappingList object. Specify AUTO_LOAD to load
  # serialised existing mappings if found
  my $dump_path =
    path_append( $self->conf->param('basedir'), 'mapping' );

  my $mappings =
    Bio::EnsEMBL::IdMapping::MappingList->new(
                                   -DUMP_PATH  => $dump_path,
                                   -CACHE_FILE => "${mapping_name}.ser",
                                   -AUTO_LOAD  => 1, );

78
  # checkpoint test: return a previously stored MappingList
79 80 81
  if ( $mappings->loaded ) {
    $self->logger->info(
                  "Read existing mappings from ${mapping_name}.ser.\n");
82 83 84 85 86 87 88
    return $mappings;
  }

  my $sources_done = {};
  my $targets_done = {};

  # sort scoring matrix entries by descending score
89 90
  my @sorted_entries =
    sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries };
91 92 93 94

  # debug
  #my $idx = substr($mapping_name, -1);

95 96
  while ( my $entry = shift(@sorted_entries) ) {

97
    #$self->logger->debug("\nxxx$idx ".$entry->to_string." ");
98

99
    # we already found a mapping for either source or target
100 101 102 103
    next
      if (    $sources_done->{ $entry->source }
           or $targets_done->{ $entry->target } );

104
    #$self->logger->debug('d');
105

106
    # there's a better mapping for either source or target
107 108 109 110 111
    next
      if ( $self->higher_score_exists(
                           $entry, $matrix, $sources_done, $targets_done
           ) );

112 113 114 115 116 117
    #$self->logger->debug('h');

    # check for ambiguous mappings; they are dealt with later
    my $other_sources = [];
    my $other_targets = [];

118 119 120
    if ( $self->ambiguous_mapping( $entry,         $matrix,
                                   $other_sources, $other_targets ) )
    {
121 122
      #$self->logger->debug('a');

123 124 125 126 127 128
      $other_sources =
        $self->filter_sources( $other_sources, $sources_done );
      $other_targets =
        $self->filter_targets( $other_targets, $targets_done );

      next if ( scalar(@$other_sources) or scalar(@$other_targets) );
129
    }
130

131 132 133 134 135
    #$self->logger->debug('A');

    # this is the best mapping, add it
    $mappings->add_Entry($entry);

136 137 138
    $sources_done->{ $entry->source } = 1;
    $targets_done->{ $entry->target } = 1;
  } ## end while ( my $entry = shift...)
139 140 141 142 143

  # create checkpoint
  $mappings->write_to_file;

  return $mappings;
144
} ## end sub basic_mapping
145 146

sub higher_score_exists {
147
  my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_;
148 149 150

  my $source = $entry->source;
  my $target = $entry->target;
151 152 153 154 155 156 157 158 159 160
  my $score  = $entry->score;

  foreach
    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
  {
    if (     $other_source != $source
         and !$sources_done->{$other_source}
         and $score < $matrix->get_score( $other_source, $target ) )
    {
      return 1;
161 162 163
    }
  }

164 165 166 167 168 169 170 171
  foreach
    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
  {
    if (     $other_target != $target
         and !$targets_done->{$other_target}
         and $score < $matrix->get_score( $source, $other_target ) )
    {
      return 1;
172 173 174 175
    }
  }

  return 0;
176
} ## end sub higher_score_exists
177 178 179 180 181

#
# find ambiguous mappings (see scores_similar() for definition)
#
sub ambiguous_mapping {
182
  my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_;
183 184 185

  my $source = $entry->source;
  my $target = $entry->target;
186
  my $score  = $entry->score;
187 188 189

  my $retval = 0;

190 191 192 193 194 195 196 197 198 199 200
  foreach
    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
  {
    my $other_score = $matrix->get_score( $other_source, $target );

    if ( $other_source != $source
         and (    $self->scores_similar( $score, $other_score )
               or $score < $other_score ) )
    {
      $retval = 1;
      push @{$other_sources}, $other_source;
201 202 203
    }
  }

204 205 206 207 208 209 210 211 212 213 214
  foreach
    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
  {
    my $other_score = $matrix->get_score( $source, $other_target );

    if ( $other_target != $target
         and (    $self->scores_similar( $score, $other_score )
               or $score < $other_score ) )
    {
      $retval = 1;
      push @{$other_targets}, $other_target;
215 216 217 218
    }
  }

  return $retval;
219
} ## end sub ambiguous_mapping
220

221
#
222 223 224
# rule for similarity taken from java code...
#
sub scores_similar {
225
  my ( $self, $s1, $s2 ) = @_;
226 227

  # always give priority to exact matches over very similar ones
228 229 230 231
  return 0 if ( $s1 == 1 and $s2 < 1 );

  my $diff = $s1 - $s2;
  $diff = -$diff if ( $diff < 0 );
232

233 234 235 236
  my $pc = 2*$diff/( $s1 + $s2 );

  return ( $pc < SIMILAR_SCORE_RATIO );
}
237 238

sub filter_sources {
239
  my ( $self, $other_sources, $sources_done ) = @_;
240

241 242 243
  unless (     scalar( @{$other_sources} )
           and scalar( keys %{$sources_done} ) )
  {
244 245 246 247 248
    return $other_sources;
  }

  my @tmp = ();

249 250
  foreach my $e ( @{$other_sources} ) {
    push @tmp, $e unless ( $sources_done->{$e} );
251 252 253 254 255 256
  }

  return \@tmp;
}

sub filter_targets {
257
  my ( $self, $other_targets, $targets_done ) = @_;
258

259
  unless (     scalar( @{$other_targets} )
260
           and scalar( keys %{$targets_done} ) )
261
  {
262 263 264 265 266
    return $other_targets;
  }

  my @tmp = ();

267 268
  foreach my $e ( @{$other_targets} ) {
    push @tmp, $e unless ( $targets_done->{$e} );
269 270 271 272 273 274
  }

  return \@tmp;
}

1;