BaseMapper.pm 6.61 KB
Newer Older
1

2
=head1 LICENSE
3

4
  Copyright (c) 1999-2013 The European Bioinformatics Institute and
5
  Genome Research Limited.  All rights reserved.
6

7
8
  This software is distributed under a modified Apache license.
  For license details, please see
9

10
    http://www.ensembl.org/info/about/code_licence.html
11

12
=head1 CONTACT
13

14
  Please email comments or questions to the public Ensembl
15
  developers list at <dev@ensembl.org>.
16

17
18
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
19

20
=cut
21

22
=head1 NAME
23

24
=head1 SYNOPSIS
25

26
=head1 DESCRIPTION
27

28
=head1 METHODS
29
30
31

=cut

32
package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::IdMapping::BaseObject;
our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);

use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
use Bio::EnsEMBL::IdMapping::MappingList;

# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
use constant SIMILAR_SCORE_RATIO => 0.01;

#
# find the highest unambiguous score for all sources and targets in a scoring
# matrix
#
sub basic_mapping {
53
54
  my $self         = shift;
  my $matrix       = shift;
55
56
57
  my $mapping_name = shift;

  # argument checks
58
59
60
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
61
62
63
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

64
65
66
67
68
69
70
71
72
73
74
75
76
77
  throw('Need a name for serialising the mapping.')
    unless ($mapping_name);

  # Create a new MappingList object. Specify AUTO_LOAD to load
  # serialised existing mappings if found
  my $dump_path =
    path_append( $self->conf->param('basedir'), 'mapping' );

  my $mappings =
    Bio::EnsEMBL::IdMapping::MappingList->new(
                                   -DUMP_PATH  => $dump_path,
                                   -CACHE_FILE => "${mapping_name}.ser",
                                   -AUTO_LOAD  => 1, );

78
  # checkpoint test: return a previously stored MappingList
79
80
81
  if ( $mappings->loaded ) {
    $self->logger->info(
                  "Read existing mappings from ${mapping_name}.ser.\n");
82
83
84
85
86
87
88
    return $mappings;
  }

  my $sources_done = {};
  my $targets_done = {};

  # sort scoring matrix entries by descending score
89
90
  my @sorted_entries =
    sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries };
91
92

  # debug
93
  my $idx = substr($mapping_name, -1);
94

95
96
  while ( my $entry = shift(@sorted_entries) ) {

97
    $self->logger->debug("\nxxx$idx ".$entry->to_string." ");
98

99
    # we already found a mapping for either source or target
100
101
102
103
    next
      if (    $sources_done->{ $entry->source }
           or $targets_done->{ $entry->target } );

104
    $self->logger->debug('d');
105

106
    # there's a better mapping for either source or target
107
108
109
110
111
    next
      if ( $self->higher_score_exists(
                           $entry, $matrix, $sources_done, $targets_done
           ) );

112
    $self->logger->debug('h');
113
114

    # check for ambiguous mappings; they are dealt with later
115
    # unless all the other ambiguous mappings have already been dealt with
116
117
118
    my $other_sources = [];
    my $other_targets = [];

119
120
121
    if ( $self->ambiguous_mapping( $entry,         $matrix,
                                   $other_sources, $other_targets ) )
    {
122
      $self->logger->debug('a');
123

124
125
126
127
128
129
      $other_sources =
        $self->filter_sources( $other_sources, $sources_done );
      $other_targets =
        $self->filter_targets( $other_targets, $targets_done );

      next if ( scalar(@$other_sources) or scalar(@$other_targets) );
130
    }
131

132
    $self->logger->debug('A');
133
134
135
136

    # this is the best mapping, add it
    $mappings->add_Entry($entry);

137
138
139
    $sources_done->{ $entry->source } = 1;
    $targets_done->{ $entry->target } = 1;
  } ## end while ( my $entry = shift...)
140
141
142
143
144

  # create checkpoint
  $mappings->write_to_file;

  return $mappings;
145
} ## end sub basic_mapping
146
147

sub higher_score_exists {
148
  my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_;
149
150
151

  my $source = $entry->source;
  my $target = $entry->target;
152
153
154
155
156
157
158
159
160
161
  my $score  = $entry->score;

  foreach
    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
  {
    if (     $other_source != $source
         and !$sources_done->{$other_source}
         and $score < $matrix->get_score( $other_source, $target ) )
    {
      return 1;
162
163
164
    }
  }

165
166
167
168
169
170
171
172
  foreach
    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
  {
    if (     $other_target != $target
         and !$targets_done->{$other_target}
         and $score < $matrix->get_score( $source, $other_target ) )
    {
      return 1;
173
174
175
176
    }
  }

  return 0;
177
} ## end sub higher_score_exists
178
179
180
181
182

#
# find ambiguous mappings (see scores_similar() for definition)
#
sub ambiguous_mapping {
183
  my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_;
184
185
186

  my $source = $entry->source;
  my $target = $entry->target;
187
  my $score  = $entry->score;
188
189
190

  my $retval = 0;

191
192
193
194
195
196
197
198
199
200
201
  foreach
    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
  {
    my $other_score = $matrix->get_score( $other_source, $target );

    if ( $other_source != $source
         and (    $self->scores_similar( $score, $other_score )
               or $score < $other_score ) )
    {
      $retval = 1;
      push @{$other_sources}, $other_source;
202
203
204
    }
  }

205
206
207
208
209
210
211
212
213
214
215
  foreach
    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
  {
    my $other_score = $matrix->get_score( $source, $other_target );

    if ( $other_target != $target
         and (    $self->scores_similar( $score, $other_score )
               or $score < $other_score ) )
    {
      $retval = 1;
      push @{$other_targets}, $other_target;
216
217
218
219
    }
  }

  return $retval;
220
} ## end sub ambiguous_mapping
221

222
#
223
224
225
# rule for similarity taken from java code...
#
sub scores_similar {
226
  my ( $self, $s1, $s2 ) = @_;
227
228

  # always give priority to exact matches over very similar ones
229
230
231
232
  return 0 if ( $s1 == 1 and $s2 < 1 );

  my $diff = $s1 - $s2;
  $diff = -$diff if ( $diff < 0 );
233

234
235
236
237
  my $pc = 2*$diff/( $s1 + $s2 );

  return ( $pc < SIMILAR_SCORE_RATIO );
}
238
239

sub filter_sources {
240
  my ( $self, $other_sources, $sources_done ) = @_;
241

242
243
244
  unless (     scalar( @{$other_sources} )
           and scalar( keys %{$sources_done} ) )
  {
245
246
247
248
249
    return $other_sources;
  }

  my @tmp = ();

250
251
  foreach my $e ( @{$other_sources} ) {
    push @tmp, $e unless ( $sources_done->{$e} );
252
253
254
255
256
257
  }

  return \@tmp;
}

sub filter_targets {
258
  my ( $self, $other_targets, $targets_done ) = @_;
259

260
  unless (     scalar( @{$other_targets} )
261
           and scalar( keys %{$targets_done} ) )
262
  {
263
264
265
266
267
    return $other_targets;
  }

  my @tmp = ();

268
269
  foreach my $e ( @{$other_targets} ) {
    push @tmp, $e unless ( $targets_done->{$e} );
270
271
272
273
274
275
  }

  return \@tmp;
}

1;