Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
ensembl-gh-mirror
ensembl
Commits
ce19e680
Commit
ce19e680
authored
Apr 28, 2008
by
Patrick Meidl
Browse files
plugin architecture for InternalIdMapper
parent
e28ad271
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
971 additions
and
483 deletions
+971
-483
misc-scripts/id_mapping/default.conf
misc-scripts/id_mapping/default.conf
+21
-2
misc-scripts/id_mapping/id_mapping.pl
misc-scripts/id_mapping/id_mapping.pl
+3
-0
misc-scripts/id_mapping/run.pl
misc-scripts/id_mapping/run.pl
+3
-0
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm
+140
-481
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm
+250
-0
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm
.../EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm
+91
-0
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm
.../EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm
+187
-0
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm
...BL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm
+276
-0
No files found.
misc-scripts/id_mapping/default.conf
View file @
ce19e680
...
...
@@ -5,7 +5,7 @@ dry_run = 0
loglevel
=
DEBUG
;
paths
basedir
= /
lustre
/
work1
/
ensembl
/
pm2
/
idmapping
/
perl
/
2008
-
04
-
2
2
c
basedir
= /
lustre
/
work1
/
ensembl
/
pm2
/
idmapping
/
perl
/
2008
-
04
-
2
8
;
prepend
this
path
to
your
'log'
parameter
;
will
default
to
"$basedir/log"
if
not
set
...
...
@@ -27,7 +27,7 @@ targetdbname = pm2_pan_troglodytes_core_41_21
;
caching
;
cache_method
=
build_cache_all
build_cache_auto_threshold
=
100
build_cache_concurrent_jobs
=
200
build_cache_concurrent_jobs
=
200
;
limit
;
region
=
chromosome
:
CHIMP1A
:
1
:
1
:
2000000
:
1
...
...
@@ -50,6 +50,25 @@ transcript_score_threshold = 0
synteny_rescore_jobs
=
20
;
lsf_opt_synteny_rescore
=
;
InternalIdMapper
;
plugin_internal_id_mappers_gene
= \
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblGeneGeneric
::
init_basic
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblGeneGeneric
::
synteny
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblGeneGeneric
::
best_transcript
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblGeneGeneric
::
biotype
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblGeneGeneric
::
internal_id
;
plugin_internal_id_mappers_transcript
= \
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblTranscriptGeneric
::
init_basic
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblTranscriptGeneric
::
non_exact_translation
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblTranscriptGeneric
::
mapped_gene
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblTranscriptGeneric
::
internal_id
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblTranscriptGeneric
::
single_gene
;
plugin_internal_id_mappers_exon
= \
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblExonGeneric
::
init_basic
,\
;
Bio
::
EnsEMBL
::
IdMapping
::
InternalIdMapper
::
EnsemblExonGeneric
::
mapped_transcript
;
StableIdMapper
mapping_types
=
gene
,
transcript
,
translation
,
exon
;
plugin_stable_id_generator
=
Bio
::
EnsEMBL
::
IdMapping
::
StableIdGenerator
::
EnsemblGeneric
...
...
misc-scripts/id_mapping/id_mapping.pl
View file @
ce19e680
...
...
@@ -87,6 +87,9 @@ $conf->parse_options(
'
exonerate_jobs|exoneratejobs=i
'
=>
0
,
'
exonerate_bytes_per_job|exoneratebytesperjob=f
'
=>
0
,
'
exonerate_extra_params|exonerateextraparams=s
'
=>
0
,
'
plugin_internal_id_mappers_gene=s@
'
=>
0
,
'
plugin_internal_id_mappers_transcript=s@
'
=>
0
,
'
plugin_internal_id_mappers_exon=s@
'
=>
0
,
'
mapping_types=s@
'
=>
1
,
'
plugin_stable_id_generator=s
'
=>
0
,
'
upload_events|uploadevents=s
'
=>
0
,
...
...
misc-scripts/id_mapping/run.pl
View file @
ce19e680
...
...
@@ -96,6 +96,9 @@ $conf->parse_options(
'
exonerate_jobs|exoneratejobs=i
'
=>
0
,
'
exonerate_bytes_per_job|exoneratebytesperjob=f
'
=>
0
,
'
exonerate_extra_params|exonerateextraparams=s
'
=>
0
,
'
plugin_internal_id_mappers_gene=s@
'
=>
0
,
'
plugin_internal_id_mappers_transcript=s@
'
=>
0
,
'
plugin_internal_id_mappers_exon=s@
'
=>
0
,
'
mapping_types=s@
'
=>
1
,
'
plugin_stable_id_generator=s
'
=>
0
,
'
upload_events|uploadevents=s
'
=>
0
,
...
...
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm
View file @
ce19e680
This diff is collapsed.
Click to expand it.
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm
0 → 100644
View file @
ce19e680
package
Bio::EnsEMBL::IdMapping::InternalIdMapper::
BaseMapper
;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http:#www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use
strict
;
use
warnings
;
no
warnings
'
uninitialized
';
use
Bio::EnsEMBL::IdMapping::
BaseObject
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::BaseObject)
;
use
Bio::EnsEMBL::Utils::
Exception
qw(throw warning)
;
use
Bio::EnsEMBL::Utils::
ScriptUtils
qw(path_append)
;
use
Bio::EnsEMBL::IdMapping::
MappingList
;
# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
use
constant
SIMILAR_SCORE_RATIO
=>
0.01
;
#
# find the highest unambiguous score for all sources and targets in a scoring
# matrix
#
sub
basic_mapping
{
my
$self
=
shift
;
my
$matrix
=
shift
;
my
$mapping_name
=
shift
;
# argument checks
unless
(
$matrix
and
$matrix
->
isa
('
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
'))
{
throw
('
Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.
');
}
throw
('
Need a name for serialising the mapping.
')
unless
(
$mapping_name
);
# Create a new MappingList object. Specify AUTO_LOAD to load serialised
# existing mappings if found
my
$dump_path
=
path_append
(
$self
->
conf
->
param
('
basedir
'),
'
mapping
');
my
$mappings
=
Bio::EnsEMBL::IdMapping::
MappingList
->
new
(
-
DUMP_PATH
=>
$dump_path
,
-
CACHE_FILE
=>
"
${mapping_name}
.ser
",
-
AUTO_LOAD
=>
1
,
);
# checkpoint test: return a previously stored MappingList
if
(
$mappings
->
loaded
)
{
$self
->
logger
->
info
("
Read existing mappings from
${mapping_name}
.ser.
\n
");
return
$mappings
;
}
my
$sources_done
=
{};
my
$targets_done
=
{};
# sort scoring matrix entries by descending score
my
@sorted_entries
=
sort
{
$b
->
score
<=>
$a
->
score
}
@
{
$matrix
->
get_all_Entries
};
# debug
#my $idx = substr($mapping_name, -1);
while
(
my
$entry
=
shift
(
@sorted_entries
))
{
#$self->logger->debug("\nxxx$idx ".$entry->to_string." ");
# we already found a mapping for either source or target
next
if
(
$sources_done
->
{
$entry
->
source
}
or
$targets_done
->
{
$entry
->
target
});
#$self->logger->debug('d');
# there's a better mapping for either source or target
next
if
(
$self
->
higher_score_exists
(
$entry
,
$matrix
,
$sources_done
,
$targets_done
));
#$self->logger->debug('h');
# check for ambiguous mappings; they are dealt with later
my
$other_sources
=
[]
;
my
$other_targets
=
[]
;
if
(
$self
->
ambiguous_mapping
(
$entry
,
$matrix
,
$other_sources
,
$other_targets
))
{
#$self->logger->debug('a');
$other_sources
=
$self
->
filter_sources
(
$other_sources
,
$sources_done
);
$other_targets
=
$self
->
filter_targets
(
$other_targets
,
$targets_done
);
next
if
(
scalar
(
@$other_sources
)
or
scalar
(
@$other_targets
));
}
#$self->logger->debug('A');
# this is the best mapping, add it
$mappings
->
add_Entry
(
$entry
);
$sources_done
->
{
$entry
->
source
}
=
1
;
$targets_done
->
{
$entry
->
target
}
=
1
;
}
# create checkpoint
$mappings
->
write_to_file
;
return
$mappings
;
}
sub
higher_score_exists
{
my
(
$self
,
$entry
,
$matrix
,
$sources_done
,
$targets_done
)
=
@_
;
my
$source
=
$entry
->
source
;
my
$target
=
$entry
->
target
;
my
$score
=
$entry
->
score
;
foreach
my
$other_source
(
@
{
$matrix
->
get_sources_for_target
(
$target
)
})
{
if
(
$other_source
!=
$source
and
!
$sources_done
->
{
$other_source
}
and
$score
<
$matrix
->
get_score
(
$other_source
,
$target
))
{
return
1
;
}
}
foreach
my
$other_target
(
@
{
$matrix
->
get_targets_for_source
(
$source
)
})
{
if
(
$other_target
!=
$target
and
!
$targets_done
->
{
$other_target
}
and
$score
<
$matrix
->
get_score
(
$source
,
$other_target
))
{
return
1
;
}
}
return
0
;
}
#
# find ambiguous mappings (see scores_similar() for definition)
#
sub
ambiguous_mapping
{
my
(
$self
,
$entry
,
$matrix
,
$other_sources
,
$other_targets
)
=
@_
;
my
$source
=
$entry
->
source
;
my
$target
=
$entry
->
target
;
my
$score
=
$entry
->
score
;
my
$retval
=
0
;
foreach
my
$other_source
(
@
{
$matrix
->
get_sources_for_target
(
$target
)
})
{
my
$other_score
=
$matrix
->
get_score
(
$other_source
,
$target
);
if
(
$other_source
!=
$source
and
(
$self
->
scores_similar
(
$score
,
$other_score
)
or
$score
<
$other_score
))
{
$retval
=
1
;
push
@
{
$other_sources
},
$other_source
;
}
}
foreach
my
$other_target
(
@
{
$matrix
->
get_targets_for_source
(
$source
)
})
{
my
$other_score
=
$matrix
->
get_score
(
$source
,
$other_target
);
if
(
$other_target
!=
$target
and
(
$self
->
scores_similar
(
$score
,
$other_score
)
or
$score
<
$other_score
))
{
$retval
=
1
;
push
@
{
$other_targets
},
$other_target
;
}
}
return
$retval
;
}
#
# rule for similarity taken from java code...
#
sub
scores_similar
{
my
(
$self
,
$s1
,
$s2
)
=
@_
;
# always give priority to exact matches over very similar ones
return
0
if
(
$s1
==
1
and
$s2
<
1
);
my
$diff
=
$s1
-
$s2
;
$diff
=
-
$diff
if
(
$diff
<
0
);
my
$pc
=
2
*
$diff
/
(
$s1
+
$s2
);
return
(
$pc
<
SIMILAR_SCORE_RATIO
);
}
sub
filter_sources
{
my
(
$self
,
$other_sources
,
$sources_done
)
=
@_
;
unless
(
scalar
(
@$other_sources
)
and
scalar
(
keys
%$sources_done
))
{
return
$other_sources
;
}
my
@tmp
=
();
foreach
my
$e
(
@
{
$other_sources
})
{
push
@tmp
,
$e
unless
(
$sources_done
->
{
$e
});
}
return
\
@tmp
;
}
sub
filter_targets
{
my
(
$self
,
$other_targets
,
$targets_done
)
=
@_
;
unless
(
scalar
(
@
{
$other_targets
})
and
scalar
(
keys
%$targets_done
))
{
return
$other_targets
;
}
my
@tmp
=
();
foreach
my
$e
(
@
{
$other_targets
})
{
push
@tmp
,
$e
unless
(
$targets_done
->
{
$e
});
}
return
\
@tmp
;
}
1
;
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm
0 → 100644
View file @
ce19e680
package
Bio::EnsEMBL::IdMapping::InternalIdMapper::
EnsemblExonGeneric
;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http:#www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use
strict
;
use
warnings
;
no
warnings
'
uninitialized
';
use
Bio::EnsEMBL::IdMapping::InternalIdMapper::
BaseMapper
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper)
;
use
Bio::EnsEMBL::Utils::
Exception
qw(throw warning)
;
#
# basic mapping
#
sub
init_basic
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$esb
=
shift
;
my
$mappings
=
shift
;
my
$exon_scores
=
shift
;
$self
->
logger
->
info
("
Basic exon mapping...
\n
",
0
,
'
stamped
');
$mappings
=
$self
->
basic_mapping
(
$exon_scores
,
"
exon_mappings
$num
");
$num
++
;
my
$new_scores
=
$esb
->
create_shrinked_matrix
(
$exon_scores
,
$mappings
,
"
exon_matrix
$num
");
return
(
$new_scores
,
$mappings
);
}
#
# reduce score for mappings of exons which do not belong to mapped
# transcripts
#
sub
mapped_transcript
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$esb
=
shift
;
my
$mappings
=
shift
;
my
$exon_scores
=
shift
;
$self
->
logger
->
info
("
Exons in mapped transcript...
\n
",
0
,
'
stamped
');
unless
(
$exon_scores
->
loaded
)
{
$esb
->
non_mapped_transcript_rescore
(
$exon_scores
,
$mappings
);
$exon_scores
->
write_to_file
;
}
$mappings
=
$self
->
basic_mapping
(
$exon_scores
,
"
exon_mappings
$num
");
$num
++
;
my
$new_scores
=
$esb
->
create_shrinked_matrix
(
$exon_scores
,
$mappings
,
"
exon_matrix
$num
");
return
(
$new_scores
,
$mappings
);
}
1
;
modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm
0 → 100644
View file @
ce19e680
package
Bio::EnsEMBL::IdMapping::InternalIdMapper::
EnsemblGeneGeneric
;
=head1 NAME
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric - default Ensembl
InternalIdMapper implementation for genes
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use
strict
;
use
warnings
;
no
warnings
'
uninitialized
';
use
Bio::EnsEMBL::IdMapping::InternalIdMapper::
BaseMapper
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper)
;
use
Bio::EnsEMBL::Utils::
Exception
qw(throw warning)
;
use
Bio::EnsEMBL::Utils::
ScriptUtils
qw(path_append)
;
#
# basic mapping
#
sub
init_basic
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$gsb
=
shift
;
my
$mappings
=
shift
;
my
$gene_scores
=
shift
;
$self
->
logger
->
info
("
Basic gene mapping...
\n
",
0
,
'
stamped
');
$mappings
=
$self
->
basic_mapping
(
$gene_scores
,
"
gene_mappings
$num
");
$num
++
;
my
$new_scores
=
$gsb
->
create_shrinked_matrix
(
$gene_scores
,
$mappings
,
"
gene_matrix
$num
");
return
(
$new_scores
,
$mappings
);
}
#
# build the synteny from unambiguous mappings
#
sub
synteny
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$gsb
=
shift
;
my
$mappings
=
shift
;
my
$gene_scores
=
shift
;
unless
(
$gene_scores
->
loaded
)
{
$self
->
logger
->
info
("
Synteny Framework building...
\n
",
0
,
'
stamped
');
my
$dump_path
=
path_append
(
$self
->
conf
->
param
('
basedir
'),
'
mapping
');
my
$sf
=
Bio::EnsEMBL::IdMapping::
SyntenyFramework
->
new
(
-
DUMP_PATH
=>
$dump_path
,
-
CACHE_FILE
=>
'
synteny_framework.ser
',
-
LOGGER
=>
$self
->
logger
,
-
CONF
=>
$self
->
conf
,
-
CACHE
=>
$self
->
cache
,
);
$sf
->
build_synteny
(
$mappings
);
# use it to rescore the genes
$self
->
logger
->
info
("
\n
Synteny assisted mapping...
\n
",
0
,
'
stamped
');
$gene_scores
=
$sf
->
rescore_gene_matrix_lsf
(
$gene_scores
);
# checkpoint
$gene_scores
->
write_to_file
;
}
my
$new_mappings
=
$self
->
basic_mapping
(
$gene_scores
,
"
gene_mappings
$num
");
$num
++
;
my
$new_scores
=
$gsb
->
create_shrinked_matrix
(
$gene_scores
,
$new_mappings
,
"
gene_matrix
$num
");
return
(
$new_scores
,
$new_mappings
);
}
#
# rescore with simple scoring function and try again
#
sub
best_transcript
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$gsb
=
shift
;
my
$mappings
=
shift
;
my
$gene_scores
=
shift
;
my
$transcript_scores
=
shift
;
$self
->
logger
->
info
("
Retry with simple best transcript score...
\n
",
0
,
'
stamped
');
unless
(
$gene_scores
->
loaded
)
{
$gsb
->
simple_gene_rescore
(
$gene_scores
,
$transcript_scores
);
$gene_scores
->
write_to_file
;
}
my
$new_mappings
=
$self
->
basic_mapping
(
$gene_scores
,
"
gene_mappings
$num
");
$num
++
;
my
$new_scores
=
$gsb
->
create_shrinked_matrix
(
$gene_scores
,
$new_mappings
,
"
gene_matrix
$num
");
return
(
$new_scores
,
$new_mappings
);
}
#
# rescore by penalising scores between genes with different biotypes
#
sub
biotype
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$gsb
=
shift
;
my
$mappings
=
shift
;
my
$gene_scores
=
shift
;
$self
->
logger
->
info
("
Retry with biotype disambiguation...
\n
",
0
,
'
stamped
');
unless
(
$gene_scores
->
loaded
)
{
$gsb
->
biotype_gene_rescore
(
$gene_scores
);
$gene_scores
->
write_to_file
;
}
my
$new_mappings
=
$self
->
basic_mapping
(
$gene_scores
,
"
gene_mappings
$num
");
$num
++
;
my
$new_scores
=
$gsb
->
create_shrinked_matrix
(
$gene_scores
,
$new_mappings
,
"
gene_matrix
$num
");
return
(
$new_scores
,
$new_mappings
);
}
#
# selectively rescore by penalising scores between genes with different
# internalIDs
#
sub
internal_id
{
my
$self
=
shift
;
my
$num
=
shift
;
my
$gsb
=
shift
;
my
$mappings
=
shift
;
my
$gene_scores
=
shift
;
$self
->
logger
->
info
("
Retry with internalID disambiguation...
\n
",
0
,
'
stamped
');
unless
(
$gene_scores
->
loaded
)
{
$gsb
->
internal_id_rescore
(
$gene_scores
);
$gene_scores
->
write_to_file
;
}
my
$new_mappings
=
$self
->
basic_mapping
(
$gene_scores
,
"
gene_mappings
$num
");
$num
++
;
my
$new_scores
=
$gsb
->
create_shrinked_matrix
(
$gene_scores
,
$new_mappings
,
"
gene_matrix
$num
");
return
(
$new_scores
,
$new_mappings
);
}