Skip to content
Snippets Groups Projects
Commit f098d487 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

first (draft) version of new modules for ID mapping

parent b300d2ed
No related branches found
No related tags found
No related merge requests found
package Bio::EnsEMBL::IdMapping::Entry;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = [];
bless ($self, $class);
return $self;
}
sub new_fast {
my $class = shift;
my $array_ref = shift;
return bless $array_ref, $class;
}
sub source {
my $self = shift;
$self->[0] = shift if (@_);
return $self->[0];
}
sub target {
my $self = shift;
$self->[1] = shift if (@_);
return $self->[1];
}
sub score {
my $self = shift;
$self->[2] = shift if (@_);
return $self->[2];
}
1;
package Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IdMapping::ScoreBuilder;
our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes);
use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
sub score_exons {
my $self = shift;
my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH => $self->conf('dumppath')
);
my $cache_file = $matrix->cache_file;
if (-s $cache_file) {
# read from file
$self->logger->log_stamped("Reading exon scoring matrix from file...\n");
$self->logger->log("Cache file $cache_file.\n", 1);
$matrix->read_from_file;
$self->logger->log_stamped("Done.\n");
} else {
#
# build scoring matrix
#
# direct mapping (by overlap, if common coord_system exists)
if ($self->cache->highest_common_cs) {
$matrix = $self->build_overlap_scores($matrix);
}
# map the remaining exons using exonerate
# dump exons to fasta files
my $dump_size = $self->cache->dump_filtered_exons;
if ($dump_size) {
# run exonerate
my $exonerate_matrix;
# merge matrices
$matrix->merge($exonerate_matrix);
}
#
# write scoring matrix to file
#
$matrix->write_to_file;
return $matrix;
}
return $matrix;
}
sub build_overlap_scores {
my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
}
my @s_exons = $self->sort_exons(
[values %{ $self->cache->get_by_name('exons_by_id', 'source') }
);
my @t_exons = $self->sort_exons(
[values %{ $self->cache->get_by_name('exons_by_id', 'target') }
);
}
#
# Return a list of exons, sorted by seq_region_name, then location (where
# location is either start-1 or end, so each exon is in the list twice)
#
# TODO: this implementation isn't good enough, since you'll need location later
# to compare source and target exons again. best add this as an instance
# variable to the TinyExon returned
#
sub sort_exons {
my $self = shift;
my $exons = shift;
return
map { $_->[2] }
sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] }
(
map { [$_->common_name, $_->common_start - 1, $_] } @$exons,
map { [$_->common_name, $_->common_end, $_] } @$exons
);
}
sub compare_exons {
my $self = shift;
my $e1 = shift;
my $e2 = shift;
return ( $e1->
}
package Bio::EnsEMBL::IdMapping::ScoreBuilder;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
=head2 new
Arg[1] :
Example :
Description : constructor
Return type :
Exceptions :
Caller : general
=cut
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);
unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) {
throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
}
unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
}
unless ($cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
}
my $self = {};
bless ($self, $class);
# initialise
$self->logger($logger);
$self->conf($conf);
$self->cache($cache);
return $self;
}
sub logger {
my $self = shift;
$self->{'_logger'} = shift if (@_);
return $self->{'_logger'};
}
sub conf {
my $self = shift;
$self->{'_conf'} = shift if (@_);
return $self->{'_conf'};
}
sub cache {
my $self = shift;
$self->{'_cache'} = shift if (@_);
return $self->{'_cache'};
}
1;
package Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes);
use Bio::EnsEMBL::IdMapping::Entry;
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my ($dump_path) = rearrange(['DUMP_PATH'], @_);
throw("You must provide a dump path") unless ($dump_path);
my $self = {};
bless ($self, $class);
# initialise internal datastructure
$self->{'matrix'} = {};
$self->{'source_list'} = {};
$self->{'target_list'} = {};
$self->{'dump_path'} = $dump_path;
return $self;
}
sub add_Entry {
my $self = shift;
my $entry = shift;
unless ($entry and $entry->isa('Bio::EnsEMBL::IdMapping::Entry')) {
throw("Need a Bio::EnsEMBL::IdMapping::Entry");
}
return $self->add_score($entry->source, $entry->target, $entry->score);
}
sub remove_Entry {
}
sub add_score {
my $self = shift;
my $source = shift;
my $target = shift;
my $score = shift;
# make sure you don't put duplicates on the source and target lists
unless (exists($self->{'matrix'}->{"$source:$target"})) {
push @{ $self->{'source_list'}->{$source} }, $target;
push @{ $self->{'target_list'}->{$target} }, $source;
}
$self->{'matrix'}->{"$source:$target"} = $score;
}
sub get_Entry {
my $self = shift;
my $source = shift;
my $target = shift;
if (exists($self->{'matrix'}->{"$source:$target"}) {
return Bio::EnsEMBL::IdMapping::Entry->new_fast(
[$source, $target, $self->{'matrix'}->{"$source:$target"}]
);
} else {
return undef;
}
}
sub get_score {
my $self = shift;
my $source = shift;
my $target = shift;
if (exists($self->{'matrix'}->{"$source:$target"}) {
return $self->{'matrix'}->{"$source:$target"};
} else {
return undef;
}
}
sub get_targets_for_source {
my $self = shift;
my $source = shift;
return($self->{'source_list'}->{$source} || []);
}
sub get_sources_for_target {
my $self = shift;
my $target = shift;
return($self->{'target_list'}->{$target} || []);
}
sub get_all_Entries {
my $self = shift;
my @result = ();
foreach my $key (keys %{ $self->{'matrix'} }) {
my ($source, $target) = split(/:/, $key);
push @result, Bio::EnsEMBL::IdMapping::Entry->new_fast(
[$source, $target, $self->{'matrix'}->{$key}]
);
}
return \@result;
}
sub get_all_sources {
return [keys %{ $_->{'source_list'} }];
}
sub get_all_targets {
return [keys %{ $_->{'target_list'} }];
}
sub get_entry_count {
return scalar(keys %{ $_->{'matrix'} });
}
sub get_source_count {
return scalar(keys %{ $_->{'source_list'} });
}
sub get_target_count {
return scalar(keys %{ $_->{'target_list'} });
}
sub get_min_scores {
my $self = shift;
my @keys = keys %{ $self->{'matrix'} };
return [undef, undef] unless (@keys);
# initialise; this should make loop quicker
my $min = $self->{'matrix'}->{$keys[0]};
my $max = $self->{'matrix'}->{$keys[0]};
foreach my $key (@keys) {
$min = $self->{'matrix'}->{$key} if ($min > $self->{'matrix'}->{$key});
$max = $self->{'matrix'}->{$key} if ($max < $self->{'matrix'}->{$key});
}
return [$min, $max];
}
sub get_average_score {
my $self = shift;
my @keys = keys %{ $self->{'matrix'} };
return undef unless (@keys);
my $total = 0;
foreach my $key (@keys) {
$total += $self->{'matrix'}->{$key};
}
return $total/scalar(@keys);
}
sub write_to_file {
}
sub merge {
my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('You must provide a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix');
}
my $c = 0;
foreach my $key (keys %{ $matrix->{'matrix'} }) {
if (!defined($self->{'matrix'}->{$key}) or
$self->{'matrix'}->{$key} < $matrix->{'matrix'}->{$key}) {
$self->{'matrix'}->{$key} = $matrix->{'matrix'}->{$key};
$c++;
}
}
return $c;
}
sub write_to_file {
my $self = shift;
# create dump directory if it doesn't exist
if (my $dump_path = $self->dump_path) {
unless (-d $dump_path) {
system("mkdir -p $dump_path") == 0 or
throw("Unable to create directory $dump_path.\n");
}
}
my $cache_file = $self->cache_file;
eval { nstore($self, $cache_file) };
if ($@) {
throw("Unable to store $cache_file: $@\n");
}
my $size = -s $cache_file;
return parse_bytes($size);
}
sub read_from_file {
my $self = shift;
my $cache_file = $self->cache_file;
unless (-s $cache_file) {
throw("No valid cache file found at $cache_file.");
}
eval { $self = retrieve($cache_file); };
if ($@) {
throw("Unable to retrieve cache: $@");
}
return $self;
}
sub cache_file {
my $self = shift;
my $cache_file = ($self->dump_path || '.').'/exon_scoring_matrix.ser';
return $cache_file;
}
#
# getter/setters
#
sub dump_path {
my $self = shift;
$self->{'dump_path'} = shift if (@_);
return $self->{'dump_path'};
}
1;
package Bio::EnsEMBL::IdMapping::TinyGene;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
# internal data structure (array indices):
#
# 0 dbID
# 1 stable_id
# 2 start
# 3 end
# 4 strand
# 5 seq_region_name
# 6 coord_system_name
# 7 coord_system_version
# 8 seq
# 9 need_project
# 10 common_start
# 11 common_end
# 12 common_strand
# 13 common_sr_name
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IDMapping::TinyFeature;
our @ISA = qw(Bio::EnsEMBL::IDMapping::TinyFeature);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
sub start {
my $self = shift;
$self->[2] = shift if (@_);
return $self->[2];
}
sub end {
my $self = shift;
$self->[3] = shift if (@_);
return $self->[3];
}
sub strand {
my $self = shift;
$self->[4] = shift if (@_);
return $self->[4];
}
sub seq_region_name {
my $self = shift;
$self->[5] = shift if (@_);
return $self->[5];
}
sub coord_system_name {
my $self = shift;
$self->[6] = shift if (@_);
return $self->[6];
}
sub coord_system_version {
my $self = shift;
$self->[7] = shift if (@_);
return $self->[7];
}
sub seq {
my $self = shift;
$self->[8] = shift if (@_);
return $self->[8];
}
sub need_project {
my $self = shift;
$self->[9] = shift if (@_);
return $self->[9];
}
sub common_start {
my $self = shift;
# when used as a setter, always set a value
$self->[10] = shift if (@_);
# when used as a getter
if (scalar(@$self > 9) {
# return value for common coord_system if available (but avoid
# autovivification gotcha!)
return $self->[10];
} elsif ($self->need_project) {
# return undef if common value expected but not there (e.g. no projection
# found
return undef;
} else {
# return native value
return $self->start;
}
}
sub common_end {
my $self = shift;
# when used as a setter, always set a value
$self->[11] = shift if (@_);
# when used as a getter
if (scalar(@$self > 9) {
# return value for common coord_system if available (but avoid
# autovivification gotcha!)
return $self->[11];
} elsif ($self->need_project) {
# return undef if common value expected but not there (e.g. no projection
# found
return undef;
} else {
# return native value
return $self->end;
}
}
sub common_strand {
my $self = shift;
# when used as a setter, always set a value
$self->[12] = shift if (@_);
# when used as a getter
if (scalar(@$self > 9) {
# return value for common coord_system if available (but avoid
# autovivification gotcha!)
return $self->[12];
} elsif ($self->need_project) {
# return undef if common value expected but not there (e.g. no projection
# found
return undef;
} else {
# return native value
return $self->strand;
}
}
sub common_sr_name {
my $self = shift;
# when used as a setter, always set a value
$self->[13] = shift if (@_);
# when used as a getter
if (scalar(@$self > 9) {
# return value for common coord_system if available (but avoid
# autovivification gotcha!)
return $self->[13];
} elsif ($self->need_project) {
# return undef if common value expected but not there (e.g. no projection
# found
return undef;
} else {
# return native value
return $self->seq_region_name;
}
}
1;
package Bio::EnsEMBL::IdMapping::TinyGene;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IDMapping::TinyFeature;
our @ISA = qw(Bio::EnsEMBL::IDMapping::TinyFeature);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
sub add_Transcript {
my $self = shift;
my $tr = shift;
unless ($tr && $tr->isa('Bio::EnsEMBL::IdMapping::TinyTranscript')) {
throw('Need a Bio::EnsEMBL::IdMapping::TinyTranscript.');
}
push @{ $self->[9] }, $tr;
}
sub get_all_Transcripts {
return $_->[9] || [];
}
1;
package Bio::EnsEMBL::IdMapping::TinyTranscript;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IDMapping::TinyFeature;
our @ISA = qw(Bio::EnsEMBL::IDMapping::TinyFeature);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
sub add_Translation {
my $self = shift;
my $tl = shift;
unless ($self->[0] eq 'tr' and $tl->[0] eq 'tl') {
throw('You can only add a translation to a transcript.');
}
$self->[10] = $tl;
}
sub add_Exon {
my $self = shift;
my $exon = shift;
unless ($exon && $exon->isa('Bio::EnsEMBL::IdMapping::TinyExon')) {
throw('Need a Bio::EnsEMBL::IdMapping::TinyExon.');
}
push @{ $self->[11] }, $exon;
}
sub get_all_Exons {
return $_->[11] || [];
}
1;
package Bio::EnsEMBL::IdMapping::TinyGene;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IDMapping::TinyFeature;
our @ISA = qw(Bio::EnsEMBL::IDMapping::TinyFeature);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment