diff --git a/modules/Bio/EnsEMBL/IdMapping/Archiver.pm b/modules/Bio/EnsEMBL/IdMapping/Archiver.pm index 8cdf7e46098b9eb6ec7461f733049311f0219256..0cb6cc79ddaddcf1a90fd3c3d715922621a3467a 100644 --- a/modules/Bio/EnsEMBL/IdMapping/Archiver.pm +++ b/modules/Bio/EnsEMBL/IdMapping/Archiver.pm @@ -2,15 +2,41 @@ package Bio::EnsEMBL::IdMapping::Archiver; =head1 NAME +Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive =head1 SYNOPSIS +my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache +); + +# create gene and peptide archive +$archiver->create_archive($mapping_session_id); + +# dump existing archive tables to file +my $num_entries = $archiver->dump_table_to_file('source', 'gene_archive', + 'gene_archive_existing.txt', 1); =head1 DESCRIPTION +This module creates the gene_archive and peptide_archive tables. Data is written +to a file as tab-delimited text for loading into a MySQL database (this can be +done manually, or using StableIdmapper->upload_file_into_table()). + +An archive entry for a given source gene is created if no target gene exists, or +if any of its transcripts or their translations changed. Non-coding transcripts +only have an entry in gene_archive (i.e. without a corresponding peptide_archive +entry). =head1 METHODS +create_archive +dump_gene +dump_tuple +dump_nc_row +mapping_session_id =head1 LICENCE @@ -45,6 +71,22 @@ use Digest::MD5 qw(md5_hex); my $pa_id; +=head2 create_archive + + Arg[1] : Int $mapping_session_id - the mapping_session_id for this run + Example : $archiver->create_archive($stable_id_mapper->mapping_session_id); + Description : Creates the gene_archive and peptide_archive tables and writes + the data to a tab-delimited file. The decision as to what to + archive is deferred to dump_gene(), see documentation there for + details. + Return type : none + Exceptions : Thrown on missing argument. + Caller : id_mapping.pl + Status : At Risk + : under development + +=cut + sub create_archive { my $self = shift; my $mapping_session_id = shift; @@ -92,6 +134,25 @@ sub create_archive { } +=head2 dump_gene + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene + Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene + Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data + Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data + Example : my $target_gene = $gene_mappings{$source_gene->stable_id}; + $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh); + Description : Given a source gene, it will write a gene_achive and + peptide_achive entry for it if no target gene exists, or if any + of its transcripts or their translation changed. + Return type : none + Exceptions : none + Caller : create_archive() + Status : At Risk + : under development + +=cut + sub dump_gene { my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_; @@ -113,7 +174,7 @@ sub dump_gene { if (! $t_gene) { $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); - # otherwise, only dump if translation of this transcript changed + # otherwise, only dump if any transcript or its translation changed } else { my $changed_flag = 1; @@ -160,6 +221,23 @@ sub dump_gene { } +=head2 dump_tuple + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive + Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript + Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation + Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data + Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data + Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); + Description : Writes entry lines for gene_archive and peptide_archive. + Return type : none + Exceptions : none + Caller : dump_gene() + Status : At Risk + : under development + +=cut + sub dump_tuple { my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_; @@ -188,6 +266,22 @@ sub dump_tuple { } +=head2 dump_nc_row + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive + Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript + Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data + Example : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh); + Description : Writes an entry line for gene_archive for non-coding + transcripts. + Return type : none + Exceptions : none + Caller : dump_gene() + Status : At Risk + : under development + +=cut + sub dump_nc_row { my ($self, $gene, $tr, $ga_fh) = @_; @@ -208,6 +302,19 @@ sub dump_nc_row { } +=head2 mapping_session_id + + Arg[1] : (optional) Int - mapping_session_id to set + Example : my $msi = $archiver->mapping_session_id; + Description : Getter/setter for mapping_session_id. + Return type : Int + Exceptions : none + Caller : create_archive() + Status : At Risk + : under development + +=cut + sub mapping_session_id { my $self = shift; $self->{'_mapping_session_id'} = shift if (@_); diff --git a/modules/Bio/EnsEMBL/IdMapping/Cache.pm b/modules/Bio/EnsEMBL/IdMapping/Cache.pm index 8d298aeff5e5e7d67f9c9df9d7086c9e72010346..ba60460e6c94d7ef2650c2f20b93fd897d708306 100644 --- a/modules/Bio/EnsEMBL/IdMapping/Cache.pm +++ b/modules/Bio/EnsEMBL/IdMapping/Cache.pm @@ -1173,11 +1173,7 @@ sub slice_names { } -# -# getters/setters -# - -=head2 +=head2 logger Arg[1] : Example : @@ -1196,7 +1192,7 @@ sub logger { } -=head2 +=head2 conf Arg[1] : Example : diff --git a/modules/Bio/EnsEMBL/IdMapping/Entry.pm b/modules/Bio/EnsEMBL/IdMapping/Entry.pm index b56112314e39f9c6c6f031f2188288961b73f96a..a78be390f699efce13132fb065bc3080b8649a40 100644 --- a/modules/Bio/EnsEMBL/IdMapping/Entry.pm +++ b/modules/Bio/EnsEMBL/IdMapping/Entry.pm @@ -2,15 +2,24 @@ package Bio::EnsEMBL::IdMapping::Entry; =head1 NAME +Bio::EnsEMBL::IdMapping::Entry - object representing a ScoredMappingMatrix entry =head1 SYNOPSIS =head1 DESCRIPTION +This object represents a ScoredMappingMatrix entry. It is defined by a pair of a +source and target object's internal Id and a score for this mapping. =head1 METHODS +new +new_fast +source +target +score +to_string =head1 LICENCE @@ -35,6 +44,20 @@ no warnings 'uninitialized'; use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 new + + Example : my $entry = Bio::EnsEMBL::IdMapping::Entry->new(); + Description : Constructor. This is a no-argument constructor, so you need to + populate the object manually. Rarely used since in most cases + new_fast() is preferred. + Return type : a Bio::EnsEMBL::IdMapping::Entry object + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub new { my $caller = shift; my $class = ref($caller) || $caller; @@ -46,6 +69,21 @@ sub new { } +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the Entry + object + Example : my $entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([ + $source_gene->id, $target_gene->id, 0.9]); + Description : Fast constructor. + Return type : a Bio::EnsEMBL::IdMapping::Entry object + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub new_fast { my $class = shift; my $array_ref = shift; @@ -53,6 +91,18 @@ sub new_fast { } +=head2 source + + Arg[1] : (optional) Int - source object's internal Id + Description : Getter/setter for source object's internal Id. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub source { my $self = shift; $self->[0] = shift if (@_); @@ -60,6 +110,18 @@ sub source { } +=head2 target + + Arg[1] : (optional) Int - target object's internal Id + Description : Getter/setter for target object's internal Id. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub target { my $self = shift; $self->[1] = shift if (@_); @@ -67,6 +129,19 @@ sub target { } +=head2 score + + Arg[1] : (optional) Float - a score + Description : Getter/setter for score for the mapping between source and + target object. + Return type : Float + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub score { my $self = shift; $self->[2] = shift if (@_); @@ -74,15 +149,22 @@ sub score { } -sub to_string { - my $self = shift; - return sprintf('%-10s%-10s%-5.6f', $self->source, $self->target, $self->score); -} +=head2 to_string + Example : print LOG $entry->to_string, "\n"; + Description : Returns a string representation of the Entry object. Useful for + debugging and logging. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development -sub to_string_compat { +=cut + +sub to_string { my $self = shift; - return join(" ", $self->source, $self->target, $self->score); + return sprintf('%-10s%-10s%-5.6f', $self->source, $self->target, $self->score); } diff --git a/modules/Bio/EnsEMBL/IdMapping/MappingList.pm b/modules/Bio/EnsEMBL/IdMapping/MappingList.pm index 4f9d66cedb2f8bb3ef78a515b5d0a420060d43e4..300a107cd9fd22d37cc0baf18bed466096428c41 100644 --- a/modules/Bio/EnsEMBL/IdMapping/MappingList.pm +++ b/modules/Bio/EnsEMBL/IdMapping/MappingList.pm @@ -2,15 +2,45 @@ package Bio::EnsEMBL::IdMapping::MappingList; =head1 NAME +Bio::EnsEMBL::IdMapping::MappingList - object holding a list of Entries =head1 SYNOPSIS +# create a new MappingList +my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', +); + +# add entries +my $mappings->add_Entry($entry1); +my $mappings->add_all($entry2, $entry3); + +# serialise to file +$mappings->write_to_file; + +# later, read these mappings from file +my $mappings1 = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', +); +$mappings1->read_from_file; =head1 DESCRIPTION +This object represents a list of Bio::EnsEMBL::IdMapping::Entry objects. It's +essentially an OO wrapper for an array with some type checking and convenience +methods. =head1 METHODS +new +add_Entry +get_all_Entries +add_all +get_entry_count +log +to_string =head1 LICENCE @@ -40,6 +70,22 @@ use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +=head2 new + + Arg[1-N] : see superclass + Example : my $gene_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', + ); + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::MappingList + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub new { my $caller = shift; my $class = ref($caller) || $caller; @@ -54,6 +100,19 @@ sub new { } +=head2 add_Entry + + Arg[1] : Bio::EnsEMBL::IdMapping::Entry - Entry to add + Example : $mappings->add_Entry($entry); + Description : Adds an Entry to the MappingList. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub add_Entry { my $self = shift; my $entry = shift; @@ -66,12 +125,40 @@ sub add_Entry { } +=head2 get_all_Entries + + Example : foreach my $entry (@{ $mappings->get_all_Entries }) { + # do something with the entry + } + Description : Gets all Entries in the MappingList. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_Entries { my $self = shift; return $self->{'cache'}->{'entries'}; } +=head2 add_all + + Arg[1] : List of Bio::EnsEMBL::IdMapping::Entry objects + Example : my @entries = ($entry1, $entry2); + $mappings->add_all(@entries); + Description : Adds a list of Entries to the MappingList. + Return type : none + Exceptions : thrown on wrong argument + Caller : general + Status : At Risk + : under development + +=cut + sub add_all { my $self = shift; my @mappings = @_; @@ -87,12 +174,39 @@ sub add_all { } +=head2 get_entry_count + + Example : my $num_entries = $mappings->get_entry_count; + Description : Returns the number of Entries in the MappingList. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_entry_count { my $self = shift; return scalar(@{ $self->{'cache'}->{'entries'} }); } +=head2 log + + Arg[1] : String $type - object type (e.g. 'gene') + Arg[2] : String $dump_path - path for writing output + Example : $mappings->log('gene', $conf->param('basedir')); + Description : Logs all Entries in the MappingList to a file. Used for + debugging. + Return type : none + Exceptions : thrown on I/0 error + Caller : general + Status : At Risk + : under development + +=cut + sub log { my $self = shift; my $type = shift; @@ -112,6 +226,21 @@ sub log { } +=head2 to_string + + Example : print LOG $mappings->to_string, "\n"; + Description : Returns a string representation of the MappingList. This is + simply a multi-line string, where each line is a stringified + Entry. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub to_string { my $self = shift; diff --git a/modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm b/modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm index db83cf8a1793aa01518d1d00bb7ea157e22b6886..b96542d91c6a1e8966c4d66078175c560c9ba628 100644 --- a/modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm +++ b/modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm @@ -2,15 +2,55 @@ package Bio::EnsEMBL::IdMapping::ResultAnalyser; =head1 NAME +Bio::EnsEMBL::IdMapping::ResultAnalyser - analyse stable Id mapping results =head1 SYNOPSIS +# get a result analyser +my $analyser = Bio::EnsEMBL::IdMapping::ResultAnalyser->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache +); + +# analyse results +$analyser->analyse($gene_mappings, + $stable_id_mapper->get_all_stable_id_events('similarity')); + +# write results to file +$analyser->write_results_to_file; + +# create click lists +$analyser->create_clicklist; + +# summary email +$analyser->create_summary_email; =head1 DESCRIPTION +This is a utility module which analyses the stable Id mapping results by +providing various sorts of mapping statistics. It also creates clicklists and a +summary email. =head1 METHODS +analyse +analyse_db +classify_source_genes_by_type +classify_genes_by_mapping_simple +classify_genes_by_mapping +add +get +get_all_by_subclass +get_all_by_class +get_count_by_subclass +get_count_by_class +get_all_classes +class_key +write_results_to_file +create_clicklist +create_summary_email +read_from_file =head1 LICENCE @@ -40,6 +80,22 @@ use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +=head2 analyse + + Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $gene_mappings - the gene + mappings to analyse + Arg[2] : Arrayref of Strings - similarity events + Example : $analyser->analyse($gene_mappings, + $stable_id_mapper->get_all_stable_id_events('similarity')); + Description : Analyses the results of a stable Id mapping run. + Return type : none + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + sub analyse { my $self = shift; my $gene_mappings = shift; @@ -62,14 +118,20 @@ sub analyse { $self->classify_genes_by_mapping($gene_mappings, $similarity_events); } -# -# Analyse stable ID data from two existing dbs. -# This is for potential stand-alone use of this module. -# -# [todo] -sub analyse_db { -} +=head2 classify_source_genes_by_type + + Example : $analyser->classify_source_genes_by_type; + Description : Classifies source genes by type and adds them to the internal + datastructure. For the format of the classification string see + class_key(). + Return type : none + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut sub classify_source_genes_by_type { my $self = shift; @@ -80,7 +142,21 @@ sub classify_source_genes_by_type { } -sub classify_target_genes_by_type { +=head2 classify_genes_by_mapping_simple + + Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene + mappings to classify + Example : $analyser->classify_genes_by_mapping_simple; + Description : Classifies target genes by mapping ('mapped' or 'unmapped'). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : This method is not in use at the momen. + Status : At Risk + : under development + +=cut + +sub classify_genes_by_mapping_simple { my $self = shift; my $gene_mappings = shift; @@ -122,13 +198,25 @@ sub classify_target_genes_by_type { } -# -# genes will be classified as: -# - mapped -# - deleted -# - lost_similar -# - lost_definite -# +=head2 classify_genes_by_mapping + + Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene + mappings to classify + Arg[2] : Arrayref of Strings - similarity events + Example : $analyser->classify_genes_by_mapping; + Description : Classifies genes by mapping. Status is + 'mapped' => stable Id was mapped + 'lost_similar' => stable Id not mapped, but there is a + similarity entry for the source Id + 'lost_definite' => not mapped and no similarity + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : This method is not in use at the momen. + Status : At Risk + : under development + +=cut + sub classify_genes_by_mapping { my $self = shift; my $gene_mappings = shift; @@ -179,13 +267,28 @@ sub classify_genes_by_mapping { } -# -# Add a stable ID / property pair to a name/dbtype lookup hash. -# -# This datastructure is a bloat for some applications, but is general enough to -# be used as a lookup hash and to generate statistics (counts by type) and -# debug lists (dump by type). -# +=head2 add + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Arg[4] : String $stable_id - gene stable Id + Arg[5] : String $val - value (usually 0 or 1) + Example : $analyser->add('source', 'KNOWN-ensembl-protein_coding', + 'mapped', 'ENSG00002342', 1); + Description : Add a stable Id / property pair to a name/dbtype lookup hash. + + The datastructure is a bit of a bloat, but is general enough to + be used as a lookup hash and to generate statistics (counts by + type) and debug lists (dump by type). + Return type : String - the added value + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + sub add { my ($self, $dbtype, $class, $subclass, $stable_id, $val) = @_; @@ -198,6 +301,23 @@ sub add { } +=head2 get + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Arg[4] : String $stable_id - gene stable Id + Example : my $mapping_status = $analyser->get('source', + 'KNOWN-ensembl-protein_coding', 'mapped', 'ENSG00002342'); + Description : Gets a stable Id mapping status from the internal datastructure. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + sub get { my ($self, $dbtype, $class, $subclass, $stable_id) = @_; @@ -207,6 +327,22 @@ sub get { } +=head2 get_all_by_subclass + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Example : my @mapped_stable_ids = $analyser->get_all_by_subclass('source', + 'KNOWN-ensembl-protein_coding', 'mapped'); + Description : Gets a list of stable Id for a given subclass. + Return type : Arrayref of String (stable Ids) + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + sub get_all_by_subclass { my ($self, $dbtype, $class, $subclass) = @_; @@ -219,6 +355,21 @@ sub get_all_by_subclass { } +=head2 get_all_by_class + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Example : my @stable_ids = $analyser->get_all_by_class('source', + 'KNOWN-ensembl-protein_coding'); + Description : Gets a list of stable Id for a given class. + Return type : Arrayref of String (stable Ids) + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + sub get_all_by_class { my ($self, $dbtype, $class) = @_; @@ -238,6 +389,22 @@ sub get_all_by_class { } +=head2 get_count_by_subclass + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Example : my $num_mapped = $analyser->get_count_by_subclass('source', + 'KNOWN-ensembl-protein_coding', 'mapped'); + Description : Gets the number of stable Ids for a given subclass. + Return type : Int + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + sub get_count_by_subclass { my ($self, $dbtype, $class, $subclass) = @_; @@ -250,6 +417,21 @@ sub get_count_by_subclass { } +=head2 get_count_by_class + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Example : my $num_mapped = $analyser->get_count_by_class('source', + 'KNOWN-ensembl-protein_coding'); + Description : Gets the number of stable Ids for a given class. + Return type : Int + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + sub get_count_by_class { my ($self, $dbtype, $class) = @_; @@ -261,6 +443,21 @@ sub get_count_by_class { } +=head2 get_all_classes + + Arg[1] : String $dbtype - db type ('source' or 'target') + Example : foreach my $class (@{ $analyser->get_all_classes('source') }) { + print "$class\n"; + } + Description : Gets a list of classes in the ResultAnalyser. + Return type : Arrayref of String + Exceptions : thrown on missing argument + Caller : internal + Status : At Risk + : under development + +=cut + sub get_all_classes { my ($self, $dbtype) = @_; @@ -271,12 +468,39 @@ sub get_all_classes { } +=head2 class_key + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - a gene object + Example : my $class = $analyser->class_key($gene); + Description : Generates a key identifying a gene class. This identifier is + composed from the gene's status, logic naame, and biotye. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + sub class_key { my ($self, $gene) = @_; return join('-', map { $gene->$_ } qw(status logic_name biotype)); } +=head2 write_results_to_file + + Example : $analyser->write_results_to_file; + Description : Writes the results of the result analysis to a file. This is a + human-readable text detailing the mapping statistics. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub write_results_to_file { my $self = shift; @@ -320,6 +544,20 @@ sub write_results_to_file { } +=head2 create_clicklist + + Example : $analyser->create_clicklist; + Description : Writes an html file which contains a list of all lost genes, + with hyperlinks to the appropriate archive website. This is to + manually check lost genes. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub create_clicklist { my $self = shift; @@ -377,6 +615,21 @@ sub create_clicklist { } +=head2 create_summary_email + + Example : $analyser->create_summary_email; + Description : Writes a text file containing a summary of the mapping stats. + This will be emailed to the genebuilder for evaluation (you will + have to manually send the email, using the text in + summary_email.txt as the template). + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub create_summary_email { my $self = shift; @@ -496,6 +749,21 @@ sub create_summary_email { } +=head2 read_from_file + + Arg[1] : String $filename - name of file to read + Arg[2] : (optional) String $append - directory name to append to basedir + Example : my $stats_text = $analyser->read_from_file('gene_mapping_stats', + 'stats'); + Description : Reads mapping stats from a file. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + sub read_from_file { my $self = shift; my $filename = shift; diff --git a/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm b/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm index e884d99c6cb4be52b646e3893c5e5840116d64d8..015db1a63051f7fbf8c584103c2f4cbfe6555749 100644 --- a/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm +++ b/modules/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm @@ -2,15 +2,23 @@ package Bio::EnsEMBL::IdMapping::ScoreBuilder; =head1 NAME +Bio::EnsEMBL::IdMapping::ScoreBuilder - score builder base class =head1 SYNOPSIS +This class is not instantiated. Please see subclasses for usage examples (e.g. +GeneScoreBuilder). =head1 DESCRIPTION +This is the base class for the score builders used in the stable Id mapping +application. It contains methods which are used by more than one ScoreBuilder. =head1 METHODS +create_shrinked_matrix +internal_id_rescore +log_matrix_stats =head1 LICENCE @@ -41,9 +49,26 @@ use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; -# -# create a shrinked matrix which doesn't contain entries which were already -# mapped +=head2 create_shrinked_matrix + + Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring + matrix + Arg[2] : Bio::EnsEMBL::Idmapping::MappingList $mappings - mappings + Arg[3] : String $cache_file - base name of a cache file (extension '.ser' + will be added automatically) for the returned matrix + Example : my $new_scores = $score_builder->create_shrinked_matrix( + $gene_scores, $mappings, "gene_matrix1"); + Description : Create a shrinked scoring matrix which doesn't contain entries + which were already mapped. It also logs how many new mappings + were added in this process. + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : thrown on wrong or missing arguments + Caller : InternalIdMapper plugin + Status : At Risk + : under development + +=cut + # sub create_shrinked_matrix { my $self = shift; @@ -111,6 +136,27 @@ sub create_shrinked_matrix { } +=head2 internal_id_rescore + + Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring + matrix + Example : $score_builder->internal_id_rescore($gene_scores); + Description : Rescore ambiguous mappings based on internal Ids. This is the + last disambiguation step and is only useful if objects with the + same internal Id were used in source and target dbs (e.g. in + patch builds or if objects were copied from source to target). + + If a source and target gene have the same internal Id and there + are mappings to other target genes then these *other* mappings + are penalised. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + sub internal_id_rescore { my $self = shift; my $matrix = shift; @@ -158,6 +204,21 @@ sub internal_id_rescore { } +=head2 log_matrix_stats + + Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring + matrix + Example : $score_builder->log_matrix_stats; + Description : Logs scoring matrix statistics (number of entries, min/max/avg + scores). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub log_matrix_stats { my $self = shift; my $matrix = shift; diff --git a/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm b/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm index fb85effd88191f3f13f6d94016f35c298b44b555..a9f3ba1c6cc32a7a17847f6d46026b8bdc0272d4 100644 --- a/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm +++ b/modules/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm @@ -2,15 +2,68 @@ package Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; =head1 NAME +Bio::EnsEMBL::IdMapping::ScoredMappingMatrix - object holding a list of scored +Entries =head1 SYNOPSIS +# create a new ScoredMappingMatrix +my $gene_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_scores.ser', +); + +# add entries +my $gene_scores->add_Entry($entry1); + +# serialise to file +$gene_scores->write_to_file; + +# later, read these gene_scores from file +my $gene_scores1 = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_gene_scores.ser', +); +$gene_scores1->read_from_file; =head1 DESCRIPTION +This object represents a collection of scores between source and target objects. +It holds a list of Bio::EnsEMBL::IdMapping::Entry objects and has methods to +retrieve indiviual or all Entries, as well as derived data like number of unique +sources or targets, or various counts and averages. + +It is the main collection for dealing with scored relationships in the stable Id +mapping application. =head1 METHODS +new +flush +sub_matrix +add_Entry +update_Entry +remove_Entry +add_score +set_score +get_Entry +get_score +get_targets_for_source +get_Entries_for_source +get_sources_for_target +get_Entries_for_target +get_all_Entries +get_all_sources +get_all_targets +get_entry_count +size +get_source_count +get_target_count +get_min_max_scores +get_average_score +merge +log +to_string =head1 LICENCE @@ -42,6 +95,22 @@ use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); use Bio::EnsEMBL::IdMapping::Entry; +=head2 new + + Arg[1-N] : see superclass + Example : my $gene_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_scores.ser', + ); + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub new { my $caller = shift; my $class = ref($caller) || $caller; @@ -58,6 +127,18 @@ sub new { } +=head2 flush + + Example : $gene_scores->flush; + Description : Flushes (empties) the scoring matrix. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub flush { my $self = shift; @@ -68,6 +149,24 @@ sub flush { } +=head2 sub_matrix + + Arg[1] : Int $start - start index (inclusive) + Arg[2] : Int $end - end index (inclusive) + Example : # get the first 1000 elements in the matrix + my $sub_matrix = $gene_scores->sub_matrix(1, 1000); + Description : Returns a sub-matrix of the ScoredMappingMatrix. The arguments + ($start and $end) specify the position of the first and last + element to return (inclusive, counting starts with element 1, + not 0) + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub sub_matrix { my $self = shift; my $start = shift; @@ -97,6 +196,19 @@ sub sub_matrix { } +=head2 add_Entry + + Arg[1] : Bio::EnsEMBL::IdMapping::Entry $entry - Entry to add + Example : $gene_scores->add_Entry($entry); + Description : Adds an Entry to the scoring matrix. + Return type : Float - the Entry's score + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub add_Entry { my $self = shift; my $entry = shift; @@ -109,16 +221,50 @@ sub add_Entry { } +=head2 update_Entry + + Arg[1] : Bio::EnsEMBL::IdMapping::Entry $entry - Entry to update + Example : $gene_scores->update_Entry($entry); + Description : Updates an Entry (or rather its score) in the scoring matrix. + Actually delegates to add_Entry(), only there as an intuitively + named wrapper. + Return type : Float - the Entry's score + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub update_Entry { return $_[0]->add_Entry($_[1]); } +# +# not needed in the current application, so not implemented +# sub remove_Entry { warning('Method ScoredMappingMatrix->remove_Entry not implemented (yet).'); } +=head2 add_score + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Arg[3] : Float $score - score for source/target pair + Example : $gene_scores->add_score(1234, 5678, 0.997); + Description : Adds a score for a source/target pair to the scoring matrix. + This is a low-level version of add_Entry(). + Return type : Float - the score + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub add_score { my $self = shift; my $source = shift; @@ -135,6 +281,24 @@ sub add_score { } +=head2 set_score + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Arg[3] : Float $score - score for source/target pair + Example : $gene_scores->set_score(1234, 5678, 0.997); + Description : Sets the score for a source/target pair in the scoring matrix. + This method is similar to add_score, but assumes that the Entry + has been added before, so won't update the sources and target + lists. + Return type : Float - the score + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub set_score { my $self = shift; my $source = shift; @@ -145,6 +309,22 @@ sub set_score { } +=head2 get_Entry + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Example : my $entry = $gene_scores->get_Entry($source_gene->id, + $target_gene->id); + Description : Gets an Entry from the scoring matrix for a given source and + target object. + Return type : Bio::EnsEMBL::IdMapping::Entry or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_Entry { my $self = shift; my $source = shift; @@ -160,6 +340,22 @@ sub get_Entry { } +=head2 get_score + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Example : my $score = $gene_scores->get_score($source_gene->id, + $target_gene->id); + Description : Gets the score from the scoring matrix for a given source and + target object. + Return type : Float or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_score { my $self = shift; my $source = shift; @@ -174,6 +370,20 @@ sub get_score { } +=head2 get_targets_for_source + + Arg[1] : Int $source - source object's internal Id ("dbID") + Example : my @targets = $gene_scores->get_targets_for_source(1234); + Description : Returns a list of all targets which have a score against a given + source object. + Return type : Arrayref of Int (target objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_targets_for_source { my $self = shift; my $source = shift; @@ -182,6 +392,20 @@ sub get_targets_for_source { } +=head2 get_Entries_for_source + + Arg[1] : Int $source - source object's internal Id ("dbID") + Example : my @entries = $gene_scores->get_Entries_for_source(1234); + Description : Returns a list of all Entries in the scoring matrix for a given + source object. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_Entries_for_source { my $self = shift; my $source = shift; @@ -191,6 +415,20 @@ sub get_Entries_for_source { } +=head2 get_sources_for_target + + Arg[1] : Int $target - target object's internal Id ("dbID") + Example : my @sources = $gene_scores->get_sources_for_target(5678); + Description : Returns a list of all sources which have a score against a given + target object. + Return type : Arrayref of Int (source objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_sources_for_target { my $self = shift; my $target = shift; @@ -199,6 +437,20 @@ sub get_sources_for_target { } +=head2 get_Entries_for_target + + Arg[1] : Int $target - target object's internal Id ("dbID") + Example : my @entries = $gene_scores->get_Entries_for_target(5678); + Description : Returns a list of all Entries in the scoring matrix for a given + target object. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_Entries_for_target { my $self = shift; my $target = shift; @@ -208,6 +460,20 @@ sub get_Entries_for_target { } +=head2 get_all_Entries + + Example : foreach my $entry (@{ $gene_scores->get_all_Entries }) { + # do something with the entry + } + Description : Returns a list of all Entries in the scoring matrix. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_Entries { my $self = shift; @@ -224,41 +490,127 @@ sub get_all_Entries { } +=head2 get_all_sources + + Example : my @sources = @{ $gene_scores->get_all_sources }; + Description : Returns a list of all sources in the scoring matrix. + Return type : Arrayref of Int (source objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_sources { my $self = shift; return [keys %{ $self->{'cache'}->{'source_list'} }]; } +=head2 get_all_targets + + Example : my @targets = @{ $gene_scores->get_all_targets }; + Description : Returns a list of all targets in the scoring matrix. + Return type : Arrayref of Int (target objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_targets { my $self = shift; return [keys %{ $self->{'cache'}->{'target_list'} }]; } +=head2 get_entry_count + + Example : my $num_entries = $gene_scores->get_entry_count; + Description : Returns the number of Entries in the scoring matrix. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_entry_count { my $self = shift; return scalar(keys %{ $self->{'cache'}->{'matrix'} }); } +=head2 size + + Example : my $size = $gene_scores->size; + Description : Returns the size of the scoring matrix. Same value as returned + by get_entry_count(). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub size { return $_[0]->get_entry_count; } +=head2 get_source_count + + Example : my $num_sources = $gene_scores->get_source_count; + Description : Returns the number of distinct sources in the scoring matrix. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_source_count { my $self = shift; return scalar(keys %{ $self->{'cache'}->{'source_list'} }); } +=head2 get_target_count + + Example : my $num_targets = $gene_scores->get_target_count; + Description : Returns the number of distinct targets in the scoring matrix. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_target_count { my $self = shift; return scalar(keys %{ $self->{'cache'}->{'target_list'} }); } +=head2 get_min_max_scores + + Example : my ($min_score, $max_score) = + @{ $gene_scores->get_min_max_scores }; + Description : Returns the mininum and maximum score in the scoring matrix. + Return type : Arrayref of Float [min_score, max_score] + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_min_max_scores { my $self = shift; @@ -279,6 +631,18 @@ sub get_min_max_scores { } +=head2 get_average_score + + Example : my $avg_score = $gene_scores->get_average_score; + Description : Returns the average (mean) score in the matrix. + Return type : Float + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_average_score { my $self = shift; @@ -296,6 +660,22 @@ sub get_average_score { } +=head2 merge + + Arg[1] : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix $matrix - another + matrix to merge with + Example : my $update_count = $gene_scores->merge($more_gene_scores); + Description : Merges two scoring matrices. If there's an Entry for a + source/target pair in both matrices, the higher score will be + retained. + Return type : Int - number of Entries added or updated + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub merge { my $self = shift; my $matrix = shift; @@ -345,6 +725,21 @@ sub merge { } +=head2 log + + Arg[1] : String $type - object type (e.g. 'gene') + Arg[2] : String $dump_path - path for writing output + Example : $gene_scores->log('gene', $conf->param('basedir')); + Description : Logs all Entries in the scoring matrix to a file. Used for + debugging. + Return type : none + Exceptions : thrown on I/0 error + Caller : general + Status : At Risk + : under development + +=cut + sub log { my $self = shift; my $type = shift; @@ -364,6 +759,21 @@ sub log { } +=head2 to_string + + Example : print LOG $gene_scores->to_string, "\n"; + Description : Returns a string representation of the scoring matrix. This is + simply a multi-line string, where each line is a stringified + Entry. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub to_string { my $self = shift; diff --git a/modules/Bio/EnsEMBL/IdMapping/Serialisable.pm b/modules/Bio/EnsEMBL/IdMapping/Serialisable.pm index 221db38ffa35c3e3c205828ec758dba5beaa8c46..f0949db626864cc9dab47f57c03d8bb162400348 100644 --- a/modules/Bio/EnsEMBL/IdMapping/Serialisable.pm +++ b/modules/Bio/EnsEMBL/IdMapping/Serialisable.pm @@ -2,15 +2,49 @@ package Bio::EnsEMBL::IdMapping::Serialisable; =head1 NAME +Bio::EnsEMBL::IdMapping::Serialisable - base class for serialisable objects =head1 SYNOPSIS +# instantiate an object which extends Serialisable +my $object = YourObject->new( + -DUMP_PATH => '/tmp', + -CACHE_FILE => 'object_cache.ser', +); + +# serialise object to file +my $filesize = $object->write_to_file; +print LOG "Serialised object to file of size $filesize.\n"; + +# later, create another object defining the same serialisation location. +# specifying -LOAD_AUTO will automatically load it from the serialisation file. +my $object1 = YourObject->new( + -DUMP_PATH => '/tmp', + -CACHE_FILE => 'object_cache.ser', + -LOAD_AUTO => 1, +); + +# alternatively, manually load the object from file +$object1->load_from_file; =head1 DESCRIPTION +This is the base class for serialisable objects used by the stable Id mapping. +It's essentially an OO wrapper for Storable, providing a method to store +(write_to_file(()) and one to retrieve (read_from_file()) serialised objects. + +This class is not instantiated itself, but rather extended by implementing +classes. =head1 METHODS +new +write_to_file +read_from_file +dump_path +cache_file_name +cache_file +loaded =head1 LICENCE @@ -39,6 +73,21 @@ use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes); use Storable qw(nstore retrieve); +=head2 new + + Arg [DUMP_PATH] : String - path for object serialisation + Arg [CACHE_FILE] : String - filename of serialised object + Arg [AUTO_LOAD] : Boolean - determines whether object should be automatically + loaded on instantiation + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::Serialisable implementing object + Exceptions : thrown on missing argument + Caller : implementing subclass + Status : At Risk + : under development + +=cut + sub new { my $caller = shift; my $class = ref($caller) || $caller; @@ -67,6 +116,19 @@ sub new { } +=head2 write_to_file + + Example : my $filesize = $object->write_to_file; + Description : Serialises an object to a file (determined by + $self->cache_file). + Return type : String - size of serialisation file + Exceptions : thrown on I/O errors + Caller : general + Status : At Risk + : under development + +=cut + sub write_to_file { my $self = shift; @@ -90,6 +152,19 @@ sub write_to_file { } +=head2 read_from_file + + Example : $object->read_from_file; + Description : Reads a serialised object from file (determined by + $self->cache_file). + Return type : Bio::EnsEMBL::IdMapping::Serialisable implementing object + Exceptions : thrown on I/O errors + Caller : general + Status : At Risk + : under development + +=cut + sub read_from_file { my $self = shift; @@ -108,9 +183,18 @@ sub read_from_file { } -# -# getter/setters -# +=head2 dump_path + + Arg[1] : String - dump path for serialisation + Example : $object->dump_path('/tmp'); + Description : Getter/setter for the dump path for serialisation. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut sub dump_path { my $self = shift; @@ -119,6 +203,19 @@ sub dump_path { } +=head2 cache_file_name + + Arg[1] : String - file name for serialisation + Example : $object->cache_file_name('object_cache.ser'); + Description : Getter/setter for the file name for serialisation. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub cache_file_name { my $self = shift; $self->{'cache_file_name'} = shift if (@_); @@ -126,12 +223,42 @@ sub cache_file_name { } +=head2 cache_file + + Example : my $cache_file = $object->cache_file; + Description : Returns the path and name of the serialised object file. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub cache_file { my $self = shift; return $self->dump_path.'/'.$self->cache_file_name; } +=head2 loaded + + Arg[1] : Boolean - "loaded" status + Example : if ($object->loaded) { + # do something with the object that was loaded from a file + } else { + # the object wasn't loaded but is new, so fill it + } + Description : Indicates whether a given object was loaded from its serialised + state on disk. + Return type : Boolean - TRUE if loaded from disk, FALSE otherwise + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub loaded { my $self = shift; $self->{'loaded'} = shift if (@_); diff --git a/modules/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm b/modules/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm index 7ed7f90777168f82ce3dc04e1144819407f67db9..df58482f0940c9910f3157b49d1ffd4f4a4eb449 100644 --- a/modules/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm +++ b/modules/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm @@ -2,15 +2,48 @@ package Bio::EnsEMBL::IdMapping::SyntenyFramework; =head1 NAME +Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic +regions across the genome =head1 SYNOPSIS +# build the SyntenyFramework from unambiguous gene mappings +my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'synteny_framework.ser', + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache, +); +$sf->build_synteny($gene_mappings); + +# use it to rescore the genes +$gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores); =head1 DESCRIPTION +The SyntenyFramework is a set of SyntenyRegions. These are pairs of locations +very analoguous to the information in the assembly table (the locations dont +have to be the same length though). They are built from genes that map uniquely +between source and target. + +Once built, the SyntenyFramework is used to score source and target gene pairs +to determine whether they are similar. This process is slow (it involves testing +all gene pairs against all SyntenyRegions), this module therefor has built-in +support to run the process in parallel via LSF. =head1 METHODS +new +build_synteny +_by_overlap +add_SyntenyRegion +get_all_SyntenyRegions +rescore_gene_matrix_lsf +rescore_gene_matrix +logger +conf +cache =head1 REALTED MODULES @@ -48,6 +81,30 @@ use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; use FindBin qw($Bin); FindBin->again; + +=head2 new + + Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object + Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object + Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object + Arg [DUMP_PATH] : String - path for object serialisation + Arg [CACHE_FILE] : String - filename of serialised object + Example : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'synteny_framework.ser', + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache, + ); + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework + Exceptions : thrown on wrong or missing arguments + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + sub new { my $caller = shift; my $class = ref($caller) || $caller; @@ -80,6 +137,23 @@ sub new { } +=head2 build_synteny + + Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings + to build the SyntenyFramework from + Example : $synteny_framework->build_synteny($gene_mappings); + Description : Builds the SyntenyFramework from unambiguous gene mappings. + SyntenyRegions are allowed to overlap. At most two overlapping + SyntenyRegions are merged (otherwise we'd get too large + SyntenyRegions with little information content). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + sub build_synteny { my $self = shift; my $mappings = shift; @@ -169,6 +243,9 @@ sub build_synteny { } +# +# sort SyntenyRegions by overlap +# sub _by_overlap { # first sort by seq_region my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name); @@ -182,16 +259,65 @@ sub _by_overlap { } +=head2 add_SyntenyRegion + + Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add + Example : $synteny_framework->add_SyntenyRegion($synteny_region); + Description : Adds a SyntenyRegion to the framework. For speed reasons (and + since this is an internal method), no argument check is done. + Return type : none + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + sub add_SyntenyRegion { push @{ $_[0]->{'cache'} }, $_[1]; } +=head2 get_all_SyntenyRegions + + Example : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) { + # do something with the SyntenyRegion + } + Description : Get a list of all SyntenyRegions in the framework. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_SyntenyRegions { return $_[0]->{'cache'}; } +=head2 rescore_gene_matrix_lsf + + Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene + scores to rescore + Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores); + Description : This method runs rescore_gene_matrix() (via the + synteny_resocre.pl script) in parallel with lsf, then combines + the results to return a single rescored scoring matrix. + Parallelisation is done by chunking the scoring matrix into + several pieces (determined by the --synteny_rescore_jobs + configuration option). + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : thrown on wrong or missing argument + thrown on filesystem I/O error + thrown on failure of one or mor lsf jobs + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + sub rescore_gene_matrix_lsf { my $self = shift; my $matrix = shift; @@ -314,8 +440,22 @@ sub rescore_gene_matrix_lsf { # -# retain 70% of old score and build other 30% from synteny match # +=head2 rescore_gene_matrix + + Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene + scores to rescore + Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores); + Description : Rescores a gene matrix. Retains 70% of old score and builds + other 30% from the synteny match. + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : thrown on wrong or missing argument + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + sub rescore_gene_matrix { my $self = shift; my $matrix = shift; diff --git a/modules/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm b/modules/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm index 6adb55329267a577dd460d476b0a9230f0d1491b..59cccf4775f2e92364e4a8f6ed28c36f63391718 100644 --- a/modules/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm +++ b/modules/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm @@ -2,15 +2,51 @@ package Bio::EnsEMBL::IdMapping::SyntenyRegion; =head1 NAME +Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions =head1 SYNOPSIS +# create a new SyntenyRegion from a source and a target gene +my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ + $source_gene->start, + $source_gene->end, + $source_gene->strand, + $source_gene->seq_region_name, + $target_gene->start, + $target_gene->end, + $target_gene->strand, + $target_gene->seq_region_name, + $entry->score, +]); + +# merge with another SyntenyRegion +my $merged_sr = $sr->merge($sr1); + +# score a gene pair against this SyntenyRegion +my $score = $sr->score_location_relationship($source_gene1, $target_gene1); =head1 DESCRIPTION +This object represents a synteny between a source and a target location. +SyntenyRegions are built from mapped genes, and the their score is defined as +the score of the gene mapping. For merged SyntenyRegions, scores are combined. =head1 METHODS +new_fast +source_start +source_end +source_strand +source_seq_region_name +target_start +target_end +target_strand +target_seq_region_name +score +merge +stretch +score_location_relationship +to_string =head1 LICENCE @@ -36,6 +72,22 @@ no warnings 'uninitialized'; use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the + SyntenyRegion object + Example : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ + ]); + Description : Constructor. On instantiation, source and target regions are + reverse complemented so that source is always on forward strand. + Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub new_fast { my $class = shift; my $array_ref = shift; @@ -52,6 +104,18 @@ sub new_fast { } +=head2 source_start + + Arg[1] : (optional) Int - source location start coordinate + Description : Getter/setter for source location start coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub source_start { my $self = shift; $self->[0] = shift if (@_); @@ -59,6 +123,18 @@ sub source_start { } +=head2 source_end + + Arg[1] : (optional) Int - source location end coordinate + Description : Getter/setter for source location end coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub source_end { my $self = shift; $self->[1] = shift if (@_); @@ -66,6 +142,18 @@ sub source_end { } +=head2 source_strand + + Arg[1] : (optional) Int - source location strand + Description : Getter/setter for source location strand. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub source_strand { my $self = shift; $self->[2] = shift if (@_); @@ -73,6 +161,18 @@ sub source_strand { } +=head2 source_seq_region_name + + Arg[1] : (optional) String - source location seq_region name + Description : Getter/setter for source location seq_region name. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub source_seq_region_name { my $self = shift; $self->[3] = shift if (@_); @@ -80,6 +180,18 @@ sub source_seq_region_name { } +=head2 target_start + + Arg[1] : (optional) Int - target location start coordinate + Description : Getter/setter for target location start coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub target_start { my $self = shift; $self->[4] = shift if (@_); @@ -87,6 +199,18 @@ sub target_start { } +=head2 target_end + + Arg[1] : (optional) Int - target location end coordinate + Description : Getter/setter for target location end coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub target_end { my $self = shift; $self->[5] = shift if (@_); @@ -94,6 +218,18 @@ sub target_end { } +=head2 target_strand + + Arg[1] : (optional) Int - target location strand + Description : Getter/setter for target location strand. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub target_strand { my $self = shift; $self->[6] = shift if (@_); @@ -101,6 +237,18 @@ sub target_strand { } +=head2 target_seq_region_name + + Arg[1] : (optional) String - target location seq_region name + Description : Getter/setter for target location seq_region name. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub target_seq_region_name { my $self = shift; $self->[7] = shift if (@_); @@ -108,6 +256,18 @@ sub target_seq_region_name { } +=head2 score + + Arg[1] : (optional) Float - score + Description : Getter/setter for the score between source and target location. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub score { my $self = shift; $self->[8] = shift if (@_); @@ -115,6 +275,23 @@ sub score { } +=head2 merge + + Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another + SyntenyRegion + Example : $merged_sr = $sr->merge($other_sr); + Description : Merges two overlapping SyntenyRegions if they meet certain + criteria (see documentation in the code for details). Score is + calculated as a combined distance score. If the two + SyntenyRegions aren't mergeable, this method returns undef. + Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef + Exceptions : warns on bad scores + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub merge { my ($self, $sr) = @_; @@ -181,9 +358,20 @@ sub merge { return $self; } -# -# extend this SyntenyRegion to span a $factor * $score more area -# + +=head2 stretch + + Arg[1] : Float $factor - stretching factor + Example : $stretched_sr = $sr->stretch(2); + Description : Extends this SyntenyRegion to span a $factor * $score more area. + Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub stretch { my ($self, $factor) = @_; @@ -203,6 +391,41 @@ sub stretch { } +=head2 score_location_relationship + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene + Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene + Example : my $score = $sr->score_location_relationship($source_gene, + $target_gene); + Description : This function calculates how well the given source location + interpolates on given target location inside this SyntenyRegion. + + Scoring is done the following way: Source and target location + are normalized with respect to this Regions source and target. + Source range will then be somewhere close to 0.0-1.0 and target + range anything around that. + + The extend of the covered area between source and target range + is a measurement of how well they agree (smaller extend is + better). The extend (actually 2*extend) is reduced by the size + of the regions. This will result in 0.0 if they overlap + perfectly and bigger values if they dont. + + This is substracted from 1.0 to give the score. The score is + likely to be below zero, but is cut off at 0.0f. + + Finally, the score is multiplied with the score of the synteny + itself. + Return type : Float + Exceptions : warns if score out of range + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + + + sub score_location_relationship { my ($self, $source_gene, $target_gene) = @_; @@ -269,6 +492,19 @@ sub score_location_relationship { } +=head2 to_string + + Example : print LOG $sr->to_string, "\n"; + Description : Returns a string representation of the SyntenyRegion object. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + sub to_string { my $self = shift; return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f", diff --git a/modules/Bio/EnsEMBL/IdMapping/TinyExon.pm b/modules/Bio/EnsEMBL/IdMapping/TinyExon.pm index 443867f97e5529022eecb9c9cecdb4d6e54c80bc..7fc69887267965e0b0aed3981eb84525e26927e4 100644 --- a/modules/Bio/EnsEMBL/IdMapping/TinyExon.pm +++ b/modules/Bio/EnsEMBL/IdMapping/TinyExon.pm @@ -2,15 +2,51 @@ package Bio::EnsEMBL::IdMapping::TinyExon; =head1 NAME +Bio::EnsEMBL::IdMapping::TinyExon - lightweight exon object =head1 SYNOPSIS +# fetch an exon from the db and create a lightweight exon object from it +my $exon = $exon_adaptor->fetch_by_stable_id('ENSE000345437'); +my $lightweight_exon = Bio::EnsEMBL::IdMapping::TinyExon->new_fast([ + $exon->dbID, + $exon->stable_id, + $exon->version, + $exon->created_date, + $exon->modified_date, + $exon->start, + $exon->end, + $exon->strand, + $exon->slice->seq_region_name, + $exon->slice->coord_system_name, + $exon->slice->coord_system->version, + $exon->slice->subseq($exon->start, $exon->end, $exon->strand), + $exon->phase, + $need_project, +]); =head1 DESCRIPTION +This is a lightweight exon object for the stable Id mapping. See the +documentation in TinyFeature for general considerations about its design. =head1 METHODS +start +end +strand +seq_region_name +coord_system_name +coord_system_version +seq +phase +need_project +common_start +common_end +common_strand +common_sr_name +length +is_known =head1 LICENCE @@ -31,21 +67,20 @@ Please post comments/questions to the Ensembl development list # internal data structure (array indices): # -# 0 dbID -# 1 stable_id -# 2 start -# 3 end -# 4 strand -# 5 seq_region_name -# 6 coord_system_name -# 7 coord_system_version -# 8 seq -# 9 phase -# 10 need_project -# 11 common_start -# 12 common_end -# 13 common_strand -# 14 common_sr_name +# 0-4 see TinyFeature +# 5 start +# 6 end +# 7 strand +# 8 seq_region_name +# 9 coord_system_name +# 10 coord_system_version +# 11 seq +# 12 phase +# 13 need_project +# 14 common_start +# 15 common_end +# 16 common_strand +# 17 common_sr_name use strict; @@ -58,6 +93,18 @@ our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 start + + Arg[1] : (optional) Int - the exon's start coordinate + Description : Getter/setter for the exon's start coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub start { my $self = shift; $self->[5] = shift if (@_); @@ -65,6 +112,18 @@ sub start { } +=head2 end + + Arg[1] : (optional) Int - the exon's end coordinate + Description : Getter/setter for the exon's end coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub end { my $self = shift; $self->[6] = shift if (@_); @@ -72,6 +131,18 @@ sub end { } +=head2 strand + + Arg[1] : (optional) Int - the exon's strand + Description : Getter/setter for the exon's strand. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub strand { my $self = shift; $self->[7] = shift if (@_); @@ -79,6 +150,19 @@ sub strand { } +=head2 seq_region_name + + Arg[1] : (optional) String - seq_region name + Description : Getter/setter for the seq_region name of the slice the exon is + on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub seq_region_name { my $self = shift; $self->[8] = shift if (@_); @@ -86,6 +170,19 @@ sub seq_region_name { } +=head2 coord_system_name + + Arg[1] : (optional) String - coord_system name + Description : Getter/setter for the coord_system name of the slice the exon is + on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub coord_system_name { my $self = shift; $self->[9] = shift if (@_); @@ -93,6 +190,19 @@ sub coord_system_name { } +=head2 coord_system_version + + Arg[1] : (optional) String - coord_system version + Description : Getter/setter for the coord_system version of the slice the + exon is on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub coord_system_version { my $self = shift; $self->[10] = shift if (@_); @@ -100,6 +210,18 @@ sub coord_system_version { } +=head2 seq + + Arg[1] : (optional) String - the exon's sequence + Description : Getter/setter for the exon's sequence. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub seq { my $self = shift; $self->[11] = shift if (@_); @@ -107,6 +229,18 @@ sub seq { } +=head2 phase + + Arg[1] : (optional) Int - the exon's phase + Description : Getter/setter for the exon's phase. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub phase { my $self = shift; $self->[12] = shift if (@_); @@ -114,6 +248,22 @@ sub phase { } +=head2 need_project + + Arg[1] : (optional) Boolean - attribute to set + Description : Getter/setter for the attribute determining whether an exon + needs to be projected onto a common coord_system. You don't need + to do so if the native coord_system is common to the source and + target assemblies, or if no common coord_system is found (the + Cache object has methods to determine this). + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub need_project { my $self = shift; $self->[13] = shift if (@_); @@ -121,6 +271,21 @@ sub need_project { } +=head2 common_start + + Arg[1] : (optional) Int - the exon's start in common coord_system + coordinates + Description : Getter/setter for the exon's start in common coord_system + coordinates. Will return $self->start if no projection to a + common coord_system is required. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub common_start { my $self = shift; @@ -143,6 +308,21 @@ sub common_start { } +=head2 common_end + + Arg[1] : (optional) Int - the exon's end in common coord_system + coordinates + Description : Getter/setter for the exon's end in common coord_system + coordinates. Will return $self->end if no projection to a + common coord_system is required. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub common_end { my $self = shift; @@ -165,6 +345,21 @@ sub common_end { } +=head2 common_strand + + Arg[1] : (optional) Int - the exon's strand in common coord_system + coordinates + Description : Getter/setter for the exon's strand in common coord_system + coordinates. Will return $self->strand if no projection to a + common coord_system is required. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub common_strand { my $self = shift; @@ -187,6 +382,22 @@ sub common_strand { } +=head2 common_sr_name + + Arg[1] : (optional) String - seq_region name of the exon's slice on the + common coord_system + Description : Getter/setter for the seq_region name of the exon's slice on the + common coord_system coordinates. Will return + $self->seq_region_name if no projection to a common coord_system + is required. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub common_sr_name { my $self = shift; @@ -209,12 +420,35 @@ sub common_sr_name { } +=head2 length + + Description : Returns the exon length (distance between start and end). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub length { my $self = shift; return ($self->end - $self->start + 1); } +=head2 is_known + + Description : Determine whether an exon is known. In the context of stable Id + mapping, this is true for all exons. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub is_known { return 1; } diff --git a/modules/Bio/EnsEMBL/IdMapping/TinyFeature.pm b/modules/Bio/EnsEMBL/IdMapping/TinyFeature.pm index 8b083f43591e873dae8a8f19635fed1631fe8a2a..9811cf6656ec3f7dea79f0aefc0337cb39ce61b3 100644 --- a/modules/Bio/EnsEMBL/IdMapping/TinyFeature.pm +++ b/modules/Bio/EnsEMBL/IdMapping/TinyFeature.pm @@ -2,15 +2,37 @@ package Bio::EnsEMBL::IdMapping::TinyFeature; =head1 NAME +Bio::EnsEMBL::IdMapping::TinyFeature - lightweight feature object =head1 SYNOPSIS +This object isn't instantiated. See objects which inherit from it (TinyGene, +TinyTranscript, etc.) for examples. =head1 DESCRIPTION +This is the base class for the lightweight feature objects used by the stable Id +maping application. For performance reasons, these objects are instantiated +using a new_fast() method. The internal implementation is an arrayref (rather +than the more common hashref), which optimises memory usage. + +There are no adaptors to fetch TinyFeatures from the database. You rather use +the normal feature adaptors and then create the TinyFeatures from the heavy +objects you get. The memory saving will therefore mainly take effect when +serialising and reloading these objects. + +Also note that TinyFeatures don't have a slice attached to them - all location +information (where required) is stored on the feature object directly. =head1 METHODS +new_fast +id +stable_id +version +created_date +modified_date +to_string =head1 LICENCE @@ -33,6 +55,9 @@ Please post comments/questions to the Ensembl development list # # 0 dbID # 1 stable_id +# 2 version +# 3 created_date +# 4 modified_date # # other instance variables differ by subclass implementation, so look there. @@ -44,6 +69,18 @@ no warnings 'uninitialized'; use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the new object + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::TinyFeature implementing class + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + sub new_fast { my $class = shift; my $array_ref = shift; @@ -51,6 +88,18 @@ sub new_fast { } +=head2 id + + Arg[1] : (optional) Int - the feature's internal Id ("dbID") + Description : Getter/setter for the feature's internal Id. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + sub id { my $self = shift; $self->[0] = shift if (@_); @@ -58,6 +107,18 @@ sub id { } +=head2 stable_id + + Arg[1] : (optional) String - the feature's stable Id + Description : Getter/setter for the feature's stable Id. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + sub stable_id { my $self = shift; $self->[1] = shift if (@_); @@ -65,6 +126,18 @@ sub stable_id { } +=head2 version + + Arg[1] : (optional) Int - the feature's stable Id version + Description : Getter/setter for the feature's stable Id version. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + sub version { my $self = shift; $self->[2] = shift if (@_); @@ -72,6 +145,18 @@ sub version { } +=head2 created_date + + Arg[1] : (optional) String - the feature's stable Id creation date + Description : Getter/setter for the feature's stable Id creation date. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + sub created_date { my $self = shift; $self->[3] = shift if (@_); @@ -79,6 +164,18 @@ sub created_date { } +=head2 modified_date + + Arg[1] : (optional) String - the feature's stable Id modification date + Description : Getter/setter for the feature's stable Id modification date. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + sub modified_date { my $self = shift; $self->[4] = shift if (@_); @@ -86,6 +183,19 @@ sub modified_date { } +=head2 to_string + + Example : print LOG "Created ", $f->to_string, "\n"; + Description : Prints a string representation of the feature for debug + purposes. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub to_string { my $self = shift; return $self->id.':'.$self->stable_id.'.'.$self->version; diff --git a/modules/Bio/EnsEMBL/IdMapping/TinyGene.pm b/modules/Bio/EnsEMBL/IdMapping/TinyGene.pm index 8a91fea1a140011a770b6a5142660982d9e8e336..5e69ffa15925bd30c0d600ca674b69967fd94734 100644 --- a/modules/Bio/EnsEMBL/IdMapping/TinyGene.pm +++ b/modules/Bio/EnsEMBL/IdMapping/TinyGene.pm @@ -2,15 +2,46 @@ package Bio::EnsEMBL::IdMapping::TinyGene; =head1 NAME +Bio::EnsEMBL::IdMapping::TinyGene - lightweight gene object =head1 SYNOPSIS +# fetch a gene from the db and create a lightweight gene object from it +my $gene = $gene_adaptor->fetch_by_stable_id('ENSG000345437'); +my $lightweight_gene = Bio::EnsEMBL::IdMapping::TinyGene->new_fast([ + $gene->dbID, + $gene->stable_id, + $gene->version, + $gene->created_date, + $gene->modified_date, + $gene->start, + $gene->end, + $gene->strand, + $gene->slice->seq_region_name, + $gene->biotype, + $gene->status, + $gene->analysis->logic_name, + ($gene->is_known ? 1 : 0), +]); =head1 DESCRIPTION +This is a lightweight gene object for the stable Id mapping. See the +documentation in TinyFeature for general considerations about its design. =head1 METHODS +start +end +strand +seq_region_name +biotype +status +logic_name +is_known +add_Transcript +get_all_Transcripts +length =head1 LICENCE @@ -29,6 +60,20 @@ Please post comments/questions to the Ensembl development list =cut +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 start +# 6 end +# 7 strand +# 8 seq_region_name +# 9 biotype +# 10 status +# 11 logic_name +# 12 is_known +# 13 [transcripts] + + use strict; use warnings; no warnings 'uninitialized'; @@ -39,6 +84,18 @@ our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 start + + Arg[1] : (optional) Int - the gene's start coordinate + Description : Getter/setter for the gene's start coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub start { my $self = shift; $self->[5] = shift if (@_); @@ -46,6 +103,18 @@ sub start { } +=head2 end + + Arg[1] : (optional) Int - the gene's end coordinate + Description : Getter/setter for the gene's end coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub end { my $self = shift; $self->[6] = shift if (@_); @@ -53,6 +122,18 @@ sub end { } +=head2 strand + + Arg[1] : (optional) Int - the gene's strand + Description : Getter/setter for the gene's strand. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub strand { my $self = shift; $self->[7] = shift if (@_); @@ -60,6 +141,19 @@ sub strand { } +=head2 seq_region_name + + Arg[1] : (optional) String - seq_region name + Description : Getter/setter for the seq_region name of the slice the gene is + on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub seq_region_name { my $self = shift; $self->[8] = shift if (@_); @@ -67,6 +161,18 @@ sub seq_region_name { } +=head2 biotype + + Arg[1] : (optional) String - the gene's biotype + Description : Getter/setter for the gene's biotype. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub biotype { my $self = shift; $self->[9] = shift if (@_); @@ -74,6 +180,18 @@ sub biotype { } +=head2 strand + + Arg[1] : (optional) String - the gene's status + Description : Getter/setter for the gene's status. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub status { my $self = shift; $self->[10] = shift if (@_); @@ -81,6 +199,18 @@ sub status { } +=head2 logic_name + + Arg[1] : (optional) String - the gene's analysis' logic_name + Description : Getter/setter for the gene's analysis' logic_name. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub logic_name { my $self = shift; $self->[11] = shift if (@_); @@ -88,6 +218,18 @@ sub logic_name { } +=head2 is_known + + Arg[1] : (optional) Boolean - the gene's "known" status + Description : Getter/setter for the gene's "known" status. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub is_known { my $self = shift; $self->[12] = shift if (@_); @@ -95,6 +237,20 @@ sub is_known { } +=head2 add_Transcript + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyTranscript $tr - the transcript to + add + Example : $tiny_gene->add_Transcript($tiny_transcript); + Description : Adds a transcript to a gene. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub add_Transcript { my $self = shift; my $tr = shift; @@ -107,11 +263,36 @@ sub add_Transcript { } +=head2 get_all_Transcripts + + Example : foreach my $tr (@{ $tiny_gene->get_all_Transcripts }) { + # do something with transcript + } + Description : Returns all transcripts attached to that gene. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::TinyTranscript objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_Transcripts { return $_[0]->[13] || []; } +=head2 length + + Description : Returns the gene length (distance between start and end). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub length { my $self = shift; return ($self->end - $self->start + 1); diff --git a/modules/Bio/EnsEMBL/IdMapping/TinyTranscript.pm b/modules/Bio/EnsEMBL/IdMapping/TinyTranscript.pm index f9c1870880b1d4a06ef9847035675f694ec44cbb..a62a2effe28e7918a19e706af9839606e968264b 100644 --- a/modules/Bio/EnsEMBL/IdMapping/TinyTranscript.pm +++ b/modules/Bio/EnsEMBL/IdMapping/TinyTranscript.pm @@ -2,15 +2,44 @@ package Bio::EnsEMBL::IdMapping::TinyTranscript; =head1 NAME +Bio::EnsEMBL::IdMapping::TinyTranscript - lightweight transcript object =head1 SYNOPSIS +# fetch a transcript from the db and create a lightweight transcript object from +# it +my $tr = $transcript_adaptor->fetch_by_stable_id('ENST000345437'); +my $lightweight_tr = Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast([ + $tr->dbID, + $tr->stable_id, + $tr->version, + $tr->created_date, + $tr->modified_date, + $tr->start, + $tr->end, + $tr->strand, + $tr->length, + md5_hex($tr->spliced_seq), + ($tr->is_known ? 1 : 0), +]); =head1 DESCRIPTION +This is a lightweight transcript object for the stable Id mapping. See the +documentation in TinyFeature for general considerations about its design. =head1 METHODS +start +end +strand +length +seq_md5_sum +is_known +add_Translation +translation +add_Exon +get_all_Exons =head1 LICENCE @@ -29,6 +58,19 @@ Please post comments/questions to the Ensembl development list =cut +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 start +# 6 end +# 7 strand +# 8 length +# 9 seq_md5_sum +# 10 is_known +# 11 translation +# 12 [exons] + + use strict; use warnings; no warnings 'uninitialized'; @@ -39,6 +81,18 @@ our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 start + + Arg[1] : (optional) Int - the transcript's start coordinate + Description : Getter/setter for the transcript's start coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub start { my $self = shift; $self->[5] = shift if (@_); @@ -46,6 +100,18 @@ sub start { } +=head2 end + + Arg[1] : (optional) Int - the transcript's end coordinate + Description : Getter/setter for the transcript's end coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub end { my $self = shift; $self->[6] = shift if (@_); @@ -53,6 +119,18 @@ sub end { } +=head2 strand + + Arg[1] : (optional) Int - the transcript's strand + Description : Getter/setter for the transcript's strand. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub strand { my $self = shift; $self->[7] = shift if (@_); @@ -60,6 +138,20 @@ sub strand { } +=head2 length + + Arg[1] : (optional) Int - the transcript's length + Description : Getter/setter for the transcript's length. Note that this is + *not* the distance between start and end, but rather the sum of + the lengths of all exons. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub length { my $self = shift; $self->[8] = shift if (@_); @@ -67,6 +159,21 @@ sub length { } +=head2 seq_md5_sum + + Arg[1] : (optional) String - the md5 digest of the transcript's sequence + Description : Getter/setter for the md5 digest of the transcript's sequence. + Note that when used as a setter, you are expected to pass a + digest, not the raw sequence (i.e. the digest is not created for + you). + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub seq_md5_sum { my $self = shift; $self->[9] = shift if (@_); @@ -74,6 +181,18 @@ sub seq_md5_sum { } +=head2 is_known + + Arg[1] : (optional) Boolean - the transcript's "known" status + Description : Getter/setter for the transcript's "known" status. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub is_known { my $self = shift; $self->[10] = shift if (@_); @@ -81,6 +200,20 @@ sub is_known { } +=head2 add_Translation + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - the translation + to add + Example : $tiny_transcript->add_Translation($tiny_translation); + Description : Adds a translation to this transcript. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub add_Translation { my $self = shift; my $tl = shift; @@ -93,11 +226,35 @@ sub add_Translation { } +=head2 translation + + Description : Getter for the transcript's translation. + Return type : Bio::EnsEMBL::IdMapping::TinyTranslation + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub translation { return $_[0]->[11]; } +=head2 add_Exon + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyExon $exon - the exon to add + Example : $tiny_transcript->add_Exon($tiny_exon); + Description : Adds an exon to this transcript. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + sub add_Exon { my $self = shift; my $exon = shift; @@ -110,6 +267,20 @@ sub add_Exon { } +=head2 get_all_Exons + + Example : foreach my $exon (@{ $tiny_transcript->get_all_Exons }) { + # do something with exon + } + Description : Returns all exons attached to that transcript. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::TinyExon objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub get_all_Exons { return $_[0]->[12] || []; } diff --git a/modules/Bio/EnsEMBL/IdMapping/TinyTranslation.pm b/modules/Bio/EnsEMBL/IdMapping/TinyTranslation.pm index 6b47038af118c379185a366127c42a686251d561..365242a904ec31563b04620efb43df66e1ed9e80 100644 --- a/modules/Bio/EnsEMBL/IdMapping/TinyTranslation.pm +++ b/modules/Bio/EnsEMBL/IdMapping/TinyTranslation.pm @@ -2,15 +2,33 @@ package Bio::EnsEMBL::IdMapping::TinyTranslation; =head1 NAME +Bio::EnsEMBL::IdMapping::TinyTranslation - lightweight translation object =head1 SYNOPSIS +if (my $tl = $tr->translation) { + my $lightweight_tl = Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast([ + $tl->dbID, + $tl->stable_id, + $tl->version, + $tl->created_date, + $tl->modified_date, + $tr->dbID, + $tr->translate->seq, + ($tr->is_known ? 1 : 0), + ]); +} =head1 DESCRIPTION +This is a lightweight translation object for the stable Id mapping. See the +documentation in TinyFeature for general considerations about its design. =head1 METHODS +transcript_id +seq +is_known =head1 LICENCE @@ -29,6 +47,14 @@ Please post comments/questions to the Ensembl development list =cut +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 transcript_id +# 6 seq +# 7 is_known + + use strict; use warnings; no warnings 'uninitialized'; @@ -39,6 +65,19 @@ our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); use Bio::EnsEMBL::Utils::Exception qw(throw warning); +=head2 transcript_id + + Arg[1] : (optional) Int - the transcript internal Id ("dbID") + Description : Getter/setter for the transcript internal Id this translation is + attached to. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub transcript_id { my $self = shift; $self->[5] = shift if (@_); @@ -46,6 +85,18 @@ sub transcript_id { } +=head2 seq + + Arg[1] : (optional) String - the translation's sequence + Description : Getter/setter for the translation's sequence. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub seq { my $self = shift; $self->[6] = shift if (@_); @@ -53,6 +104,18 @@ sub seq { } +=head2 is_known + + Arg[1] : (optional) Boolean - the translation's "known" status + Description : Getter/setter for the translation's "known" status. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + sub is_known { my $self = shift; $self->[7] = shift if (@_);