From f4036606f8428a96319da736392cee7bdfe27826 Mon Sep 17 00:00:00 2001 From: Graham McVicker <mcvicker@sanger.ac.uk> Date: Wed, 11 Feb 2004 21:23:00 +0000 Subject: [PATCH] modifications so that toplevel is now a pseudo coordinate system representing the highest real coordinate system in a given region --- modules/Bio/EnsEMBL/AssemblyMapper.pm | 8 +- modules/Bio/EnsEMBL/ChainedAssemblyMapper.pm | 6 +- modules/Bio/EnsEMBL/Chromosome.pm | 5 +- modules/Bio/EnsEMBL/CoordSystem.pm | 95 +++++- .../EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm | 8 + .../Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm | 204 ++++++------ modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm | 7 +- modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm | 7 +- modules/Bio/EnsEMBL/Feature.pm | 9 +- modules/Bio/EnsEMBL/Mapper.pm | 66 ++-- modules/Bio/EnsEMBL/Mapper/Coordinate.pm | 53 ++- modules/Bio/EnsEMBL/Slice.pm | 31 +- modules/Bio/EnsEMBL/TopLevelAssemblyMapper.pm | 308 ++++++++++++++++++ 13 files changed, 626 insertions(+), 181 deletions(-) create mode 100644 modules/Bio/EnsEMBL/TopLevelAssemblyMapper.pm diff --git a/modules/Bio/EnsEMBL/AssemblyMapper.pm b/modules/Bio/EnsEMBL/AssemblyMapper.pm index 683dcd3a31..cbaf833b28 100644 --- a/modules/Bio/EnsEMBL/AssemblyMapper.pm +++ b/modules/Bio/EnsEMBL/AssemblyMapper.pm @@ -102,7 +102,9 @@ sub new { #we load the mapper calling the 'ASSEMBLED' the 'from' coord system #and the 'COMPONENT' the 'to' coord system - $self->{'mapper'} = Bio::EnsEMBL::Mapper->new($ASSEMBLED, $COMPONENT); + $self->{'mapper'} = Bio::EnsEMBL::Mapper->new($ASSEMBLED, $COMPONENT, + $coord_systems[0], + $coord_systems[1]); return $self; } @@ -186,8 +188,6 @@ sub map { =cut - - sub flush { my $self = shift; @@ -258,7 +258,7 @@ sub fastmap { Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs The coordinate system to obtain overlapping ids of Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} - Description: Retrieves a list of overlapping seq_region internal identifiers + Description: Retrieves a list of overlapping seq_region names of another coordinate system. This is the same as the list_ids method but uses seq_region names rather internal ids Returntype : List of strings diff --git a/modules/Bio/EnsEMBL/ChainedAssemblyMapper.pm b/modules/Bio/EnsEMBL/ChainedAssemblyMapper.pm index cdfd072fa8..1cfd7d439a 100644 --- a/modules/Bio/EnsEMBL/ChainedAssemblyMapper.pm +++ b/modules/Bio/EnsEMBL/ChainedAssemblyMapper.pm @@ -119,7 +119,9 @@ sub new { #mapper that is actually used and is loaded by the mappings generated #by the other two mappers - $self->{'first_last_mapper'} = Bio::EnsEMBL::Mapper->new($FIRST, $LAST); + $self->{'first_last_mapper'} = Bio::EnsEMBL::Mapper->new($FIRST, $LAST, + $coord_systems[0], + $coord_systems[2]); #need registries to keep track of what regions are registered in source #and destination coordinate systems @@ -246,7 +248,7 @@ sub map { sub fastmap { my $self = shift; - $self->map(@_,1); + return $self->map(@_,1); } diff --git a/modules/Bio/EnsEMBL/Chromosome.pm b/modules/Bio/EnsEMBL/Chromosome.pm index 2f7649d1ac..8bd68df438 100755 --- a/modules/Bio/EnsEMBL/Chromosome.pm +++ b/modules/Bio/EnsEMBL/Chromosome.pm @@ -53,7 +53,10 @@ sub new { if($chr_name) { if($adaptor) { - my $chr = $adaptor->fetch_by_region('toplevel',$chr_name); + my $csa = $adaptor->db->get_CoordSystemAdaptor(); + my ($top_cs) = @{$csa->fetch_all()}; + my $chr = $adaptor->fetch_by_region($top_cs->name(),$chr_name, + undef,undef,undef,$top_cs->version()); bless $chr, 'Bio::EnsEMBL::Chromosome'; return $chr; } else { diff --git a/modules/Bio/EnsEMBL/CoordSystem.pm b/modules/Bio/EnsEMBL/CoordSystem.pm index 73dfa98d76..03d8146d6e 100644 --- a/modules/Bio/EnsEMBL/CoordSystem.pm +++ b/modules/Bio/EnsEMBL/CoordSystem.pm @@ -55,7 +55,8 @@ package Bio::EnsEMBL::CoordSystem; use Bio::EnsEMBL::Storable; -use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); use vars qw(@ISA); @@ -67,24 +68,32 @@ use vars qw(@ISA); Arg [..] : List of named arguments: -NAME - The name of the coordinate system -VERSION - (optional) The version of the coordinate system + -RANK - The rank of the coordinate system. The highest + level coordinate system should have rank 1, the + second highest rank 2 and so on. An example of + a high level coordinate system is 'chromosome' an + example of a lower level coordinate system is + 'clone'. -TOP_LEVEL - (optional) Sets whether this is a top-level coord - system. Default = 0 + system. Default = 0. This should only be set to + true if you are creating an artificial toplevel + coordsystem by the name of 'toplevel' -SEQUENCE_LEVEL - (optional) Sets whether this is a sequence level coordinate system. Default = 0 - -DEFAULT - (optional) + -DEFAULT - (optional) Whether this is the default version of the coordinate systems of this name. Default = 0 -DBID - (optional) The internal identifier of this coordinate system -ADAPTOR - (optional) The adaptor which provides database interaction for this object - Example : $cs = Bio::EnsEMBL::CoordSystem->new(-NAME => 'chromosome', + Example : $cs = Bio::EnsEMBL::CoordSystem->new(-NAME => 'chromosome', -VERSION => 'NCBI33', + -RANK => 1, -DBID => 1, -ADAPTOR => adaptor, -DEFAULT => 1, - -SEQUENCE_LEVEL => 0, - -TOP_LEVEL => 1); + -SEQUENCE_LEVEL => 0); Description: Creates a new CoordSystem object representing a coordinate system. Returntype : Bio::EnsEMBL::CoordSystem @@ -99,23 +108,57 @@ sub new { my $self = $class->SUPER::new(@_); - my ($name,$version, $top_level, $sequence_level, $default) = + my ($name,$version, $top_level, $sequence_level, $default, $rank) = rearrange(['NAME','VERSION','TOP_LEVEL', 'SEQUENCE_LEVEL', - 'DEFAULT'], @_); + 'DEFAULT', 'RANK'], @_); throw('The NAME argument is required') if(!$name); $version = '' if(!defined($version)); - $top_level ||= 0; - $sequence_level ||= 0; - $default ||= 0; + $top_level = ($top_level) ? 1 : 0; + $sequence_level = ($sequence_level) ? 1 : 0; + $default = ($default) ? 1 : 0; + $rank ||= 0; + + if($top_level) { + if($rank) { + throw('RANK argument must be 0 if TOP_LEVEL is 1'); + } + + if($name) { + if($name ne 'toplevel') { + throw('The NAME argument must be "toplevel" if TOP_LEVEL is 1') + } + } else { + $name = 'toplevel'; + } + + if($sequence_level) { + throw("SEQUENCE_LEVEL argument must be 0 if TOP_LEVEL is 1"); + } + + $default = 0; + + } else { + if(!$rank) { + throw("RANK argument must be non-zero if not toplevel CoordSystem"); + } + if($name eq 'toplevel') { + throw("Cannot name coord system 'toplevel' unless TOP_LEVEL is 1"); + } + } + + if($rank !~ /^\d+$/) { + throw('The RANK argument must be a positive integer'); + } $self->{'version'} = $version; $self->{'name'} = $name; $self->{'top_level'} = $top_level; $self->{'sequence_level'} = $sequence_level; $self->{'default'} = $default; + $self->{'rank'} = $rank; return $self; } @@ -196,7 +239,11 @@ sub equals { Arg [1] : none Example : if($coord_sys->is_top_level()) { ... } - Description: Returns true if this is a top level coordinate system + Description: Returns true if this is the toplevel pseudo coordinate system. + The toplevel coordinate system is not a real coordinate system + which is stored in the database, but it is a placeholder that + can be used to request transformations or retrievals to/from + the highest defined coordinate system in a given region. Returntype : 0 or 1 Exceptions : none Caller : general @@ -244,4 +291,28 @@ sub is_default { } + + +=head2 rank + + Arg [1] : none + Example : if($cs1->rank() < $cs2->rank()) { + print $cs1->name(), " is a higher level coord system than", + $cs2->name(), "\n"; + } + Description: Returns the rank of this coordinate system. A lower number + is a higher coordinate system. The highest level coordinate + system has a rank of 1 (e.g. 'chromosome'). The toplevel + pseudo coordinate system has a rank of 0. + Returntype : int + Exceptions : none + Caller : general + +=cut + +sub rank { + my $self = shift; + return $self->{'rank'}; +} + 1; diff --git a/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm index f3fb19925c..bf09e81ec7 100644 --- a/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm @@ -65,6 +65,7 @@ use strict; use Bio::EnsEMBL::DBSQL::BaseAdaptor; use Bio::EnsEMBL::AssemblyMapper; use Bio::EnsEMBL::ChainedAssemblyMapper; +use Bio::EnsEMBL::TopLevelAssemblyMapper; use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache use Bio::EnsEMBL::Utils::Exception qw(deprecate throw); @@ -146,6 +147,13 @@ sub fetch_by_CoordSystems { $cs1->name . " " . $cs1->version); } + if($cs1->is_top_level()) { + return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2); + } + if($cs2->is_top_level()) { + return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1); + } + my $csa = $self->db->get_CoordSystemAdaptor(); #retrieve the shortest possible mapping path between these systems diff --git a/modules/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm index 2542b6cef7..a9518dc1bd 100644 --- a/modules/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm @@ -36,23 +36,23 @@ Bio::EnsEMBL::DBSQL::CoordSystemAdaptor } # - # Fetching by top level: + # Fetching by rank: + # + $cs = $csa->fetch_by_rank(2); + + # + # Fetching the pseudo coord system 'toplevel' # #Get the default top_level coord system: $cs = $csa->fetch_top_level(); - #Get a particular version of a top_level coord system: - $cs = $csa->fetch_top_level('NCBI34'); - - #Get all top level coord systems: - foreach $cs (@{$csa->fetch_all_top_level()}) { - print $cs->name(), ' ', $cs->version, "\n"; - } - #can also use an alias in fetch_by_name: $cs = $csa->fetch_by_name('toplevel'); + #can also request toplevel using rank=0 + $cs = $csa->fetch_by_rank(0); + # # Fetching by sequence level: # @@ -87,10 +87,7 @@ not have a version an empty string ('') is used instead. Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk -=head1 APPENDIX - -The rest of the documentation details each of the object -methods. Internal methods are usually preceded with a _ +=head1 METHODS =cut @@ -102,7 +99,6 @@ package Bio::EnsEMBL::DBSQL::CoordSystemAdaptor; use Bio::EnsEMBL::DBSQL::BaseAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::CoordSystem; -use DBI; use vars qw(@ISA); @@ -140,30 +136,29 @@ sub new { #keyed on id, coord_system value $self->{'_dbID_cache'} = {}; + #keyed on rank + $self->{'_rank_cache'} = {}; + #keyed on id, 1/undef values $self->{'_is_sequence_level'} = {}; - $self->{'_is_top_level'} = {}; $self->{'_is_default_version'} = {}; - my $sth = $self->prepare('SELECT coord_system_id, name, version, attrib ' . - 'FROM coord_system'); - + my $sth = $self->prepare( + 'SELECT coord_system_id, name, rank, version, attrib ' . + 'FROM coord_system'); $sth->execute(); - my ($dbID, $name, $version, $attrib); - $sth->bind_columns(\$dbID, \$name, \$version, \$attrib); + my ($dbID, $name, $rank, $version, $attrib); + $sth->bind_columns(\$dbID, \$name, \$rank, \$version, \$attrib); while($sth->fetch()) { my $seq_lvl = 0; - my $top_lvl = 0; my $default = 0; if($attrib) { foreach my $attrib (split(',', $attrib)) { $self->{"_is_$attrib"}->{$dbID} = 1; if($attrib eq 'sequence_level') { $seq_lvl = 1; - } elsif($attrib eq 'top_level') { - $top_lvl = 1; } elsif($attrib eq 'default_version') { $default = 1; } @@ -175,13 +170,14 @@ sub new { -ADAPTOR => $self, -NAME => $name, -VERSION => $version, - -TOP_LEVEL => $top_lvl, + -RANK => $rank, -SEQUENCE_LEVEL => $seq_lvl, -DEFAULT => $default); $self->{'_dbID_cache'}->{$dbID} = $cs; $self->{'_name_cache'}->{lc($name)} ||= []; + $self->{'_rank_cache'}->{$rank} = $cs; push @{$self->{'_name_cache'}->{lc($name)}}, $cs; } $sth->finish(); @@ -225,6 +221,15 @@ sub new { $mappings{$asm_cs->dbID}->{$cmp_cs->dbID} = 1; } + # + # Create the pseudo coord system 'toplevel' and cache it so that + # only one of these is created for each db... + # + my $toplevel = Bio::EnsEMBL::CoordSystem->new(-TOP_LEVEL => 1, + -NAME => 'toplevel', + -ADAPTOR => $self); + $self->{'_top_level'} = $toplevel; + $self->{'_mapping_paths'} = \%mappings; return $self; @@ -238,7 +243,10 @@ sub new { Example : foreach my $cs (@{$csa->fetch_all()}) { print $cs->name(), ' ', $cs->version(), "\n"; } - Description: Retrieves every coordinate system defined in the DB + Description: Retrieves every coordinate system defined in the DB. + These will be returned in ascending order of rank. I.e. + The highest coordinate system with rank=1 would be first in the + array. Returntype : listref of Bio::EnsEMBL::CoordSystems Exceptions : none Caller : general @@ -248,13 +256,47 @@ sub new { sub fetch_all { my $self = shift; - my @coord_systems = values %{$self->{'_dbID_cache'}}; + my @coord_systems; + + #order the array by rank in ascending order + foreach my $rank (sort {$a <=> $b} keys %{$self->{'_rank_cache'}}) { + push @coord_systems, $self->{'_rank_cache'}->{$rank}; + } return \@coord_systems; } +=head2 fetch_by_rank + + Arg [1] : int $rank + Example : my $cs = $coord_sys_adaptor->fetch_by_rank(1); + Description: Retrieves a CoordinateSystem via its rank. 0 is a special + rank reserved for the pseudo coordinate system 'toplevel'. + undef is returned if no coordinate system of the specified rank + exists. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : general + +=cut + +sub fetch_by_rank { + my $self = shift; + my $rank = shift; + + throw("Rank argument must be defined.") if(!defined($rank)); + throw("Rank argument must be a non-negative integer.") if($rank !~ /^\d+$/); + + if($rank == 0) { + return $self->fetch_top_level(); + } + + return $self->{'_rank_cache'}->{$rank}; +} + + =head2 fetch_by_name Arg [1] : string $name @@ -266,17 +308,16 @@ sub fetch_all { specified the default version will be used. Example : $coord_sys = $csa->fetch_by_name('clone'); $coord_sys = $csa->fetch_by_name('chromosome', 'NCBI33'); - #toplevel is an alias for the highest coordinate system - #such as the chromosome coordinate system + # toplevel is an pseudo coord system representing the highest + # coord system in a given region + # such as the chromosome coordinate system $coord_sys = $csa->fetch_by_name('toplevel'); - $coord_sys = $csa->fetch_by_name('toplevel', 'NCBI31'); #seqlevel is an alias for the sequence level coordinate system #such as the clone or contig coordinate system $coord_sys = $csa->fetch_by_name('seqlevel'); Description: Retrieves a coordinate system by its name Returntype : Bio::EnsEMBL::CoordSystem - Exceptions : throw if the requested coordinate system does not exist - throw if no name argument provided + Exceptions : throw if no name argument provided warning if no version provided and default does not exist Caller : general @@ -299,19 +340,16 @@ sub fetch_by_name { } if(!exists($self->{'_name_cache'}->{$name})) { - my $guess = ''; if($name =~ /top/) { - $guess = "\nDid you mean 'toplevel' instead of '$name'?"; + warning("Did you mean 'toplevel' coord system instead of '$name'?"); } elsif($name =~ /seq/) { - $guess = "\nDid you mean 'seqlevel' instead of '$name'?"; + warning("Did you mean 'seqlevel' coord system instead of '$name'?"); } - throw("Coord_system with name [$name] does not exist.$guess"); + return undef; } my @coord_systems = @{$self->{'_name_cache'}->{$name}}; - throw("Coord_system with name [$name] does not exist.") if(!@coord_systems); - foreach my $cs (@coord_systems) { if($version) { return $cs if(lc($cs->version()) eq $version); @@ -321,7 +359,8 @@ sub fetch_by_name { } if($version) { - throw("Coord_system [$name] does not exist with version [$version]"); + #the specific version we were looking for was not found + return undef; } #didn't find a default, just take first one @@ -359,7 +398,7 @@ sub fetch_all_by_name { if($name eq 'seqlevel') { return [$self->fetch_sequence_level()]; } elsif($name eq 'toplevel') { - return $self->fetch_all_top_level(); + return [$self->fetch_top_level()]; } return $self->{'_name_cache'}->{$name} || []; @@ -469,8 +508,9 @@ sub add_feature_table { Arg [1] : int dbID Example : $cs = $csa->fetch_by_dbID(4); Description: Retrieves a coord_system via its internal - identifier. - Returntype : Bio::EnsEMBL::CoordSystem + identifier, or undef if no coordinate system with the provided + id exists. + Returntype : Bio::EnsEMBL::CoordSystem or undef Exceptions : thrown if no coord_system exists for specified dbID Caller : general @@ -484,7 +524,7 @@ sub fetch_by_dbID { my $cs = $self->{'_dbID_cache'}->{$dbID}; - throw("Coord_system with dbID [$dbID] does not exist") if(!$cs); + return undef if(!$cs); return $cs; } @@ -493,63 +533,22 @@ sub fetch_by_dbID { =head2 fetch_top_level - Arg [1] : string $version (optional) - The version of the top-level to obtain. If not provided - the default version top-level will be returned. + Arg [1] : none Example : $cs = $csa->fetch_top_level(); - $cs = $csa->fetch_top_level('NCBI34'); - Description: Retrieves the top level coordinate system. It is possible - to have multiple top-level coordinate systems if there is - more than one version of the top-level system. For - example the top level of a homo_sapiens database could - be 'chromosome' but this might have 'NCBI33', 'NCBI31', - and 'NCBI34' versions. One of these will be defined as - the default and will be returned if no specific version - is requested. If a specific version is requested then it - will be returned. + Description: Retrieves the toplevel pseudo coordinate system. Returntype : a Bio::EnsEMBL::CoordSystem object - Exceptions : throw if no top-level coord_system exists for specified - version - throw if no top-level coord_system exists at all - warning if no version specified and no default version - exists - Caller : general - -=cut - -sub fetch_top_level { - my $self = shift; - - return $self->_fetch_by_attrib('top_level', @_); -} - - -=head2 fetch_all_top_level - - Arg [1] : none - Example : foreach my $cs (@{$csa->fetch_all_top_level()}) { - my ($id, $name, $version) = @$cs; - } - Description: Retrieves all top level coordinate systems defined in - the DB. It is possible to have multiple top-level - coordinate systems if there is more than one version - of the top-level system. For example the top level of a - homo_sapiens database could be 'chromosome' but this - might have 'NCBI33', 'NCBI31', and 'NCBI34' versions. - Returntype : listref of Bio::EnsEMBL::CoordSystem objects Exceptions : none Caller : general =cut -sub fetch_all_top_level { +sub fetch_top_level { my $self = shift; - return $self->_fetch_all_by_attrib('top_level', @_); + return $self->{'_top_level'}; } - =head2 fetch_sequence_level Arg [1] : none @@ -797,6 +796,7 @@ sub deleteObj { delete $self->{'_dbID_cache'}; delete $self->{'_mapping_paths'}; delete $self->{'_shortest_paths'}; + delete $self->{'_top_level'}; } @@ -822,10 +822,17 @@ sub store { my $db = $self->db(); my $name = $cs->name(); my $version = $cs->version(); - my $toplevel = $cs->is_top_level(); + my $rank = $cs->rank(); + my $seqlevel = $cs->is_sequence_level(); my $default = $cs->is_default(); - + + my $toplevel = $cs->is_top_level(); + + if($toplevel) { + throw("The toplevel CoordSystem cannot be stored"); + } + # # Do lots of sanity checking to prevent bad data from being entered # @@ -856,9 +863,19 @@ sub store { } } + if($rank !~ /^\d+$/) { + throw("Rank attribute must be a positive integer not [$rank]"); + } + if($rank == 0) { + throw("Only toplevel CoordSystem may have rank of 0."); + } + + if(defined($self->{'_rank_cache'}->{$rank})) { + throw("CoordSystem with rank [$rank] already exists."); + } + my @attrib; - push @attrib, 'top_level' if($toplevel); push @attrib, 'default_version' if($default); push @attrib, 'sequence_level' if($seqlevel); @@ -871,9 +888,10 @@ sub store { my $sth = $db->prepare('INSERT INTO coord_system ' . 'SET name = ?, ' . 'version = ?, ' . - 'attrib = ?'); + 'attrib = ?,' . + 'rank = ?'); - $sth->execute($name, $version, $attrib_str); + $sth->execute($name, $version, $attrib_str, $rank); my $dbID = $sth->{'mysql_insertid'}; $sth->finish(); @@ -889,12 +907,12 @@ sub store { # $self->{'_is_default_version'}->{$dbID} = 1 if($default); $self->{'_is_sequence_level'}->{$dbID} = 1 if($seqlevel); - $self->{'_is_top_level'}->{$dbID} = 1 if($toplevel); $self->{'_name_cache'}->{lc($name)} ||= []; push @{$self->{'_name_cache'}->{lc($name)}}, $cs; $self->{'_dbID_cache'}->{$dbID} = $cs; + $self->{'_rank_cache'}->{$rank} = $cs; return $cs; } diff --git a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm index 6af66ae180..94fd5ef64b 100644 --- a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm @@ -267,10 +267,13 @@ sub fetch_by_assembly_location { deprecate('Use fetch_by_Slice_start_end_strand() instead'); + my $csa = $self->db->get_CoordSystem(); + my $top_cs = @{$csa->fetch_all}; + my $slice_adaptor = $self->db->get_SliceAdaptor(); - my $slice = $slice_adaptor->fetch_by_region('toplevel', $chrName, + my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName, $chrStart, $chrEnd, - $strand); + $strand, $top_cs->version); return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1); } diff --git a/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm index e6aca82fc3..9ef007b681 100644 --- a/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm @@ -1446,7 +1446,12 @@ sub fetch_by_chr_name{ my ($self,$chr_name) = @_; deprecate('Use fetch_by_region() instead.'); - return $self->fetch_by_region('toplevel',$chr_name); + my $csa = $self->db->get_CoordSystemAdaptor(); + + my $top_cs = @{$csa->fetch_all()}; + + return $self->fetch_by_region($top_cs->name(),$chr_name, + undef,undef,undef,$top_cs->version); } diff --git a/modules/Bio/EnsEMBL/Feature.pm b/modules/Bio/EnsEMBL/Feature.pm index eb1b7618d0..82ba9552b0 100644 --- a/modules/Bio/EnsEMBL/Feature.pm +++ b/modules/Bio/EnsEMBL/Feature.pm @@ -392,8 +392,10 @@ sub transform { my $cs = $db->get_CoordSystemAdaptor->fetch_by_name($cs_name, $cs_version); my $current_cs = $slice->coord_system(); - # self is already in the requested coordinate system, so we can just return a copy - if( $cs->equals( $current_cs ) && $slice->start() == 1 && $slice->strand() == 1 ) { + # if feature is already in the requested coordinate system, we can just + # return a copy + if( $cs->equals( $current_cs ) && $slice->start() == 1 && + $slice->strand() == 1 ) { my $new_feature; %$new_feature = %$self; bless $new_feature, ref $self; @@ -418,7 +420,8 @@ sub transform { bless $new_feature, ref $self; $new_feature->{'start'} = $p_slice->start(); $new_feature->{'end'} = $p_slice->end(); - $new_feature->{'strand'} = ($self->{'strand'} == 0) ? 0 : $p_slice->strand(); + $new_feature->{'strand'} = + ($self->{'strand'} == 0) ? 0 : $p_slice->strand(); $new_feature->{'slice'} = $slice; return $new_feature; } diff --git a/modules/Bio/EnsEMBL/Mapper.pm b/modules/Bio/EnsEMBL/Mapper.pm index cea829dba5..d599e52e66 100644 --- a/modules/Bio/EnsEMBL/Mapper.pm +++ b/modules/Bio/EnsEMBL/Mapper.pm @@ -56,11 +56,30 @@ use Bio::EnsEMBL::Mapper::Gap; use Bio::EnsEMBL::Utils::Exception qw(throw); -sub new { - my($class,@args) = @_; - my $from = shift @args; - my $to = shift @args; +=head2 new + + Arg [1] : string $from + The name of the 'from' coordinate system + Arg [2] : string $to + The name of the 'to' coordinate system + Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs + The 'from' coordinate system + Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs + Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO'); + Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : general + +=cut + +sub new { + my $class = shift; + my $from = shift; + my $to = shift; + my $from_cs = shift; + my $to_cs = shift; my $self = {}; bless $self,$class; @@ -76,7 +95,10 @@ sub new { $self->from($from); $self->{'pair_count'} = 0; -# set stuff in self from @args + + $self->{'from_cs'} = $from_cs; + $self->{'to_cs'} = $to_cs; + return $self; } @@ -130,8 +152,6 @@ sub flush { sub map_coordinates{ my ($self, $id, $start, $end, $strand, $type) = @_; - #&eprof_start('map_coordinates'); - unless(defined($id) && defined($start) && defined($end) && defined($strand) && defined($type) ) { throw("Must start,end,strand,id,type as coordinates"); @@ -142,14 +162,16 @@ sub map_coordinates{ my $hash = $self->{"_pair_$type"}; - my ($from, $to); + my ($from, $to, $cs); if($type eq $self->{'to'}) { $from = 'to'; $to = 'from'; + $cs = $self->{'from_cs'}; } else { $from = 'from'; $to = 'to'; + $cs = $self->{'to_cs'}; } unless(defined $hash) { @@ -244,7 +266,8 @@ sub map_coordinates{ my $res = Bio::EnsEMBL::Mapper::Coordinate->new($target_coord->{'id'}, $target_start, $target_end, - $pair->{'ori'} * $strand); + $pair->{'ori'} * $strand, + $cs); push(@result,$res); $last_used_pair = $pair; @@ -267,8 +290,6 @@ sub map_coordinates{ @result = reverse ( @result); } - #&eprof_end('map_coordinates'); - return @result; } @@ -296,16 +317,18 @@ sub map_coordinates{ sub fastmap { my ($self, $id, $start, $end, $strand, $type) = @_; - my ($from, $to); + my ($from, $to, $cs); if( ! $self->{'_is_sorted'} ) { $self->_sort() } if($type eq $self->{'to'}) { $from = 'to'; $to = 'from'; + $cs = $self->{'from_cs'}; } else { $from = 'from'; $to = 'to'; + $cs = $self->{'to_cs'}; } my $hash = $self->{"_pair_$type"} or @@ -317,25 +340,24 @@ sub fastmap { foreach my $pair (@$pairs) { my $self_coord = $pair->{$from}; my $target_coord = $pair->{$to}; - + # only super easy mapping is done if( $start < $self_coord->{'start'} || - $end > $self_coord->{'end'} ) { + $end > $self_coord->{'end'} ) { next; } - + if( $pair->{'ori'} == 1 ) { return ( $target_coord->{'id'}, - $target_coord->{'start'}+$start-$self_coord->{'start'}, - $target_coord->{'start'}+$end-$self_coord->{'start'}, - $strand ); + $target_coord->{'start'}+$start-$self_coord->{'start'}, + $target_coord->{'start'}+$end-$self_coord->{'start'}, + $strand, $cs ); } else { return ( $target_coord->{'id'}, - $target_coord->{'end'} - ($end - $self_coord->{'start'}), - $target_coord->{'end'} - ($start - $self_coord->{'start'}), - -$strand ); + $target_coord->{'end'} - ($end - $self_coord->{'start'}), + $target_coord->{'end'} - ($start - $self_coord->{'start'}), + -$strand, $cs ); } - } return (); diff --git a/modules/Bio/EnsEMBL/Mapper/Coordinate.pm b/modules/Bio/EnsEMBL/Mapper/Coordinate.pm index 8cbeb0ab13..818424a29d 100644 --- a/modules/Bio/EnsEMBL/Mapper/Coordinate.pm +++ b/modules/Bio/EnsEMBL/Mapper/Coordinate.pm @@ -36,12 +36,13 @@ use vars qw(@ISA); use strict; sub new { - my($class, $id, $start, $end, $strand) = @_; + my($class, $id, $start, $end, $strand, $coord_system) = @_; return bless { 'id' => $id, - 'start' => $start, - 'end' => $end, - 'strand' => $strand }, $class; + 'start' => $start, + 'end' => $end, + 'strand' => $strand, + 'coord_system' => $coord_system}, $class; } @@ -57,13 +58,9 @@ sub new { =cut sub start{ - my $obj = shift; - if( @_ ) { - my $value = shift; - $obj->{'start'} = $value; - } - return $obj->{'start'}; - + my $self = shift; + $self->{'start'} = shift if(@_); + return $self->{'start'}; } @@ -79,13 +76,9 @@ sub start{ =cut sub end{ - my $obj = shift; - if( @_ ) { - my $value = shift; - $obj->{'end'} = $value; - } - return $obj->{'end'}; - + my $self = shift; + $self->{'end'} = shift if(@_); + return $self->{'end'}; } @@ -101,13 +94,9 @@ sub end{ =cut sub strand{ - my $obj = shift; - if( @_ ) { - my $value = shift; - $obj->{'strand'} = $value; - } - return $obj->{'strand'}; - + my $self = shift; + $self->{'strand'} = shift if(@_); + return $self->{'strand'}; } @@ -124,12 +113,16 @@ sub strand{ =cut sub id{ - my ($self,$value) = @_; - if( defined $value) { - $self->{'id'} = $value; - } - return $self->{'id'}; + my $self = shift; + $self->{'id'} = shift if(@_); + return $self->{'id'}; +} + +sub coord_system { + my $self = shift; + $self->{'coord_system'} = shift if(@_); + return $self->{'coord_system'}; } sub length { diff --git a/modules/Bio/EnsEMBL/Slice.pm b/modules/Bio/EnsEMBL/Slice.pm index e0c92050f9..b7ad77f9e1 100644 --- a/modules/Bio/EnsEMBL/Slice.pm +++ b/modules/Bio/EnsEMBL/Slice.pm @@ -148,7 +148,9 @@ sub new { } if(!$coord_system || !ref($coord_system) || !$coord_system->isa('Bio::EnsEMBL::CoordSystem')) { - throw('COORD_SYSTEM argment must be a Bio::EnsEMBL::CoordSystem'); + throw('COORD_SYSTEM argument must be a Bio::EnsEMBL::CoordSystem'); + } elsif($coord_system->is_top_level()) { + throw('Cannot create slice on toplevel CoordSystem.'); } #strand defaults to 1 if not defined @@ -615,7 +617,7 @@ sub project { #if the slice has negative coordinates or coordinates exceeding the #exceeding length of the sequence region we want to shrink the slice to #the defined region - + if($self->{'start'} > $entire_len || $self->{'end'} < 1) { #none of this slice is in a defined region return []; @@ -642,8 +644,8 @@ sub project { my @projection; my $current_start = 1; - - #decompose this slice into its symlinked components. + + #decompose this slice into its symlinked components. #this allows us to handle haplotypes and PARs my $normal_slice_proj = $slice_adaptor->fetch_normalized_slice_projection($self); @@ -651,12 +653,12 @@ sub project { my $normal_slice = $segment->[2]; $slice_cs = $normal_slice->coord_system(); - + my $asma = $db->get_AssemblyMapperAdaptor(); my $asm_mapper = $asma->fetch_by_CoordSystems($slice_cs, $cs); # perform the mapping between this slice and the requested system - my @coords = $asm_mapper->map($normal_slice->seq_region_name(), + my @coords = $asm_mapper->map($normal_slice->seq_region_name(), $normal_slice->start(), $normal_slice->end(), $normal_slice->strand(), @@ -667,17 +669,19 @@ sub project { foreach my $coord (@coords) { my $coord_start = $coord->start(); my $coord_end = $coord->end(); - my $length = $coord_end - $coord_start + 1; + my $length = $coord_end - $coord_start + 1; #skip gaps if($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + my $coord_cs = $coord->coord_system(); + #create slices for the mapped-to coord system - my $slice = $slice_adaptor->fetch_by_region($cs->name(), + my $slice = $slice_adaptor->fetch_by_region($coord_cs->name(), $coord->id(), $coord_start, $coord_end, $coord->strand(), - $cs->version()); + $coord_cs->version()); my $current_end = $current_start + $length - 1; @@ -1775,8 +1779,13 @@ sub get_Chromosome { deprecate("Use SliceAdaptor::fetch_by_region('chromosome'," . '$slice->seq_region_name) instead'); - my $chr = $self->adaptor->fetch_by_region('toplevel', - $self->seq_region_name()); + my $csa = $self->adaptor->db->get_CoordSystemAdaptor(); + my ($top_cs) = @{$csa->fetch_all()}; + + my $chr = $self->adaptor->fetch_by_region($top_cs->name(), + $self->seq_region_name(), + undef,undef,undef, + $top_cs->version()); return bless($chr, 'Bio::EnsEMBL::Chromosome'); } diff --git a/modules/Bio/EnsEMBL/TopLevelAssemblyMapper.pm b/modules/Bio/EnsEMBL/TopLevelAssemblyMapper.pm new file mode 100644 index 0000000000..b95b2a9fb2 --- /dev/null +++ b/modules/Bio/EnsEMBL/TopLevelAssemblyMapper.pm @@ -0,0 +1,308 @@ +# +# Ensembl module for Bio::EnsEMBL::AssemblyMapper +# +# Written by Graham McVicker +# +# Copyright GRL and EBI +# +# You may distribute this module under the same terms as perl itself + +=head1 NAME + +Bio::EnsEMBL::TopLevelMapper - +Handles mapping between a given coordinate system and the toplevel pseudo +coordinate system. + +=head1 SYNOPSIS + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + $asma = $db->get_AssemblyMapperAdaptor(); + $csa = $db->get_CoordSystemAdaptor(); + + my $toplevel = $cs_adaptor->fetch_by_name('toplevel'); + my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); + + $asm_mapper = $map_adaptor->fetch_by_CoordSystems($cs1, $cs2); + + #map to toplevel coord system for this region + @chr_coords = $asm_mapper->map('AL30421.1.200.92341',100,10000,-1,$ctg_cs); + + #list toplevel seq_region_ids for this region + @chr_ids = $asm_mapper->list_ids('AL30421.1.200.92341',1,1000,-1,$ctg_cs); + +=head1 DESCRIPTION + +The TopLevelAssemblyMapper performs mapping between a provided coordinate +system and the toplevel pseudo cooordinate system. The toplevel coordinate +system is not a real coordinate system, but represents the highest coordinate +system that can be mapped to in a given region. It is only possible to +perform unidirectional mapping using this mapper, because it does not make +sense to map from the toplevel coordinate system to another coordinate system. + +=head1 CONTACT + +Post general queries to B<ensembl-dev@ebi.ac.uk> + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::TopLevelAssemblyMapper; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::CoordSystem; + +sub new { + my ($caller, $adaptor, $toplevel_cs, $other_cs) = @_; + + my $class = ref($caller) || $caller; + + if(!ref($toplevel_cs)) { + throw('Toplevel CoordSystem argument expected.'); + } + if(!ref($other_cs)) { + throw('Other CoordSystem argument expected.'); + } + + if(!$toplevel_cs->is_top_level()) { + throw($toplevel_cs->name() . " is not the toplevel CoordSystem."); + } + if($other_cs->is_top_level()) { + throw("Other CoordSystem argument should not be toplevel CoordSystem."); + } + + my $cs_adaptor = $adaptor->db()->get_CoordSystemAdaptor(); + my $coord_systems = $cs_adaptor->fetch_all(); + + return bless {'coord_systems' => $coord_systems, + 'adaptor' => $adaptor, + 'toplevel_cs' => $toplevel_cs, + 'other_cs' => $other_cs}, $class; +} + +sub adaptor { + my $self = shift; + + $self->{'adaptor'} = shift if(@_); + + return $self->{'adaptor'}; +} + + +sub map { + throw('Incorrect number of arguments.') if(@_ != 6); + + my($self, $frm_seq_region, $frm_start, $frm_end, $frm_strand, $frm_cs, + $fastmap) = @_; + + my $mapper = $self->{'mapper'}; + my $toplevel_cs = $self->{'toplevel_cs'}; + my $other_cs = $self->{'other_cs'}; + my $adaptor = $self->{'adaptor'}; + + if($frm_cs->is_top_level()) { + throw("The toplevel CoordSystem can only be mapped TO, not FROM."); + } + if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the assembled nor the component coordinate system " . + " of this AssemblyMapper"); + } + + my $coord_systems = $self->{'coord_systems'}; + + my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); + + # + # TBD try to make this more efficient + # + my $from_rank = $other_cs->rank(); + foreach my $cs (@$coord_systems) { + last if($cs->rank >= $from_rank); + + #check if a mapping path even exists to this coordinate system + my @mapping_path = $csa->get_mapping_path($cs,$other_cs); + + if(@mapping_path) { + + # Try to map to this coord system. If we get back any coordinates then + # it is our 'toplevel' that we were looking for + my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); + + if($fastmap) { + my @result = $mapper->fastmap($frm_seq_region, $frm_start, $frm_end, + $frm_strand, $frm_cs); + return @result if(@result); + } else { + my @coords = $mapper->map($frm_seq_region, $frm_start, $frm_end, + $frm_strand, $frm_cs); + + if(@coords > 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + return @coords; + } + } + } + } + + # the toplevel coordinate system for the region requested *is* the + # requested region. + if($fastmap) { + return ($frm_seq_region,$frm_start, $frm_end, $frm_strand, $other_cs); + } + return Bio::EnsEMBL::Mapper::Coordinate->new + ($frm_seq_region,$frm_start,$frm_end, $frm_strand, $other_cs); +} + +# +# for polymorphism with AssemblyMapper +# +sub flush {} + +sub fastmap { + my $self = shift; + return $self->map(@_,1); +} + +sub assembled_CoordSystem { + my $self = shift; + return $self->{'toplevel_cs'}; +} + +sub component_CoordSystem { + my $self = shift; + return $self->{'other_cs'}; +} + + +sub _list { + my($self, $frm_seq_region, $frm_start, $frm_end, $frm_cs, $seq_regions) = @_; + + my $mapper = $self->{'mapper'}; + my $toplevel_cs = $self->{'toplevel_cs'}; + my $other_cs = $self->{'other_cs'}; + my $adaptor = $self->{'adaptor'}; + + if($frm_cs->is_top_level()) { + throw("The toplevel CoordSystem can only be mapped TO, not FROM."); + } + if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the assembled nor the component coordinate system " . + " of this AssemblyMapper"); + } + + my $coord_systems = $self->{'coord_systems'}; + my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); + + # + # TBD try to make this more efficient + # + my $from_rank = $other_cs->rank(); + foreach my $cs (@$coord_systems) { + last if($cs->rank >= $from_rank); + + #check if a mapping path even exists to this coordinate system + my @mapping_path = $csa->get_mapping_path($cs, $other_cs); + + if(@mapping_path) { + + # Try to map to this coord system. If we get back any coordinates then + # it is our 'toplevel' that we were looking for + my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); + + my @result; + + if($seq_regions) { + @result = $mapper->list_seq_regions($frm_seq_region, $frm_start, + $frm_end, $frm_cs); + } else { + @result = $mapper->list_ids($frm_seq_region, $frm_start, + $frm_end, $frm_cs); + } + + return @result if(@result); + } + } + + # the toplevel coordinate system for the region requested *is* the + return ($frm_seq_region); + + + # requested region. + if($seq_regions) { + return ($frm_seq_region); + } + + #this seems a bit silly and inefficient, but it is probably never + #called anyway. + my $slice_adaptor = $adaptor->db()->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region($other_cs->name(), + $frm_seq_region, + undef,undef,undef,$other_cs); + return ($slice_adaptor->get_seq_region_id($slice)); +} + + + +=head2 list_seq_regions + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} + Description: Retrieves a list of overlapping seq_region names + of another coordinate system. This is the same as the + list_ids method but uses seq_region names rather internal ids + Returntype : List of strings + Exceptions : none + Caller : general + +=cut + +sub list_seq_regions { + throw('Incorrect number of arguments.') if(@_ != 5); + return _list(@_,1); +} + + +=head2 list_ids + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest. + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} + Description: Retrieves a list of overlapping seq_region internal identifiers + of another coordinate system. This is the same as the + list_seq_regions method but uses internal identfiers rather + than seq_region strings + Returntype : List of ints + Exceptions : thrown if the from CoordSystem is the toplevel coord system + thrown if the from CoordSystem is not the one used in the mapper + Caller : general + +=cut + +sub list_ids { + throw('Incorrect number of arguments.') if(@_ != 5); + return _list(@_,0); +} + + + + + +1; -- GitLab