Merge branch 'master' of https://github.com/Ensembl/ensembl

61334766 · Magali Ruffier · 26de24c9 · fd51dd09 · 61334766 · 26de24c9
Commit 61334766 authored 11 years ago by Magali Ruffier
--- a/misc-scripts/assembly/exon_conservation_check.pl
+++ b/misc-scripts/assembly/exon_conservation_check.pl
@@ -28,8 +28,6 @@ use lib "$Bin/../../../ensembl-analysis/modules";
 require AssemblyMapper::Support;
 use Bio::EnsEMBL::Utils::Exception qw( throw );
 use Pod::Usage;
-use Bio::EnsEMBL::DBSQL::OntologyDBAdaptor;
-use Bio::EnsEMBL::Utils::BiotypeMapper;

 #Genebuilder utils
 require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils;
@@ -51,11 +49,9 @@ unless ($support->parse_arguments(@_)) {
 }
 $support->connect_dbs;

-my $onto_db_adaptor = Bio::EnsEMBL::DBSQL::OntologyDBAdaptor->new(
-  -DBNAME => $support->ref_dba->dbc->dbname,
-  -DBCONN => $support->ref_dba->dbc,
-);
-my $biotype_mapper = new Bio::EnsEMBL::Utils::BiotypeMapper($onto_db_adaptor);
+my $biotype_manager = 
+  Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'production')->get_biotype_manager();
+  

 $support->log_stamped("Beginning analysis.\n");
 $support->log("EXON KEY       : !! = Very bad (pc mismatch), %% = Somewhat bad (mismatch), ?? = No mapping, might be bad, && = eval error\n");
@@ -149,18 +145,11 @@ sub exon {
      # Determine severity of the problem
      $difference = diff(\$old_seq, \$projected_seq);

-      my $group_list = $biotype_mapper->belongs_to_groups($parent_gene->biotype);
+      my $group_list = $biotype_manager->belongs_to_groups($parent_gene);
      foreach my $group (@$group_list) {
-        if ($group eq 'protein_coding') {
-          $state = '!!';
-          last;
-        }
-      }
-      if (!$state) {
-
-        # Middle badness.
-        $state = '%%';
+	$state = '!!' and last if $group eq 'protein_coding'
      }
+      $state = '%%' unless $state; # Middle badness
    }

    $location = sprintf('%d : %d', $projected_exon->start(), $projected_exon->end());
@@ -220,13 +209,10 @@ sub transcript {
  if ($projected_transcript) {

    #Check if it was protein coding
-    my $group_list = $biotype_mapper->belongs_to_groups($projected_transcript->biotype);
+    my $group_list = $biotype_manager->belongs_to_groups($projected_transcript);
    my $is_pc      = 0;
    foreach my $group (@$group_list) {
-      if ($group eq 'protein_coding') {
-        $is_pc = 1;
-        last;
-      }
+      $is_pc = 1 and last if $group eq 'protein_coding';
    }

    #Now check for protein sequence mis-match

--- a/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm
+++ b/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm
-=head1 LICENSE
-
-Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-=cut
-
-=pod
-
-
-=head1 NAME
-
-BiotypeMapper - Translates EnsEMBL biotypes into Sequence Ontology terms and back
-
-=head1 AUTHOR
-
-Kieron Taylor, 2011 - ktaylor@ebi.ac.uk
-
-=head1 SYNOPSIS
-
-use Bio::EnsEMBL::Utils::BiotypeMapper
-
-my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' );
-my $biotype_mapper = new BiotypeMapper($ontology_adaptor);
-
-print $biotype_mapper->translate_feature_to_SO_term($feature);
-
-my $list_of_biotypes = $biotype_mapper->biotypes_belonging_to_group('protein-coding');
-
-=head1 DESCRIPTION
-
-BiotypeMapper provides a series of nearest matches between EnsEMBL biotypes and
-the Sequence Ontology (http://www.sequenceontology.org). In addition, biotypes
-are members of groupings, such as "short non-coding". This allows one to
-conveniently select all the biotypes of a certain kind. 
-
-SO Mappings are imperfect due to the inexact correspondance of biotypes to 
-several SO terms. The a best guess has been chosen in each case.
-
-Reverse mappings from SO to biotype are vague, due to many-to-one relationships.
-In this case a list of possible terms is given. 
-
-=cut
-
-package Bio::EnsEMBL::Utils::BiotypeMapper;
-
-use strict;
-use warnings;
-
-use Bio::EnsEMBL::Utils::Exception;
-use Bio::EnsEMBL::Utils::Cache;
-
-my %gene_so_mapping = (
-    'protein_coding' 		=> 'SO:0001217', # protein_coding_gene
-    'pseudogene' 			=> 'SO:0000336', # pseudogene
-    'processed_transcript' 	=> 'SO:0001503', # processed_transcript
-    'lincRNA' 				=> 'SO:0001641', # lincRNA_gene
-    'polymorphic_pseudogene'=> 'SO:0000336', # pseudogene
-    'Mt_tRNA' 				=> 'SO:0000088', # mt_gene
-    'IG_D_gene' 			=> 'SO:0000510', # D_gene
-    'snoRNA' 				=> 'SO:0001267', #snoRNA_gene
-    'misc_RNA' 				=> 'SO:0000356', #RNA
-    'miRNA' 				=> 'SO:0001265', #miRNA_gene
-    'rRNA' 					=> 'SO:0001637', #rRNA_gene
-    'snRNA'					=> 'SO:0001268', #snRNA_gene
-    'snRNA_pseudogene'		=> 'SO:0000336', # pseudogene
-    'tRNA_pseudogene'		=> 'SO:0000778', # pseudogenic_tRNA
-    'rRNA_pseudogene'		=> 'SO:0000777', # pseudogenic_rRNA
-    'TR_J_gene'				=> 'SO:0000470', # J_gene
-    'TR_V_gene'				=> 'SO:0000466', # V_gene
-    'TR_C_gene'				=> 'SO:0000478', # C_gene
-    'ncRNA'					=> 'SO:0001263', # ncRNA_gene
-    'tRNA'					=> 'SO:0001272', # tRNA_gene
-    'retrotransposed'		=> 'SO:0000569', # retrotransposed
-## heavily abbreviated
-     );
-
-my %transcript_so_mapping = (
-    'processed_transcript' 				=> 'SO:0001503', # processed_transcript
-    'nonsense_mediated_decay' 			=> 'SO:0001621', # NMD_transcript_variant
-    'retained_intron' 					=> 'SO:0000681', # aberrant_processed_transcript
-    'transcribed_unprocessed_pseudogene'=> 'SO:0000516', # pseudogenic_transcript
-    'processed_pseudogene' 				=> 'SO:0000043', # processed_pseudogene
-    'unprocessed_pseudogene' 			=> 'SO:0000336', # pseudogene
-    'unitary_pseudogene'				=> 'SO:0000336',
-    'pseudogene' 						=> 'SO:0000336', # pseudogene
-    'transcribed_processed_pseudogene'	=> 'SO:0000043', 
-    'retrotransposed' 					=> 'SO:0000569', #retrotransposed
-    'ncrna_host' 						=> 'SO:0000483',
-    'polymorphic_pseudogene'			=> 'SO:0000336',
-    'lincRNA'							=> 'SO:0001463',
-    'ncrna_host'						=> 'SO:0000483',
-    '3prime_overlapping_ncrna'			=> 'SO:0000483',
-    'TR_V_gene'							=> 'SO:0000466',
-    'TR_V_pseudogene'					=> 'SO:0000336',
-
-    'TR_J_gene'							=> 'SO:0000470',
-    'IG_C_gene'							=> 'SO:0000478',
-    'IG_C_pseudogene'					=> 'SO:0000336',
-    'TR_C_gene'							=> 'SO:0000478',
-    'IG_J_pseudogene'					=> 'SO:0000336',
-    'miRNA'								=> 'SO:0000276', #miRNA
-    'miRNA_pseudogene'					=> 'SO:0000336',
-    'disrupted_domain' 					=> 'SO:0000681', # aberrant_processed_transcript
-    'rRNA' 								=> 'SO:0000252', #rRNA
-    'rRNA_pseudogene'					=> 'SO:0000777', 
-    'scRNA_pseudogene'					=> 'SO:0000336',
-    'snoRNA' 							=> 'SO:0000275', # snoRNA
-    'snoRNA_pseudogene'					=> 'SO:0000336',
-    'snRNA'								=> 'SO:0000274', # snRNA
-    'snRNA_pseudogene'					=> 'SO:0000336', 
-    );
-
-my %feature_so_mapping = (
-    'Bio::EnsEMBL::Feature' => 'SO:0000001', # region
-    'Bio::EnsEMBL::Gene' => 'SO:0000704', # gene
-    'Bio::EnsEMBL::Transcript' => 'SO:0000673', # transcript
-    'Bio::EnsEMBL::Exon' => 'SO:0000147', # exon
-    'Bio::EnsEMBL::Slice' => 'SO:0000001', # region
-    'Bio::EnsEMBL::SimpleFeature' => 'SO:0001411', # biological_region
-    'Bio::EnsEMBL::MiscFeature' => 'SO:0001411', # biological_region
-    'Bio::EnsEMBL::RepeatFeature' => 'SO:0000657', # repeat region
-    'Bio::EnsEMBL::Variation::VariationFeature' => 'SO:0001060', # sequence variant
-    'Bio::EnsEMBL::Variation::StructuralVariationFeature' => 'SO:0001537', # structural variant
-    'Bio::EnsEMBL::Compara::ConstrainedElement' => 'SO:0001009', #DNA_constraint_sequence ????
-    'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => 'SO:0005836', # regulatory_region
-);
-
-my %grouping_of_biotypes = (
-    # Genebuilder/Havana categorisation
-    'protein_coding' => [qw( protein_coding polymorphic_pseudogene   )],
-    'pseudogene'     => [qw( pseudogene retrotransposed snRNA_pseudogene tRNA_pseudogene
-                            miRNA_pseudogene Mt_tRNA_pseudogene rRNA_pseudogene
-                            scRNA_pseudogene misc_RNA_pseudogene snoRNA_pseudogene
-                            processed_pseudogene
-                        )],
-    'long_noncoding' => [qw( 3prime_overlapping_ncrna antisense lincRNA ncrna_host non_coding 
-                            processed_transcript sense_intronic sense_overlapping
-                        )],
-    'short_noncoding'=> [qw( miRNA misc_RNA  Mt_tRNA 
-                             rRNA snoRNA  snRNA 
-                        )],
-
-    # practical Ensembl core categories for fasta dumping
-    # protein coding/pseudogene
-    'cdna'              => [qw( protein_coding polymorphic_pseudogene IG_V_gene TR_V_gene 
-                                IG_J_gene TR_J_gene IG_D_gene IG_C_gene TR_C_gene pseudogene
-                                retrotransposed IG_V_pseudogene TR_V_pseudogene 
-                                IG_J_pseudogene IG_C_pseudogene processed_transcript
-                                antisense ambiguous_orf transcribed_processed_pseudogene
-                                disrupted_domain processed_pseudogene
-                           )],
-
-    # protein coding transcript
-    # protein coding
-    'peptide_producing' => [qw( protein_coding polymorphic_pseudogene IG_V_gene TR_V_gene 
-                                IG_J_gene TR_J_gene IG_D_gene IG_C_gene TR_C_gene IG_LV_gene
-                                nonsense_mediated_decay
-                           )],
-
-    # short/long noncoding
-    'ncrna'             => [qw( ncRNA miRNA miRNA_pseudogene misc_RNA misc_RNA_pseudogene Mt_tRNA 
-                            Mt_tRNA_pseudogene Mt_rRNA rRNA rRNA_pseudogene scRNA_pseudogene 
-                            snoRNA snoRNA_pseudogene snRNA snRNA_pseudogene tRNA_pseudogene
-                            3prime_overlapping_ncrna lincRNA ncrna_host non_coding 
-                             sense_intronic sense_overlapping tRNA
-                            )],
-);
-
-=head2 new
-
-    Constructor
-    Arg [1]    : OntologyAdaptor from the EnsEMBL registry
-    Returntype : Bio::EnsEMBL::BiotypeMapper
-
-=cut
-
-sub new {
-    my $class = shift;
-    my $self = { 	
-        ontology_adaptor => shift,
-    };
-    
-    tie my %cache, 'Bio::EnsEMBL::Utils::Cache', 100;
-    $self->{cache} = \%cache;
-
-    bless $self, $class;
-    return $self;
-}
-
-=head2 translate_feature_to_SO_term
-
-    Arg [0]    : Bio::EnsEMBL::Feature, subclass or related Storable
-    Description: Translates a Feature type into an SO term. If the Feature is a
-                 Gene or Transcript, then a further refinement of the type is made
-                 via Biotype
-    Returntype : String
-
-=cut
-
-sub translate_feature_to_SO_term {
-    my $self = shift;
-    my $feature = shift;
-    my $so_accession;
-    my $so_name;
-    my $ref = ref($feature);
-    if ($feature->isa('Bio::EnsEMBL::Gene') && exists $gene_so_mapping{$feature->biotype}) {
-        $so_accession = $gene_so_mapping{$feature->biotype};
-    }
-    elsif ($feature->isa('Bio::EnsEMBL::Transcript') && exists $transcript_so_mapping{$feature->biotype}) {
-	   $so_accession = $transcript_so_mapping{$feature->biotype};
-    }
-    elsif ($feature->isa('Bio::EnsEMBL::Variation::BaseVariationFeature')) {
-	   $so_name = $feature->class_SO_term();
-    }
-    
-    if (! $so_accession && ! $so_name && exists $feature_so_mapping{$ref}) {
-      $so_accession = $feature_so_mapping{$ref};
-    }
-    else {
-        if($feature->can('SO_term')) {
-		    $so_accession = $feature->SO_term();
-        }
-    }
-
-    if ($so_accession) {
-       $so_name = $self->fetch_SO_name_by_accession($so_accession);
-    }
-    
-    throw ("Ontology mapping not found for ".ref($feature)) unless $so_name;
-    
-    return $so_name;
-}
-
-=head2 fetch_SO_name_by_accession
-
-  Arg [0]    : Sequence Ontology accession
-  Description: Returns the name linked to the given accession. These are
-               internally cached for speed.
-  Returntype : The name of the given accession.
-
-=cut
-
-sub fetch_SO_name_by_accession {
-  my ($self, $so_accession) = @_;
-  my $so_name = $self->{cache}->{$so_accession};
-  if(!$so_name) {
-    my $so_term = $self->{'ontology_adaptor'}->fetch_by_accession($so_accession);
-    $so_name = $so_term->name();
-    $self->{cache}->{$so_accession} = $so_name;
-  }
-  return $so_name;
-}
-
-
-=head2 translate_SO_to_biotype
-
-    Arg [0]    : Sequence Ontology term, either in name or URI format
-    Description: Returns the closest corresponding Ensembl biotypes to a given SO term
-    Returntype : Listref Array of Strings containing possible biotypes
-=cut
-
-sub translate_SO_to_biotype {
-	my $self = shift;
-    my $translate_me = shift;
-
-    my @so_names;
-# look up text in ontology database
-    if ($translate_me !~ /^SO:/) {
-        my $so_terms = $self->{'ontology_adaptor'}->fetch_all_by_name($translate_me);
-        @so_names = [];
-        foreach my $term (@{$so_terms}) {
-            push @so_names,$term->accession();
-        }
-    } else {
-        push @so_names,$translate_me;
-    }
-# convert list of accessions into biotypes
-    my @biotypes;
-    foreach my $accession (@so_names) {
-        foreach	my $key (keys %gene_so_mapping) {
-            if ($gene_so_mapping{$key} eq $accession) {
-                push @biotypes,$key;
-            }
-        }
-        foreach my $key (keys %transcript_so_mapping) {
-            if ($transcript_so_mapping{$key} eq $accession) {
-                push @biotypes,$key;
-            }
-        }
-        foreach my $key (keys %feature_so_mapping) {
-            if ($feature_so_mapping{$key} eq $accession) {
-                push @biotypes,$key;
-            }
-        }
-    }
-
-    return \@biotypes;
-}
-
-=head2 belongs_to_groups
-
-    Arg [0]    : Biotype (string)
-    Description: Returns the group names that include the given biotype
-    Returntype : Listref of strings
-=cut
-
-sub belongs_to_groups {
-    my $self = shift;
-    my $member = shift;
-    my @belongs_to;
-    foreach my $group (keys %grouping_of_biotypes) {
-        my $lc_group = lc($group);
-        foreach my $biotype ( @{ $grouping_of_biotypes{$group} }) {
-            if ($biotype eq $member) {
-                push(@belongs_to, $lc_group);
-            }
-        }
-    }
-    return \@belongs_to;
-}
-
-=head2 group_members
-
-    Arg [0]    : Biotype group name (string)
-    Description: Returns a list of biotypes that belong in the group.
-    Returntype : Listref of strings
-=cut
-
-sub group_members {
-    my $self = shift;
-    my $group = lc(shift);
-    if (exists($grouping_of_biotypes{$group})) {
-        my @biotypes = @{ $grouping_of_biotypes{$group} };
-        return \@biotypes;
-    }
-    else {
-        throw ("$group is not a valid group name for biotypes");
-    }
-}
-
-=head2 member_of_group
-
-    Arg [0]    : Biotype (string)
-    Arg [1]    : Group to check (string) 
-    Description: Returns true if a biotype is present in a group
-    Returntype : Boolean
-=cut
-
-sub member_of_group {
-    my $self = shift;
-    my $biotype = shift;
-    my $query_group = lc(shift);
-    my @groups = @{ $self->belongs_to_groups($biotype) };
-    while (my $group = shift @groups) {
-        my $lc_group = lc($group);
-        if ($lc_group eq $query_group) {
-            return 1;   
-        }
-    }
-    return 0;
-}
-
-1;
--- a/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm
+++ b/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm
@@ -77,7 +77,7 @@ sub new {
    };
    bless $self, $class;
    if ( ! check_ref($self->{'ontology_adaptor'}, "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" )) {
-        throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function. See also Bio::EnsEMBL::Utils::BiotypeMapper");        
+        throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function.");        
    }
    $self->{'mapper'} = Bio::EnsEMBL::Utils::SequenceOntologyMapper->new($self->{'ontology_adaptor'});