From 5967f6708a3ffa0f09eac14e19e6d34f841b6b03 Mon Sep 17 00:00:00 2001
From: Kieron Taylor <ktaylor@ebi.ac.uk>
Date: Fri, 24 Feb 2012 16:21:39 +0000
Subject: [PATCH] Added more groupings for use in FASTA dumping.

---
 modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm | 31 +++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm b/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm
index 0333e1dc80..a91c1b234b 100644
--- a/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm
+++ b/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm
@@ -118,8 +118,37 @@ my %feature_so_mapping = (
 	'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => 'SO:0001679', # transcription_regulatory_region
 );
 
+my %grouping_of_biotypes = (
+    # Genebuilder/Havana categorisation
+    'protein_coding' => [qw( protein_coding polymorphic_pseudogene   )],
+    'pseudogene'     => [qw( pseudogene retrotransposed )],
+    'long_noncoding' => [qw( 3prime_overlapping_ncrna antisense lincRNA ncrna_host non_coding 
+                            processed_transcript sense_intronic sense_overlapping
+                        )],
+    'short_noncoding'=> [qw( miRNA miRNA_pseudogene misc_RNA misc_RNA_pseudogene Mt_tRNA 
+                            Mt_tRNA_pseudogene rRNA rRNA_pseudogene scRNA_pseudogene snoRNA
+                            snoRNA_pseudogene snRNA snRNA_pseudogene tRNA_pseudogene
+                        )],
+    # practical Ensembl core categories for fasta dumping
+    'cDNA'              => [qw( protein_coding polymorphic_pseudogene IG_V_gene TR_V_gene 
+                                IG_J_gene TR_J_gene IG_D_gene IG_C_gene TR_C_gene pseudogene
+                                retrotransposed IG_V_pseudogene TR_V_pseudogene 
+                                IG_J_pseudogene IG_C_pseudogene
+                                
+                           )],
+    'peptide_producing' => [qw( protein_coding polymorphic_pseudogene IG_V_gene TR_V_gene 
+                                IG_J_gene TR_J_gene IG_D_gene IG_C_gene TR_C_gene  
+                           )],
+    'ncRNA'             => [qw( ncRNA miRNA miRNA_pseudogene misc_RNA misc_RNA_pseudogene Mt_tRNA 
+                            Mt_tRNA_pseudogene Mt_rRNA rRNA rRNA_pseudogene scRNA_pseudogene 
+                            snoRNA snoRNA_pseudogene snRNA snRNA_pseudogene tRNA_pseudogene
+                            3prime_overlapping_ncrna antisense lincRNA ncrna_host non_coding 
+                            processed_transcript sense_intronic sense_overlapping tRNA
+                            )],
+);
+
 my %biotype_grouping = (
-    'protein-coding' => 'protein_coding',
+    'protein_coding' => 'protein_coding',
     'polymorphic_pseudogene' => 'protein_coding',
     'pseudogene' => 'pseudogene',
     'retrotransposed' => 'pseudogene',
-- 
GitLab