From c4c3556a5315f3ee394af831c0719dd240a54ede Mon Sep 17 00:00:00 2001
From: James Allen <jallen@ebi.ac.uk>
Date: Thu, 1 Nov 2012 16:04:30 +0000
Subject: [PATCH] Strip newlines from gene descriptions

---
 .../xref_mapping/XrefParser/FlybaseParser.pm  | 60 ++++++++++---------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm b/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm
index 650ead8a75..e7959a70c5 100644
--- a/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm
@@ -13,14 +13,14 @@ my $verbose;
 # The object types we'd like to parse.
 our %object_types = ( gene       => 1,
                       mRNA       => 1,
-											pre_miRNA  => 1,
-                      miRNA      => 1,
-                      ncRNA      => 1,
                       protein    => 1,
                       pseudogene => 1,
+                      miRNA      => 1,
+                      ncRNA      => 1,
+                      pre_miRNA  => 1,
                       rRNA       => 1,
-                      snRNA      => 1,
                       snoRNA     => 1,
+                      snRNA      => 1,
                       tRNA       => 1 );
 
 # This is some statistics from the 5.4 file 'dmel-all-r5.4.gff.gz',
@@ -67,37 +67,36 @@ our %object_types = ( gene       => 1,
 
 # This hash will translate the Dbxref names in the data file into source
 # names known by the Xref system.
-our %source_name_map = ( 'FlyBase'    => 'flybase_annotation_id',
-                         'GB'         => 'EMBL',
-                         'GB_protein' => 'protein_id',
-                         'INTERPRO'   => 'Interpro',
-#                         'UniProt/Swiss-Prot' => 'Uniprot/SWISSPROT',
-#                         'UniProt/TrEMBL'     => 'Uniprot/SPTREMBL',
-                         'bdgpinsituexpr'     => 'BDGP_insitu_expr',
-                         'dedb'               => 'DEDb',
-                         'flygrid'            => 'FlyGrid',
-												 'TF'                 => 'TransFac',
+# It's important to _not_ import the FlyBase UniProt annotations; they're
+# done at the gene level, which means that when we do our UniProt analysis
+# at the translation# level results are shifted to the gene level, which
+# messes up the web display.
+our %source_name_map = ( 'FlyBase'            => 'flybase_annotation_id',
+                         'BIOGRID'            => 'BioGRID',
                          'EPD'                => 'EPD',
-												 'MIR'                => 'miRBase',
-												 'MEROPS'             => 'MEROPS',
-												 'BIOGRID'            => 'BioGRID',
-												 'FlyReactome'        => 'FlyReactome',
-												 'GenomeRNAi_gene'    => 'GenomeRNAi',
-												 'INTERACTIVEFLY'     => 'InteractiveFly',
-												 'MITODROME'          => 'MitoDrome',
-												 'flyexpress'         => 'FlyExpress',
-												 'Rfam'               => 'RFAM',
-			 #'FlyAtlas'           => 'FlyAtlas',
-			 #'GCR'                => 'GPCR',
-			 #'GLEANR'             => 'GLEAN-R',
+                         'flyexpress'         => 'FlyExpress',
+                         'FlyReactome'        => 'FlyReactome',
+                         'GB'                 => 'EMBL',
+                         'GB_protein'         => 'protein_id',
+                         'GenomeRNAi'         => 'GenomeRNAi',
+                         'INTERACTIVEFLY'     => 'InteractiveFly',
+                         'MEROPS'             => 'MEROPS',
+                         'MIR'                => 'miRBase',
+                         'MITODROME'          => 'MitoDrome',
+                         'Rfam'               => 'Rfam',
+                         'TF'                 => 'TransFac',
+                         'INTERPRO'           => 'Interpro',
+                         #'FlyAtlas'           => 'FlyAtlas',
+                         #'UniProt/Swiss-Prot' => 'Uniprot/SWISSPROT',
+                         #'UniProt/TrEMBL'     => 'Uniprot/SPTREMBL',
 		       );
 
 # This is for source_ids that depend on the type of 'ID' of the line.
 our %special_source_name_map = (
                                 'gene' => {
-                                         'Dbxref' => 'FlyBaseCGID_gene',
-                                         'Name'   => 'FlyBaseName_gene',
-                                         'ID'     => 'flybase_gene_id'
+                                   'Dbxref' => 'FlyBaseCGID_gene',
+                                   'Name'   => 'FlyBaseName_gene',
+                                   'ID'     => 'flybase_gene_id'
                                 },
                                 'transcript' => {
                                    'Dbxref' => 'FlyBaseCGID_transcript',
@@ -405,6 +404,9 @@ sub run {
 			# we have to put it back
 			$description =~ s/%2C/,/g; 
 
+			# Embedded newlines wreak havoc further down the line
+			$description =~ s/[\n\r]//gm;
+
       my $xref_id;
 
       if ( exists( $xref_ids{$source_name}{$accession} ) ) {
-- 
GitLab