From b61e8e624e01c799e330be05716ccde47b7a8429 Mon Sep 17 00:00:00 2001
From: Magali Ruffier <mr6@ebi.ac.uk>
Date: Tue, 23 Dec 2014 08:02:04 +0000
Subject: [PATCH] ENSCORESW-1191: regex updated to catch Name and Synonyms
 across multiple lines, with possible evidence codes

---
 .../xref_mapping/XrefParser/UniProtParser.pm  | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm
index e19825cd98..6cd96ca187 100644
--- a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm
@@ -372,20 +372,20 @@ sub create_xrefs {
     #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";
 
     
-    my ($gns) = $_ =~ /(GN\s+Name.+)/; # /s allows . to match newline
+    my ($gns) = $_ =~ /(GN\s+.+)/s;
     my @gn_lines = ();
-    if ( defined $gns ) { @gn_lines = split /\n/, $gns }
+    if ( defined $gns ) { @gn_lines = split /;/, $gns }
   
     # Do not allow the addition of UniProt Gene Name dependent Xrefs
     # if the protein was imported from Ensembl. Otherwise we will
     # re-import previously set symbols
     if(! $ensembl_derived_protein) {
+      my %depe;
       foreach my $gn (@gn_lines){
         my $gene_name = undef;
-        my %depe;
 
-        if($gn =~ /Name=((.*?))[;\s]/){
-          $depe{LABEL} = uc($1);
+        if($gn =~ /Name=((.*?))[;\s]/s){ # /s for multi-line entries ; is the delimiter
+          $depe{LABEL} = $1; # leave name as is, upper/lower case is relevant in gene names
           $depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});
           $gene_name = $depe{ACCESSION};
 
@@ -394,13 +394,15 @@ sub create_xrefs {
           $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
           push @{$xref->{DEPENDENT_XREFS}}, \%depe;
           $dependent_xrefs{"Uniprot_gn"}++;
-          my @syn;
-          if($gn =~ /Synonyms=([^;]+);/){
-            my $syn = $1;
-            $syn =~ s/\s+//g;
-            @syn= split(/,/,$syn);
-            push (@{$depe{"SYNONYMS"}}, @syn);
-          }
+        }
+        my @syn;
+        if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line
+          my $syn = $1;
+          $syn =~ s/{.*}//g;
+          $syn =~ s/\n//g;
+          $syn =~ s/\s+//g;
+          @syn = split(/,/,$syn);
+          push (@{$depe{"SYNONYMS"}}, @syn);
         }
       }
     }
-- 
GitLab