diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm index eba95aabb5fda0e9387ff03302b4cab64ed4c164..a2cb721ac92ae3b6cc165aee583efa5b730c9044 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm @@ -301,17 +301,109 @@ sub create_xrefs { @all_lines = split /\n/, $description_and_rest; # extract ^DE lines only & build cumulative description string - my $description; + my $description = " "; + my $name = ""; + my $flags = " "; + + my $mode = ""; + foreach my $line (@all_lines) { - my ($description_only) = $line =~ /^DE\s+(.+)/; - $description .= $description_only if ($description_only); - $description .= " "; + + next if(!($line =~ /^DE/)); + + + # Set up the mode first + if($line =~ /^DE RecName:/){ + if($mode eq "RecName"){ + $description .= ";"; + } + $mode = "RecName"; + } + elsif($line =~ /^DE SubName:/){ + if($mode eq "RecName"){ + $description .= ";"; + } + $mode = "RecName"; + } + elsif($line =~ /^DE AltName:/){ + $mode = "AltName"; + } + elsif($line =~ /^DE Contains:/){ + if($mode eq "Contains"){ + $description .= ";"; + } + elsif($mode eq "Includes"){ + $description .= "][Contains "; + } + else{ + $description .= " [Contains "; + } + $mode = "Contains"; + next; + } + elsif($line =~ /^DE Includes:/){ + if($mode eq "Includes"){ + $description .= ";"; + } + elsif($mode eq "Contains"){ + $description .= "][Includess"; + } + else{ + $description .= " [Includes "; + } + $mode = "Includes"; + next; + } + elsif($line =~ /^DE Flags: (.*);/){ + $flags .= "$1 "; + next; + } + + + # now get the data + if($line =~ /^DE RecName: Full=(.*);/){ + $name .= $1; + } + elsif($line =~ /RecName: Full=(.*);/){ + $description .= $1; + } + elsif($line =~ /SubName: Full=(.*);/){ + $name .= $1; + } + elsif($line =~ /AltName: Full=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /Short=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /EC=(.*);/){ + $description .= "(EC ".$1.")"; + } + elsif($line =~ /Allergen=(.*);/){ + $description .= "(Allergen ".$1.")"; + } + elsif($line =~ /INN=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /Biotech=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /CD_antigen=(.*);/){ + $description .= "(".$1." antigen)"; + } + else{ + print STDERR "unable to process *$line* for $acc\n"; + } + + } + if($mode eq "Contains" or $mode eq "Includes"){ + $description .= "]"; } $description =~ s/^\s*//g; $description =~ s/\s*$//g; - $xref->{DESCRIPTION} = $description; + $xref->{DESCRIPTION} = $name.$flags.$description; # extract sequence my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm index fe5e8cd13d561392cc9bbeda2ba8040250db230a..cf3ef2680feef2a47ac9ada98b4779916093ae0d 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm @@ -280,18 +280,105 @@ sub create_xrefs { @all_lines = split /\n/, $description_and_rest; # extract ^DE lines only & build cumulative description string - my $description; + my $description = " "; + my $name = ""; + my $flags = " "; + + my $mode = ""; foreach my $line (@all_lines) { - my ($description_only) = $line =~ /^DE\s+(.+)/; - $description .= $description_only if ($description_only); - $description .= " "; + + next if(!($line =~ /^DE/)); + + # Set up the mode first + if($line =~ /^DE RecName:/){ + if($mode eq "RecName"){ + $description .= "; "; + } + $mode = "RecName"; + } + elsif($line =~ /^DE SubName:/){ + if($mode eq "RecName"){ + $description .= "; "; + } + $mode = "RecName"; + } + elsif($line =~ /^DE AltName:/){ + $mode = "AltName"; + } + elsif($line =~ /^DE Contains:/){ + if($mode eq "Contains"){ + $description .= "; "; + } + elsif($mode eq "Includes"){ + $description .= "][Contains "; + } + else{ + $description .= " [Contains "; + } + $mode = "Contains"; + next; + } + elsif($line =~ /^DE Includes:/){ + if($mode eq "Includes"){ + $description .= "; "; + } + elsif($mode eq "Contains"){ + $description .= "][Includess"; + } + else{ + $description .= " [Includes "; + } + $mode = "Includes"; + next; + } + elsif($line =~ /^DE Flags: (.*);/){ + $flags .= "$1 "; + next; + } + + # now get the data + if($line =~ /^DE RecName: Full=(.*);/){ + $name .= $1; + } + elsif($line =~ /RecName: Full=(.*);/){ + $description .= $1; + } + elsif($line =~ /SubName: Full=(.*);/){ + $name .= $1; + } + elsif($line =~ /AltName: Full=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /Short=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /EC=(.*);/){ + $description .= "(EC ".$1.")"; + } + elsif($line =~ /Allergen=(.*);/){ + $description .= "(Allergen ".$1.")"; + } + elsif($line =~ /INN=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /Biotech=(.*);/){ + $description .= "(".$1.")"; + } + elsif($line =~ /CD_antigen=(.*);/){ + $description .= "(".$1." antigen)"; + } + else{ + print STDERR "unable to process *$line* for $acc\n"; + } + } + if($mode eq "Contains" or $mode eq "Includes"){ + $description .= "]"; } $description =~ s/^\s*//g; $description =~ s/\s*$//g; - $xref->{DESCRIPTION} = $description; - + $xref->{DESCRIPTION} = $name.$flags.$description; push @xrefs, $xref;