diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm index 12ee73c2d295b7288dcac4105dec7b5fcb4264d9..b53c7eb717712b89ece45934678ffe20e659ab64 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm @@ -35,6 +35,7 @@ if (!defined(caller())) { # -------------------------------------------------------------------------------- + sub run { my $self = shift if (defined(caller(1))); @@ -167,11 +168,7 @@ sub create_xrefs { my $num_sptr_pred = 0; my %dependent_sources = $self->get_dependent_xref_sources(); # name-id hash - -# get from HGNC file -# if(defined($dependent_sources{'HGNC'})){ -# $dependent_sources{'HGNC'} = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","uniprot"); -# } + my %GeneNameSynonym; if(defined($dependent_sources{'MGI'})){ $dependent_sources{'MGI'} = XrefParser::BaseParser->get_source_id_for_source_name("MGI","uniprot"); @@ -293,8 +290,11 @@ sub create_xrefs { foreach my $line (@all_lines) { my ($accessions_only) = $line =~ /^AC\s+(.+)/; push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only); + } + + $xref->{INFO_TYPE} = "SEQUENCE_MATCH"; $xref->{ACCESSION} = $accessions[0]; for (my $a=1; $a <= $#accessions; $a++) { @@ -393,6 +393,40 @@ sub create_xrefs { $xref->{SEQUENCE} = $parsed_seq; #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n"; + + my ($gns) = $_ =~ /(GN\s+Name.+)/; # /s allows . to match newline + my @gn_lines = (); + if ( defined $gns ) { @gn_lines = split /\n/, $gns } + + foreach my $gn (@gn_lines){ + my $gene_name = undef; + my %depe; + + if($gn =~ /Name=(\S+);/){ + $depe{ACCESSION} = uc($1); + $gene_name = $depe{ACCESSION}; + $depe{SOURCE_NAME} = "Uniprot_genename"; + $depe{SOURCE_ID} = $dependent_sources{"Uniprot_genename"}; + $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID}; + push @{$xref->{DEPENDENT_XREFS}}, \%depe; + $dependent_xrefs{"Uniprot_genename"}++; + my @syn; + if($gn =~ /Synonyms=([^;]+);/){ + my $syn = $1; + $syn =~ s/\s+//g; + @syn= split(/,/,$syn); + foreach my $ent (@syn){ + $GeneNameSynonym{$gene_name}{uc($ent)} = 1; +# print "$gene_name\t$ent\n"; + } + } + } + } + + my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline + my @dep_lines = (); + if ( defined $deps ) { @dep_lines = split /\n/, $deps } + # dependent xrefs - only store those that are from sources listed in the source table my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline @@ -535,11 +569,31 @@ sub create_xrefs { print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n" if($verbose); print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose); + +# my $kount=0; + my $genename_source_id = $dependent_sources{"Uniprot_genename"}; + foreach my $namekey (keys %GeneNameSynonym){ + #add xref + my $xref_id = $self->add_xref($namekey,"",$namekey, "", $genename_source_id, $species_id,"DEPENDENT"); +# $kount++; +# print $namekey."\t"; + foreach my $synkey (keys %{$GeneNameSynonym{$namekey}}){ + #add synonyms for xref + $self->add_synonym($xref_id, $synkey); +# print "$synkey, "; + } +# print "\n"; + } + + +# print "$kount gene anmes added\n"; + print "Added the following dependent xrefs:-\n" if($verbose); foreach my $key (keys %dependent_xrefs){ print $key."\t".$dependent_xrefs{$key}."\n" if($verbose); } + return \@xrefs; #TODO - currently include records from other species - filter on OX line?? diff --git a/misc-scripts/xref_mapping/xref_config.ini b/misc-scripts/xref_mapping/xref_config.ini index 46beb344c92fa7c6ef5490715ddeda158355880f..e26091e220b505f8e2b53298b885d984bfd16e04 100644 --- a/misc-scripts/xref_mapping/xref_config.ini +++ b/misc-scripts/xref_mapping/xref_config.ini @@ -2373,6 +2373,17 @@ prio_descr = parser = UniProtParser release_uri = +[source Uniprot_genename] +# Special source used in UniProtParser foir gene names.. +name = Uniprot_genename +download = N +order = 20 +priority = 1 +prio_descr = +parser = UniProtParser +release_uri = +data_uri = + [source Uniprot/SWISSPROT::drosophila_melanogaster] # Used by drosophila_melanogaster name = Uniprot/SWISSPROT