From e27dabf9cae2fbfb5824a4f93c42c68a3753cc24 Mon Sep 17 00:00:00 2001 From: Ian Longden <ianl@sanger.ac.uk> Date: Thu, 28 Jan 2010 09:50:50 +0000 Subject: [PATCH] Parser to add direct mapping fro swissprot entrys --- .../XrefParser/UniProtDirectParser.pm | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 misc-scripts/xref_mapping/XrefParser/UniProtDirectParser.pm diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtDirectParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtDirectParser.pm new file mode 100644 index 0000000000..1dbd78a69f --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/UniProtDirectParser.pm @@ -0,0 +1,141 @@ +package XrefParser::UniProtDirectParser; + +use strict; + +use DBI; + +use base qw( XrefParser::BaseParser ); + +# Parse file of Uniprot records and assign direct xrefs +# All assumed to be linked to translation + +my $verbose; +# -------------------------------------------------------------------------------- +# Parse command line and run if being run directly + +if (!defined(caller())) { + + if (scalar(@ARGV) != 1) { + print "\nUsage: RefSeqParser.pm file.SPC <source_id> <species_id>\n\n"; + exit(1); + } + + run($ARGV[0], -1); + +} + +# -------------------------------------------------------------------------------- + +sub run { + + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $rel_file = shift; + $verbose = shift; + + my %prefix = (9606 => "ENSP0", 10090 => "ENSMUSP0", 10116 => "ENSRNOP0"); + + if(!defined($prefix{$species_id})){ + print "No prefix known for this species $species_id???\n"; + return 1; + } + + my $filename = @{$files}[0]; + + my $file_io = $self->get_filehandle($filename); + if ( !defined($file_io) ) { + return 1; + } + + my $parsed_count = 0; + + + my %prot2ensembl; + + my $count = 0; + while ( defined( my $line = $file_io->getline() ) ) { + my ($prot, $ens) = split /\s+/,$line; + if($ens =~ /$prefix{$species_id}/){ + push @{$prot2ensembl{$prot}}, $ens; + } + } + my $dbi = XrefParser::BaseParser->dbi(); + + my $sw_source_id = XrefParser::BaseParser->get_source_id_for_source_name("uniprot/swissprot","sequence_mapped"); + if($sw_source_id < 1){ + die "Could not find source id for uniprot/swissprot ???\n"; + } + else{ + print "Source_id = $sw_source_id\n"; + } + my $get_desc_sth = $dbi->prepare("select xref_id, version, label, description from xref where source_id = $sw_source_id and accession = ?"); + + + my $get_dependents_sth = $dbi->prepare("select dependent_xref_id, linkage_annotation, linkage_source_id from dependent_xref where master_xref_id = ?"); + + my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id,dependent_xref_id,linkage_annotation, linkage_source_id) VALUES (?,?,?,?)"); + + my $err_count; + foreach my $key (keys %prot2ensembl){ + + # + # get the descrptions etc for the uniprot entry + # + $get_desc_sth->execute($key); + my ($old_xref_id, $version, $label, $description); + $get_desc_sth->bind_columns(\$old_xref_id, \$version, \$label, \$description); + $get_desc_sth->fetch; + if(!defined($old_xref_id)){ + print STDERR "Could not find $key in the database\n" if ($err_count <10); + $err_count++; + next; + } + $count++; + + # + # get the dependents + # + my %linkage_anotation=(); + my %linkage_source_id=(); + my ($dependent_xref_id, $linkage_annotation, $linkage_source_id); + $get_dependents_sth->execute($old_xref_id); + $get_dependents_sth->bind_columns(\$dependent_xref_id, \$linkage_annotation, \$linkage_source_id); + while($get_dependents_sth->fetch){ + $linkage_anotation{$dependent_xref_id} = $linkage_annotation; + $linkage_source_id{$dependent_xref_id} = $linkage_source_id; + } + + +# print $key."\t"; + # + # Add the new xref + # + + my $xref_id = XrefParser::BaseParser->add_xref($key, $version, $label, $description, $source_id, $species_id, "DIRECT"); + foreach my $trans (@{$prot2ensembl{$key}}){ + # + #add the direct xref entry + # + + XrefParser::BaseParser->add_direct_xref( $xref_id, $trans, "Translation", ''); +# print ":".$trans; + + # + #add the dependents + # + foreach my $dep (keys %linkage_anotation){ + $add_dependent_xref_sth->execute($xref_id, $dep, $linkage_anotation{$dep}, $linkage_source_id{$dep}); + } + } + } + + + print $count." entrys added\n".$err_count." not found\n"; + return 0; +} + + +1; -- GitLab