UniProtDirectParser.pm 4.94 KB
Newer Older
1 2 3
package XrefParser::UniProtDirectParser;

use strict;
4 5
use warnings;
use Carp;
6 7 8
use DBI;

use base qw( XrefParser::BaseParser );
9
use XrefParser::Database;
10 11 12 13 14 15 16

# Parse file of Uniprot records and assign direct xrefs
# All assumed to be linked to translation


# --------------------------------------------------------------------------------

17
sub run_script {
18

19 20 21
 my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
22
  my $file         = $ref_arg->{file};
23 24
  my $verbose      = $ref_arg->{verbose};

25 26
  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
    croak "Need to pass source_id, species_id and file as pairs";
27 28
  }
  $verbose |=0;
29

30 31 32 33 34
  my $user = "ensro";
  my $host;
  my $port;
  my $dbname;
  my $wget = "";
35

36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  if($file =~ /host[=][>](\S+?)[,]/){
    $host = $1;
  }
  if($file =~ /port[=][>](\S+?)[,]/){
    $port =  $1;
  }
  if($file =~ /dbname[=][>](\S+?)[,]/){
    $dbname = $1;
  }
  if($file =~ /wget[=][>](\S+?)[,]/){
    $wget = $1;
  }


  my $ua = LWP::UserAgent->new();
  $ua->timeout(10);
  $ua->env_proxy();

  my $response = $ua->get($wget);

  if ( !$response->is_success() ) {
    warn($response->status_line);
    return 1;
  }
 
  my $production_db =  XrefParser::Database->new({ host   => $host,
					     port   => $port,
					     user   => $user,
					     dbname => $dbname,
					     pass   => ""});
  my $prod_dbi = $production_db->dbi();

  if(!defined($prod_dbi)){
69 70 71
    return 1;
  }

72
  my ($prefix) = $prod_dbi->selectrow_array("SELECT species_prefix FROM species WHERE taxon = $species_id");
73

74 75 76 77
  my %prefix = ($species_id => $prefix);

  if(!defined($prefix{$species_id})){
    print "No prefix known for this species $species_id???\n";
78 79 80 81 82 83 84 85 86
    return 1;
  }

  my $parsed_count = 0;


  my %prot2ensembl;

  my $count = 0;
87 88 89

  my @lines = split(/\n/,$response->content);
  foreach my $line (@lines){
90 91 92 93 94
    my ($prot, $ens) = split /\s+/,$line;
    if($ens =~ /$prefix{$species_id}/){
      push @{$prot2ensembl{$prot}}, $ens;
   }
  }
95
  my $dbi = $self->dbi();
96

97
  my $sw_source_id =  $self->get_source_id_for_source_name("uniprot/swissprot","sequence_mapped");
98 99 100 101 102 103 104 105 106 107 108 109 110
  if($sw_source_id < 1){
    die "Could not find source id for uniprot/swissprot ???\n";
  }
  else{
    print "Source_id = $sw_source_id\n";
  }
  my $get_desc_sth = $dbi->prepare("select xref_id, version, label, description from xref where source_id = $sw_source_id and accession = ?");


  my $get_dependents_sth = $dbi->prepare("select dependent_xref_id, linkage_annotation, linkage_source_id  from dependent_xref where master_xref_id = ?");

  my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id,dependent_xref_id,linkage_annotation, linkage_source_id) VALUES (?,?,?,?)");

111 112 113 114 115 116

  my $get_aliases_sth =  $dbi->prepare("select synonym from synonym where xref_id = ?");
  my $add_alias_sth   =  $dbi->prepare("INSERT INTO synonym (xref_id, synonym) VALUES (?, ?)");



117
  my $err_count=0;
118 119 120 121 122 123 124 125 126 127
  foreach my $key (keys %prot2ensembl){

    #
    # get the descrptions etc for the uniprot entry
    #
    $get_desc_sth->execute($key);
    my ($old_xref_id, $version, $label, $description);
    $get_desc_sth->bind_columns(\$old_xref_id, \$version, \$label, \$description);
    $get_desc_sth->fetch;
    if(!defined($old_xref_id)){
128
      print "Could not find $key in the database\n" if ($err_count <10);
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
      $err_count++;
      next;
    }
    $count++;

    #
    # get the dependents
    #
    my %linkage_anotation=();
    my %linkage_source_id=();
    my ($dependent_xref_id, $linkage_annotation, $linkage_source_id);
    $get_dependents_sth->execute($old_xref_id);
    $get_dependents_sth->bind_columns(\$dependent_xref_id, \$linkage_annotation, \$linkage_source_id);
    while($get_dependents_sth->fetch){
      $linkage_anotation{$dependent_xref_id} =  $linkage_annotation;
      $linkage_source_id{$dependent_xref_id} =  $linkage_source_id;
    }

#    print $key."\t";
    #
    # Add the new xref
    #

152 153 154 155 156 157 158
    my $xref_id = $self->add_xref({ acc        => $key,
				    version    => $version,
				    label      => $label,
				    desc       => $description,
				    source_id  => $source_id,
				    species_id => $species_id,
				    info_type  => "DIRECT"} );
159 160 161 162 163 164 165 166 167


    #
    # Add the synonyms
    #
    my $synonym;
    $get_aliases_sth->execute($old_xref_id);
    $get_aliases_sth->bind_columns(\$synonym);
    while($get_aliases_sth->fetch()){
168
      $add_alias_sth->execute($xref_id, $synonym) || croak "Could not add synonym for $xref_id, $synonym";
169 170 171
    }


172 173 174 175 176
    foreach my $trans (@{$prot2ensembl{$key}}){
      #
      #add the direct xref entry
      #

177
      $self->add_direct_xref( $xref_id, $trans, "Translation", '');
178 179 180 181 182 183
#      print ":".$trans;

      #
      #add the dependents
      #
      foreach my $dep (keys %linkage_anotation){
184
	$add_dependent_xref_sth->execute($xref_id, $dep, $linkage_anotation{$dep}, $linkage_source_id{$dep});
185 186 187 188 189 190 191 192 193 194 195
      }
    }
  }


  print $count." entrys added\n".$err_count." not found\n";
  return 0;
}


1;