Parser to use existing InterPro data from a core db

cd571715 · James Allen · df82ed70 · cd571715
Commit cd571715 authored 11 years ago by James Allen
--- a/misc-scripts/xref_mapping/XrefParser/InterproFromCoreParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/InterproFromCoreParser.pm
+package XrefParser::InterproFromCoreParser;
+use strict;
+use warnings;
+use Carp;
+use vars qw(@ISA);
+@ISA = qw(XrefParser::BaseParser);
+use Bio::EnsEMBL::Registry;
+sub run_script {
+  my ($self, $ref_arg) = @_;
+  my $source_id    = $ref_arg->{source_id};
+  my $species_id   = $ref_arg->{species_id};
+  my $file         = $ref_arg->{file};
+  my $verbose      = $ref_arg->{verbose};
+  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
+    croak "Need to pass source_id, species_id and file as pairs";
+  }
+  $verbose |=0;
+  my $project;
+  if ($file =~ /project[=][>](\S+?)[,]/) {
+    $project = $1;
+  }
+  my $registry = "Bio::EnsEMBL::Registry";
+  if ($project eq 'ensembl') {
+    $registry->load_registry_from_multiple_dbs(
+      {
+        '-host' => 'ens-staging1',
+        '-user' => 'ensro',
+      },
+      {
+        '-host' => 'ens-staging2',
+        '-user' => 'ensro',
+      },
+    );
+  } elsif ($project eq 'ensemblgenomes') {
+    $registry->load_registry_from_multiple_dbs(
+      {
+        '-host' => 'mysql-eg-staging-1.ebi.ac.uk',
+        '-port' => 4160,
+        '-user' => 'ensro',
+      },
+      {
+        '-host' => 'mysql-eg-staging-2.ebi.ac.uk',
+        '-port' => 4275,
+        '-user' => 'ensro',
+      },
+    );
+  } else {
+    die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
+  }
+  my %id2name = $self->species_id2name;
+  my $species_name = $id2name{$species_id}[0];
+  my %interpro = $self->get_core_interpro($registry, $species_name);
+  my $add_interpro_sth = $self->dbi()->prepare(
+    "INSERT IGNORE INTO interpro (interpro, pfam, dbtype) VALUES(?,?,?)"
+  );
+  my $add_xref_sth = $self->dbi()->prepare(
+    "INSERT IGNORE INTO xref ".
+    "(accession, label, description, source_id, species_id, info_type) ".
+    "VALUES(?,?,?,?,?,?)"
+  );
+  # The InterproScan pipeline uses additional sources for Interpro
+  # links (e.g. Gene3D, Panther), so in order to replicate that in the xref
+  # database, remove the restriction on the column contents.
+  $self->dbi()->do("ALTER TABLE interpro MODIFY COLUMN dbtype VARCHAR(25);");
+  foreach my $interpro_id (sort keys %interpro) {
+    foreach my $db_type (sort keys %{$interpro{$interpro_id}}) {
+      foreach my $id (sort keys %{$interpro{$interpro_id}{$db_type}}) {
+        my $added = 
+          $add_xref_sth->execute(
+            $interpro_id,
+            $interpro{$interpro_id}{$db_type}{$id}{'short_name'},
+            $interpro{$interpro_id}{$db_type}{$id}{'name'},
+            $source_id,
+            $species_id,
+            $interpro{$interpro_id}{$db_type}{$id}{'info_type'},
+          );
+        if ( !$added ) {
+          print STDERR "Problem adding '$interpro_id'\n";
+          return 1;    # 1 is an error
+        }
+        $added =
+          $add_interpro_sth->execute(
+            $interpro_id,
+            $id,
+            $db_type
+          );
+        if ( !$added ) {
+          print STDERR "Problem adding '$interpro_id'/".$interpro{$interpro_id}{$db_type}{'id'}."\n";
+          return 1;    # 1 is an error
+        }
+      }
+    }
+  }
+  return 0;
+}
+sub get_core_interpro {
+  my ($self, $registry, $species_name) = @_;
+  my $dba = $registry->get_DBAdaptor($species_name, "core");
+  # Get interpro terms and related information
+  my %interpro;
+  my $sql =
+    'select distinct '.
+    'i.interpro_ac, i.id, '.
+    'x.display_label, x.description, x.info_type, '.
+    'a.logic_name '.
+    'from xref x '.
+    'inner join interpro i on x.dbprimary_acc = i.interpro_ac '.
+    'inner join protein_feature pf on i.id = pf.hit_name '.
+    'inner join analysis a on pf.analysis_id = a.analysis_id;';
+  my $sth = $dba->dbc()->prepare($sql);
+  $sth->execute();
+  # Ensembl analysis logic names don't match with the terms that
+  # Interpro uses, but mapping is easy enough.
+  my %dbtypes = (
+    'gene3d'      => 'GENE3D',
+    'hmmpanther'  => 'PANTHER',
+    'pfam'        => 'PFAM',
+    'pfscan'      => 'PROFILE',
+    'pirsf'       => 'PIRSF',
+    'prints'      => 'PRINTS',
+    'scanprosite' => 'PROSITE',
+    'smart'       => 'SMART',
+    'superfamily' => 'SSF',
+    'tigrfam'     => 'TIGRFAMs',
+  );
+  while (my @row = $sth->fetchrow_array()) {
+    my $interpro_id = $row[0];
+    my $db_type = $dbtypes{$row[5]};
+    my $id = $row[1];
+    if (defined $db_type) {
+      $interpro{$interpro_id}{$db_type}{$id}{'short_name'} = $row[2];
+      $interpro{$interpro_id}{$db_type}{$id}{'name'} = $row[3];
+      $interpro{$interpro_id}{$db_type}{$id}{'info_type'} = $row[4];
+    }
+  }
+  print "Retrieved ".scalar(keys %interpro)." interpro ids.\n";
+  return %interpro;
+}
+1;