Skip to content
Snippets Groups Projects
Commit cd571715 authored by James Allen's avatar James Allen
Browse files

Parser to use existing InterPro data from a core db

parent df82ed70
No related branches found
No related tags found
No related merge requests found
package XrefParser::InterproFromCoreParser;
use strict;
use warnings;
use Carp;
use vars qw(@ISA);
@ISA = qw(XrefParser::BaseParser);
use Bio::EnsEMBL::Registry;
sub run_script {
my ($self, $ref_arg) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $file = $ref_arg->{file};
my $verbose = $ref_arg->{verbose};
if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
croak "Need to pass source_id, species_id and file as pairs";
}
$verbose |=0;
my $project;
if ($file =~ /project[=][>](\S+?)[,]/) {
$project = $1;
}
my $registry = "Bio::EnsEMBL::Registry";
if ($project eq 'ensembl') {
$registry->load_registry_from_multiple_dbs(
{
'-host' => 'ens-staging1',
'-user' => 'ensro',
},
{
'-host' => 'ens-staging2',
'-user' => 'ensro',
},
);
} elsif ($project eq 'ensemblgenomes') {
$registry->load_registry_from_multiple_dbs(
{
'-host' => 'mysql-eg-staging-1.ebi.ac.uk',
'-port' => 4160,
'-user' => 'ensro',
},
{
'-host' => 'mysql-eg-staging-2.ebi.ac.uk',
'-port' => 4275,
'-user' => 'ensro',
},
);
} else {
die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
}
my %id2name = $self->species_id2name;
my $species_name = $id2name{$species_id}[0];
my %interpro = $self->get_core_interpro($registry, $species_name);
my $add_interpro_sth = $self->dbi()->prepare(
"INSERT IGNORE INTO interpro (interpro, pfam, dbtype) VALUES(?,?,?)"
);
my $add_xref_sth = $self->dbi()->prepare(
"INSERT IGNORE INTO xref ".
"(accession, label, description, source_id, species_id, info_type) ".
"VALUES(?,?,?,?,?,?)"
);
# The InterproScan pipeline uses additional sources for Interpro
# links (e.g. Gene3D, Panther), so in order to replicate that in the xref
# database, remove the restriction on the column contents.
$self->dbi()->do("ALTER TABLE interpro MODIFY COLUMN dbtype VARCHAR(25);");
foreach my $interpro_id (sort keys %interpro) {
foreach my $db_type (sort keys %{$interpro{$interpro_id}}) {
foreach my $id (sort keys %{$interpro{$interpro_id}{$db_type}}) {
my $added =
$add_xref_sth->execute(
$interpro_id,
$interpro{$interpro_id}{$db_type}{$id}{'short_name'},
$interpro{$interpro_id}{$db_type}{$id}{'name'},
$source_id,
$species_id,
$interpro{$interpro_id}{$db_type}{$id}{'info_type'},
);
if ( !$added ) {
print STDERR "Problem adding '$interpro_id'\n";
return 1; # 1 is an error
}
$added =
$add_interpro_sth->execute(
$interpro_id,
$id,
$db_type
);
if ( !$added ) {
print STDERR "Problem adding '$interpro_id'/".$interpro{$interpro_id}{$db_type}{'id'}."\n";
return 1; # 1 is an error
}
}
}
}
return 0;
}
sub get_core_interpro {
my ($self, $registry, $species_name) = @_;
my $dba = $registry->get_DBAdaptor($species_name, "core");
# Get interpro terms and related information
my %interpro;
my $sql =
'select distinct '.
'i.interpro_ac, i.id, '.
'x.display_label, x.description, x.info_type, '.
'a.logic_name '.
'from xref x '.
'inner join interpro i on x.dbprimary_acc = i.interpro_ac '.
'inner join protein_feature pf on i.id = pf.hit_name '.
'inner join analysis a on pf.analysis_id = a.analysis_id;';
my $sth = $dba->dbc()->prepare($sql);
$sth->execute();
# Ensembl analysis logic names don't match with the terms that
# Interpro uses, but mapping is easy enough.
my %dbtypes = (
'gene3d' => 'GENE3D',
'hmmpanther' => 'PANTHER',
'pfam' => 'PFAM',
'pfscan' => 'PROFILE',
'pirsf' => 'PIRSF',
'prints' => 'PRINTS',
'scanprosite' => 'PROSITE',
'smart' => 'SMART',
'superfamily' => 'SSF',
'tigrfam' => 'TIGRFAMs',
);
while (my @row = $sth->fetchrow_array()) {
my $interpro_id = $row[0];
my $db_type = $dbtypes{$row[5]};
my $id = $row[1];
if (defined $db_type) {
$interpro{$interpro_id}{$db_type}{$id}{'short_name'} = $row[2];
$interpro{$interpro_id}{$db_type}{$id}{'name'} = $row[3];
$interpro{$interpro_id}{$db_type}{$id}{'info_type'} = $row[4];
}
}
print "Retrieved ".scalar(keys %interpro)." interpro ids.\n";
return %interpro;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment