Skip to content
Snippets Groups Projects
Commit 9d80b598 authored by Andy Yates's avatar Andy Yates
Browse files

Merging of UniParc mapping code into the Xref system. This relies on being...

Merging of UniParc mapping code into the Xref system. This relies on being able to contact the EBI based UniParc readonly database.
parent 868d6f3c
No related branches found
No related tags found
No related merge requests found
...@@ -4,10 +4,12 @@ use strict; ...@@ -4,10 +4,12 @@ use strict;
use warnings; use warnings;
use Carp; use Carp;
use Cwd; use Cwd;
use DBI;
use File::Basename; use File::Basename;
use IPC::Open3; use IPC::Open3;
use XrefMapper::db;
use XrefMapper::uniparc;
=head2 new =head2 new
...@@ -47,6 +49,23 @@ sub xref{ ...@@ -47,6 +49,23 @@ sub xref{
return $self->{_xref}; return $self->{_xref};
} }
=head2 uniparc
Arg [1] : (optional)
Example : $mapper->uniparc($new_uniparc);
Description: Getter / Setter for the uniparc.
info for the uniparc database.
Returntype : XrefMapper::uniparc
Exceptions : none
=cut
sub uniparc {
my ($self, $uniparc) = @_;
$self->{uniparc} = $uniparc if defined $uniparc;
return $self->{uniparc};
}
=head2 farm_queue =head2 farm_queue
Arg [1] : (optional) Arg [1] : (optional)
...@@ -192,11 +211,13 @@ sub process_file { ...@@ -192,11 +211,13 @@ sub process_file {
my $xref=undef; my $xref=undef;
my $ensembl=undef; my $ensembl=undef;
my $uniparc;
my $type; my $type;
my %xref_hash=(); my %xref_hash=();
my %species_hash=(); my %species_hash=();
my %farm_hash=(); my %farm_hash=();
my %uniparc_hash;
open my $fh, "<", $file or croak ("\nCannot open input file '$file':\n $!\n"); open my $fh, "<", $file or croak ("\nCannot open input file '$file':\n $!\n");
while( my $line = <$fh> ) { while( my $line = <$fh> ) {
...@@ -225,6 +246,9 @@ sub process_file { ...@@ -225,6 +246,9 @@ sub process_file {
elsif($type eq "farm"){ elsif($type eq "farm"){
$farm_hash{lc($key)} = $value; $farm_hash{lc($key)} = $value;
} }
elsif($type eq 'uniparc') { # Processing uniparc settings
$uniparc_hash{lc($key)} = $value;
}
} }
close $fh or croak "Can't close file"; close $fh or croak "Can't close file";
...@@ -335,6 +359,19 @@ sub process_file { ...@@ -335,6 +359,19 @@ sub process_file {
print "No xref database is too be used\n" if ($verbose) print "No xref database is too be used\n" if ($verbose)
} }
#If we had UniParc information then create the settings
if($uniparc_hash{dbname}) {
my %args = (
-DBNAME => $uniparc_hash{dbname},
-USER => $uniparc_hash{user}
);
$args{-PASS} = $uniparc_hash{password} if $uniparc_hash{password};
$uniparc = XrefMapper::uniparc->new(%args);
$uniparc->method($uniparc_hash{method});
$mapper->uniparc($uniparc);
}
if(defined($species_hash{'species'})){ if(defined($species_hash{'species'})){
......
package XrefMapper::Methods::ChecksumBasic;
use strict;
use warnings;
use Bio::SeqIO;
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Digest::MD5;
my $DEFAULT_BATCH_SIZE = 10;
sub new {
my ($class, @args) = @_;
my $self = bless({}, $class);
my ($mapper, $batch_size) = rearrange([qw(mapper batch_size)], @args);
throw 'No -MAPPER given' unless $mapper;
$batch_size = $DEFAULT_BATCH_SIZE unless $batch_size;
$self->mapper($mapper);
$self->batch_size($batch_size);
return $self;
}
sub mapper {
my ($self, $_mapper) = @_;
$self->{mapper} = $_mapper if defined $_mapper;
return $self->{mapper};
}
sub batch_size {
my ($self, $batch_size) = @_;
$self->{batch_size} = $batch_size if defined $batch_size;
return $self->{batch_size};
}
sub run {
my ($self, $target) = @_;
if(! defined $target) {
$target = $self->mapper()->core()->protein_file();
}
my $reader = $self->_get_sequence_parser($target);
my @results;
my @tmp_list;
my $batch_size = $self->batch_size();
my $count = 0;
while ( my $sequence = $reader->next_seq() ) {
push(@tmp_list, $sequence);
$count++;
if( ($count % $batch_size) == 0) {
my $res = $self->perform_mapping(\@tmp_list);
push(@results, @{$res});
$self->mapper()->log_progress("Finished batch mapping of %d peptides\n", $batch_size);
$count = 0;
@tmp_list = ();
}
}
#Final mapping if there were some left over
if(@tmp_list) {
$self->mapper()->log_progress("Finishing progess\n");
my $res = $self->perform_mapping(\@tmp_list);
push(@results, @{$res});
@tmp_list = ();
}
$reader->close();
return \@results;
}
sub perform_mapping {
my ($self, $sequences) = @_;
throw('Override to perform the mapping you require');
}
sub _get_sequence_parser {
my ($self, $target) = @_;
throw "Cannot find the file '${target}'" unless -f $target;
my $reader = Bio::SeqIO->new(-FILE => $target, -FORMAT => 'fasta');
return $reader;
}
sub md5_checksum {
my ($self, $sequence) = @_;
my $digest = Digest::MD5->new();
$digest->add($sequence->seq());
return $digest->hexdigest();
}
1;
\ No newline at end of file
package XrefMapper::Methods::OracleUniParc;
use strict;
use warnings;
use base qw/XrefMapper::Methods::ChecksumBasic/;
use Bio::EnsEMBL::DBSQL::DBConnection;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::SqlHelper;
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use List::Util qw(max);
my $DEFAULT_BATCH_SIZE = 10000;
my $UNIPARC_SQL = <<'SQL';
SELECT p.UPI
FROM UNIPARC.PROTEIN p
WHERE p.md5 = ?
SQL
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
my ($batch_size) = rearrange([qw(batch_size)], @args);
if(! $batch_size) {
$self->batch_size($DEFAULT_BATCH_SIZE);
}
return $self;
}
sub checksum {
my ($self, $sequence) = @_;
return uc($self->md5_checksum($sequence));
}
sub perform_mapping {
my ($self, $sequences) = @_;
my @final_results;
$self->oracle_dbc()->batch(-SQL => $UNIPARC_SQL, -CALLBACK => sub {
my ($sth) = @_;
foreach my $sequence (@{$sequences}) {
my $checksum = $self->checksum($sequence);
$sth->execute($checksum);
my $upi;
while(my $row = $sth->fetchrow_arrayref()) {
my ($local_upi) = @{$row};
if(defined $upi) {
throw sprintf('The sequence %s had a checksum of %s but this resulted in more than one UPI: [%s, %s]', $sequence->id(), $checksum, $upi, $local_upi);
}
$upi = $local_upi;
}
if(defined $upi){
push(@final_results, { id => $sequence->id(), upi => $upi, object_type => 'Translation' });
}
}
return;
});
return \@final_results;
}
sub oracle_dbc {
my ($self) = @_;
if(! exists $self->{oracle_dbc}) {
my $dbc = $self->mapper()->uniparc()->dbc();
$dbc->disconnect_when_inactive(0);
$dbc->driver('Oracle');
$self->{oracle_dbc} = $dbc;
}
return $self->{oracle_dbc};
}
sub DESTROY {
my ($self) = @_;
$self->oracle_dbc()->disconnect_if_idle() if $self->oracle_dbc();
return;
}
1;
\ No newline at end of file
package XrefMapper::UniParcMapper;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use base qw(XrefMapper::BasicMapper);
my $DEFAULT_METHOD = 'XrefMapper::Methods::OracleUniParc';
sub new {
my($class, $mapper) = @_;
my $self = bless {}, $class;
$self->core($mapper->core);
$self->xref($mapper->xref);
$self->uniparc($mapper->uniparc());
$self->mapper($mapper);
$self->method($self->uniparc()->method() || $DEFAULT_METHOD);
return $self;
}
sub _xref_helper {
my ($self) = @_;
return $self->xref()->dbc()->sql_helper();
}
sub logic_name {
my ($self) = @_;
return 'XrefChecksum';
}
sub external_db_name {
my ($self) = @_;
return 'UniParc';
}
sub mapper {
my ($self, $mapper) = @_;
$self->{mapper} = $mapper if defined $mapper;
return $self->{mapper};
}
sub method {
my ($self, $method) = @_;
$self->{method} = $method if defined $method;
return $self->{method};
}
sub verbose {
my ($self) = @_;
return $self->mapper()->verbose();
}
sub process {
my ($self, $do_upload) = @_;
$self->_update_status('checksum_xrefs_started');
my $method = $self->get_method();
my $results = $method->run();
if($do_upload) {
$self->log_progress('Starting upload');
$self->upload($results);
}
$self->_update_status('checksum_xrefs_finished');
return;
}
sub upload {
my ($self, $results) = @_;
#The elements come in as an array looking like
# [ { id => 1, upi => 'UPI00000A', object_type => 'Translation' } ]
my $insert_xref = <<'SQL';
INSERT INTO xref (source_id, accession, label, version, species_id, info_type)
values (?,?,?,?,?,?)
SQL
my $insert_object_xref = <<'SQL';
INSERT INTO object_xref (ensembl_id, ensembl_object_type, xref_id, linkage_type, ox_status)
values (?,?,?,?,?)
SQL
my $h = $self->_xref_helper();
my $source_id = $self->source_id();
my $species_id = $self->species_id();
$h->transaction(-CALLBACK => sub {
$self->log_progress('Starting xref insertion');
#Record UPIs to make sure we do not insert a UPI in more than once
my %upi_xref_id;
$h->batch(-SQL => $insert_xref, -CALLBACK => sub {
my ($sth) = @_;
foreach my $e (@{$results}) {
my $upi = $e->{upi};
if(exists $upi_xref_id{$upi}) {
$e->{xref_id} = $upi_xref_id{$upi};
}
else {
$sth->execute($source_id, $e->{upi}, $e->{upi}, 1, $species_id, 'CHECKSUM');
my $id = $sth->{'mysql_insertid'};
$e->{xref_id} = $id;
$upi_xref_id{$upi} = $id;
}
}
return;
});
$self->log_progress('Starting object_xref insertion');
$h->batch(-SQL => $insert_object_xref, -CALLBACK => sub {
my ($sth) = @_;
foreach my $e (@{$results}) {
$sth->execute($e->{id}, $e->{object_type}, $e->{xref_id}, 'CHECKSUM', 'DUMP_OUT');
}
return;
});
});
$self->log_progress('Finished insertions');
return;
}
sub source_id {
my ($self) = @_;
return $self->_xref_helper()->execute_single_result(
-SQL => 'select source_id from source where name=?',
-PARAMS => [$self->external_db_name()]
);
}
sub species_id {
my ($self) = @_;
my $species_id = $self->SUPER::species_id();
if(! defined $species_id) {
$species_id = $self->get_id_from_species_name($self->core()->species());
$self->SUPER::species_id($species_id);
}
return $species_id;
}
sub get_method {
my ($self) = @_;
my $method_class = $self->method();
eval "require ${method_class};";
if($@) {
throw "Cannot require the class ${method_class}. Make sure your PERL5LIB is correct: $@";
}
return $method_class->new( -MAPPER => $self );
}
############# INTERNAL METHODS
sub _update_status {
my ($self, $status) = @_;
if($self->xref()) {
my $h = $self->_xref_helper();
my $sql = q{insert into process_status (status, date) values(?,now())};
$h->execute_update(-SQL => $sql, -PARAMS => [$status]);
}
else {
my $time = localtime();
$self->log_progress(q{Status Update '%s' @ %s}."\n", $status, $time);
}
return;
}
sub log_progress {
my ( $self, $fmt, @params ) = @_;
return if (!$self->verbose);
printf( STDERR "CHKSM==> %s", sprintf( $fmt, @params ) );
}
1;
\ No newline at end of file
...@@ -188,7 +188,8 @@ sub update{ ...@@ -188,7 +188,8 @@ sub update{
# Get analysis id's # Get analysis id's
#################### ####################
my %analysis_ids = $self->get_analysis(); # my %analysis_ids = $self->get_analysis();
my $checksum_analysis_id = $self->get_single_analysis('XrefChecksum');
print "xref offset is $xref_offset, object_xref offset is $object_xref_offset\n" if ($verbose); print "xref offset is $xref_offset, object_xref offset is $object_xref_offset\n" if ($verbose);
...@@ -290,6 +291,25 @@ GSQL ...@@ -290,6 +291,25 @@ GSQL
} }
print "DIRECT $count\n" if ($verbose); print "DIRECT $count\n" if ($verbose);
} }
### IF CHECKSUM, xref, object_xref
# 1:m mapping between object & xref
elsif($type eq 'CHECKSUM') {
my $count = 0;
$direct_sth->execute($source_id, $type);
my $last_xref = 0;
while(my $row = $direct_sth->fetchrow_arrayref()) {
my ($xref_id, $acc, $label, $version, $desc, $info, $object_xref_id, $ensembl_id, $ensembl_type) = @{$row};
if($last_xref != $xref_id) {
push @xref_list, $xref_id;
$count++;
$add_xref_sth->execute(($xref_id+$xref_offset), $ex_id, $acc, $label, $version, $desc, $type, $info || $where_from);
$last_xref = $xref_id;
}
$add_object_xref_sth->execute(($object_xref_id+$object_xref_offset), $ensembl_id, $ensembl_type, ($xref_id+$xref_offset), $checksum_analysis_id);
}
print "CHECKSUM $count\n" if ($verbose);
}
### If DEPENDENT, xref, object_xref , dependent_xref (order by xref_id) # maybe linked to more than one? ### If DEPENDENT, xref, object_xref , dependent_xref (order by xref_id) # maybe linked to more than one?
...@@ -779,43 +799,47 @@ WEL ...@@ -779,43 +799,47 @@ WEL
sub get_analysis{ sub get_analysis{
my $self = shift; my $self = shift;
my %typeToLogicName = ( 'Gene' => 'XrefExonerateDNA', my %typeToLogicName = ( 'Gene' => 'XrefExonerateDNA',
'Transcript' => 'XrefExonerateDNA', 'Transcript' => 'XrefExonerateDNA',
'Translation' => 'XrefExonerateProtein' ); 'Translation' => 'XrefExonerateProtein');
my %analysis_id; my %analysis_id;
foreach my $key (qw(Gene Transcript Translation)){ foreach my $key (qw(Gene Transcript Translation)){
my $logic_name = $typeToLogicName{$key}; my $logic_name = $typeToLogicName{$key};
$analysis_id{$key} = $self->get_single_analysis($logic_name);
my $sth = $self->core->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
$sth->execute();
my $analysis_id;
if (my @row = $sth->fetchrow_array()) {
$analysis_id{$key} = $row[0];
} else {
print "No analysis with logic_name $logic_name found, creating ...\n" if ($self->verbose);
$sth = $self->core->dbc->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "',NOW())");
# TODO - other fields in analysis table
$sth->execute();
$analysis_id{$key} = $sth->{'mysql_insertid'};
}
$sth->finish();
} }
return %analysis_id; return %analysis_id;
}
sub get_single_analysis {
my ($self, $logic_name) = @_;
my $h = $self->core->dbc()->sql_helper();
my $analysis_ids = $h->execute_simple(
-SQL => 'SELECT analysis_id FROM analysis WHERE logic_name=?',
-PARAMS => [$logic_name]
);
my $analysis_id;
if(@{$analysis_ids}) {
$analysis_id = $analysis_ids->[0];
}
else {
print "No analysis with logic_name $logic_name found, creating ...\n" if ($self->verbose);
# TODO - other fields in analysis table
$self->core()->dbc()->sql_helper()->execute_update(
-SQL => 'INSERT INTO analysis (logic_name, created) VALUES (?,NOW())',
-PARAMS => [$logic_name],
-CALLBACK => sub {
my ($sth) = @_;
$analysis_id = $sth->{'mysql_insertid'};
return;
}
);
}
return $analysis_id;
} }
1; 1;
package XrefMapper::uniparc;
use base qw(XrefMapper::db);
sub method {
my ($self, $method) = @_;
$self->{method} = $method if defined $method;
return $self->{method};
}
1;
\ No newline at end of file
...@@ -289,7 +289,8 @@ CREATE TABLE process_status ( ...@@ -289,7 +289,8 @@ CREATE TABLE process_status (
'prioritys_flagged','processed_pairs','biomart_test_finished', 'prioritys_flagged','processed_pairs','biomart_test_finished',
'source_level_move_finished','alt_alleles_processed', 'source_level_move_finished','alt_alleles_processed',
'official_naming_done', 'official_naming_done',
'coordinate_xrefs_started','coordinate_xref_finished', 'checksum_xrefs_started', 'checksum_xrefs_finished',
'coordinate_xrefs_started','coordinate_xref_finished',
'tests_started','tests_failed','tests_finished', 'tests_started','tests_failed','tests_finished',
'core_loaded','display_xref_done','gene_description_done'), 'core_loaded','display_xref_done','gene_description_done'),
date DATETIME NOT NULL, date DATETIME NOT NULL,
...@@ -365,7 +366,8 @@ CREATE TABLE object_xref ( ...@@ -365,7 +366,8 @@ CREATE TABLE object_xref (
linkage_type ENUM( 'PROJECTION', 'MISC', 'DEPENDENT', linkage_type ENUM( 'PROJECTION', 'MISC', 'DEPENDENT',
'DIRECT', 'SEQUENCE_MATCH', 'DIRECT', 'SEQUENCE_MATCH',
'INFERRED_PAIR', 'PROBE', 'INFERRED_PAIR', 'PROBE',
'UNMAPPED', 'COORDINATE_OVERLAP' ), 'UNMAPPED', 'COORDINATE_OVERLAP',
'CHECKSUM'),
ox_status ENUM( 'DUMP_OUT','FAILED_PRIORITY', 'FAILED_CUTOFF', 'NO_DISPLAY', 'MULTI_DELETE') NOT NULL DEFAULT 'DUMP_OUT', ox_status ENUM( 'DUMP_OUT','FAILED_PRIORITY', 'FAILED_CUTOFF', 'NO_DISPLAY', 'MULTI_DELETE') NOT NULL DEFAULT 'DUMP_OUT',
-- set ox_status to 0 if non used priority_xref or failed cutoff -- set ox_status to 0 if non used priority_xref or failed cutoff
unused_priority INT UNSIGNED, unused_priority INT UNSIGNED,
......
...@@ -6,6 +6,11 @@ user=ensadmin ...@@ -6,6 +6,11 @@ user=ensadmin
password= password=
dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/xref dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/xref
#uniparc
#dbname=
#user=
#password=
species=mus_musculus species=mus_musculus
host=ecs4 host=ecs4
port=3306 port=3306
......
...@@ -18,6 +18,7 @@ use XrefMapper::XrefLoader; ...@@ -18,6 +18,7 @@ use XrefMapper::XrefLoader;
use XrefMapper::Interpro; use XrefMapper::Interpro;
use XrefMapper::DisplayXrefs; use XrefMapper::DisplayXrefs;
use XrefMapper::CoordinateMapper; use XrefMapper::CoordinateMapper;
use XrefMapper::UniParcMapper;
use XrefMapper::OfficialNaming; use XrefMapper::OfficialNaming;
use XrefMapper::DirectXrefs; use XrefMapper::DirectXrefs;
...@@ -230,11 +231,20 @@ if($status eq "alt_alleles_processed"){ ...@@ -230,11 +231,20 @@ if($status eq "alt_alleles_processed"){
} }
#UniParc checksum mapping
#Allow for reruns if required but needs a uniparc object available in the mapper
#as that means we can do it
$status = $mapper->xref_latest_status();
if($status eq 'official_naming_done' && defined $mapper->uniparc()) {
my $checksum_mapper = XrefMapper::UniParcMapper->new($mapper);
$checksum_mapper->process($upload);
}
# Coordinate xrefs # Coordinate xrefs
# tests # tests
$status = $mapper->xref_latest_status(); $status = $mapper->xref_latest_status();
if($status eq "official_naming_done" || $status eq "tests_started" || $status eq "tests_failed" ){ if($status eq "official_naming_done" || $status eq "checksum_xrefs_finished" || $status eq "tests_started" || $status eq "tests_failed" ){
my $tester = XrefMapper::TestMappings->new($mapper); my $tester = XrefMapper::TestMappings->new($mapper);
if($tester->unlinked_entries){ if($tester->unlinked_entries){
die "Problems found so will not load core database\n"; die "Problems found so will not load core database\n";
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment