diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index 1be281bd2851740a07fa397184d2a0e50052849a..d7c4fefe2a95d6618583175622f8b60e1408b66a 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -4,10 +4,12 @@ use strict; use warnings; use Carp; use Cwd; -use DBI; use File::Basename; use IPC::Open3; +use XrefMapper::db; +use XrefMapper::uniparc; + =head2 new @@ -47,6 +49,23 @@ sub xref{ return $self->{_xref}; } +=head2 uniparc + + Arg [1] : (optional) + Example : $mapper->uniparc($new_uniparc); + Description: Getter / Setter for the uniparc. + info for the uniparc database. + Returntype : XrefMapper::uniparc + Exceptions : none + +=cut + +sub uniparc { + my ($self, $uniparc) = @_; + $self->{uniparc} = $uniparc if defined $uniparc; + return $self->{uniparc}; +} + =head2 farm_queue Arg [1] : (optional) @@ -192,11 +211,13 @@ sub process_file { my $xref=undef; my $ensembl=undef; + my $uniparc; my $type; my %xref_hash=(); my %species_hash=(); my %farm_hash=(); + my %uniparc_hash; open my $fh, "<", $file or croak ("\nCannot open input file '$file':\n $!\n"); while( my $line = <$fh> ) { @@ -225,6 +246,9 @@ sub process_file { elsif($type eq "farm"){ $farm_hash{lc($key)} = $value; } + elsif($type eq 'uniparc') { # Processing uniparc settings + $uniparc_hash{lc($key)} = $value; + } } close $fh or croak "Can't close file"; @@ -335,6 +359,19 @@ sub process_file { print "No xref database is too be used\n" if ($verbose) } + #If we had UniParc information then create the settings + if($uniparc_hash{dbname}) { + my %args = ( + -DBNAME => $uniparc_hash{dbname}, + -USER => $uniparc_hash{user} + ); + $args{-PASS} = $uniparc_hash{password} if $uniparc_hash{password}; + + $uniparc = XrefMapper::uniparc->new(%args); + $uniparc->method($uniparc_hash{method}); + $mapper->uniparc($uniparc); + } + if(defined($species_hash{'species'})){ diff --git a/misc-scripts/xref_mapping/XrefMapper/Methods/ChecksumBasic.pm b/misc-scripts/xref_mapping/XrefMapper/Methods/ChecksumBasic.pm new file mode 100644 index 0000000000000000000000000000000000000000..694746586680fa8c2def935f97d7cab1c595cdab --- /dev/null +++ b/misc-scripts/xref_mapping/XrefMapper/Methods/ChecksumBasic.pm @@ -0,0 +1,94 @@ +package XrefMapper::Methods::ChecksumBasic; + +use strict; +use warnings; + +use Bio::SeqIO; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Digest::MD5; + +my $DEFAULT_BATCH_SIZE = 10; + +sub new { + my ($class, @args) = @_; + my $self = bless({}, $class); + + my ($mapper, $batch_size) = rearrange([qw(mapper batch_size)], @args); + + throw 'No -MAPPER given' unless $mapper; + $batch_size = $DEFAULT_BATCH_SIZE unless $batch_size; + + $self->mapper($mapper); + $self->batch_size($batch_size); + return $self; +} + +sub mapper { + my ($self, $_mapper) = @_; + $self->{mapper} = $_mapper if defined $_mapper; + return $self->{mapper}; +} + +sub batch_size { + my ($self, $batch_size) = @_; + $self->{batch_size} = $batch_size if defined $batch_size; + return $self->{batch_size}; +} + +sub run { + my ($self, $target) = @_; + + if(! defined $target) { + $target = $self->mapper()->core()->protein_file(); + } + + my $reader = $self->_get_sequence_parser($target); + my @results; + my @tmp_list; + my $batch_size = $self->batch_size(); + my $count = 0; + while ( my $sequence = $reader->next_seq() ) { + push(@tmp_list, $sequence); + $count++; + if( ($count % $batch_size) == 0) { + my $res = $self->perform_mapping(\@tmp_list); + push(@results, @{$res}); + $self->mapper()->log_progress("Finished batch mapping of %d peptides\n", $batch_size); + $count = 0; + @tmp_list = (); + } + } + + #Final mapping if there were some left over + if(@tmp_list) { + $self->mapper()->log_progress("Finishing progess\n"); + my $res = $self->perform_mapping(\@tmp_list); + push(@results, @{$res}); + @tmp_list = (); + } + + $reader->close(); + return \@results; +} + +sub perform_mapping { + my ($self, $sequences) = @_; + throw('Override to perform the mapping you require'); +} + +sub _get_sequence_parser { + my ($self, $target) = @_; + throw "Cannot find the file '${target}'" unless -f $target; + my $reader = Bio::SeqIO->new(-FILE => $target, -FORMAT => 'fasta'); + return $reader; +} + +sub md5_checksum { + my ($self, $sequence) = @_; + my $digest = Digest::MD5->new(); + $digest->add($sequence->seq()); + return $digest->hexdigest(); +} + +1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/XrefMapper/Methods/OracleUniParc.pm b/misc-scripts/xref_mapping/XrefMapper/Methods/OracleUniParc.pm new file mode 100644 index 0000000000000000000000000000000000000000..220a35863a57cd1298b7e00b8c1ac349e055f48d --- /dev/null +++ b/misc-scripts/xref_mapping/XrefMapper/Methods/OracleUniParc.pm @@ -0,0 +1,82 @@ +package XrefMapper::Methods::OracleUniParc; + +use strict; +use warnings; + +use base qw/XrefMapper::Methods::ChecksumBasic/; + +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::SqlHelper; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use List::Util qw(max); + +my $DEFAULT_BATCH_SIZE = 10000; + +my $UNIPARC_SQL = <<'SQL'; +SELECT p.UPI +FROM UNIPARC.PROTEIN p +WHERE p.md5 = ? +SQL + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($batch_size) = rearrange([qw(batch_size)], @args); + if(! $batch_size) { + $self->batch_size($DEFAULT_BATCH_SIZE); + } + return $self; +} + +sub checksum { + my ($self, $sequence) = @_; + return uc($self->md5_checksum($sequence)); +} + +sub perform_mapping { + my ($self, $sequences) = @_; + + my @final_results; + + $self->oracle_dbc()->batch(-SQL => $UNIPARC_SQL, -CALLBACK => sub { + my ($sth) = @_; + foreach my $sequence (@{$sequences}) { + my $checksum = $self->checksum($sequence); + $sth->execute($checksum); + my $upi; + while(my $row = $sth->fetchrow_arrayref()) { + my ($local_upi) = @{$row}; + if(defined $upi) { + throw sprintf('The sequence %s had a checksum of %s but this resulted in more than one UPI: [%s, %s]', $sequence->id(), $checksum, $upi, $local_upi); + } + $upi = $local_upi; + } + if(defined $upi){ + push(@final_results, { id => $sequence->id(), upi => $upi, object_type => 'Translation' }); + } + } + return; + }); + + return \@final_results; +} + +sub oracle_dbc { + my ($self) = @_; + if(! exists $self->{oracle_dbc}) { + my $dbc = $self->mapper()->uniparc()->dbc(); + $dbc->disconnect_when_inactive(0); + $dbc->driver('Oracle'); + $self->{oracle_dbc} = $dbc; + } + return $self->{oracle_dbc}; +} + +sub DESTROY { + my ($self) = @_; + $self->oracle_dbc()->disconnect_if_idle() if $self->oracle_dbc(); + return; +} + +1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm b/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm new file mode 100644 index 0000000000000000000000000000000000000000..02dbec5c740c578e9e7cd562432dbd1c060f1497 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm @@ -0,0 +1,178 @@ +package XrefMapper::UniParcMapper; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use base qw(XrefMapper::BasicMapper); + +my $DEFAULT_METHOD = 'XrefMapper::Methods::OracleUniParc'; + +sub new { + my($class, $mapper) = @_; + my $self = bless {}, $class; + $self->core($mapper->core); + $self->xref($mapper->xref); + $self->uniparc($mapper->uniparc()); + $self->mapper($mapper); + $self->method($self->uniparc()->method() || $DEFAULT_METHOD); + return $self; +} + +sub _xref_helper { + my ($self) = @_; + return $self->xref()->dbc()->sql_helper(); +} + +sub logic_name { + my ($self) = @_; + return 'XrefChecksum'; +} + +sub external_db_name { + my ($self) = @_; + return 'UniParc'; +} + +sub mapper { + my ($self, $mapper) = @_; + $self->{mapper} = $mapper if defined $mapper; + return $self->{mapper}; +} + +sub method { + my ($self, $method) = @_; + $self->{method} = $method if defined $method; + return $self->{method}; +} + +sub verbose { + my ($self) = @_; + return $self->mapper()->verbose(); +} + +sub process { + my ($self, $do_upload) = @_; + + $self->_update_status('checksum_xrefs_started'); + + my $method = $self->get_method(); + my $results = $method->run(); + + if($do_upload) { + $self->log_progress('Starting upload'); + $self->upload($results); + } + + $self->_update_status('checksum_xrefs_finished'); + return; +} + +sub upload { + my ($self, $results) = @_; + + #The elements come in as an array looking like + # [ { id => 1, upi => 'UPI00000A', object_type => 'Translation' } ] + + my $insert_xref = <<'SQL'; +INSERT INTO xref (source_id, accession, label, version, species_id, info_type) +values (?,?,?,?,?,?) +SQL + my $insert_object_xref = <<'SQL'; +INSERT INTO object_xref (ensembl_id, ensembl_object_type, xref_id, linkage_type, ox_status) +values (?,?,?,?,?) +SQL + + my $h = $self->_xref_helper(); + my $source_id = $self->source_id(); + my $species_id = $self->species_id(); + + $h->transaction(-CALLBACK => sub { + + $self->log_progress('Starting xref insertion'); + #Record UPIs to make sure we do not insert a UPI in more than once + my %upi_xref_id; + $h->batch(-SQL => $insert_xref, -CALLBACK => sub { + my ($sth) = @_; + foreach my $e (@{$results}) { + my $upi = $e->{upi}; + if(exists $upi_xref_id{$upi}) { + $e->{xref_id} = $upi_xref_id{$upi}; + } + else { + $sth->execute($source_id, $e->{upi}, $e->{upi}, 1, $species_id, 'CHECKSUM'); + my $id = $sth->{'mysql_insertid'}; + $e->{xref_id} = $id; + $upi_xref_id{$upi} = $id; + } + } + return; + }); + + $self->log_progress('Starting object_xref insertion'); + $h->batch(-SQL => $insert_object_xref, -CALLBACK => sub { + my ($sth) = @_; + foreach my $e (@{$results}) { + $sth->execute($e->{id}, $e->{object_type}, $e->{xref_id}, 'CHECKSUM', 'DUMP_OUT'); + } + return; + }); + }); + + $self->log_progress('Finished insertions'); + + return; +} + +sub source_id { + my ($self) = @_; + return $self->_xref_helper()->execute_single_result( + -SQL => 'select source_id from source where name=?', + -PARAMS => [$self->external_db_name()] + ); +} + +sub species_id { + my ($self) = @_; + my $species_id = $self->SUPER::species_id(); + if(! defined $species_id) { + $species_id = $self->get_id_from_species_name($self->core()->species()); + $self->SUPER::species_id($species_id); + } + return $species_id; +} + +sub get_method { + my ($self) = @_; + my $method_class = $self->method(); + eval "require ${method_class};"; + if($@) { + throw "Cannot require the class ${method_class}. Make sure your PERL5LIB is correct: $@"; + } + return $method_class->new( -MAPPER => $self ); +} + +############# INTERNAL METHODS + +sub _update_status { + my ($self, $status) = @_; + if($self->xref()) { + my $h = $self->_xref_helper(); + my $sql = q{insert into process_status (status, date) values(?,now())}; + $h->execute_update(-SQL => $sql, -PARAMS => [$status]); + } + else { + my $time = localtime(); + $self->log_progress(q{Status Update '%s' @ %s}."\n", $status, $time); + } + return; +} + +sub log_progress { + my ( $self, $fmt, @params ) = @_; + return if (!$self->verbose); + printf( STDERR "CHKSM==> %s", sprintf( $fmt, @params ) ); +} + +1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/XrefMapper/XrefLoader.pm b/misc-scripts/xref_mapping/XrefMapper/XrefLoader.pm index ca9a9246b76bb126d72a058c8d3f0bbb41ba8fc2..0e2f5d1fbc851a7bf131e2c8bffe3611e48af16d 100644 --- a/misc-scripts/xref_mapping/XrefMapper/XrefLoader.pm +++ b/misc-scripts/xref_mapping/XrefMapper/XrefLoader.pm @@ -188,7 +188,8 @@ sub update{ # Get analysis id's #################### - my %analysis_ids = $self->get_analysis(); # + my %analysis_ids = $self->get_analysis(); + my $checksum_analysis_id = $self->get_single_analysis('XrefChecksum'); print "xref offset is $xref_offset, object_xref offset is $object_xref_offset\n" if ($verbose); @@ -290,6 +291,25 @@ GSQL } print "DIRECT $count\n" if ($verbose); } + + ### IF CHECKSUM, xref, object_xref + # 1:m mapping between object & xref + elsif($type eq 'CHECKSUM') { + my $count = 0; + $direct_sth->execute($source_id, $type); + my $last_xref = 0; + while(my $row = $direct_sth->fetchrow_arrayref()) { + my ($xref_id, $acc, $label, $version, $desc, $info, $object_xref_id, $ensembl_id, $ensembl_type) = @{$row}; + if($last_xref != $xref_id) { + push @xref_list, $xref_id; + $count++; + $add_xref_sth->execute(($xref_id+$xref_offset), $ex_id, $acc, $label, $version, $desc, $type, $info || $where_from); + $last_xref = $xref_id; + } + $add_object_xref_sth->execute(($object_xref_id+$object_xref_offset), $ensembl_id, $ensembl_type, ($xref_id+$xref_offset), $checksum_analysis_id); + } + print "CHECKSUM $count\n" if ($verbose); + } ### If DEPENDENT, xref, object_xref , dependent_xref (order by xref_id) # maybe linked to more than one? @@ -779,43 +799,47 @@ WEL sub get_analysis{ my $self = shift; - - my %typeToLogicName = ( 'Gene' => 'XrefExonerateDNA', 'Transcript' => 'XrefExonerateDNA', - 'Translation' => 'XrefExonerateProtein' ); - + 'Translation' => 'XrefExonerateProtein'); my %analysis_id; - foreach my $key (qw(Gene Transcript Translation)){ - my $logic_name = $typeToLogicName{$key}; - - my $sth = $self->core->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'"); - - $sth->execute(); - - my $analysis_id; - - if (my @row = $sth->fetchrow_array()) { - - $analysis_id{$key} = $row[0]; - - } else { - - print "No analysis with logic_name $logic_name found, creating ...\n" if ($self->verbose); - $sth = $self->core->dbc->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "',NOW())"); - # TODO - other fields in analysis table - $sth->execute(); - $analysis_id{$key} = $sth->{'mysql_insertid'}; - } - $sth->finish(); - + $analysis_id{$key} = $self->get_single_analysis($logic_name); } return %analysis_id; +} + +sub get_single_analysis { + my ($self, $logic_name) = @_; + my $h = $self->core->dbc()->sql_helper(); + my $analysis_ids = $h->execute_simple( + -SQL => 'SELECT analysis_id FROM analysis WHERE logic_name=?', + -PARAMS => [$logic_name] + ); + my $analysis_id; + + if(@{$analysis_ids}) { + $analysis_id = $analysis_ids->[0]; + } + else { + print "No analysis with logic_name $logic_name found, creating ...\n" if ($self->verbose); + # TODO - other fields in analysis table + $self->core()->dbc()->sql_helper()->execute_update( + -SQL => 'INSERT INTO analysis (logic_name, created) VALUES (?,NOW())', + -PARAMS => [$logic_name], + -CALLBACK => sub { + my ($sth) = @_; + $analysis_id = $sth->{'mysql_insertid'}; + return; + } + ); + } + return $analysis_id; } + 1; diff --git a/misc-scripts/xref_mapping/XrefMapper/uniparc.pm b/misc-scripts/xref_mapping/XrefMapper/uniparc.pm new file mode 100644 index 0000000000000000000000000000000000000000..190a91ea0fbba0fac952a18c78d4f9cb185b239d --- /dev/null +++ b/misc-scripts/xref_mapping/XrefMapper/uniparc.pm @@ -0,0 +1,11 @@ +package XrefMapper::uniparc; + +use base qw(XrefMapper::db); + +sub method { + my ($self, $method) = @_; + $self->{method} = $method if defined $method; + return $self->{method}; +} + +1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/sql/table.sql b/misc-scripts/xref_mapping/sql/table.sql index 88f8fa4aeec2942548a815ba6d0d4b2882106f6c..8df24753611494ebc7088860f7f026f3a788b3e6 100755 --- a/misc-scripts/xref_mapping/sql/table.sql +++ b/misc-scripts/xref_mapping/sql/table.sql @@ -289,7 +289,8 @@ CREATE TABLE process_status ( 'prioritys_flagged','processed_pairs','biomart_test_finished', 'source_level_move_finished','alt_alleles_processed', 'official_naming_done', - 'coordinate_xrefs_started','coordinate_xref_finished', + 'checksum_xrefs_started', 'checksum_xrefs_finished', + 'coordinate_xrefs_started','coordinate_xref_finished', 'tests_started','tests_failed','tests_finished', 'core_loaded','display_xref_done','gene_description_done'), date DATETIME NOT NULL, @@ -365,7 +366,8 @@ CREATE TABLE object_xref ( linkage_type ENUM( 'PROJECTION', 'MISC', 'DEPENDENT', 'DIRECT', 'SEQUENCE_MATCH', 'INFERRED_PAIR', 'PROBE', - 'UNMAPPED', 'COORDINATE_OVERLAP' ), + 'UNMAPPED', 'COORDINATE_OVERLAP', + 'CHECKSUM'), ox_status ENUM( 'DUMP_OUT','FAILED_PRIORITY', 'FAILED_CUTOFF', 'NO_DISPLAY', 'MULTI_DELETE') NOT NULL DEFAULT 'DUMP_OUT', -- set ox_status to 0 if non used priority_xref or failed cutoff unused_priority INT UNSIGNED, diff --git a/misc-scripts/xref_mapping/xref_mapper.input b/misc-scripts/xref_mapping/xref_mapper.input index f7cbe28cce84753f82bce31eaac3b1e0bf04d21c..de1bd9a4460275e6c1c521e675bd51b6b4e1f422 100644 --- a/misc-scripts/xref_mapping/xref_mapper.input +++ b/misc-scripts/xref_mapping/xref_mapper.input @@ -6,6 +6,11 @@ user=ensadmin password= dir=/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping/xref +#uniparc +#dbname= +#user= +#password= + species=mus_musculus host=ecs4 port=3306 diff --git a/misc-scripts/xref_mapping/xref_mapper.pl b/misc-scripts/xref_mapping/xref_mapper.pl index 4ef7e56743c066ddae087ea556300923c9a12747..a1c3f85a2e4da5b83f49884f7933d54826996bf8 100644 --- a/misc-scripts/xref_mapping/xref_mapper.pl +++ b/misc-scripts/xref_mapping/xref_mapper.pl @@ -18,6 +18,7 @@ use XrefMapper::XrefLoader; use XrefMapper::Interpro; use XrefMapper::DisplayXrefs; use XrefMapper::CoordinateMapper; +use XrefMapper::UniParcMapper; use XrefMapper::OfficialNaming; use XrefMapper::DirectXrefs; @@ -230,11 +231,20 @@ if($status eq "alt_alleles_processed"){ } +#UniParc checksum mapping +#Allow for reruns if required but needs a uniparc object available in the mapper +#as that means we can do it +$status = $mapper->xref_latest_status(); +if($status eq 'official_naming_done' && defined $mapper->uniparc()) { + my $checksum_mapper = XrefMapper::UniParcMapper->new($mapper); + $checksum_mapper->process($upload); +} + # Coordinate xrefs # tests $status = $mapper->xref_latest_status(); -if($status eq "official_naming_done" || $status eq "tests_started" || $status eq "tests_failed" ){ +if($status eq "official_naming_done" || $status eq "checksum_xrefs_finished" || $status eq "tests_started" || $status eq "tests_failed" ){ my $tester = XrefMapper::TestMappings->new($mapper); if($tester->unlinked_entries){ die "Problems found so will not load core database\n";