diff --git a/misc-scripts/xref_mapping/XrefMapper/Methods/MySQLUniParc.pm b/misc-scripts/xref_mapping/XrefMapper/Methods/MySQLUniParc.pm new file mode 100644 index 0000000000000000000000000000000000000000..b999f106770e3717693ed94da82a1914de863290 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefMapper/Methods/MySQLUniParc.pm @@ -0,0 +1,42 @@ +package XrefMapper::Methods::MySQLUniParc; + +use strict; +use warnings; + +use base qw/XrefMapper::Methods::ChecksumBasic/; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +my $UNIPARC_SQL = <<SQL; +select accession +from checksum_xref +where checksum =? +SQL + +sub perform_mapping { + my ($self, $sequences) = @_; + + my @final_results; + + $self->mapper()->xref()->dbc()->sql_helper()->batch(-SQL => $UNIPARC_SQL, -CALLBACK => sub { + my ($sth) = @_; + foreach my $sequence (@{$sequences}) { + my $checksum = uc($self->md5_checksum($sequence)); + $sth->execute($checksum); + my $upi; + while(my $row = $sth->fetchrow_arrayref()) { + my ($local_upi) = @{$row}; + if(defined $upi) { + throw sprintf('The sequence %s had a checksum of %s but this resulted in more than one UPI: [%s, %s]', $sequence->id(), $checksum, $upi, $local_upi); + } + $upi = $local_upi; + } + if(defined $upi){ + push(@final_results, { id => $sequence->id(), upi => $upi, object_type => 'Translation' }); + } + } + return; + }); + + return \@final_results; +} \ No newline at end of file diff --git a/misc-scripts/xref_mapping/XrefMapper/Methods/OracleUniParc.pm b/misc-scripts/xref_mapping/XrefMapper/Methods/OracleUniParc.pm deleted file mode 100644 index 5789f96ffafcad50980ddfb5ade22e5a18c97248..0000000000000000000000000000000000000000 --- a/misc-scripts/xref_mapping/XrefMapper/Methods/OracleUniParc.pm +++ /dev/null @@ -1,82 +0,0 @@ -package XrefMapper::Methods::OracleUniParc; - -use strict; -use warnings; - -use base qw/XrefMapper::Methods::ChecksumBasic/; - -use Bio::EnsEMBL::DBSQL::DBConnection; -use Bio::EnsEMBL::Utils::Exception qw(throw); -use Bio::EnsEMBL::Utils::SqlHelper; -use Bio::EnsEMBL::Utils::Argument qw(rearrange); -use List::Util qw(max); - -my $DEFAULT_BATCH_SIZE = 10000; - -my $UNIPARC_SQL = <<'SQL'; -SELECT p.UPI -FROM UNIPARC.PROTEIN p -WHERE p.md5 = ? -SQL - -sub new { - my ($class, @args) = @_; - my $self = $class->SUPER::new(@args); - my ($batch_size) = rearrange([qw(batch_size)], @args); - if(! $batch_size) { - $self->batch_size($DEFAULT_BATCH_SIZE); - } - return $self; -} - -sub checksum { - my ($self, $sequence) = @_; - return uc($self->md5_checksum($sequence)); -} - -sub perform_mapping { - my ($self, $sequences) = @_; - - my @final_results; - - $self->oracle_dbc()->sql_helper()->batch(-SQL => $UNIPARC_SQL, -CALLBACK => sub { - my ($sth) = @_; - foreach my $sequence (@{$sequences}) { - my $checksum = $self->checksum($sequence); - $sth->execute($checksum); - my $upi; - while(my $row = $sth->fetchrow_arrayref()) { - my ($local_upi) = @{$row}; - if(defined $upi) { - throw sprintf('The sequence %s had a checksum of %s but this resulted in more than one UPI: [%s, %s]', $sequence->id(), $checksum, $upi, $local_upi); - } - $upi = $local_upi; - } - if(defined $upi){ - push(@final_results, { id => $sequence->id(), upi => $upi, object_type => 'Translation' }); - } - } - return; - }); - - return \@final_results; -} - -sub oracle_dbc { - my ($self) = @_; - if(! exists $self->{oracle_dbc}) { - my $dbc = $self->mapper()->uniparc()->dbc(); - $dbc->disconnect_when_inactive(0); - $dbc->driver('Oracle'); - $self->{oracle_dbc} = $dbc; - } - return $self->{oracle_dbc}; -} - -sub DESTROY { - my ($self) = @_; - $self->oracle_dbc()->disconnect_if_idle() if $self->oracle_dbc(); - return; -} - -1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm b/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm index 7f97dfddc6b12b7d449e0db27d3ec3b8bc52f11d..c0c19c79239c313d9f195c9caf9d4e7a2d00e3e9 100644 --- a/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/UniParcMapper.pm @@ -7,7 +7,7 @@ use Bio::EnsEMBL::Utils::Exception qw(throw); use base qw(XrefMapper::BasicMapper); -my $DEFAULT_METHOD = 'XrefMapper::Methods::OracleUniParc'; +my $DEFAULT_METHOD = 'XrefMapper::Methods::MySQLUniParc'; sub new { my($class, $mapper) = @_; diff --git a/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm b/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm new file mode 100644 index 0000000000000000000000000000000000000000..1950c79b54243bea1d68a1f0d1a9c81818d80f78 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/ArrayExpressParser.pm @@ -0,0 +1,95 @@ +package XrefParser::ArrayExpressParser; + +## Parsing format looks like: +# ggallus_gene_ensembl +# hsapiens_gene_ensembl +# mmulatta_gene_ensembl +# mmusculus_gene_ensembl +# osativa_eg_gene +# ptrichocarpa_eg_gene +# + +use strict; +use warnings; +use Carp; +use base qw( XrefParser::BaseParser ); + +sub meta_key { + my ($self) = @_; + return "array_express.exported"; +} + +sub run { + my ($self, $ref_arg) = @_; + + my $source_id = $ref_arg->{source_id}; + my $species_id = $ref_arg->{species_id}; + my $files = $ref_arg->{files}; + my $verbose = $ref_arg->{verbose}; + + if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){ + croak "Need to pass source_id, species_id and files as pairs"; + } + $verbose ||=0; + + my $species_id_to_names = $self->species_id2name(); + my $names = $species_id_to_names->{$species_id}; + my $contents_lookup = $self->_get_contents($files, $verbose); + my $active = $self->_is_active($contents_lookup, $names, $verbose); + $self->_insert_meta($active); + + return; +} + +sub _get_contents { + my ($self, $files, $verbose) = @_; + my %lookup; + my $fh = $self->get_filehandle($files->[0]); + while(my $line = <$fh>) { + chomp $line; + my ($species, $remainder) = $line =~ /^(\w+)_(.+)$/; + croak "The line '$line' is not linked to a gene set. This is unexpected." if $remainder !~ /gene/; + $lookup{$species} = 1; + } + close ($fh); + if($verbose) { + printf("ArrayExpress is using the species [%s]\n", join(q{, }, keys %lookup)); + } + return \%lookup; +} + +sub _is_active { + my ($self, $contents_lookup, $names, $verbose) = @_; + #Loop through the names and aliases first. If we get a hit then great + my $active = 0; + foreach my $name (@{$names}) { + if($contents_lookup->{$name}) { + printf('Found ArrayExpress has declared the name "%s". This was an alias'."\n", $name) if $verbose; + $active = 1; + last; + } + } + + #Last ditch using the default name and messing around with the name + if(!$active) { + my $default_name = $names->[0]; + my ($letter, $remainder) = $default_name =~ /^(\w).+_(.+)$/; + my $new_name = join(q{}, $letter, $remainder); + if($contents_lookup->{$new_name}) { + printf('Found ArrayExpress has declared the name "%s". We have constructed this from the default name'."\n", $new_name) if $verbose; + $active = 1; + } + } + + return $active; +} + +sub _insert_meta { + my ($self, $active) = @_; + my $sth = $self->dbi->prepare('INSERT INTO meta (meta_key, meta_value) values (?,?)'); + $sth->execute($self->meta_key(), $active); + $sth->finish(); + return; +} + +1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/XrefParser/UniParcParser.pm b/misc-scripts/xref_mapping/XrefParser/UniParcParser.pm new file mode 100644 index 0000000000000000000000000000000000000000..5f2ea38932bbebd422ae38b20d4d51c2a6255467 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/UniParcParser.pm @@ -0,0 +1,76 @@ +package XrefParser::UniParcParser; + +# Input format looks like: +# +# UPI0001B45C00 71B80D7A684B1F2DEDDA7B5AEE1D029E +# UPI0002473BEA 4542D97F3AB3F7B656ABB941AED3F2BB +# UPI00024743AF A69E7EEE820CA54100AD43E86BE823E4 + +use strict; +use warnings; +use Carp; +use IO::File; +use base qw( XrefParser::BaseParser ); + +my $TABLE_NAME = 'checksum_xref'; + +sub run { + my ($self, $ref_arg) = @_; + + my $source_id = $ref_arg->{source_id}; + my $species_id = $ref_arg->{species_id}; + my $files = $ref_arg->{files}; + my $verbose = $ref_arg->{verbose}; + + if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){ + croak "Need to pass source_id, species_id and files as pairs"; + } + $verbose ||=0; + + my $target_file = $files->[0].'.mysqlinput'; + my $input_fh = $self->get_filehandle($files->[0]); + if(-f $target_file) { + print "Target file '${target_file}' already exists; removing" if $verbose; + unlink $target_file; + } + my $output_fh = IO::File->new($target_file, 'w'); + + $self->_transfer_contents($input_fh, $output_fh); + + close($input_fh); + close($output_fh); + + $self->_load_table($target_file, $verbose); + + return; +} + +sub _transfer_contents { + my ($self, $input_fh, $output_fh, $source_id) = @_; + my $counter = 1; + while(my $line = <$input_fh>) { + chomp $line; + my ($upi, $checksum) = split(/\s+/, $line); + my @output = ($counter++, $source_id, $upi, $checksum); + print join("\t", @output); + print "\n"; + } + return; +} + +sub _load_table { + my ($self, $file, $verbose) = @_; + my $dbh = $self->dbi(); + my ($count) = $dbh->selectrow_array('select count(*) from '.$TABLE_NAME); + if($count) { + print "'$TABLE_NAME' has rows; truncating\n" if $verbose; + $dbh->do('truncate table '.$TABLE_NAME); + } + print "Loading data into '$TABLE_NAME' from '$file'\n" if $verbose; + my $load = sprintf(q{LOAD DATA LOCAL INFILE '%s'INTO %s}, $file, $TABLE_NAME); + $dbh->do($load); + print "Finished loading data into '$TABLE_NAME'\n" if $verbose; + return; +} + +1; \ No newline at end of file diff --git a/misc-scripts/xref_mapping/sql/table.sql b/misc-scripts/xref_mapping/sql/table.sql index b38fd06b1537cb833fdf37cee94db03a2c3fb616..23daad70583b53e4f87c2722219db6bd1bddd85b 100755 --- a/misc-scripts/xref_mapping/sql/table.sql +++ b/misc-scripts/xref_mapping/sql/table.sql @@ -227,6 +227,22 @@ CREATE TABLE coordinate_xref ( ) COLLATE=latin1_swedish_ci ENGINE=InnoDB; ################################################################################ + +-- Table for checksum-based Xrefs, based +-- on the input format from UniProt/UniParc + +CREATE TABLE checksum_xref ( + checksum_xref_id INT UNSIGNED NOT NULL AUTO_INCREMENT, + source_id INT UNSIGNED NOT NULL, + accession CHAR(14) NOT NULL, + checksum CHAR(32) NOT NULL + + PRIMARY KEY checksum_xref(checksum_xref_id), + INDEX checksum_idx(checksum(10)) +) COLLATE=latin1_swedish_ci ENGINE=InnoDB; + +################################################################################ + ################################################################################ -- new tables for new mapper code diff --git a/misc-scripts/xref_mapping/xref_config.ini b/misc-scripts/xref_mapping/xref_config.ini index 0b619cc92afb6155e1504706c0788fe6f37aaa62..48e9193df32605857e5b3b5fe5e9811d9e290ea7 100644 --- a/misc-scripts/xref_mapping/xref_config.ini +++ b/misc-scripts/xref_mapping/xref_config.ini @@ -406,6 +406,16 @@ parser = FlybaseParser release_uri = data_uri = ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r5.39_FB2011_07/gff/dmel-all-*.gff.gz +[source ArrayExpress::MULTI] +# Used by all +name = ArrayExpress +download = Y +order = 50 +priority = 1 +prio_descr = +parser = ArrayExpressParser +release_uri = +data_uri = http://www.ebi.ac.uk/gxa/dataExport/organisms [source CCDS::homo_sapiens] # Used by homo_sapiens @@ -2879,15 +2889,14 @@ parser = UniProtAltParser release_uri = [source UniParc::MULTI] -# Special source used by the UniParc checksum mapper -name = UniParc -download = N -order = 20 -priority = 1 -prio_descr = -parser = none -release_uri = - +name = UniParc +download = Y +order = 20 +priority = 1 +prio_descr = +parser = UniParcParser +release_uri = +data_uri = ftp://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis [source HGNC_curated_transcript::homo_sapiens] name = HGNC_curated_transcript_notransfer @@ -5699,6 +5708,7 @@ source = UniGene::sus_scrofa source = Uniprot/SPTREMBL::MULTI source = Uniprot/SWISSPROT::MULTI source = Uniprot/SWISSPROT::DIRECT +source = UniParc::MULTI source = RFAM::MULTI source = miRBase::MULTI source = goslim_goa::MULTI