use IPC::Open3;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use vars '@ISA';
@ISA = qw{ XrefMapper::db };
=head1 NAME
This is the basic mapper routine. It will create the necessary fasta files for
both the xref and ensembl sequences. These will then be matched using exonerate
Ian Longden
and the results written to another file. By creating a <species>.pm file and
inheriting from this base class different matching routines, parameters, data
sets etc can be set.
=head1 CONTACT
Post questions to the EnsEMBL development list
# Hashes to hold method-specific thresholds
my %method_query_threshold;
my %method_target_threshold;
# Various useful variables.
my %translation_to_transcript;
my %transcript_to_translation;
my %genes_to_transcripts;
my %xref_to_source;
my %object_xref_mappings;
my %object_xref_identities;
my %xref_descriptions;
my %xref_accessions;
=head2 dump_seqs
Arg[1]: xref object which holds info needed for the dump of xref
Description: Dumps out the files for the mapping. Xref object should hold
the value of the databases and source to be used.
Returntype : none
Exceptions : will die if species not known or an error occurs while
: trying to write to files.
Caller : general
my ($self) = @_;
Arg[1]: xref object which holds info on method and files.
Description: runs the mapping of the list of files with species methods
Returntype : none
Exceptions : none
Caller : general
my ($self) = @_;
my @list=();
my $i = 0;
foreach my $method (@{$self->method()}){
my @dna=();
push @dna, $method;
push @dna, $self->xref->dir."/xref_".$i."_dna.fasta";
push @dna, $self->ensembl_dna_file();
push @list, \@dna;
my @pep=();
push @pep, $method;
push @pep, $self->xref->dir."/xref_".$i."_peptide.fasta";
push @pep, $self->ensembl_protein_file();
push @list, \@pep;
=head2 get_species_id_from_species_name
Arg[1]: species name
Description: get the species_id from the database for the named database.
Example : my $id = get_species_id_from_species_name('homo_sapiens');
Returntype : int (species_id)
Exceptions : will die if species does not exist in given xref database.
Caller : general
sub get_species_id_from_species_name{
my ($xref,$species) = @_;
my $sql = "select species_id from species where name = '".$species."'";
my @row = $sth->fetchrow_array();
my $species_id;
if (defined @row) {
$species_id = $row[0];
} else {
print STDERR "Couldn't get ID for species ".$species."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from species";
$sth = $dbi->prepare($sql);
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
die("Please try again :-)\n");
return $species_id;
=head2 get_set_lists
Description: specifies the list of databases and source to be used in the
: generation of one or more data sets.
Returntype : list of lists
Example : my @lists =@{$self->get_set_lists()};
Exceptions : none
Caller : dump_xref
sub get_set_lists{
my ($self) = @_;
# return [["ExonerateGappedBest1", ["homo_sapiens","Uniprot/SWISSPROT"]]];
# return [["method1",["homo_sapiens","RefSeq"],["homo_sapiens","UniProtSwissProt"]],
# ["method2",[$self->species,"*"]],
# ["method3",["*","*"]]];
return [["ExonerateGappedBest1", ["homo_sapiens","*"], ["mus_musculus", "*"]]];
=head2 get_source_id_from_source_name
Arg[1]: source name
Description: get the source_id from the database for the named source.
Example : my $id = get_source_id_from_source_name('RefSeq');
Returntype : int (source_id)
Exceptions : will die if source does not exist in given xref database.
Caller : general
sub get_source_id_from_source_name{
my ($xref, $source) = @_;
my $source_id;
my $sql = "select source_id from source where name = '".$source."'";
my $dbi = $xref->dbi();
my $sth = $dbi->prepare($sql);
my @row = $sth->fetchrow_array();
if (defined $row[0] and $row[0] ne '') {
$source_id = $row[0];
# print $source."\t*".$row[0]."*\n";
} else {
print STDERR "Couldn't get ID for source ".$source."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from source";
$sth = $dbi->prepare($sql);
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
die("Please try again :-)\n");
return $source_id;
=head2 dump_xref
Arg[1]: xref object which holds info on method and files.
Description: Dumps the Xref data as fasta file(s)
Returntype : none
Exceptions : none
Caller : dump_seqs
sub dump_xref{
my ($self) = @_;
my $xref =$self->xref();
my @method=();
my @lists =@{$self->get_set_lists()};
my $i=0;
Ian Longden
my $skip = 1;
foreach my $list (@lists){
if(!-e $xref->dir()."/xref_".$i."_dna.fasta"){
$skip = 0;
if(!-e $xref->dir()."/xref_".$i."_peptide.fasta"){
Ian Longden
$skip = 0;
my $k = 0;
foreach my $list (@lists){
$method[$k++] = shift @$list;
Ian Longden
foreach my $list (@lists){
Ian Longden
# print "method->".@$list[0]."\n";
$method[$i] = shift @$list;
my $j = 0;
my @species_id=();
foreach my $element (@$list){
while(my $species = shift(@$element)){
# print $j.")\t".$species."\n";
if($species ne "*"){
$species_id[$j] = get_species_id_from_species_name($xref,$species);
$species_id[$j] = -1;
my $source = shift(@$element);
if($source ne "*"){
$source_id[$j] = get_source_id_from_source_name($xref,$source);
$source_id[$j] = -1;
Ian Longden
# print $j."\t".$source. "\t".$source_id[$j] ."\n";
# print $j."\t".$species."\t".$species_id[$j]."\n";
#method data fully defined now
Ian Longden
=head2 dump_subset
Arg[1]: xref object which holds info on files.
Arg[2]: list of species to use.
Arg[3]: list of sources to use.
Arg[4]: index to be used in file creation.
Description: Dumps the Xref data for one set of species/databases
Returntype : none
Exceptions : none
Caller : dump_xref
sub dump_subset{
Ian Longden
my ($self,$xref,$rspecies_id,$rsource_id,$index) = @_;
# generate or condition list for species and sources
my $final_clause;
my $use_all = 0;
my @or_list;
for (my $j = 0; $j < scalar(@$rspecies_id); $j++){
my @condition;
if($$rspecies_id[$j] > 0){
push @condition, "x.species_id=" . $$rspecies_id[$j];
if($$rsource_id[$j] > 0){
push @condition, "x.source_id=" . $$rsource_id[$j];
# note if both source and species are * (-1) there's no need for a final clause
if ( !@condition ) {
$use_all = 1;
push @or_list, join (" AND ", @condition);
Ian Longden
$final_clause = " AND ((" . join(") OR (", @or_list) . "))" unless ($use_all) ;
for my $sequence_type ('dna', 'peptide') {
my $filename = $xref->dir() . "/xref_" . $index . "_" . $sequence_type . ".fasta";
open(XREF_DUMP,">$filename") || die "Could not open $filename";
my $sql = "SELECT p.xref_id, p.sequence, x.species_id , x.source_id ";
$sql .= " FROM primary_xref p, xref x ";
$sql .= " WHERE p.xref_id = x.xref_id AND ";
$sql .= " p.sequence_type ='$sequence_type' ";
$sql .= $final_clause;
$sql .= " LIMIT ".$self->maxdump()." ";
my $sth = $xref->dbi()->prepare($sql);
while(my @row = $sth->fetchrow_array()){
$row[1] =~ s/(.{60})/$1\n/g;
print XREF_DUMP ">".$row[0]."\n".$row[1]."\n";
=head2 dump_ensembl
Description: Dumps the ensembl data to a file in fasta format.
Returntype : none
Exceptions : none
Caller : dump_seqs
sub dump_ensembl{
my ($self) = @_;
=head2 fetch_and_dump_seq
Description: Dumps the ensembl data to a file in fasta format.
Returntype : none
Exceptions : wil die if the are errors in db connection or file creation.
Caller : dump_ensembl
sub fetch_and_dump_seq{
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-species => $self->species(),
-dbname => $self->dbname(),
-host => $self->host(),
-port => $self->port(),
-pass => $self->password(),
-user => $self->user(),
-group => 'core');
Ian Longden
# if no directory set then dump in the current directory.
# store ensembl protein file name and open it
Ian Longden
if(defined($self->dumpcheck()) and -e $self->ensembl_protein_file() and -e $self->ensembl_dna_file()){
|| die("Could not open dna file for writing: ".$self->ensembl_dna_file."\n");
|| die("Could not open protein file for writing: ".$self->ensembl_protein_file."\n");
my $gene_adap = $db->get_GeneAdaptor();
my @gene_ids = @{$gene_adap->list_dbIDs()};
Ian Longden
my $max = undef;
$max = $self->maxdump();
my $i =0;
foreach my $gene_id (@gene_ids){
my $gene = $gene_adap->fetch_by_dbID($gene_id);
foreach my $transcript (@{$gene->get_all_Transcripts()}) {
Ian Longden
my $seq = $transcript->spliced_seq();
$seq =~ s/(.{60})/$1\n/g;
print DNA ">" . $transcript->dbID() . "\n" .$seq."\n";
my $trans = $transcript->translation();
my $translation = $transcript->translate();
Ian Longden
my $pep_seq = $translation->seq();
$pep_seq =~ s/(.{60})/$1\n/g;
print PEP ">".$trans->dbID()."\n".$pep_seq."\n";
last if(defined($max) and $i > $max);
=head2 ensembl_protein_file
Arg [1] : (optional) string $arg
the fasta file name for the ensembl proteins
Example : $file_name = $self->ensembl_protein_file();
Description: Getter / Setter for the protien ensembl fasta file
Returntype : string
Exceptions : none
(defined $arg) &&
($self->{_ens_prot_file} = $arg );
return $self->{_ens_prot_file};
=head2 ensembl_dna_file
Arg [1] : (optional) string $arg
the fasta file name for the ensembl dna
Example : $file_name = $self->ensembl_dna_file();
Description: Getter / Setter for the protien ensembl fasta file
Returntype : string
Exceptions : none
(defined $arg) &&
($self->{_ens_dna_file} = $arg );
return $self->{_ens_dna_file};
=head2 method
Arg [1] : (optional) list reference $arg
reference to a list of method names
Example : my @methods = @{$self->method()};
Description: Getter / Setter for the methods
Returntype : list
Exceptions : none
sub method{
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_method} = $arg );
return $self->{_method};
sub xref{
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_xref} = $arg );
return $self->{_xref};
=head2 run_mapping
Arg[1] : List of lists of (method, query, target)
Arg[2] :
Example : none
Description: Create and submit mapping jobs to LSF, and wait for them to finish.
Returntype : none
Exceptions : none
Caller : general
sub run_mapping {
my ($self, $lists) = @_;
# delete old output files in target directory if we're going to produce new ones
if (!defined($self->use_existing_mappings)) {
my $dir = $self->dir();
unlink (<$dir/*.map $dir/*.out $dir/*.err>);
# foreach method, submit the appropriate job & keep track of the job name
# note we check if use_existing_mappings is set here, not earlier, as we
# still need to instantiate the method object in order to fill
# method_query_threshold and method_target_threshold
my @job_names;
foreach my $list (@$lists){
my ($method, $queryfile ,$targetfile) = @$list;
my $obj_name = "XrefMapper::Methods::$method";
# check that the appropriate object exists
eval "require $obj_name";
if($@) {
warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@");
} else {
my $obj = $obj_name->new();
$method_query_threshold{$method} = $obj->query_identity_threshold();
$method_target_threshold{$method} = $obj->target_identity_threshold();
if (!defined($self->use_existing_mappings)) {
my $job_name = $obj->run($queryfile, $targetfile, $self->dir());
push @job_names, $job_name;
sleep 1; # make sure unique names really are unique
} # foreach method
if (!defined($self->use_existing_mappings)) {
# submit depend job to wait for all mapping jobs
submit_depend_job($self->dir, @job_names);
=head2 submit_depend_job
Arg[1] : List of job names.
Arg[2] :
Example : none
Description: Submit an LSF job that waits for other jobs to finish.
Returntype : none
Exceptions : none
Caller : general
sub submit_depend_job {
my ($root_dir, @job_names) = @_;
# Submit a job that does nothing but wait on the main jobs to
# finish. This job is submitted interactively so the exec does not
# return until everything is finished.
# build up the bsub command; first part
my @depend_bsub = ('bsub', '-K');
# one -wended clause for each main job
foreach my $job (@job_names) {
push @depend_bsub, "-wended($job)";
# rest of command
push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true');
#print "##depend bsub:\n" . join (" ", @depend_bsub) . "\n";
my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid);
$depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub);
my $depend_jobid;
while (<$depend_rtr>) {
if (/Job <([0-9]+)> is/) {
$depend_jobid = $1;
print "LSF job ID for depend job: $depend_jobid \n" ;
if (!defined($depend_jobid)) {
print STDERR "Error: could not get depend job ID\n";
Arg[1] : The target file used in the exonerate run. Used to work out the Ensembl object type.
Arg[2] :
Example : none
Description: Parse exonerate output files and build files for loading into target db tables.
Returntype : List of strings
Exceptions : none
Caller : general
my $dir = $self->dir();
# get current max object_xref_id
# TODO use selectall_arrayref
my $row = @{$self->dbi()->selectall_arrayref("SELECT MAX(object_xref_id) FROM object_xref")}[0];
my $max_object_xref_id = @{$row}[0];
if (!defined $max_object_xref_id) {
print "Can't get highest existing object_xref_id, using 1\n";
} else {
print "Maximum existing object_xref_id = $max_object_xref_id\n";
$max_object_xref_id = 1;
$row = @{$self->dbi->selectall_arrayref("SELECT MAX(xref_id) FROM xref")}[0];
my $max_xref_id = @$row[0];
if (!defined $max_xref_id) {
print "Can't get highest existing xref_id, using 0\n)";
} else {
print "Maximum existing xref_id = $max_xref_id\n";
$max_object_xref_id = 1;
#my $ox_sth = $dbi->prepare("INSERT INTO object_xref(ensembl_id, ensembl_object_type, xref_id) VALUES(?,?,?)");
#my $ix_sth = $dbi->prepare("INSERT INTO identity_xref VALUES(?,?,?,?,?,?,?,?,?,?,?)");
# files to write table data to
open (OBJECT_XREF, ">$dir/object_xref.txt");
open (IDENTITY_XREF, ">$dir/identity_xref.txt");
my $total_lines = 0;
my $total_files = 0;
my $object_xref_id = $max_object_xref_id + 1;
# keep a (unique) list of xref IDs that need to be written out to file as well
# this is a hash of hashes, keyed on xref id that relates xrefs to e! objects (may be 1-many)
my %primary_xref_ids = ();
# also keep track of types of ensembl objects
my %ensembl_object_types;
# and a list of mappings of ensembl objects to xrefs
Glenn Proctor
# (primary now, dependent added in dump_core_xrefs)
# this is required for display_xref generation later
# format:
# key: ensembl object type:ensembl object id
# value: list of xref_id (with offset)
# Note %object_xref_mappings is global
foreach my $file (glob("$dir/*.map")) {
Glenn Proctor
#print "Parsing results from " . basename($file) . "\n";
open(FILE, $file);
# files are named Method_(dna|peptide)
my $type = get_ensembl_object_type($file);
my $method = get_method($file);
# get or create the appropriate analysis ID
# XXX restore when using writeable database
#my $analysis_id = $self->get_analysis_id($type);
my $analysis_id = 999;
while (<FILE>) {
my ($label, $query_id, $target_id, $identity, $query_length, $target_length, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score) = split(/:/, $_);
$cigar_line =~ s/ //g;
# calculate percentage identities
my $query_identity = int (100 * $identity / $query_length);
my $target_identity = int (100 * $identity / $target_length);
# only take mappings where there is a good match on one or both sequences
next if ($query_identity < $method_query_threshold{$method} &&
$target_identity < $method_target_threshold{$method});
# note we add on $xref_id_offset to avoid clashes
print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t" . ($query_id+$xref_id_offset) . "\n";
print IDENTITY_XREF join("\t", ($object_xref_id, $query_identity, $target_identity, $query_start+1, $query_end, $target_start+1, $target_end, $cigar_line, $score, "\\N", $analysis_id)) . "\n";
# TODO - evalue?
$ensembl_object_types{$target_id} = $type;
# store mapping for later - note NON-OFFSET xref_id is used
my $key = $type . "|" . $target_id;
my $xref_id = $query_id;
push @{$object_xref_mappings{$key}}, $xref_id;
# store query & target identities
# Note this is a hash (object id) of hashes (xref id) of hashes ("query_identity" or "target_identity")
$object_xref_identities{$target_id}->{$xref_id}->{"query_identity"} = $query_identity;
$object_xref_identities{$target_id}->{$xref_id}->{"target_identity"} = $target_identity;
# note the NON-OFFSET xref_id is stored here as the values are used in
# a query against the original xref database
$primary_xref_ids{$query_id}{$target_id} = $target_id;
#print "After $file, lines read increased by " . ($total_lines-$last_lines) . "\n";
$last_lines = $total_lines;
print "Read $total_lines lines from $total_files exonerate output files\n";
# write relevant xrefs to file
$self->dump_core_xrefs(\%primary_xref_ids, $object_xref_id+1, $xref_id_offset, \%ensembl_object_types);
# write comparison info. Can be removed after development
sub get_ensembl_object_type {
my $filename = shift;
my $type;
$filename = basename($filename);
if ($filename =~ /_dna_/i) {
$type = "Transcript";
} elsif ($filename =~ /_peptide_/i) {
$type = "Translation";
} else {
print STDERR "Cannot deduce Ensembl object type from filename $filename\n";
return $type;
sub get_method {
my $filename = shift;
$filename = basename($filename);
my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/;
return $method;
sub get_analysis_id {
my ($self, $ensembl_type) = @_;
my %typeToLogicName = ( 'dna' => 'XrefExonerateDNA',
'protein' => 'XrefExonerateProtein' );
my $logic_name = $typeToLogicName{lc($ensembl_type)};
my $sth = $self->dbi()->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
my $analysis_id;
if (my @row = $sth->fetchrow_array()) {
$analysis_id = $row[0];
print "Found exising analysis ID ($analysis_id) for $logic_name\n";
} else {
print "No analysis with logic_name $logic_name found, creating ...\n";
$sth = $self->dbi()->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
# TODO - other fields in analysis table
$analysis_id = $sth->{'mysql_insertid'};
print "Done (analysis ID=" . $analysis_id. ")\n";
return $analysis_id;
Glenn Proctor
sub dump_core_xrefs {
my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $ensembl_object_types_hashref) = @_;
my @xref_ids = keys %$xref_ids_hashref;
my %xref_to_objects = %$xref_ids_hashref;
my %ensembl_object_types = %$ensembl_object_types_hashref;
my $dir = $self->dir();
open (XREF, ">$dir/xref.txt");
open (OBJECT_XREF, ">>$dir/object_xref.txt");
open (EXTERNAL_SYNONYM, ">$dir/external_synonym.txt");
my $xref_dbi = $self->xref()->dbi();
my $core_dbi = $self->dbi();
# keep a unique list of source IDs to build the external_db table later
my %source_ids;
my $object_xref_id = $start_object_xref_id;
Glenn Proctor
# build cache of source id -> external_db id
my %source_to_external_db = $self->map_source_to_external_db();
# execute several queries with a max of 200 entries in each IN clause - more efficient
my $batch_size = 200;
# keep track of what xref_id & object_xref_ids have been written to prevent
# duplicates; e.g. several dependent xrefs may be dependent on the same master xref.
my %object_xrefs_written;
while(@xref_ids) {
my @ids;
if($#xref_ids > $batch_size) {
@ids = splice(@xref_ids, 0, $batch_size);
} else {
@ids = splice(@xref_ids, 0);
my $id_str;
if(@ids > 1) {
$id_str = "IN (" . join(',', @ids). ")";
} else {
$id_str = "= " . $ids[0];
my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
my $xref_sth = $xref_dbi->prepare($sql);
my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id, $master_xref_id);
$xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
# note the xref_id we write to the file is NOT the one we've just read
Glenn Proctor
# from the internal xref database as the ID may already exist in the
# core database so we add on $xref_id_offset
while ($xref_sth->fetch()) {
# make sure label is set to /something/ so that the website displays something
$label = $accession if (!$label);
if (!$xrefs_written{$xref_id}) {
my $external_db_id = $source_to_external_db{$source_id};
if ($external_db_id) { # skip "unknown" sources
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
$xrefs_written{$xref_id} = 1;
$source_ids{$source_id} = $source_id;
# Now get the dependent xrefs for each of these xrefs and write them as well
Glenn Proctor
$sql = "SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, x.source_id, x.version FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id $id_str";
my $dep_sth = $xref_dbi->prepare($sql);
Glenn Proctor
$dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, \$source_id, \$version);