Skip to content
Snippets Groups Projects
BasicMapper.pm 64.7 KiB
Newer Older
Ian Longden's avatar
Ian Longden committed
package XrefMapper::BasicMapper;
Ian Longden's avatar
Ian Longden committed

use File::Basename;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
Ian Longden's avatar
Ian Longden committed
use Bio::EnsEMBL::Translation;
use XrefMapper::db;
Ian Longden's avatar
Ian Longden committed
use vars '@ISA';
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
=head1 NAME

XrefMapper::BasicMapper

=head1 DESCIPTION

This is the basic mapper routine. It will create the necessary fasta files for
both the xref and ensembl sequences. These will then be matched using exonerate
and the results written to another file. By creating a <species>.pm file and 
inheriting from this base class different matching routines, parameters, data 
sets etc can be set.
Ian Longden's avatar
Ian Longden committed

=head1 CONTACT

Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk


=cut

# Hashes to hold method-specific thresholds
my %method_query_threshold;
my %method_target_threshold;

# Various useful variables.
my %translation_to_transcript;
my %transcript_to_translation;
my %genes_to_transcripts;
my %xref_to_source;
my %object_xref_mappings;
my %object_xref_identities;
my %xref_descriptions;
my %xref_accessions;
my %xrefs_written;
my %object_xrefs_written;
=head2 new

  Description: Constructor for BasicMapper.
  Returntype : BasicMapper
  Exceptions : none
  Caller     : general

=cut

sub new{
  my($class, @args) = @_;

  my $self ={};
  bless $self,$class;

  return $self;
}

Ian Longden's avatar
Ian Longden committed
=head2 dump_seqs

  Arg[1]: xref object which holds info needed for the dump of xref

  Description: Dumps out the files for the mapping. Xref object should hold
              the value of the databases and source to be used.
  Returntype : none
  Exceptions : will die if species not known or an error occurs while
             : trying to write to files. 
  Caller     : general
 
=cut
 

Ian Longden's avatar
Ian Longden committed

=head2 build_list_and_map
Ian Longden's avatar
Ian Longden committed

  Arg[1]: xref object which holds info on method and files.

Glenn Proctor's avatar
Glenn Proctor committed
  Description: runs the mapping of the list of files with species methods
Ian Longden's avatar
Ian Longden committed
  Returntype : none
  Exceptions : none
  Caller     : general

=cut

sub build_list_and_map {


  my @list=();

  my $i = 0;
  foreach my $method (@{$self->method()}){
    my @dna=();
    push @dna, $method;
    push @dna, $self->xref->dir."/xref_".$i."_dna.fasta";
    push @list, \@dna;
    my @pep=();
    push @pep, $method;
Ian Longden's avatar
Ian Longden committed
    push @pep, $self->xref->dir."/xref_".$i."_peptide.fasta";
  $self->run_mapping(\@list);
Ian Longden's avatar
Ian Longden committed
=head2 get_species_id_from_species_name

  Arg[1]: species name

  Description: get the species_id from the database for the named database.
  Example    : my $id = get_species_id_from_species_name('homo_sapiens');
  Returntype : int (species_id)
  Exceptions : will die if species does not exist in given xref database.
  Caller     : general

=cut

sub get_species_id_from_species_name{
  my $sql = "select species_id from species where name = '".$species."'";
  $sth->execute();
  my @row = $sth->fetchrow_array();
  my $species_id;
    $species_id = $row[0];
  } else {
    print STDERR "Couldn't get ID for species ".$species."\n";
    print STDERR "It must be one of :-\n";
    $sql = "select name from species";
    $sth->execute();
    while(my @row = $sth->fetchrow_array()){
      print STDERR $row[0]."\n";
    }
    die("Please try again :-)\n");
Ian Longden's avatar
Ian Longden committed
  $sth->finish();
Ian Longden's avatar
Ian Longden committed
=head2 get_set_lists

  Description: specifies the list of databases and source to be used in the
             : generation of one or more data sets.
  Returntype : list of lists
  Example    : my @lists =@{$self->get_set_lists()};
  Exceptions : none
  Caller     : dump_xref

=cut

  return [["ExonerateGappedBest1", ["*","*"]]];
Ian Longden's avatar
Ian Longden committed
=head2 get_source_id_from_source_name

  Arg[1]: source name

  Description: get the source_id from the database for the named source.
  Example    : my $id = get_source_id_from_source_name('RefSeq');
  Returntype : int (source_id)
  Exceptions : will die if source does not exist in given xref database.
  Caller     : general

=cut

sub get_source_id_from_source_name{
  my $source_id;
  
  my $sql = "select source_id from source where name = '".$source."'";
  $sth->execute();
  my @row = $sth->fetchrow_array();
  if (defined $row[0] and $row[0] ne '') {
    $source_id = $row[0];
  } else {
    print STDERR "Couldn't get ID for source ".$source."\n";
    print STDERR "It must be one of :-\n";
    $sql = "select name from source";
    $sth->execute();
    while(my @row = $sth->fetchrow_array()){
      print STDERR $row[0]."\n";
    }
    die("Please try again :-)\n");
Ian Longden's avatar
Ian Longden committed
  $sth->finish();
Ian Longden's avatar
Ian Longden committed

=head2 dump_xref

  Arg[1]: xref object which holds info on method and files.

  Description: Dumps the Xref data as fasta file(s)
  Returntype : none
  Exceptions : none
  Caller     : dump_seqs

=cut

  if(!defined($xref->dir())){
    if(defined($self->dir)){
      $xref->species($self->dir);
    }
    else{
      $xref->dir(".");
    }
  }
  
  my @method=();
  
  my @lists =@{$self->get_set_lists()};
  
  my $i=0;
  if(defined($self->dumpcheck())){
    my $skip = 1;
    foreach my $list (@lists){
      if(!-e $xref->dir()."/xref_".$i."_dna.fasta"){ 
	$skip = 0;
      }
Ian Longden's avatar
Ian Longden committed
      if(!-e $xref->dir()."/xref_".$i."_peptide.fasta"){ 
Ian Longden's avatar
Ian Longden committed
      my $k = 0;
      foreach my $list (@lists){
	$method[$k++] = shift @$list;
      }
      $self->method(\@method);
  foreach my $list (@lists){
    $method[$i] = shift @$list;
Ian Longden's avatar
Ian Longden committed
    my @source_id=();
    my @species_id=();
    foreach my $element (@$list){
      while(my $species = shift(@$element)){
	if($species ne "*"){
	  $species_id[$j] = get_species_id_from_species_name($xref,$species);
	}
	else{
	  $species_id[$j] = -1;
	}
	my $source = shift(@$element);
	if($source ne "*"){
	  $source_id[$j] = get_source_id_from_source_name($xref,$source);
	}
	else{
	  $source_id[$j] = -1;
	}
	$j++;
      }
    }
    #method data fully defined now
    $self->dump_subset($xref,\@species_id,\@source_id,$i);    
Ian Longden's avatar
Ian Longden committed
=head2 dump_subset

  Arg[1]: xref object which holds info on files.
  Arg[2]: list of species to use.
  Arg[3]: list of sources to use.
  Arg[4]: index to be used in file creation.
  
  Description: Dumps the Xref data for one set of species/databases
  Returntype : none
  Exceptions : none
  Caller     : dump_xref

=cut

  my ($self,$xref,$rspecies_id,$rsource_id,$index) = @_;
  # generate or condition list for species and sources
  my $final_clause;
  my $use_all = 0;
  my @or_list;
  for (my $j = 0; $j < scalar(@$rspecies_id); $j++){
    my @condition;
    if($$rspecies_id[$j] > 0){
      push @condition, "x.species_id=" . $$rspecies_id[$j];
    if($$rsource_id[$j] > 0){
      push @condition, "x.source_id=" . $$rsource_id[$j];
Ian Longden's avatar
Ian Longden committed
    }
    # note if both source and species are * (-1) there's no need for a final clause

    if ( !@condition ) {
      $use_all = 1;
      last;
    }

    push @or_list, join (" AND ", @condition);

  $final_clause = " AND ((" . join(") OR (", @or_list) . "))" unless ($use_all) ;


  for my $sequence_type ('dna', 'peptide') {

    my $filename = $xref->dir() . "/xref_" . $index . "_" . $sequence_type . ".fasta";
    open(XREF_DUMP,">$filename") || die "Could not open $filename";

    my $sql = "SELECT p.xref_id, p.sequence, x.species_id , x.source_id ";
    $sql   .= "  FROM primary_xref p, xref x ";
    $sql   .= "  WHERE p.xref_id = x.xref_id AND ";
    $sql   .= "        p.sequence_type ='$sequence_type' ";
    $sql   .= $final_clause;

    if(defined($self->maxdump())){
      $sql .= " LIMIT ".$self->maxdump()." ";
    $sth->execute();
    while(my @row = $sth->fetchrow_array()){

      print XREF_DUMP ">".$row[0]."\n".$row[1]."\n";

Ian Longden's avatar
Ian Longden committed
    }
Ian Longden's avatar
Ian Longden committed
=head2 dump_ensembl

  Description: Dumps the ensembl data to a file in fasta format.
  Returntype : none
  Exceptions : none
  Caller     : dump_seqs

=cut

Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
=head2 fetch_and_dump_seq

  Description: Dumps the ensembl data to a file in fasta format.
  Returntype : none
  Exceptions : wil die if the are errors in db connection or file creation.
  Caller     : dump_ensembl

=cut

sub fetch_and_dump_seq{
Ian Longden's avatar
Ian Longden committed

  my $ensembl = $self->core;
  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $ensembl->dbc);
Ian Longden's avatar
Ian Longden committed

  #
  # store ensembl dna file name and open it
  #
  if(!defined($ensembl->dir())){
    $ensembl->dir(".");
  $ensembl->dna_file($ensembl->dir."/".$ensembl->species."_dna.fasta");


Ian Longden's avatar
Ian Longden committed
  #
  # store ensembl protein file name and open it
  #
  $ensembl->protein_file($ensembl->dir."/".$ensembl->species."_protein.fasta");

  if(defined($self->dumpcheck()) and -e $ensembl->protein_file() and -e $ensembl->dna_file()){
  open(DNA,">".$ensembl->dna_file()) 
    || die("Could not open dna file for writing: ".$ensembl->dna_file."\n");
  open(PEP,">".$ensembl->protein_file()) 
    || die("Could not open protein file for writing: ".$ensembl->protein_file."\n");
Ian Longden's avatar
Ian Longden committed

  my $gene_adaptor = $db->get_GeneAdaptor();


  # fetch by location, or everything if not defined
  my @genes;
  if ($location) {

    my $slice_adaptor = $db->get_SliceAdaptor();
    my $slice = $slice_adaptor->fetch_by_name($location);
    @genes = @{$gene_adaptor->fetch_all_by_Slice($slice)};

  } else {

    @genes = @{$gene_adaptor->fetch_all()};

  }

  my $max = undef;
  if(defined($self->maxdump())){
    $max = $self->maxdump();
  }
Ian Longden's avatar
Ian Longden committed
  my $i =0;
    foreach my $transcript (@{$gene->get_all_Transcripts()}) {
      my $seq = $transcript->spliced_seq(); 
      $seq =~ s/(.{60})/$1\n/g;
Ian Longden's avatar
Ian Longden committed
      print DNA ">" . $transcript->dbID() . "\n" .$seq."\n";
      my $trans = $transcript->translation();
      my $translation = $transcript->translate();
      if(defined($translation)){
	my $pep_seq = $translation->seq();
	$pep_seq =~ s/(.{60})/$1\n/g;
	print PEP ">".$trans->dbID()."\n".$pep_seq."\n";
      }
Ian Longden's avatar
Ian Longden committed
    }
Ian Longden's avatar
Ian Longden committed
  close DNA;
  close PEP;
Ian Longden's avatar
Ian Longden committed

###
# Getter/Setter methods
###




  Arg [1]    : (optional) list reference $arg
               reference to a list of method names 
  Example    : my @methods = @{$self->method()};
  Description: Getter / Setter for the methods 
  Returntype : list
Ian Longden's avatar
Ian Longden committed
  Exceptions : none

=cut
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
  my ($self, $arg) = @_;
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
  (defined $arg) &&
    ($self->{_method} = $arg );
  return $self->{_method};
Ian Longden's avatar
Ian Longden committed
}
Ian Longden's avatar
Ian Longden committed
=head2 core
 
  Arg [1]    : (optional) 
  Example    : $mapper->core($new_core);
  Description: Getter / Setter for the core. 
               info for the ensembl core database. 
  Returntype : XrefMapper::db
  Exceptions : none

=cut

  (defined $arg) &&
    ($self->{_core} = $arg );
  return $self->{_core};
}
Ian Longden's avatar
Ian Longden committed

=head2 dumpcheck
 
  Arg [1]    : (optional) 
  Example    : $mapper->dumpcheck("yes");
  Description: Getter / Setter for dumpcheck. 
               If set the mapper will not dump fasta files 
               if they exist already. 
  Returntype : scalar
  Exceptions : none

=cut

Ian Longden's avatar
Ian Longden committed
  my ($self, $arg) = @_;
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
  (defined $arg) &&
    ($self->{_dumpcheck} = $arg );
  return $self->{_dumpcheck};
Ian Longden's avatar
Ian Longden committed
}
Ian Longden's avatar
Ian Longden committed
=head2 maxdump
 
  Arg [1]    : (optional) 
  Example    : $mapper->maxdump(10);
  Description: Getter / Setter for maxdump. 
               If set the mapper will only dump that number of 
               sequences into the fasta files. (Mainly used for testing).
  Returntype : scalar
  Exceptions : none

=cut

  (defined $arg) &&
    ($self->{_maxdump} = $arg );
  return $self->{_maxdump};
}
Ian Longden's avatar
Ian Longden committed


=head2 use_existing_mappings

  Arg [1]    : (optional) 
  Example    : $mapper->use_existing_mappings("yes");
  Description: Getter / Setter for use_existing_mappings. 
               If set the mapper will not redo the mapping
               but parse the existing .map files.
  Returntype : scalar
  Exceptions : none

=cut

  my ($self, $arg) = @_;

  (defined $arg) &&
    ($self->{_use_existing_mappings} = $arg );
  return $self->{_use_existing_mappings};
Ian Longden's avatar
Ian Longden committed

=head2 xref
 
  Arg [1]    : (optional) 
  Example    : $mapper->core($new_core);
  Description: Getter / Setter for the core. 
               info for the xref database. 
  Returntype : XrefMapper::db
  Exceptions : none

=cut

sub xref{
  my ($self, $arg) = @_;

  (defined $arg) &&
    ($self->{_xref} = $arg );
  return $self->{_xref};
}

Ian Longden's avatar
Ian Longden committed


  Arg[1]     : List of lists of (method, query, target)
  Arg[2]     :
  Example    : none
  Description: Create and submit mapping jobs to LSF, and wait for them to finish.
  Returntype : none
  Exceptions : none
  Caller     : general

=cut

sub run_mapping {

  my ($self, $lists) = @_;

  # delete old output files in target directory if we're going to produce new ones
Ian Longden's avatar
Ian Longden committed
  if (!defined($self->use_existing_mappings())) {
    print "deleting out err and map files from output dir\n";
    unlink (<$dir/*.map $dir/*.out $dir/*.err>);
  }
  #disconnect so that we can then reconnect after the long mapping bit.
  $self->core->dbc->disconnect_if_idle(1);
  $self->xref->dbc->disconnect_if_idle(1);
  $self->core->dbc->disconnect_when_inactive(1);
  $self->xref->dbc->disconnect_when_inactive(1);

  # foreach method, submit the appropriate job & keep track of the job name
  # note we check if use_existing_mappings is set here, not earlier, as we
  # still need to instantiate the method object in order to fill
  # method_query_threshold and method_target_threshold

  my @job_names;

  foreach my $list (@$lists){

    my ($method, $queryfile ,$targetfile)  =  @$list;

    my $obj_name = "XrefMapper::Methods::$method";
    # check that the appropriate object exists
    eval "require $obj_name";
    if($@) {

      warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@");

    } else {

      my $obj = $obj_name->new();
      $method_query_threshold{$method} = $obj->query_identity_threshold();
      $method_target_threshold{$method} = $obj->target_identity_threshold();
      if (!defined($self->use_existing_mappings)) {
	my $job_name = $obj->run($queryfile, $targetfile, $self->core->dir());
	push @job_names, $job_name;
	sleep 1; # make sure unique names really are unique
      }
  if (!defined($self->use_existing_mappings)) {
    # submit depend job to wait for all mapping jobs
    submit_depend_job($self->core->dir, @job_names);
} # run_mapping
=head2 submit_depend_job

  Arg[1]     : List of job names.
  Arg[2]     :
  Example    : none
  Description: Submit an LSF job that waits for other jobs to finish.
  Returntype : none
  Exceptions : none
  Caller     : general

=cut

sub submit_depend_job {

  my ($root_dir, @job_names) = @_;

  # Submit a job that does nothing but wait on the main jobs to
  # finish. This job is submitted interactively so the exec does not
  # return until everything is finished.

  # build up the bsub command; first part
  my @depend_bsub = ('bsub', '-K');

Glenn Proctor's avatar
 
Glenn Proctor committed
  # build -w 'ended(job1) && ended(job2)' clause
  my $ended_str = "-w ";
  my $i = 0;
  foreach my $job (@job_names) {
Glenn Proctor's avatar
 
Glenn Proctor committed
    $ended_str .= "ended($job)";
    $ended_str .= " && " if ($i < $#job_names);
    $i++;
Glenn Proctor's avatar
 
Glenn Proctor committed
  push @depend_bsub, $ended_str;

Glenn Proctor's avatar
 
Glenn Proctor committed
  push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err");
  #print "##depend bsub:\n" . join (" ", @depend_bsub) . "\n";
Glenn Proctor's avatar
 
Glenn Proctor committed
  my $jobid = 0;
Glenn Proctor's avatar
 
Glenn Proctor committed
  eval {
    my $pid;
    my $reader;
Glenn Proctor's avatar
 
Glenn Proctor committed
    local *BSUB;
    local *BSUB_READER;

    if (($reader = open(BSUB_READER, '-|'))) {
      while (<BSUB_READER>) {
	if (/^Job <(\d+)> is submitted/) {
	  $jobid = $1;
	  print "LSF job ID for depend job: $jobid\n"
	}
      }
      close(BSUB_READER);
    } else {
      die("Could not fork : $!\n") unless (defined($reader));
      open(STDERR, ">&STDOUT");
      if (($pid = open(BSUB, '|-'))) {
	
	print BSUB "/bin/true\n";
	close BSUB;
	if ($? != 0) {
	  die("bsub exited with non-zero status ($?) - job not submitted\n");
	}
      } else {
	if (defined($pid)) {
	  exec(@depend_bsub);
	  die("Could not exec bsub : $!\n");
	} else {
	  die("Could not fork : $!\n");
	}
      }
      exit(0);
    }
  };

  if ($@) {
    # Something went wrong
    warn("Job submission failed:\n$@\n");
  }
=head2 parse_mappings

  Example    : none
  Description: Parse exonerate output files and build files for loading into target db tables.
  Returntype : List of strings
  Exceptions : none
  Caller     : general

=cut

sub parse_mappings {
  my $ensembl = $self->core;
  my $xref = $self->xref;
  my $dir = $ensembl->dir();
  $ensembl->dbc->disconnect_if_idle(0);
  $ensembl->dbc->disconnect_when_inactive(0);

  $xref->dbc->disconnect_if_idle(0);
  $xref->dbc->disconnect_when_inactive(0);
  # cache xref id->label info; useful for debugging
#  $self->cache_xref_labels($xref->dbc);
  # get current max object_xref_id
  my $row = @{$ensembl->dbc->db_handle->selectall_arrayref("SELECT MAX(object_xref_id) FROM object_xref")}[0];
  my $max_object_xref_id = @{$row}[0];
  if (!defined $max_object_xref_id) {
    print "No existing object_xref_ids, will start from 1\n";
  } else {
    print "Maximum existing object_xref_id = $max_object_xref_id\n";
  }
  my $object_xref_id_offset = $max_object_xref_id + 1;
  my $object_xref_id = $object_xref_id_offset;
  $row = @{$ensembl->dbc->db_handle->selectall_arrayref("SELECT MAX(xref_id) FROM xref")}[0];
Glenn Proctor's avatar
Glenn Proctor committed
  if (!defined $max_xref_id) {
    print "No existing xref_ids, will start from 1\n";
    $max_xref_id = -1; # so that generated xref_ids will be the same as in xref db
Glenn Proctor's avatar
Glenn Proctor committed
  } else {
    print "Maximum existing xref_id = $max_xref_id\n";
  }
  my $xref_id_offset = $max_xref_id + 1;

  # files to write table data to
  open (OBJECT_XREF,   ">$dir/object_xref.txt");
  open (IDENTITY_XREF, ">$dir/identity_xref.txt");
  my $last_lines = 0;
  my $total_files = 0;

  # keep a (unique) list of xref IDs that need to be written out to file as well
  # this is a hash of hashes, keyed on xref id that relates xrefs to e! objects (may be 1-many)
  my %primary_xref_ids = ();
  # also keep track of types of ensembl objects
  my %ensembl_object_types;

  # and a list of mappings of ensembl objects to xrefs
  # (primary now, dependent added in dump_core_xrefs)
  # this is required for display_xref generation later
  # format:
  #   key: ensembl object type:ensembl object id
  #   value: list of xref_id (with offset)
  # Note %object_xref_mappings is global
  foreach my $file (glob("$dir/*.map")) {
    #print "Parsing results from " . basename($file) .  "\n";
    open(FILE, $file);
    $total_files++;

    # files are named Method_(dna|peptide)_N.map
    my $type = get_ensembl_object_type($file);

    my $method = get_method($file);

    # get or create the appropriate analysis ID
    # XXX restore when using writeable database
Ian Longden's avatar
Ian Longden committed
    my $analysis_id = $self->get_analysis_id($type);
    #    my $analysis_id = 999;
    while (<FILE>) {

      $total_lines++;
      chomp();
      my ($label, $query_id, $target_id, $identity, $query_length, $target_length, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score) = split(/:/, $_);
      $cigar_line =~ s/ //g;

      # calculate percentage identities
      my $query_identity = int (100 * $identity / $query_length);
      my $target_identity = int (100 * $identity / $target_length);
      # only take mappings where there is a good match on one or both sequences
      if ($query_identity  < $method_query_threshold{$method} &&
	  $target_identity < $method_target_threshold{$method}){
	my $reason = $target_id."|".$type."|".$query_identity."|".$target_identity."|";
	   $reason .= $method_query_threshold{$method}."|". $method_target_threshold{$method};
	$failed_xref_mappings{$query_id} = $reason;
	next;
      }
Glenn Proctor's avatar
Glenn Proctor committed
      # note we add on $xref_id_offset to avoid clashes
      print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t" . ($query_id+$xref_id_offset) . "\n";
      print IDENTITY_XREF join("\t", ($object_xref_id, $query_identity, $target_identity, $query_start+1, $query_end, $target_start+1, $target_end, $cigar_line, $score, "\\N", $analysis_id)) . "\n";

      # TODO - evalue?
      $object_xref_id++;

      $ensembl_object_types{$target_id} = $type;

      # store mapping for later - note NON-OFFSET xref_id is used
      my $key = $type . "|" . $target_id;
      my $xref_id = $query_id;
      push @{$object_xref_mappings{$key}}, $xref_id;

      # Note this is a hash (type|object id) of hashes (xref id) of hashes ("query_identity" or "target_identity")
      $object_xref_identities{$key}->{$xref_id}->{"query_identity"} = $query_identity;
      $object_xref_identities{$key}->{$xref_id}->{"target_identity"} = $target_identity;
Glenn Proctor's avatar
Glenn Proctor committed
      # note the NON-OFFSET xref_id is stored here as the values are used in
      # a query against the original xref database
      $primary_xref_ids{$query_id}{$target_id} = $target_id;
    #print "After $file, lines read increased by " . ($total_lines-$last_lines) . "\n";
    $last_lines = $total_lines;
  }

  close(IDENTITY_XREF);
  close(OBJECT_XREF);

  print "Read $total_lines lines from $total_files exonerate output files\n";

  # write relevant xrefs to file
  $max_object_xref_id = $self->dump_core_xrefs(\%primary_xref_ids, $object_xref_id+1, $xref_id_offset, $object_xref_id_offset, \%ensembl_object_types);
  # dump xrefs that don't appear in either the primary_xref or dependent_xref tables
  $self->dump_orphan_xrefs($xref_id_offset);

  # dump triage type data
  $self->dump_triage_data($xref_id_offset);

  # dump interpro table as well
  $self->dump_interpro();

  # dump direct xrefs
  $self->dump_direct_xrefs($xref_id_offset, $max_object_xref_id);

Glenn Proctor's avatar
Glenn Proctor committed
  # write comparison info. Can be removed after development
  ###writes to xref.txt.Do not want to do this if loading data afterwards
  ####  $self->dump_comparison();
sub get_stable_ids(){
  my ($self, $type, $string, $hashref) = @_;

  my $sql = "SELECT ".$type."_id ,stable_id ";
  $sql .=      "FROM ".$type."_stable_id ";
  $sql .=          "WHERE ".$type."_id IN (".$string.")";

  my $sth = $self->core->dbc->prepare($sql);
  $sth->execute();
  my ($trans, $stable);
  $sth->bind_columns(\$trans,\$stable);
  while($sth->fetch()){
    $hashref->{$trans} = $stable;
  }
  $sth->finish;
}

sub dump_triage_data() {
  my ($self, $xref_id_offset) = @_;

  my $translation="";
  my $translation_count=0;
  my $transcript="";
  my $transcript_count=0;
  my $batch_size=200;
  my %translation_2_stable=();
  my %transcript_2_stable=();
  foreach my $temp (values %failed_xref_mappings){
    my ($id, $type) = split(/\|/,$temp);
    if($type =~ /Translation/){
      $translation_count++;
      if($translation_count > $batch_size){
	my $ex = $self->get_stable_ids("translation",$translation.$id,\%translation_2_stable);
	$translation_count = 0;
	$translation = "";
      }
      else{
	$translation .= "$id,";
      }
    }
    elsif($type =~ /Transcript/){
      $transcript_count++;
      if($transcript_count > $batch_size){
	$self->get_stable_ids("transcript",$transcript.$id,\%transcript_2_stable);
	$transcript_count=0;
	$transcript="";
      }