Skip to content
Snippets Groups Projects
BasicMapper.pm 23.2 KiB
Newer Older
Ian Longden's avatar
Ian Longden committed
package XrefMapper::BasicMapper;
Ian Longden's avatar
Ian Longden committed

use strict;
use DBI;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
Ian Longden's avatar
Ian Longden committed
use Bio::EnsEMBL::Translation;
use  XrefMapper::db;

use vars '@ISA';

@ISA = qw{ XrefMapper::db };

Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
=head1 NAME

XrefMapper::BasicMapper

=head1 DESCIPTION

This is the basic mapper routine. It will create the necessary fasta files for
both the xref and ensembl sequences. These will then be matched using exonerate
and the results written to another file. By creating a <species>.pm file and 
inheriting from this base class different matching routines, parameters, data 
sets etc can be set.
Ian Longden's avatar
Ian Longden committed

=head1 CONTACT

Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk


=cut

Ian Longden's avatar
Ian Longden committed
=head2 dump_seqs

  Arg[1]: xref object which holds info needed for the dump of xref

  Description: Dumps out the files for the mapping. Xref object should hold
              the value of the databases and source to be used.
  Returntype : none
  Exceptions : will die if species not known or an error occurs while
             : trying to write to files. 
  Caller     : general
 
=cut
 

Ian Longden's avatar
Ian Longden committed

  my ($self, $xref) = @_;
  $self->dump_xref($xref);
Ian Longden's avatar
Ian Longden committed


=head2 run_matching

  Arg[1]: xref object which holds info on method and files.

  Description: runs the mapping of the list of files with specied methods
  Returntype : none
  Exceptions : none
  Caller     : general

=cut

Ian Longden's avatar
Ian Longden committed
sub run_matching{
  my ($self,$xref) = @_;

  my @list=();

  my $i = 0;
  foreach my $method (@{$self->method()}){
    my @dna=();
    push @dna, $method;
    push @dna, $xref->dir."/xref_".$i."_dna.fasta";
    push @dna, $self->ensembl_dna_file();
    push @list, \@dna;
    my @pep=();
    push @pep, $method;
    push @pep, $xref->dir."/xref_".$i."_prot.fasta";
    push @pep, $self->ensembl_protein_file();
    push @list, \@pep;
    $i++;
  }
  
  $self->run_mapping(\@list);
Ian Longden's avatar
Ian Longden committed
=head2 get_species_id_from_species_name

  Arg[1]: species name

  Description: get the species_id from the database for the named database.
  Example    : my $id = get_species_id_from_species_name('homo_sapiens');
  Returntype : int (species_id)
  Exceptions : will die if species does not exist in given xref database.
  Caller     : general

=cut

sub get_species_id_from_species_name{
  my ($xref,$species) = @_;
  my $sql = "select species_id from species where name = '".$species."'";
  my $dbi = $xref->dbi();
Ian Longden's avatar
Ian Longden committed
  my $sth = $dbi->prepare($sql);
  $sth->execute();
  my @row = $sth->fetchrow_array();
  my $species_id;
  if (defined @row) {
    $species_id = $row[0];
  } else {
    print STDERR "Couldn't get ID for species ".$species."\n";
    print STDERR "It must be one of :-\n";
    $sql = "select name from species";
    $sth->execute();
    while(my @row = $sth->fetchrow_array()){
      print STDERR $row[0]."\n";
    }
    die("Please try again :-)\n");
Ian Longden's avatar
Ian Longden committed
  $sth->finish();
  $dbi->disconnect();
Ian Longden's avatar
Ian Longden committed
=head2 get_set_lists

  Description: specifies the list of databases and source to be used in the
             : generation of one or more data sets.
  Returntype : list of lists
  Example    : my @lists =@{$self->get_set_lists()};
  Exceptions : none
  Caller     : dump_xref

=cut

sub get_set_lists{
  my ($self) = @_;  

  return [["method1",["homo_sapiens","RefSeq"],["homo_sapiens","UniProtSwissProt"]],
	  ["method2",[$self->species,"*"]],
	  ["method3",["*","*"]]];
}

Ian Longden's avatar
Ian Longden committed
=head2 get_source_id_from_source_name

  Arg[1]: source name

  Description: get the source_id from the database for the named source.
  Example    : my $id = get_source_id_from_source_name('RefSeq');
  Returntype : int (source_id)
  Exceptions : will die if source does not exist in given xref database.
  Caller     : general

=cut

sub get_source_id_from_source_name{
  my ($xref, $source) = @_;
  my $source_id;
  
  my $sql = "select source_id from source where name = '".$source."'";
  my $dbi = $xref->dbi();
  my $sth = $dbi->prepare($sql);
  $sth->execute();
  my @row = $sth->fetchrow_array();
  if (defined $row[0] and $row[0] ne '') {
    $source_id = $row[0];
#    print $source."\t*".$row[0]."*\n";
  } else {
    print STDERR "Couldn't get ID for source ".$source."\n";
    print STDERR "It must be one of :-\n";
    $sql = "select name from source";
    $sth = $dbi->prepare($sql);
    $sth->execute();
    while(my @row = $sth->fetchrow_array()){
      print STDERR $row[0]."\n";
    }
    die("Please try again :-)\n");
  }  
Ian Longden's avatar
Ian Longden committed
  $sth->finish();
  $dbi->disconnect();
Ian Longden's avatar
Ian Longden committed

=head2 dump_xref

  Arg[1]: xref object which holds info on method and files.

  Description: Dumps the Xref data as fasta file(s)
  Returntype : none
  Exceptions : none
  Caller     : dump_seqs

=cut

sub dump_xref{
  my ($self,$xref) = @_;
  
  if(!defined($xref->dir())){
    if(defined($self->dir)){
      $xref->species($self->dir);
    }
    else{
      $xref->dir(".");
    }
  }
  
  my @method=();
  
  my @lists =@{$self->get_set_lists()};
  
  my $i=0;
  foreach my $list (@lists){
    print "method->".@$list[0]."\n";
    $method[$i] = shift @$list;
    my $j = 1;
Ian Longden's avatar
Ian Longden committed
    my @source_id=();
    my @species_id=();
    foreach my $element (@$list){
      while(my $species = shift(@$element)){
	#	print $j.")\t".$species."\n";
	if($species ne "*"){
	  $species_id[$j] = get_species_id_from_species_name($xref,$species);
	}
	else{
	  $species_id[$j] = -1;
	}
	my $source = shift(@$element);
	if($source ne "*"){
	  $source_id[$j] = get_source_id_from_source_name($xref,$source);
	}
	else{
	  $source_id[$j] = -1;
	}
	print $j."\t".$source. "\t".$source_id[$j] ."\n";
	print $j."\t".$species."\t".$species_id[$j]."\n";
	$j++;
      }
    }
    #method data fully defined now
    dump_subset($xref,\@species_id,\@source_id,$i);    
    $i++;
  }
  
  $self->method(\@method);

  return;
  
}

Ian Longden's avatar
Ian Longden committed
=head2 dump_subset

  Arg[1]: xref object which holds info on files.
  Arg[2]: list of species to use.
  Arg[3]: list of sources to use.
  Arg[4]: index to be used in file creation.
  
  Description: Dumps the Xref data for one set of species/databases
  Returntype : none
  Exceptions : none
  Caller     : dump_xref

=cut


sub dump_subset{
  my ($xref,$rspecies_id,$rsource_id,$index) = @_;
  
  open(XDNA,">".$xref->dir()."/xref_".$index."_dna.fasta") 
    || die "Could not open xref_".$index."_dna.fasta";

  my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id ";
  $sql   .= "  from primary_xref p, xref x ";
  $sql   .= "  where p.xref_id = x.xref_id and ";
  $sql   .= "      p.sequence_type ='dna' ";
  
  
  for (my $j =1; $j<scalar(@$rspecies_id); $j++){
    print $j."\t".$$rspecies_id[$j]."\t".$$rsource_id[$j]."\n";
  }
  #  return $xref->dir."/xref_".$i."_dna.fasta";
  
  my $sth = $xref->dbi()->prepare($sql);
Ian Longden's avatar
Ian Longden committed
  my $i = 0;
  while(my @row = $sth->fetchrow_array()){
    my $pass = 0;
    for (my $j =1; $j<scalar(@$rspecies_id); $j++){
      if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){
	if($$rsource_id[$j] < 0 or  $row[3] == $$rsource_id[$j]){
	  $pass = 1;
	}
      }
    }
    if($pass){
      $i++;
      $row[1] =~ s/(.{60})/$1\n/g;
      print XDNA ">".$row[0]."\n".$row[1]."\n";
      if($i > 10){
	goto ENDDNA;
      }
Ian Longden's avatar
Ian Longden committed
    }

  open(XPRO,">".$xref->dir."/xref_".$index."_prot.fasta") 
    || die "Could not open xref_".$index."_prot.fasta";
  my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id ";
  $sql   .= "  from primary_xref p, xref x ";
  $sql   .= "  where p.xref_id = x.xref_id and ";
  $sql   .= "      p.sequence_type ='peptide' ";
  
  
  $sth = $xref->dbi()->prepare($sql);
Ian Longden's avatar
Ian Longden committed
  $i = 0;
  while(my @row = $sth->fetchrow_array()){
    my $pass = 0;
    for (my $j =1; $j<scalar(@$rspecies_id); $j++){
      if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){
	if($$rsource_id[$j] < 0 or  $row[3] == $$rsource_id[$j]){
	  $pass = 1;
	}
      }
    }
    if($pass){
      $i++;
      $row[1] =~ s/(.{60})/$1\n/g;
      print XPRO ">".$row[0]."\n".$row[1]."\n";
      if($i > 10){
	goto ENDPRO;
      }
Ian Longden's avatar
Ian Longden committed
    }
Ian Longden's avatar
Ian Longden committed
  $sth->finish();
Ian Longden's avatar
Ian Longden committed
=head2 dump_ensembl

  Description: Dumps the ensembl data to a file in fasta format.
  Returntype : none
  Exceptions : none
  Caller     : dump_seqs

=cut

sub dump_ensembl{
  my ($self) = @_;

  $self->fetch_and_dump_seq();
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
=head2 fetch_and_dump_seq

  Description: Dumps the ensembl data to a file in fasta format.
  Returntype : none
  Exceptions : wil die if the are errors in db connection or file creation.
  Caller     : dump_ensembl

=cut

sub fetch_and_dump_seq{
Ian Longden's avatar
Ian Longden committed
  my ($self) = @_;

  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-species => $self->species(),
                           -dbname  => $self->dbname(),
                           -host    => $self->host(),
                           -port    => $self->port(),
                           -password => $self->password(),
Ian Longden's avatar
Ian Longden committed
                           -user     => $self->user(),
Ian Longden's avatar
Ian Longden committed

  #
  # store ensembl dna file name and open it
  #
  
  # if no directory set then dump in the current directory.
  if(!defined($self->dir())){
    $self->dir(".");
  }
Ian Longden's avatar
Ian Longden committed
  $self->ensembl_dna_file($self->dir."/".$self->species."_dna.fasta");
Ian Longden's avatar
Ian Longden committed
  open(DNA,">".$self->ensembl_dna_file()) 
    || die("Could not open dna file for writing: ".$self->ensembl_dna_file."\n");

Ian Longden's avatar
Ian Longden committed
  #
  # store ensembl protein file name and open it
  #
  $self->ensembl_protein_file($self->dir."/".$self->species."_protein.fasta");
Ian Longden's avatar
Ian Longden committed
  open(PEP,">".$self->ensembl_protein_file()) 
    || die("Could not open dna file for writing: ".$self->ensembl_protein_file."\n");

  my $gene_adap = $db->get_GeneAdaptor();
  my @gene_ids = @{$gene_adap->list_dbIDs()};
  my $i =0;
  foreach my $gene_id (@gene_ids){
    $i++;
    my $gene = $gene_adap->fetch_by_dbID($gene_id);
    foreach my $transcript (@{$gene->get_all_Transcripts()}) {
      my $seq = $transcript->spliced_seq(); 
      $seq =~ s/(.{60})/$1\n/g;
Ian Longden's avatar
Ian Longden committed
      print DNA ">" . $transcript->dbID() . "\n" .$seq."\n";
      my $trans = $transcript->translation();
      my $translation = $transcript->translate();
#      print "tranlation ".$translation."\n";
Ian Longden's avatar
Ian Longden committed
      my $pep_seq = $translation->seq();
      $pep_seq =~ s/(.{60})/$1\n/g;
      print PEP ">".$trans->dbID()."\n".$pep_seq."\n";
    }
    if($i > 10){
      goto FIN;
Ian Longden's avatar
Ian Longden committed
FIN:
  close DNA;
  close PEP;
Ian Longden's avatar
Ian Longden committed

###
# Getter/Setter methods
###



Ian Longden's avatar
Ian Longden committed
#=head2 xref_protein_file
# 
#  Arg [1]    : (optional) string $arg
#               the fasta file name for the protein xref
#  Example    : $file name = $xref->xref_protein_file();
#  Description: Getter / Setter for the protien xref fasta file 
#  Returntype : string
#  Exceptions : none
#
#=cut
#
#
#sub xref_protein_file{
#  my ($self, $arg) = @_;
#
#  (defined $arg) &&
#    ($self->{_xref_prot_file} = $arg );
#  return $self->{_xref_prot_file};
#}
#
#=head2 xref_dna_file
#
#  Arg [1]    : (optional) string $arg
#               the fasta file name for the dna xref
#  Example    : $file name = $xref->xref_dna_file();
#  Description: Getter / Setter for the dna xref fasta file 
#  Returntype : string
#  Exceptions : none
#
#=cut
#
#sub xref_dna_file{
#  my ($self, $arg) = @_;
#
#  (defined $arg) &&
#    ($self->{_xref_dna_file} = $arg );
#  return $self->{_xref_dna_file};
#}
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
=head2 ensembl_protein_file
 
  Arg [1]    : (optional) string $arg
               the fasta file name for the ensembl proteins 
  Example    : $file_name = $self->ensembl_protein_file();
  Description: Getter / Setter for the protien ensembl fasta file 
  Returntype : string
  Exceptions : none

=cut
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
sub ensembl_protein_file{
  my ($self, $arg) = @_;
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
  (defined $arg) &&
    ($self->{_ens_prot_file} = $arg );
  return $self->{_ens_prot_file};
}
Ian Longden's avatar
Ian Longden committed
=head2 ensembl_dna_file
 
  Arg [1]    : (optional) string $arg
               the fasta file name for the ensembl dna 
  Example    : $file_name = $self->ensembl_dna_file();
  Description: Getter / Setter for the protien ensembl fasta file 
  Returntype : string
  Exceptions : none

=cut

Ian Longden's avatar
Ian Longden committed
sub ensembl_dna_file{
  my ($self, $arg) = @_;
Ian Longden's avatar
Ian Longden committed

Ian Longden's avatar
Ian Longden committed
  (defined $arg) &&
    ($self->{_ens_dna_file} = $arg );
  return $self->{_ens_dna_file};
}
Ian Longden's avatar
Ian Longden committed
=head2 method
 
  Arg [1]    : (optional) list reference $arg
               reference to a list of method names 
  Example    : my @methods = @{$self->method()};
  Description: Getter / Setter for the methods 
  Returntype : list
  Exceptions : none

=cut


sub method{
  my ($self, $arg) = @_;

  (defined $arg) &&
    ($self->{_method} = $arg );
  return $self->{_method};
}

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
# --------------------------------------------------------------------------------
# GLENN


# Use exonerate (or other program) to find xref-ensembl obejct mappings

sub run_mapping {

  my ($self, $lists) = @_;

  # foreach method, submit the appropriate job & keep track of the job name
  my @job_names;

  foreach my $list (@$lists){

    my ($method, $queryfile ,$targetfile)  =  @$list;

    print "###$method $queryfile $targetfile\n";

    my $obj_name = "XrefMapper::Methods::$method";
    # check that the appropriate object exists
    eval "require $obj_name";
    if($@) {

      warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@");

    } else {

      my $obj = $obj_name->new();
      my $job_name = $obj->run($queryfile, $targetfile);
      push @job_names, $job_name;
      print "Submitted LSF job $job_name to list\n";
      sleep 1; # make sure unique names really are unique

    }

  } # foreach method

  # submit depend job to wait for all mapping jobs
  submit_depend_job($self->dir, @job_names);


} # run_exonerate


sub submit_depend_job {

  my ($root_dir, @job_names) = @_;

  # Submit a job that does nothing but wait on the main jobs to
  # finish. This job is submitted interactively so the exec does not
  # return until everything is finished.

  # build up the bsub command; first part
  my @depend_bsub = ('bsub', '-K');

  # one -wended clause for each main job
  foreach my $job (@job_names) {
    push @depend_bsub, "-wended($job)";
  }

  # rest of command
  push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true');

  #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";

  my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid);
  $depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub);
  my $depend_jobid;
  while (<$depend_rtr>) {
    if (/Job <([0-9]+)> is/) {
      $depend_jobid = $1;
      print "LSF job ID for depend job: $depend_jobid \n" ;
    }
  }
  if (!defined($depend_jobid)) {
    print STDERR "Error: could not get depend job ID\n";
  }



}

=head2 store

  Arg[1]     : The target file used in the exonerate run. Used to work out the Ensembl object type.
  Arg[2]     : 
  Example    : none
  Description: Parse exonerate output files and build files for loading into target db tables.
  Returntype : List of strings
  Exceptions : none
  Caller     : general

=cut

sub store {

  my ($target_file_name) = @_;

  my $type = get_ensembl_object_type($target_file_name);

  # get or create the appropriate analysis ID
  my $analysis_id = get_analysis_id($type);

  # TODO - get this from config
  my $dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=arne_core_20_34",
			 "ensadmin",
			 "ensembl",
			 {'RaiseError' => 1}) || die "Can't connect to database";

 # get current max object_xref_id
  my $max_object_xref_id = 0;
  my $sth = $dbi->prepare("SELECT MAX(object_xref_id) FROM object_xref");
  $sth->execute();
  my $max_object_xref_id = ($sth->fetchrow_array())[0];
  if (!defined $max_object_xref_id) {
    print "Can't get highest existing object_xref_id, using 1\n)";
  } else {
    print "Maximum existing object_xref_id = $max_object_xref_id\n";
  }


  #my $ox_sth = $dbi->prepare("INSERT INTO object_xref(ensembl_id, ensembl_object_type, xref_id) VALUES(?,?,?)");

  #my $ix_sth = $dbi->prepare("INSERT INTO identity_xref VALUES(?,?,?,?,?,?,?,?,?,?,?)");

  # files to write table data to
  open (OBJECT_XREF, ">object_xref.txt");
  open (IDENTITY_XREF, ">identity_xref.txt");

  my $total_lines = 0;
  my $total_files = 0;

  my $object_xref_id = $max_object_xref_id + 1;

  # keep a (unique) list of xref IDs that need to be written out to file as well
  my %primary_xref_ids;

  foreach my $file (glob("*.map")) {

    print "Parsing results from $file \n";
    open(FILE, $file);
    $total_files++;

    while (<FILE>) {

      $total_lines++;
      chomp();
      my ($label, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score) = split(/:/, $_);
      $cigar_line =~ s/ //;

      # TODO make sure query & target are the right way around

      print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t$query_id\n";
      print IDENTITY_XREF "$object_xref_id\t$query_id\t$target_id\t$query_start\t$query_end\t$target_start\t$target_end\t$cigar_line\t$score\t\\N\t$analysis_id\n";
      # TODO - evalue?
      $object_xref_id++;

      $primary_xref_ids{$query_id} = $query_id;

      # Store in database
      # create entry in object_xref and get its object_xref_id
      #$ox_sth->execute($target_id, $type, $query_id) || warn "Error writing to object_xref table";
      #my $object_xref_id = $ox_sth->{'mysql_insertid'};

      # create entry in identity_xref
      #$ix_sth->execute($object_xref_id, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score, undef, $analysis_id) || warn "Error writing to identity_xref table";

    }

    close(FILE);

  }

  close(IDENTITY_XREF);
  close(OBJECT_XREF);

  print "Read $total_lines lines from $total_files exonerate output files\n";

  # write relevant xrefs to file
  dump_xrefs(\%primary_xref_ids);

}


sub get_ensembl_object_type {

  my $filename = shift;
  my $type;

  if ($filename =~ /gene/i) {

    $type = "Gene";

  } elsif ($filename =~ /transcript/i) {

    $type = "Transcript";

  } elsif ($filename =~ /translation/i) {

    $type = "Translation";

  } else {

    print STDERR "Cannot deduce Ensembl object type from filename $filename";
  }

  return $type;

}


sub get_analysis_id {

  my $ensembl_type = shift;

  my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA',
			  'translation' => 'XrefExonerateProtein' );

  my $logic_name = $typeToLogicName{lc($ensembl_type)};

  # TODO - get these details from Config
  my $host = "ecs1g";
  my $port = 3306;
  my $database = "arne_core_20_34";
  my $user = "ensadmin";
  my $password = "ensembl";

  my $dbi = DBI->connect("dbi:mysql:host=$host;port=$port;database=$database",
			 "$user",
			 "$password",
			 {'RaiseError' => 1}) || die "Can't connect to database";


  my $sth = $dbi->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
  $sth->execute();

  my $analysis_id;

  if (my @row = $sth->fetchrow_array()) {

    $analysis_id = $row[0];
    print "Found exising analysis ID ($analysis_id) for $logic_name\n";

  } else {

    print "No analysis with logic_name $logic_name found, creating ...\n";
    $sth = $dbi->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
    # TODO - other fields in analysis table
    $sth->execute();
    $analysis_id = $sth->{'mysql_insertid'};
    print "Done (analysis ID=" . $analysis_id. ")\n";

  }

  return $analysis_id;

}


sub dump_xrefs {

  my $xref_ids_hashref = shift;
  my @xref_ids = keys %$xref_ids_hashref;

  open (XREF, ">xref.txt");

  # TODO - get this from config
  my $xref_dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=glenn_test_xref",
			      "ensro",
			      "",
			      {'RaiseError' => 1}) || die "Can't connect to database";

  my $core_dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=arne_core_20_34",
			      "ensro",
			      "",
			      {'RaiseError' => 1}) || die "Can't connect to database";

  # get current highest internal ID from xref
  my $max_xref_id = 0;
  my $core_sth = $core_dbi->prepare("SELECT MAX(xref_id) FROM xref");
  $core_sth->execute();
  my $max_xref_id = ($core_sth->fetchrow_array())[0];
  if (!defined $max_xref_id) {
    print "Can't get highest existing xref_id, using 0\n)";
  } else {
    print "Maximum existing xref_id = $max_xref_id\n";
  }
  my $core_xref_id = $max_xref_id + 1;

  # keep a unique list of source IDs to build the external_db table later
  my %source_ids;

  # execute several queries with a max of 200 entries in each IN clause - more efficient
  my $batch_size = 200;

  while(@xref_ids) {

    my @ids;
    if($#xref_ids > $batch_size) {
      @ids = splice(@xref_ids, 0, $batch_size);
    } else {
      @ids = splice(@xref_ids, 0);
    }

    my $id_str;
    if(@ids > 1)  {
      $id_str = "IN (" . join(',', @ids). ")";
    } else {
      $id_str = "= " . $ids[0];
    }


    my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
    my $xref_sth = $xref_dbi->prepare($sql);
    $xref_sth->execute();

    my ($xref_id, $accession, $label, $description, $source_id, $species_id);
    $xref_sth->bind_columns(\$xref_id, \$accession, \$label, \$description, \$source_id, \$species_id);

    # note the xref_id we write to the file is NOT the one we've just read
    # from the internal xref database as the ID may already exist in the core database
    while (my @row = $xref_sth->fetchrow_array()) {
      print XREF "$core_xref_id\t$accession\t$label\t$description\n";
      $source_ids{$source_id} = $source_id;
      $core_xref_id++;
      if ($source_id == 1001) {
	print "xref $xref_id has source_id 1001\n";
      }
    }

    # Now get the dependent xrefs for each of these xrefs and write them as well
    $sql = "SELECT x.accession, x.label, x.description, x.source_id FROM dependent_xref dx, xref x WHERE x.xref_id=dx.master_xref_id AND master_xref_id $id_str";
    my $dep_sth = $xref_dbi->prepare($sql);
    $dep_sth->execute();

    $dep_sth->bind_columns(\$accession, \$label, \$description, \$source_id);
    while (my @row = $dep_sth->fetchrow_array()) {
      print XREF "$core_xref_id\t$accession\t$label\t$description\tDEPENDENT\n";
      $source_ids{$source_id} = $source_id;
      $core_xref_id++;
    }
    #print "source_ids: " . join(" ", keys(%source_ids)) . "\n";

  } # while @xref_ids

  close(XREF);

  # now write the exernal_db file - the %source_ids hash will contain the IDs of the
  # sources that need to be written as external_dbs
  open(EXTERNAL_DB, ">external_db.txt");

  # get current highest internal ID from external_db
  my $max_edb_id = 0;
  my $core_sth = $core_dbi->prepare("SELECT MAX(external_db_id) FROM external_db");
  $core_sth->execute();
  my $max_edb_id = ($core_sth->fetchrow_array())[0];
  if (!defined $max_edb_id) {
    print "Can't get highest existing external_db_id, using 0\n)";
  } else {
    print "Maximum existing external_db_id = $max_edb_id\n";
  }
  my $edb_id = $max_edb_id + 1;

  my @source_id_array = keys %source_ids;
  my $source_id_str;
  if(@source_id_array > 1)  {
    $source_id_str = "IN (" . join(',', @source_id_array). ")";
  } else {
    $source_id_str = "= " . $source_id_array[0];
  }

  my $source_sql = "SELECT name, release FROM source WHERE source_id $source_id_str";
  my $source_sth = $xref_dbi->prepare($source_sql);
  $source_sth->execute();

  my ($source_name, $release);
  $source_sth->bind_columns(\$source_name, \$release);

  while (my @row = $source_sth->fetchrow_array()) {
    print EXTERNAL_DB "$edb_id\t$source_name\t$release\tXREF\n";
    # TODO knownxref etc??
    $edb_id++;
  }

  close(EXTERNAL_DB);



}

Ian Longden's avatar
Ian Longden committed
1;