From bc80daea9e96e68525771ecc00d24a364fdcd567 Mon Sep 17 00:00:00 2001
From: Glenn Proctor <gp1@sanger.ac.uk>
Date: Mon, 11 Oct 2004 16:00:57 +0000
Subject: [PATCH] Added mapping stuff from old exonerate_prototype.pl

---
 .../xref_mapping/XrefMapper/BasicMapper.pm    | 405 +++++++++++++++++-
 1 file changed, 392 insertions(+), 13 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm
index 617c6ae63b..2a68fe1ae0 100644
--- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm
+++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm
@@ -84,22 +84,10 @@ sub run_matching{
     $i++;
   }
   
-  glenn_call(\@list);
-  print "NOT done yet:-)\n";
-}
-
+  $self->run_mapping(\@list);
 
-sub glenn_call{
-  my ($lists) = @_;
-  foreach my $list (@$lists){
-    my ($meth, $q ,$t)  =  @$list;
-    print "HELLO: ".$meth."\n\t".$q."\n\t".$t."\n\n";
-  }
 }
 
-sub store{
-  print "NOT done yet Either :-)\n";
-}
 
 =head2 get_species_id_from_species_name
 
@@ -547,4 +535,395 @@ sub method{
   return $self->{_method};
 }
 
+# --------------------------------------------------------------------------------
+# GLENN
+
+
+# Use exonerate (or other program) to find xref-ensembl obejct mappings
+
+sub run_mapping {
+
+  my ($self, $lists) = @_;
+
+  # foreach method, submit the appropriate job & keep track of the job name
+  my @job_names;
+
+  foreach my $list (@$lists){
+
+    my ($method, $queryfile ,$targetfile)  =  @$list;
+
+    print "###$method $queryfile $targetfile\n";
+
+    my $obj_name = "XrefMapper::Methods::$method";
+    # check that the appropriate object exists
+    eval "require $obj_name";
+    if($@) {
+
+      warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@");
+
+    } else {
+
+      my $obj = $obj_name->new();
+      my $job_name = $obj->run($queryfile, $targetfile);
+      push @job_names, $job_name;
+      print "Submitted LSF job $job_name to list\n";
+      sleep 1; # make sure unique names really are unique
+
+    }
+
+  } # foreach method
+
+  # submit depend job to wait for all mapping jobs
+  submit_depend_job($self->dir, @job_names);
+
+
+} # run_exonerate
+
+
+sub submit_depend_job {
+
+  my ($root_dir, @job_names) = @_;
+
+  # Submit a job that does nothing but wait on the main jobs to
+  # finish. This job is submitted interactively so the exec does not
+  # return until everything is finished.
+
+  # build up the bsub command; first part
+  my @depend_bsub = ('bsub', '-K');
+
+  # one -wended clause for each main job
+  foreach my $job (@job_names) {
+    push @depend_bsub, "-wended($job)";
+  }
+
+  # rest of command
+  push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true');
+
+  #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
+
+  my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid);
+  $depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub);
+  my $depend_jobid;
+  while (<$depend_rtr>) {
+    if (/Job <([0-9]+)> is/) {
+      $depend_jobid = $1;
+      print "LSF job ID for depend job: $depend_jobid \n" ;
+    }
+  }
+  if (!defined($depend_jobid)) {
+    print STDERR "Error: could not get depend job ID\n";
+  }
+
+
+
+}
+
+=head2 store
+
+  Arg[1]     : The target file used in the exonerate run. Used to work out the Ensembl object type.
+  Arg[2]     : 
+  Example    : none
+  Description: Parse exonerate output files and build files for loading into target db tables.
+  Returntype : List of strings
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub store {
+
+  my ($target_file_name) = @_;
+
+  my $type = get_ensembl_object_type($target_file_name);
+
+  # get or create the appropriate analysis ID
+  my $analysis_id = get_analysis_id($type);
+
+  # TODO - get this from config
+  my $dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=arne_core_20_34",
+			 "ensadmin",
+			 "ensembl",
+			 {'RaiseError' => 1}) || die "Can't connect to database";
+
+ # get current max object_xref_id
+  my $max_object_xref_id = 0;
+  my $sth = $dbi->prepare("SELECT MAX(object_xref_id) FROM object_xref");
+  $sth->execute();
+  my $max_object_xref_id = ($sth->fetchrow_array())[0];
+  if (!defined $max_object_xref_id) {
+    print "Can't get highest existing object_xref_id, using 1\n)";
+  } else {
+    print "Maximum existing object_xref_id = $max_object_xref_id\n";
+  }
+
+
+  #my $ox_sth = $dbi->prepare("INSERT INTO object_xref(ensembl_id, ensembl_object_type, xref_id) VALUES(?,?,?)");
+
+  #my $ix_sth = $dbi->prepare("INSERT INTO identity_xref VALUES(?,?,?,?,?,?,?,?,?,?,?)");
+
+  # files to write table data to
+  open (OBJECT_XREF, ">object_xref.txt");
+  open (IDENTITY_XREF, ">identity_xref.txt");
+
+  my $total_lines = 0;
+  my $total_files = 0;
+
+  my $object_xref_id = $max_object_xref_id + 1;
+
+  # keep a (unique) list of xref IDs that need to be written out to file as well
+  my %primary_xref_ids;
+
+  foreach my $file (glob("*.map")) {
+
+    print "Parsing results from $file \n";
+    open(FILE, $file);
+    $total_files++;
+
+    while (<FILE>) {
+
+      $total_lines++;
+      chomp();
+      my ($label, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score) = split(/:/, $_);
+      $cigar_line =~ s/ //;
+
+      # TODO make sure query & target are the right way around
+
+      print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t$query_id\n";
+      print IDENTITY_XREF "$object_xref_id\t$query_id\t$target_id\t$query_start\t$query_end\t$target_start\t$target_end\t$cigar_line\t$score\t\\N\t$analysis_id\n";
+      # TODO - evalue?
+      $object_xref_id++;
+
+      $primary_xref_ids{$query_id} = $query_id;
+
+      # Store in database
+      # create entry in object_xref and get its object_xref_id
+      #$ox_sth->execute($target_id, $type, $query_id) || warn "Error writing to object_xref table";
+      #my $object_xref_id = $ox_sth->{'mysql_insertid'};
+
+      # create entry in identity_xref
+      #$ix_sth->execute($object_xref_id, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score, undef, $analysis_id) || warn "Error writing to identity_xref table";
+
+    }
+
+    close(FILE);
+
+  }
+
+  close(IDENTITY_XREF);
+  close(OBJECT_XREF);
+
+  print "Read $total_lines lines from $total_files exonerate output files\n";
+
+  # write relevant xrefs to file
+  dump_xrefs(\%primary_xref_ids);
+
+}
+
+
+sub get_ensembl_object_type {
+
+  my $filename = shift;
+  my $type;
+
+  if ($filename =~ /gene/i) {
+
+    $type = "Gene";
+
+  } elsif ($filename =~ /transcript/i) {
+
+    $type = "Transcript";
+
+  } elsif ($filename =~ /translation/i) {
+
+    $type = "Translation";
+
+  } else {
+
+    print STDERR "Cannot deduce Ensembl object type from filename $filename";
+  }
+
+  return $type;
+
+}
+
+
+sub get_analysis_id {
+
+  my $ensembl_type = shift;
+
+  my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA',
+			  'translation' => 'XrefExonerateProtein' );
+
+  my $logic_name = $typeToLogicName{lc($ensembl_type)};
+
+  # TODO - get these details from Config
+  my $host = "ecs1g";
+  my $port = 3306;
+  my $database = "arne_core_20_34";
+  my $user = "ensadmin";
+  my $password = "ensembl";
+
+  my $dbi = DBI->connect("dbi:mysql:host=$host;port=$port;database=$database",
+			 "$user",
+			 "$password",
+			 {'RaiseError' => 1}) || die "Can't connect to database";
+
+
+  my $sth = $dbi->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
+  $sth->execute();
+
+  my $analysis_id;
+
+  if (my @row = $sth->fetchrow_array()) {
+
+    $analysis_id = $row[0];
+    print "Found exising analysis ID ($analysis_id) for $logic_name\n";
+
+  } else {
+
+    print "No analysis with logic_name $logic_name found, creating ...\n";
+    $sth = $dbi->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
+    # TODO - other fields in analysis table
+    $sth->execute();
+    $analysis_id = $sth->{'mysql_insertid'};
+    print "Done (analysis ID=" . $analysis_id. ")\n";
+
+  }
+
+  return $analysis_id;
+
+}
+
+
+sub dump_xrefs {
+
+  my $xref_ids_hashref = shift;
+  my @xref_ids = keys %$xref_ids_hashref;
+
+  open (XREF, ">xref.txt");
+
+  # TODO - get this from config
+  my $xref_dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=glenn_test_xref",
+			      "ensro",
+			      "",
+			      {'RaiseError' => 1}) || die "Can't connect to database";
+
+  my $core_dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=arne_core_20_34",
+			      "ensro",
+			      "",
+			      {'RaiseError' => 1}) || die "Can't connect to database";
+
+  # get current highest internal ID from xref
+  my $max_xref_id = 0;
+  my $core_sth = $core_dbi->prepare("SELECT MAX(xref_id) FROM xref");
+  $core_sth->execute();
+  my $max_xref_id = ($core_sth->fetchrow_array())[0];
+  if (!defined $max_xref_id) {
+    print "Can't get highest existing xref_id, using 0\n)";
+  } else {
+    print "Maximum existing xref_id = $max_xref_id\n";
+  }
+  my $core_xref_id = $max_xref_id + 1;
+
+  # keep a unique list of source IDs to build the external_db table later
+  my %source_ids;
+
+  # execute several queries with a max of 200 entries in each IN clause - more efficient
+  my $batch_size = 200;
+
+  while(@xref_ids) {
+
+    my @ids;
+    if($#xref_ids > $batch_size) {
+      @ids = splice(@xref_ids, 0, $batch_size);
+    } else {
+      @ids = splice(@xref_ids, 0);
+    }
+
+    my $id_str;
+    if(@ids > 1)  {
+      $id_str = "IN (" . join(',', @ids). ")";
+    } else {
+      $id_str = "= " . $ids[0];
+    }
+
+
+    my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
+    my $xref_sth = $xref_dbi->prepare($sql);
+    $xref_sth->execute();
+
+    my ($xref_id, $accession, $label, $description, $source_id, $species_id);
+    $xref_sth->bind_columns(\$xref_id, \$accession, \$label, \$description, \$source_id, \$species_id);
+
+    # note the xref_id we write to the file is NOT the one we've just read
+    # from the internal xref database as the ID may already exist in the core database
+    while (my @row = $xref_sth->fetchrow_array()) {
+      print XREF "$core_xref_id\t$accession\t$label\t$description\n";
+      $source_ids{$source_id} = $source_id;
+      $core_xref_id++;
+      if ($source_id == 1001) {
+	print "xref $xref_id has source_id 1001\n";
+      }
+    }
+
+    # Now get the dependent xrefs for each of these xrefs and write them as well
+    $sql = "SELECT x.accession, x.label, x.description, x.source_id FROM dependent_xref dx, xref x WHERE x.xref_id=dx.master_xref_id AND master_xref_id $id_str";
+    my $dep_sth = $xref_dbi->prepare($sql);
+    $dep_sth->execute();
+
+    $dep_sth->bind_columns(\$accession, \$label, \$description, \$source_id);
+    while (my @row = $dep_sth->fetchrow_array()) {
+      print XREF "$core_xref_id\t$accession\t$label\t$description\tDEPENDENT\n";
+      $source_ids{$source_id} = $source_id;
+      $core_xref_id++;
+    }
+    #print "source_ids: " . join(" ", keys(%source_ids)) . "\n";
+
+  } # while @xref_ids
+
+  close(XREF);
+
+  # now write the exernal_db file - the %source_ids hash will contain the IDs of the
+  # sources that need to be written as external_dbs
+  open(EXTERNAL_DB, ">external_db.txt");
+
+  # get current highest internal ID from external_db
+  my $max_edb_id = 0;
+  my $core_sth = $core_dbi->prepare("SELECT MAX(external_db_id) FROM external_db");
+  $core_sth->execute();
+  my $max_edb_id = ($core_sth->fetchrow_array())[0];
+  if (!defined $max_edb_id) {
+    print "Can't get highest existing external_db_id, using 0\n)";
+  } else {
+    print "Maximum existing external_db_id = $max_edb_id\n";
+  }
+  my $edb_id = $max_edb_id + 1;
+
+  my @source_id_array = keys %source_ids;
+  my $source_id_str;
+  if(@source_id_array > 1)  {
+    $source_id_str = "IN (" . join(',', @source_id_array). ")";
+  } else {
+    $source_id_str = "= " . $source_id_array[0];
+  }
+
+  my $source_sql = "SELECT name, release FROM source WHERE source_id $source_id_str";
+  my $source_sth = $xref_dbi->prepare($source_sql);
+  $source_sth->execute();
+
+  my ($source_name, $release);
+  $source_sth->bind_columns(\$source_name, \$release);
+
+  while (my @row = $source_sth->fetchrow_array()) {
+    print EXTERNAL_DB "$edb_id\t$source_name\t$release\tXREF\n";
+    # TODO knownxref etc??
+    $edb_id++;
+  }
+
+  close(EXTERNAL_DB);
+
+
+
+}
+
 1;
-- 
GitLab