Skip to content
Snippets Groups Projects
BasicMapper.pm 64.7 KiB
Newer Older
      else{
	$transcript .= "$id,";
      }
    }
    else{
      die "Unknown type *".$type."*\n".$temp."\n";
    }
  }
  if($transcript_count){
    chop $transcript; # remove last , from list
    $self->get_stable_ids("transcript",$transcript,\%transcript_2_stable);
  }
  if($translation_count){
    chop $translation;
    $self->get_stable_ids("translation",$translation,\%translation_2_stable);
  }


  open (XREF, ">>" . $self->core->dir() . "/xref.txt");
Ian Longden's avatar
Ian Longden committed
  open (XREF_MISSED, ">" . $self->core->dir() . "/triage.txt");

  foreach my $source ("%RefSeq%","UniProt%"){
    my $sql = "select x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id from xref x, source s where s.name like '".$source."' and x.source_id = s.source_id";
    my $sth = $self->xref->dbc->prepare($sql);
    $sth->execute();
    
    my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id);
    $sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
    while($sth->fetch()){
      if (!$xrefs_written{$xref_id}) {
	$xrefs_written{$xref_id} = 1;
	my $external_db_id = $source_to_external_db{$source_id};
	if(!defined($updated_source{$external_db_id})){
	  $self->cleanup_sources_file($external_db_id);
	}
	print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
	if(defined($failed_xref_mappings{$xref_id})){
	  my ($ensembl_id,$type,$q_perc,$t_perc,$q_cut,$t_cut) =  split(/\|/,$failed_xref_mappings{$xref_id});
	  print XREF_MISSED  ($xref_id+$xref_id_offset) . "\thighest match is $accession (".$q_perc."%) to ";
	  if($type  =~ /Translation/){
	    print XREF_MISSED $translation_2_stable{$ensembl_id};
	  }
	  elsif($type  =~ /Transcript/){
	    print XREF_MISSED $transcript_2_stable{$ensembl_id};
	  }
	  else{
	    die "type=*".$type."*\n".$failed_xref_mappings{$xref_id}."\n";
	  }
	  print XREF_MISSED " (".$t_perc."\%) which are below there respective cutoffs off $q_cut\% and $t_cut\%.\n";
	  print XREF_MISSED  ($xref_id+$xref_id_offset) . "\t$accession no match to Ensembl\n";
# dump xrefs that don't appear in either the primary_xref or dependent_xref tables
# e.g. Interpro xrefs

sub dump_orphan_xrefs() {

  my ($self, $xref_id_offset) = @_;

  my $count;

  open (XREF, ">>" . $self->core->dir() . "/xref.txt");
  open (XREF_MISSED, ">>" . $self->core->dir() . "/xref_failed.txt");

  # need a double left-join
  my $sql = "SELECT x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id FROM xref x LEFT JOIN primary_xref px ON px.xref_id=x.xref_id LEFT JOIN dependent_xref dx ON dx.dependent_xref_id=x.xref_id WHERE px.xref_id IS NULL AND dx.dependent_xref_id IS NULL";

  $sth->execute();

  my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id);
  $sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);

  while ($sth->fetch()) {

    my $external_db_id = $source_to_external_db{$source_id};
    if ($external_db_id) { # skip "unknown" sources
	if(!defined($updated_source{$external_db_id})){
	  $self->cleanup_sources_file($external_db_id);
	}
	print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";

  print "Wrote $count xrefs that are neither primary nor dependent\n";

}

# Dump direct xrefs. Need to do stable ID -> internal ID mapping.

sub dump_direct_xrefs {

  my ($self, $xref_id_offset, $max_object_xref_id) = @_;
  my $object_xref_id = $max_object_xref_id + 1;

  print "Writing direct xrefs\n";

  open (XREF, ">>" . $self->core->dir() . "/xref.txt");
  open (OBJECT_XREF, ">>" . $self->core->dir() . "/object_xref.txt");
  # Will need to look up translation stable ID from transcript stable ID, build hash table
  print "Building transcript stable ID -> translation stable ID lookup table\n";
  my %transcript_stable_id_to_translation_stable_id;
  my $trans_sth = $self->core->dbc->prepare("SELECT tss.stable_id as transcript, tls.stable_id AS translation FROM translation tl, translation_stable_id tls, transcript_stable_id tss WHERE tss.transcript_id=tl.transcript_id AND tl.translation_id=tls.translation_id");
  $trans_sth->execute();
  my ($transcript_stable_id, $translation_stable_id);
  $trans_sth->bind_columns(\$transcript_stable_id, \$translation_stable_id);
  while ($trans_sth->fetch()) {
    $transcript_stable_id_to_translation_stable_id{$transcript_stable_id} = $translation_stable_id;
  }
  $trans_sth->finish();

  # Will need lookup tables for gene/transcript/translation stable ID to internal ID
  my $stable_id_to_internal_id = $self->build_stable_id_to_internal_id_hash();

  # SQL / statement handle for getting all direct xrefs
  my $xref_sql = "SELECT dx.general_xref_id, dx.ensembl_stable_id, dx.type, dx.linkage_xref, x.accession, x.version, x.label, x.description, x.source_id, x.species_id FROM direct_xref dx, xref x WHERE dx.general_xref_id=x.xref_id";
  my $xref_sth = $self->xref->dbc->prepare($xref_sql);

  $xref_sth->execute();

  my ($xref_id, $ensembl_stable_id, $type, $linkage_xref, $accession, $version, $label, $description, $source_id, $species_id);
  $xref_sth->bind_columns(\$xref_id, \$ensembl_stable_id, \$type, \$linkage_xref,\ $accession, \$version, \$label, \$description, \$source_id, \$species_id);

  while ($xref_sth->fetch()) {

    my $external_db_id = $source_to_external_db{$source_id};
    if ($external_db_id) {

      # In the case of CCDS xrefs, direct_xref is to transcript but we want
      # the mapping in the core db to be to the *translation*
      if ($source_id == get_source_id_from_source_name($self->xref(), "CCDS")) {
	$type = 'translation';
	my $tmp_esid = $ensembl_stable_id;
	$ensembl_stable_id = $transcript_stable_id_to_translation_stable_id{$tmp_esid};
	warn "Can't find translation for transcript $tmp_esid" if (!$ensembl_stable_id);
	#print "CCDS: transcript $tmp_esid -> translation $ensembl_stable_id\n";
      }

      my $ensembl_internal_id = $stable_id_to_internal_id->{$type}->{$ensembl_stable_id};
	  if(!defined($updated_source{$external_db_id})){
	    $self->cleanup_sources_file($external_db_id);
	  }
	  print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
	  $xrefs_written{$xref_id} = 1;
	}
	print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\n";
	# deal with UTR transcripts in Elegans and potentially others
	# Need to link xrefs that are listed as linking to e.g. ZK829.4
	# to each of ZK829.4.1, ZK829.4.2, ZK829.4.3
	my $old_object_xref_id = $object_xref_id;
	if ($source_id == get_source_id_from_source_name($self->xref(), "wormpep_id")) {

	  # search for matching stable IDs
	  my $pat = $ensembl_stable_id .  '\..+';
	  foreach my $stable_id (keys %{$stable_id_to_internal_id->{$type}}) {

	    if ($stable_id =~ /$pat/) {

	      if (!$xrefs_written{$xref_id}) {
		if(!defined($updated_source{$external_db_id})){
		  $self->cleanup_sources_file($external_db_id);
		}
		print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
		$xrefs_written{$xref_id} = 1;
	      }
	      $ensembl_internal_id = $stable_id_to_internal_id->{$type}->{$stable_id};
	      print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\n";
	      $object_xref_id++;

	    }

	  } # foreach stable_id

	} # if source_id

	# if we haven't changed $object_xref_id, nothing was written
	print STDERR "Can't find $type corresponding to stable ID $ensembl_stable_id in ${type}_stable_id, not writing record for xref $accession\n" if ($object_xref_id == $old_object_xref_id);
    }

  }

  close(OBJECT_XREF);
  close(XREF);

  $xref_sth->finish();

  print "Wrote $count direct xrefs\n";

}


# Dump the interpro table from the xref database
sub dump_interpro {

  my $self = shift;

  open (INTERPRO, ">" .  $self->core->dir() . "/interpro.txt");
  my $sth = $self->xref->dbc->prepare("SELECT * FROM interpro");
  $sth->execute();

  my ($interpro, $pfam);
  $sth->bind_columns(\$interpro, \$pfam);
  while ($sth->fetch()) {
    print INTERPRO $interpro . "\t" . $pfam . "\n";
  }
  $sth->finish();

  close (INTERPRO);

}

sub build_stable_id_to_internal_id_hash {


  my %stable_id_to_internal_id;

  foreach my $type ('gene', 'transcript', 'translation') { # Add exon here if required

    print "Caching stable ID -> internal ID links for ${type}s\n";

    my $core_sql = "SELECT ${type}_id, stable_id FROM ${type}_stable_id" ;
    my $sth = $self->core->dbc->prepare($core_sql);
    $sth->execute();
    my ($internal_id, $stable_id);
    $sth->bind_columns(\$internal_id, \$stable_id);

    while ($sth->fetch) {

      $stable_id_to_internal_id{$type}{$stable_id} = $internal_id;

    }

  }

  return \%stable_id_to_internal_id;

}

sub get_ensembl_object_type {

  my $filename = shift;
  my $type;

  $filename = basename($filename);

  if ($filename =~ /_dna_/i) {
  } elsif ($filename =~ /_peptide_/i) {
    print STDERR "Cannot deduce Ensembl object type from filename $filename\n";
sub get_method {

  my $filename = shift;

  $filename = basename($filename);

  my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/;

  return $method;

}
  my ($self, $ensembl_type) = @_;
Ian Longden's avatar
Ian Longden committed
  my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA',
			  'translation' => 'XrefExonerateProtein' );

  my $logic_name = $typeToLogicName{lc($ensembl_type)};

  my $sth = $self->core->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
  $sth->execute();

  my $analysis_id;

  if (my @row = $sth->fetchrow_array()) {

    $analysis_id = $row[0];
Ian Longden's avatar
Ian Longden committed
#    print "Found exising analysis ID ($analysis_id) for $logic_name\n";

  } else {

    print "No analysis with logic_name $logic_name found, creating ...\n";
    $sth = $self->core->dbc->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
    # TODO - other fields in analysis table
    $sth->execute();
    $analysis_id = $sth->{'mysql_insertid'};
    print "Done (analysis ID=" . $analysis_id. ")\n";

  }
  my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $object_xref_id_offset,  $ensembl_object_types_hashref) = @_;
  my @xref_ids = keys %$xref_ids_hashref;
  my %xref_to_objects = %$xref_ids_hashref;
  my %ensembl_object_types = %$ensembl_object_types_hashref;

  open (XREF, ">$dir/xref.txt");
  open (OBJECT_XREF, ">>$dir/object_xref.txt");
  open (EXTERNAL_SYNONYM, ">$dir/external_synonym.txt");
  # Cache synonyms for later use
  # Do one big query to get a list of all the synonyms; note each xref may have
  # more than one synonym so they are stored in a hash of lists
  my $syn_count;
  my %synonyms;
  my $syn_sth = $self->xref->dbc->prepare("SELECT xref_id, synonym FROM synonym");
  $syn_sth->execute();

  my ($sxref_id, $synonym);
  $syn_sth->bind_columns(\$sxref_id, \$synonym);
  while ($syn_sth->fetch()) {

    push @{$synonyms{$sxref_id}}, $synonym;

  }

  # keep a unique list of source IDs to build the external_db table later
  my %source_ids;

  my $object_xref_id = $start_object_xref_id;

  # build cache of source id -> external_db id; note %source_to_external_db is global
  %source_to_external_db = $self->map_source_to_external_db();
  # execute several queries with a max of 200 entries in each IN clause - more efficient
  my $batch_size = 200;

  # keep track of what xref_id & object_xref_ids have been written to prevent
  # duplicates; e.g. several dependent xrefs may be dependent on the same master xref.
  # Note %xrefs_written and %object_xrefs_written are global
  while(@xref_ids) {

    my @ids;
    if($#xref_ids > $batch_size) {
      @ids = splice(@xref_ids, 0, $batch_size);
    } else {
      @ids = splice(@xref_ids, 0);
    }

    my $id_str;
    if(@ids > 1)  {
      $id_str = "IN (" . join(',', @ids). ")";
    } else {
      $id_str = "= " . $ids[0];
    }


    my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
    my $xref_sth = $self->xref->dbc->prepare($sql);
    my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id, $master_xref_id, $linkage_annotation);
    $xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);

    # note the xref_id we write to the file is NOT the one we've just read
    # from the internal xref database as the ID may already exist in the
    # core database so we add on $xref_id_offset
    while ($xref_sth->fetch()) {
      # make sure label is set to /something/ so that the website displays something
      $label = $accession if (!$label);

      if (!$xrefs_written{$xref_id}) {
	my $external_db_id = $source_to_external_db{$source_id};
	if ($external_db_id) { # skip "unknown" sources
	  if(!defined($updated_source{$external_db_id})){
	    $self->cleanup_sources_file($external_db_id);
	  }
	  print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
	  $xrefs_written{$xref_id} = 1;
	  $source_ids{$source_id} = $source_id;
	}
    }

    # Now get the dependent xrefs for each of these xrefs and write them as well
    # Store the go_linkage_annotations as we go along (need for dumping go_xref)
    my $go_source_id = get_source_id_from_source_name($self->xref, "GO");

    $sql = "SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, x.source_id, x.version, dx.linkage_annotation FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id $id_str";
    my $dep_sth = $self->xref->dbc->prepare($sql);
    $dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, \$source_id, \$version, \$linkage_annotation);
    while ($dep_sth->fetch()) {
      my $external_db_id = $source_to_external_db{$source_id};
	if(!defined($updated_source{$external_db_id})){
	  $self->cleanup_sources_file($external_db_id);
	}
	print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\tDEPENDENT\n";
	$xrefs_written{$xref_id} = 1;
	$source_ids{$source_id} = $source_id;
      }
      # create an object_xref linking this (dependent) xref with any objects it maps to
      # write to file and add to object_xref_mappings
      if (defined $xref_to_objects{$master_xref_id}) {
	my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); 
	#print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n";
	foreach my $object_id (@ensembl_object_ids) {
	  my $type = $ensembl_object_types{$object_id};
	  my $full_key = $type."|".$object_id."|".$xref_id;
	  if (!$object_xrefs_written{$full_key}) {
	    print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n";
	    # Add this mapping to the list - note NON-OFFSET xref_id is used
	    my $key = $type . "|" . $object_id;
	    push @{$object_xref_mappings{$key}}, $xref_id;
	    $object_xrefs_written{$full_key} = 1;
	    # Also store *parent's* query/target identity for dependent xrefs
	    $object_xref_identities{$key}->{$xref_id}->{"target_identity"} = $object_xref_identities{$key}->{$master_xref_id}->{"target_identity"};
	    $object_xref_identities{$key}->{$xref_id}->{"query_identity"} = $object_xref_identities{$key}->{$master_xref_id}->{"query_identity"};
	    # write a go_xref with the appropriate linkage type
	    print GO_XREF $object_xref_id . "\t" . $linkage_annotation . "\n"  if ($source_id == $go_source_id);

	    $object_xref_id++;

    #print "source_ids: " . join(" ", keys(%source_ids)) . "\n";

  } # while @xref_ids
  # Dump any synonyms for xrefs we've written
  # Now write the synonyms we want to the file
  foreach my $xref_id (keys %synonyms) {
    foreach my $syn (@{$synonyms{$xref_id}}) {
      print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $syn . "\n";
      $syn_count++;
    }
  }
  print "Wrote $syn_count synonyms\n";
  close(EXTERNAL_SYNONYM);
  # calculate display_xref_ids for transcripts and genes
  my $transcript_display_xrefs = $self->build_transcript_display_xrefs($xref_id_offset);

  $self->build_gene_display_xrefs($transcript_display_xrefs);

  # now build gene descriptions
  $self->build_gene_descriptions();
Glenn Proctor's avatar
Glenn Proctor committed
# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id

sub dump_comparison {

Glenn Proctor's avatar
Glenn Proctor committed
  print "Dumping comparison data\n";

  open (COMPARISON, ">comparison/xref_mappings.txt");
  print COMPARISON "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";
Glenn Proctor's avatar
Glenn Proctor committed

  # get the xref accession for each xref as the xref_ids are ephemeral
  # first read all the xrefs that were dumped and get an xref_id->accession map
  my %xref_id_to_accesson;
  open (XREF, "$dir/xref.txt");
Glenn Proctor's avatar
Glenn Proctor committed
  while (<XREF>) {
    my ($xref_id,$external_db_id,$accession,$label,$version,$description) = split;
Glenn Proctor's avatar
Glenn Proctor committed
    $xref_id_to_accesson{$xref_id} = $accession;
  }
  close (XREF);

  open (OBJECT_XREF, "$dir/object_xref.txt");
Glenn Proctor's avatar
Glenn Proctor committed
  while (<OBJECT_XREF>) {
    my ($object_xref_id,$object_id,$type,$xref_id) = split;
    print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
  }

  close (OBJECT_XREF);
  close (COMPARISON);
sub build_transcript_display_xrefs {

  # get a list of xref sources; format:
  # key: xref_id value: source_name
  # lots of these; if memory is a problem, just get the source ID (not the name)
  # and look it up elsewhere
  print "Building xref->source mapping table\n";
  my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
  $sth->execute();

  my ($xref_id, $source_name);
  $sth->bind_columns(\$xref_id, \$source_name);

  while ($sth->fetch()) {
    $xref_to_source{$xref_id} = $source_name;
  }

  print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";

  # Cache the list of translation->transcript mappings & vice versa
  print "Building translation to transcript mappings\n";
  $sth = $self->core->dbc->prepare("SELECT translation_id, transcript_id FROM translation");
  $sth->execute();

  my ($translation_id, $transcript_id);
  $sth->bind_columns(\$translation_id, \$transcript_id);

  while ($sth->fetch()) {
    $translation_to_transcript{$translation_id} = $transcript_id;
    $transcript_to_translation{$transcript_id} = $translation_id if ($translation_id);
  print "Building transcript display_xrefs\n";
  my @priorities = $self->transcript_display_xref_sources();


  # go through each object/xref mapping and store the best ones as we go along
  my %obj_to_best_xref;

  foreach my $key (keys %object_xref_mappings) {
    my ($type, $object_id) = split /\|/, $key;

    next if ($type !~ /(Transcript|Translation)/i);

    # if a transcript has more than one associated xref,
    # use the one with the highest priority, i.e. lower list position in @priorities
    my @xrefs = @{$object_xref_mappings{$key}};
    my ($best_xref, $best_xref_priority_idx);
    # store best query & target identities for each source
    my %best_qi;
    my %best_ti;
    $best_xref_priority_idx = 99999;
    foreach my $xref (@xrefs) {
      my $source = $xref_to_source{$xref};
      if ($source) {
	my $i = find_in_list($source, @priorities);
	my $key = $type . "|" . $object_id;
	my $query_identity  = $object_xref_identities{$key}->{$xref}->{"query_identity"};
	my $target_identity = $object_xref_identities{$key}->{$xref}->{"target_identity"};

	# Check if this source has a better priority than the current best one
	# Note if 2 sources are the same priority, the mappings are compared on
	# query_identity then target_identity
#	if ($i > -1 && $i < $best_xref_priority_idx &&
#	    (($query_identity > $best_query_identity) ||
#	    ($query_identity == $best_query_identity && $target_identity > $best_target_identity))) {
	if ($i > -1 && $i <= $best_xref_priority_idx && $query_identity > $best_qi{$s}) {
	  $best_xref = $xref;
	  $best_xref_priority_idx = $i;
	  $best_qi{$s} = $query_identity;
	  $best_ti{$s} = $target_identity;
	warn("Couldn't find a source for xref id $xref " . $xref_accessions{$xref_id} . "\n");
    # store object type, id, and best xref id and source priority
    if ($best_xref) {
      $obj_to_best_xref{$key} = $best_xref . "|" . $best_xref_priority_idx;
    }
  # Now go through each of the calculated best xrefs and convert any that are
  # calculated against translations to be associated with their transcript,
  # if the priority of the translation xref is higher than that of the transcript
  # xref.
  # Needs to be done this way to avoid clobbering higher-priority transcripts.

  # hash keyed on transcript id, value is xref_id|source prioirity index
  my %transcript_display_xrefs;

  # Write a .sql file that can be executed, and a .txt file that can be processed
  open (TRANSCRIPT_DX, ">$dir/transcript_display_xref.sql");
  open (TRANSCRIPT_DX_TXT, ">$dir/transcript_display_xref.txt");

  foreach my $key (keys %obj_to_best_xref) {

    my ($type, $object_id) = split /\|/, $key;

    my ($best_xref, $best_xref_priority_idx) = split /\|/, $obj_to_best_xref{$key};

    # If transcript has a translation, use the best xref out of the transcript & translation
    my $transcript_id;
    my $translation_id;
    if ($type =~ /Transcript/i) {
      $transcript_id = $object_id;
      $translation_id = $transcript_to_translation{$transcript_id};
    }
    elsif ($type =~ /Translation/i) {
      $translation_id = $object_id;
      $transcript_id = $translation_to_transcript{$translation_id};
      $object_id = $transcript_id;
    }
    else{
      next;
    }
    if ($translation_id) {
      my ($translation_xref, $translation_priority) = split /\|/, $obj_to_best_xref{"Translation|$translation_id"};
      my ($transcript_xref, $transcript_priority)   = split /\|/, $obj_to_best_xref{"Transcript|$transcript_id"};
      my $transcript_qi = $object_xref_identities{"Transcript|$object_id"}->{$transcript_xref}->{"query_identity"};
      my $translation_qi = $object_xref_identities{"Translation|$object_id"}->{$translation_xref}->{"query_identity"};
      #print "transcript xref $transcript_xref transcript pri $transcript_priority transcript qi: $transcript_qi translation xref $translation_xref translation pri $translation_priority translation qi $translation_qi\n";
      if(!$translation_xref){
	$best_xref = $transcript_xref;
	$best_xref_priority_idx = $transcript_priority;
      }
      if(!$transcript_xref){
	$best_xref = $translation_xref;
	$best_xref_priority_idx = $translation_priority;
      elsif ($translation_priority < $transcript_priority) {
	$best_xref = $translation_xref;
	$best_xref_priority_idx = $translation_priority;
      } else {
	$best_xref = $transcript_xref;
	$best_xref_priority_idx = $transcript_priority;
      }
      # Write record with xref_id_offset
      print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
      print TRANSCRIPT_DX_TXT ($best_xref+$xref_id_offset) . "\t" . $object_id . "\n";

      my $value = ($best_xref+$xref_id_offset) . "|" . $best_xref_priority_idx;
      $transcript_display_xrefs{$object_id} = $value;
  }

  close(TRANSCRIPT_DX);

  print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";

  return \%transcript_display_xrefs;

}


# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used


  my ($self, $transcript_display_xrefs) = @_;

  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $self->core->dbc);
  my $ta = $db->get_TranscriptAdaptor();

  print "Assigning display_xrefs to genes\n";

  open (GENE_DX, ">$dir/gene_display_xref.sql");
  open (GENE_DX_TXT, ">$dir/gene_display_xref.txt");
  my $trans_no_xref = 0;
  my $trans_xref = 0;
  foreach my $gene_id (keys %genes_to_transcripts) {

    my @transcripts = @{$genes_to_transcripts{$gene_id}};

    my $best_xref;
    my $best_xref_priority_idx = 99999;
    my $best_transcript_length = -1;
    foreach my $transcript_id (@transcripts) {
      if (!$transcript_display_xrefs->{$transcript_id}) {
	$trans_no_xref++;
	next;
      } else {
	$trans_xref++;
      }
      my ($xref_id, $priority) = split (/\|/, $transcript_display_xrefs->{$transcript_id});
      # 2 separate if clauses to avoid having to fetch transcripts unnecessarily
      if (($priority < $best_xref_priority_idx)) {
	$best_xref_priority_idx = $priority;
	$best_xref = $xref_id;
      } elsif ($priority == $best_xref_priority_idx) {

	# compare transcript lengths and use longest
	my $transcript = $ta->fetch_by_dbID($transcript_id);
	my $transcript_length = $transcript->length();
	if ($transcript_length > $best_transcript_length) {
	  $best_transcript_length = $transcript_length;
	  $best_xref_priority_idx = $priority;
	  $best_xref = $xref_id;
	}
      print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
      print GENE_DX_TXT $best_xref . "\t" . $gene_id ."\n";
  print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
  print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
  print "Couldn't find display_xrefs for $miss genes\n" if ($miss > 0);
  print "Found display_xrefs for all genes\n" if ($miss eq 0);
}

# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.

sub transcript_display_xref_sources {

  return ('HUGO',
	  'MarkerSymbol',
	  'flybase_symbol',
	  'Anopheles_symbol',
	  'Genoscope_predicted_transcript',
	  'Genoscope_predicted_gene',
	  'Uniprot/SWISSPROT',
Glenn Proctor's avatar
 
Glenn Proctor committed
	  'RefSeq_peptide',
	  'RefSeq_dna',
	  'Uniprot/SPTREMBL',
# Get transcripts associated with each gene

sub build_genes_to_transcripts {

  my ($self) = @_;

  print "Getting transcripts for all genes\n";

  my $sql = "SELECT gene_id, transcript_id FROM transcript";
  $sth->execute();

  my ($gene_id, $transcript_id);
  $sth->bind_columns(\$gene_id, \$transcript_id);

  # Note %genes_to_transcripts is global
  while ($sth->fetch()) {
    push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
  }

  print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";

}
# Find the index of an item in a list(ref), or -1 if it's not in the list.
# Only look for exact matches (case insensitive)

sub find_in_list {

  my ($item, @list) = @_;

  for (my $i = 0; $i < scalar(@list); $i++) {
    if (lc($list[$i]) eq lc($item)) {
  return -1;

}

# Take a string and a list of regular expressions
# Find the index of the highest matching regular expression
# Return the index, or -1 if not found.

sub find_match {

 my ($str, @list) = @_;

 my $str2 = $str;
 my $highest_index = -1;

  for (my $i = 0; $i < scalar(@list); $i++) {
    my $re = $list[$i];
    if ($str2 =~ /$re/i) {
      $highest_index = $i;
    }
  }

  return $highest_index;
# Build a map of source id (in xref database) to external_db (in core database)

sub map_source_to_external_db {

  my $self = shift;

  my %source_to_external_db;

  # get all sources
  my $sth = $self->xref->dbc->prepare("SELECT source_id, name FROM source");
  $sth->execute();
  my ($source_id, $source_name);
  $sth->bind_columns(\$source_id, \$source_name);

  while($sth->fetchrow_array()) {

    # find appropriate external_db_id for each one
    my $sql = "SELECT external_db_id FROM external_db WHERE db_name=?";
    my $core_sth = $self->core->dbc->prepare($sql);
    $core_sth->execute($source_name);

    my @row = $core_sth->fetchrow_array();

    if (@row) {

      $source_to_external_db{$source_id} = $row[0];
      #print "Source name $source_name id $source_id corresponds to core external_db_id " . $row[0] . "\n";

    } else {

      print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db\n"

sub cleanup_sources_file{
  my ($self,$id) = @_;

  $updated_source{$id} =1;

  my $dir = $self->core->dir();
  open (DEL, ">>$dir/cleanup.sql") || die "Could not open $dir/cleanup.sql\n";


  print DEL "DELETE external_synonym ";
  print DEL     "FROM external_synonym, xref ";
  print DEL       "WHERE external_synonym.xref_id = xref.xref_id ";
  print DEL         "AND xref.external_db_id = $id\n";


  print DEL "DELETE identity_xref ";
  print DEL     "FROM identity_xref, object_xref, xref ";
  print DEL       "WHERE identity_xref.object_xref_id = object_xref.object_xref_id ";
  print DEL         "AND object_xref.xref_id = xref.xref_id ";
  print DEL         "AND xref.external_db_id = $id \n";


  print DEL "DELETE object_xref ";
  print DEL     "FROM object_xref, xref ";
  print DEL       "WHERE object_xref.xref_id = xref.xref_id ";
  print DEL         "AND xref.external_db_id = $id\n";


  print DEL "DELETE FROM xref WHERE xref.external_db_id = $id \n";

  close DEL;

}

# Upload .txt files and execute .sql files.

sub do_upload {

  my $ensembl = $self->core;
  my $core_db = $ensembl->dbc;
  if(defined($self->delete_existing())){

    my $file = $ensembl->dir() . "/cleanup.sql";
    open(CLEAN,"<$file") || die "could not open $file for reading \n";
    while(<CLEAN>){