BasicMapper.pm

  while ($trans_sth->fetch()) {
    $transcript_stable_id_to_translation_stable_id{$transcript_stable_id} = $translation_stable_id;
  }
  $trans_sth->finish();

  # Will need lookup tables for gene/transcript/translation stable ID to internal ID
  my $stable_id_to_internal_id = $self->build_stable_id_to_internal_id_hash();

  # SQL / statement handle for getting all direct xrefs
  my $xref_sql = "SELECT dx.general_xref_id, dx.ensembl_stable_id, dx.type, dx.linkage_xref, x.accession, x.version, x.label, x.description, x.source_id, x.species_id FROM direct_xref dx, xref x WHERE dx.general_xref_id=x.xref_id";
  my $xref_sth = $self->xref->dbc->prepare($xref_sql);

  $xref_sth->execute();

  my ($xref_id, $ensembl_stable_id, $type, $linkage_xref, $accession, $version, $label, $description, $source_id, $species_id);
  $xref_sth->bind_columns(\$xref_id, \$ensembl_stable_id, \$type, \$linkage_xref,\ $accession, \$version, \$label, \$description, \$source_id, \$species_id);

  while ($xref_sth->fetch()) {

    my $external_db_id = $source_to_external_db{$source_id};
    if ($external_db_id) {

      # In the case of CCDS xrefs, direct_xref is to transcript but we want
      # the mapping in the core db to be to the *translation*
      if ($source_id == get_source_id_from_source_name($self->xref(), "CCDS")) {
	$type = 'translation';
	my $tmp_esid = $ensembl_stable_id;
	$ensembl_stable_id = $transcript_stable_id_to_translation_stable_id{$tmp_esid};
	warn "Can't find translation for transcript $tmp_esid" if (!$ensembl_stable_id);
	#print "CCDS: transcript $tmp_esid -> translation $ensembl_stable_id\n";
      }

      my $ensembl_internal_id = $stable_id_to_internal_id->{$type}->{$ensembl_stable_id};

      if ($ensembl_internal_id) {

	if (!$xrefs_written{$xref_id}) {
	  print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
	  $xrefs_written{$xref_id} = 1;
	}
	print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\n";
	$object_xref_id++;
	$count++;

      } else {

	# deal with UTR transcripts in Elegans and potentially others
	# Need to link xrefs that are listed as linking to e.g. ZK829.4
	# to each of ZK829.4.1, ZK829.4.2, ZK829.4.3
	my $old_object_xref_id = $object_xref_id;
	if ($source_id == get_source_id_from_source_name($self->xref(), "wormpep_id")) {

	  # search for matching stable IDs
	  my $pat = $ensembl_stable_id .  '\..+';
	  foreach my $stable_id (keys %{$stable_id_to_internal_id->{$type}}) {

	    if ($stable_id =~ /$pat/) {

	      if (!$xrefs_written{$xref_id}) {
		print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
		$xrefs_written{$xref_id} = 1;
	      }
	      $ensembl_internal_id = $stable_id_to_internal_id->{$type}->{$stable_id};
	      print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\n";
	      $object_xref_id++;

	    }

	  } # foreach stable_id

	} # if source_id

	# if we haven't changed $object_xref_id, nothing was written
	print STDERR "Can't find $type corresponding to stable ID $ensembl_stable_id in ${type}_stable_id, not writing record for xref $accession\n" if ($object_xref_id == $old_object_xref_id);

      }

     
    }

  }

  close(OBJECT_XREF);
  close(XREF);

  $xref_sth->finish();

  print "Wrote $count direct xrefs\n";

}


# Dump the interpro table from the xref database
sub dump_interpro {

  my $self = shift;

  open (INTERPRO, ">" .  $self->core->dir() . "/interpro.txt");

  my $sth = $self->xref->dbc->prepare("SELECT * FROM interpro");
  $sth->execute();

  my ($interpro, $pfam);
  $sth->bind_columns(\$interpro, \$pfam);
  while ($sth->fetch()) {
    print INTERPRO $interpro . "\t" . $pfam . "\n";
  }
  $sth->finish();

  close (INTERPRO);

}

sub build_stable_id_to_internal_id_hash {

  my ($self) = @_;

  my %stable_id_to_internal_id;

  foreach my $type ('gene', 'transcript', 'translation') { # Add exon here if required

    print "Caching stable ID -> internal ID links for ${type}s\n";

    my $core_sql = "SELECT ${type}_id, stable_id FROM ${type}_stable_id" ;
    my $sth = $self->core->dbc->prepare($core_sql);
    $sth->execute();
    my ($internal_id, $stable_id);
    $sth->bind_columns(\$internal_id, \$stable_id);

    while ($sth->fetch) {

      $stable_id_to_internal_id{$type}{$stable_id} = $internal_id;

    }

  }

  return \%stable_id_to_internal_id;

}

sub get_ensembl_object_type {

  my $filename = shift;
  my $type;

  $filename = basename($filename);

  if ($filename =~ /_dna_/i) {

    $type = "Transcript";

  } elsif ($filename =~ /_peptide_/i) {

    $type = "Translation";

  } else {

    print STDERR "Cannot deduce Ensembl object type from filename $filename\n";
  }

  return $type;

}

sub get_method {

  my $filename = shift;

  $filename = basename($filename);

  my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/;

  return $method;

}

sub get_analysis_id {

  my ($self, $ensembl_type) = @_;

  my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA',
			  'translation' => 'XrefExonerateProtein' );

  my $logic_name = $typeToLogicName{lc($ensembl_type)};

  my $sth = $self->core->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
  $sth->execute();

  my $analysis_id;

  if (my @row = $sth->fetchrow_array()) {

    $analysis_id = $row[0];
#    print "Found exising analysis ID ($analysis_id) for $logic_name\n";

  } else {

    print "No analysis with logic_name $logic_name found, creating ...\n";
    $sth = $self->core->dbc->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
    # TODO - other fields in analysis table
    $sth->execute();
    $analysis_id = $sth->{'mysql_insertid'};
    print "Done (analysis ID=" . $analysis_id. ")\n";

  }
  $sth->finish();

  return $analysis_id;

}


sub dump_core_xrefs {

  my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $object_xref_id_offset,  $ensembl_object_types_hashref) = @_;

  my @xref_ids = keys %$xref_ids_hashref;
  my %xref_to_objects = %$xref_ids_hashref;
  my %ensembl_object_types = %$ensembl_object_types_hashref;

  my $dir = $self->core->dir();

  open (XREF, ">$dir/xref.txt");
  open (OBJECT_XREF, ">>$dir/object_xref.txt");
  open (EXTERNAL_SYNONYM, ">$dir/external_synonym.txt");
  open (GO_XREF, ">$dir/go_xref.txt");

  # keep a unique list of source IDs to build the external_db table later
  my %source_ids;

  my $object_xref_id = $start_object_xref_id;

  # build cache of source id -> external_db id; note %source_to_external_db is global
  %source_to_external_db = $self->map_source_to_external_db();

  # execute several queries with a max of 200 entries in each IN clause - more efficient
  my $batch_size = 200;

  # keep track of what xref_id & object_xref_ids have been written to prevent
  # duplicates; e.g. several dependent xrefs may be dependent on the same master xref.
  # Note %xrefs_written and %object_xrefs_written are global

  while(@xref_ids) {

    my @ids;
    if($#xref_ids > $batch_size) {
      @ids = splice(@xref_ids, 0, $batch_size);
    } else {
      @ids = splice(@xref_ids, 0);
    }

    my $id_str;
    if(@ids > 1)  {
      $id_str = "IN (" . join(',', @ids). ")";
    } else {
      $id_str = "= " . $ids[0];
    }


    my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
    my $xref_sth = $self->xref->dbc->prepare($sql);
    $xref_sth->execute();

    my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id, $master_xref_id, $linkage_annotation);
    $xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);

    # note the xref_id we write to the file is NOT the one we've just read
    # from the internal xref database as the ID may already exist in the
    # core database so we add on $xref_id_offset
    while ($xref_sth->fetch()) {

      # make sure label is set to /something/ so that the website displays something
      $label = $accession if (!$label);

      if (!$xrefs_written{$xref_id}) {
	my $external_db_id = $source_to_external_db{$source_id};
	if ($external_db_id) { # skip "unknown" sources
	  print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
	  $xrefs_written{$xref_id} = 1;
	  $source_ids{$source_id} = $source_id;
	}
      }
    }

    # Now get the dependent xrefs for each of these xrefs and write them as well
    # Store the go_linkage_annotations as we go along (need for dumping go_xref)
    my $go_source_id = get_source_id_from_source_name($self->xref, "GO");

    $sql = "SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, x.source_id, x.version, dx.linkage_annotation FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id $id_str";

    my $dep_sth = $self->xref->dbc->prepare($sql);
    $dep_sth->execute();

    $dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, \$source_id, \$version, \$linkage_annotation);
    while ($dep_sth->fetch()) {

      my $external_db_id = $source_to_external_db{$source_id};
      next if (!$external_db_id);


      $label = $accession if (!$label);

      if (!$xrefs_written{$xref_id}) {
	print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\tDEPENDENT\n";
	$xrefs_written{$xref_id} = 1;
	$source_ids{$source_id} = $source_id;
      }

      # create an object_xref linking this (dependent) xref with any objects it maps to
      # write to file and add to object_xref_mappings
      if (defined $xref_to_objects{$master_xref_id}) {
	my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); 
	#print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n";
	foreach my $object_id (@ensembl_object_ids) {
	  my $type = $ensembl_object_types{$object_id};
	  my $full_key = $type."|".$object_id."|".$xref_id;
	  if (!$object_xrefs_written{$full_key}) {
	    print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n";

	    # Add this mapping to the list - note NON-OFFSET xref_id is used
	    my $key = $type . "|" . $object_id;
	    push @{$object_xref_mappings{$key}}, $xref_id;
	    $object_xrefs_written{$full_key} = 1;

	    # Also store *parent's* query/target identity for dependent xrefs
	    $object_xref_identities{$key}->{$xref_id}->{"target_identity"} = $object_xref_identities{$key}->{$master_xref_id}->{"target_identity"};
	    $object_xref_identities{$key}->{$xref_id}->{"query_identity"} = $object_xref_identities{$key}->{$master_xref_id}->{"query_identity"};

	    # write a go_xref with the appropriate linkage type
	    print GO_XREF $object_xref_id . "\t" . $linkage_annotation . "\n"  if ($source_id == $go_source_id);

	    $object_xref_id++;

	  }
	}
      }
    }

    #print "source_ids: " . join(" ", keys(%source_ids)) . "\n";

  } # while @xref_ids

  # Dump any synonyms for xrefs we've written
  # Do one big query to get a list of all the synonyms; note each xref may have
  # more than one synonym so they are stored in a hash of lists
  print "Dumping synonyms\n";
  my $syn_count;
  my %synonyms;
  my $syn_sth = $self->xref->dbc->prepare("SELECT xref_id, synonym FROM synonym");
  $syn_sth->execute();

  my ($xref_id, $synonym);
  $syn_sth->bind_columns(\$xref_id, \$synonym);
  while ($syn_sth->fetch()) {

    push @{$synonyms{$xref_id}}, $synonym;

  }

  # Now write the ones we want to the file
  foreach my $xref_id (keys %synonyms) {
    foreach my $syn (@{$synonyms{$xref_id}}) {
      print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $syn . "\n";
      $syn_count++;
    }
  }

  print "Wrote $syn_count synonyms\n";

  close(XREF);
  close(OBJECT_XREF);
  close(EXTERNAL_SYNONYM);
  close(GO_XREF);

  # calculate display_xref_ids for transcripts and genes
  my $transcript_display_xrefs = $self->build_transcript_display_xrefs($xref_id_offset);

  $self->build_genes_to_transcripts();

  $self->build_gene_display_xrefs($transcript_display_xrefs);

  # now build gene descriptions
  $self->build_gene_descriptions();

  return $object_xref_id;

}


# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id

sub dump_comparison {

  my $self = shift;

  my $dir = $self->core->dir();

  print "Dumping comparison data\n";

  open (COMPARISON, ">comparison/xref_mappings.txt");
  print COMPARISON "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";

  # get the xref accession for each xref as the xref_ids are ephemeral
  # first read all the xrefs that were dumped and get an xref_id->accession map
  my %xref_id_to_accesson;
  open (XREF, "$dir/xref.txt");
  while (<XREF>) {
    my ($xref_id,$external_db_id,$accession,$label,$version,$description) = split;
    $xref_id_to_accesson{$xref_id} = $accession;
  }
  close (XREF);

  open (OBJECT_XREF, "$dir/object_xref.txt");
  while (<OBJECT_XREF>) {
    my ($object_xref_id,$object_id,$type,$xref_id) = split;
    print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
  }

  close (OBJECT_XREF);
  close (COMPARISON);

}

sub build_transcript_display_xrefs {

  my ($self, $xref_id_offset) = @_;

  my $dir = $self->core->dir();

  # get a list of xref sources; format:
  # key: xref_id value: source_name
  # lots of these; if memory is a problem, just get the source ID (not the name)
  # and look it up elsewhere
  # note %xref_to_source is global
  print "Building xref->source mapping table\n";
  my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
  my $sth = $self->xref->dbc->prepare($sql);
  $sth->execute();

  my ($xref_id, $source_name);
  $sth->bind_columns(\$xref_id, \$source_name);

  while ($sth->fetch()) {
    $xref_to_source{$xref_id} = $source_name;
  }

  print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";

  # Cache the list of translation->transcript mappings & vice versa
  # Note variables are global
  print "Building translation to transcript mappings\n";
  $sth = $self->core->dbc->prepare("SELECT translation_id, transcript_id FROM translation");
  $sth->execute();

  my ($translation_id, $transcript_id);
  $sth->bind_columns(\$translation_id, \$transcript_id);

  while ($sth->fetch()) {
    $translation_to_transcript{$translation_id} = $transcript_id;
    $transcript_to_translation{$transcript_id} = $translation_id if ($translation_id);
  }

  print "Building transcript display_xrefs\n";
  my @priorities = $self->transcript_display_xref_sources();

  my $n = 0;

  # go through each object/xref mapping and store the best ones as we go along
  my %obj_to_best_xref;

  foreach my $key (keys %object_xref_mappings) {

    my ($type, $object_id) = split /\|/, $key;

    next if ($type !~ /(Transcript|Translation)/i);

    # if a transcript has more than one associated xref,
    # use the one with the highest priority, i.e. lower list position in @priorities
    my @xrefs = @{$object_xref_mappings{$key}};
    my ($best_xref, $best_xref_priority_idx);
    # store best query & target identities for each source
    my %best_qi;
    my %best_ti;
    $best_xref_priority_idx = 99999;
    foreach my $xref (@xrefs) {

      my $source = $xref_to_source{$xref};
      if ($source) {
	my $i = find_in_list($source, @priorities);

	my $s = $source . "|" . $xref;
	my $key = $type . "|" . $object_id;
	my $query_identity  = $object_xref_identities{$key}->{$xref}->{"query_identity"};
	my $target_identity = $object_xref_identities{$key}->{$xref}->{"target_identity"};

	# Check if this source has a better priority than the current best one
	# Note if 2 sources are the same priority, the mappings are compared on
	# query_identity then target_identity
#	if ($i > -1 && $i < $best_xref_priority_idx &&
#	    (($query_identity > $best_query_identity) ||
#	    ($query_identity == $best_query_identity && $target_identity > $best_target_identity))) {
	if ($i > -1 && $i < $best_xref_priority_idx && $query_identity > $best_qi{$s}) {
	  $best_xref = $xref;
	  $best_xref_priority_idx = $i;
	  $best_qi{$s} = $query_identity;
	  $best_ti{$s} = $target_identity;
	}
      } else {
	warn("Couldn't find a source for xref $xref \n");
      }
    }
    # store object type, id, and best xref id and source priority
    if ($best_xref) {
      $obj_to_best_xref{$key} = $best_xref . "|" . $best_xref_priority_idx;
    }

  }


  # Now go through each of the calculated best xrefs and convert any that are
  # calculated against translations to be associated with their transcript,
  # if the priority of the translation xref is higher than that of the transcript
  # xref.
  # Needs to be done this way to avoid clobbering higher-priority transcripts.

  # hash keyed on transcript id, value is xref_id|source prioirity index
  my %transcript_display_xrefs;

  # Write a .sql file that can be executed, and a .txt file that can be processed
  open (TRANSCRIPT_DX, ">$dir/transcript_display_xref.sql");
  open (TRANSCRIPT_DX_TXT, ">$dir/transcript_display_xref.txt");

  foreach my $key (keys %obj_to_best_xref) {

    my ($type, $object_id) = split /\|/, $key;

    my ($best_xref, $best_xref_priority_idx) = split /\|/, $obj_to_best_xref{$key};

    # If transcript has a translation, use the best xref out of the transcript & translation

    my $transcript_id;
    my $translation_id;
    if ($type =~ /Transcript/i) {
      $transcript_id = $object_id;
      $translation_id = $transcript_to_translation{$transcript_id};
    }
    elsif ($type =~ /Translation/i) {
      $translation_id = $object_id;
      $transcript_id = $translation_to_transcript{$translation_id};
      $object_id = $transcript_id;
    }
    else{
      print "Cannot deal with type $type\n";
      next;
    }
    if ($translation_id) {
      my ($translation_xref, $translation_priority) = split /\|/, $obj_to_best_xref{"Translation|$translation_id"};
      my ($transcript_xref, $transcript_priority)   = split /\|/, $obj_to_best_xref{"Transcript|$transcript_id"};
      my $transcript_qi = $object_xref_identities{"Transcript|$object_id"}->{$transcript_xref}->{"query_identity"};
      my $translation_qi = $object_xref_identities{"Translation|$object_id"}->{$translation_xref}->{"query_identity"};

      #print "transcript xref $transcript_xref transcript pri $transcript_priority transcript qi: $transcript_qi translation xref $translation_xref translation pri $translation_priority translation qi $translation_qi\n";
      if(!$translation_xref){
	$best_xref = $transcript_xref;
	$best_xref_priority_idx = $transcript_priority;
      }
      if(!$transcript_xref){
	$best_xref = $translation_xref;
	$best_xref_priority_idx = $translation_priority;
      }
      elsif ($translation_priority < $transcript_priority) {
	$best_xref = $translation_xref;
	$best_xref_priority_idx = $translation_priority;
      } else {
	$best_xref = $transcript_xref;
	$best_xref_priority_idx = $transcript_priority;
      }

    }
    if ($best_xref) {

      # Write record with xref_id_offset
      print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
      print TRANSCRIPT_DX_TXT ($best_xref+$xref_id_offset) . "\t" . $object_id . "\n";
      $n++;

      my $value = ($best_xref+$xref_id_offset) . "|" . $best_xref_priority_idx;
      $transcript_display_xrefs{$object_id} = $value;

    }

  }

  close(TRANSCRIPT_DX);
  close(TRANSCRIPT_DX_TXT);

  print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";

  return \%transcript_display_xrefs;

}


# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used

sub build_gene_display_xrefs {

  my ($self, $transcript_display_xrefs) = @_;

  my $dir = $self->core->dir();

  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $self->core->dbc);

  my $ta = $db->get_TranscriptAdaptor();

  print "Assigning display_xrefs to genes\n";

  open (GENE_DX, ">$dir/gene_display_xref.sql");
  open (GENE_DX_TXT, ">$dir/gene_display_xref.txt");
  my $hit = 0;
  my $miss = 0;
  my $trans_no_xref = 0;
  my $trans_xref = 0;
  foreach my $gene_id (keys %genes_to_transcripts) {

    my @transcripts = @{$genes_to_transcripts{$gene_id}};

    my $best_xref;
    my $best_xref_priority_idx = 99999;
    my $best_transcript_length = -1;
    foreach my $transcript_id (@transcripts) {
      if (!$transcript_display_xrefs->{$transcript_id}) {
	$trans_no_xref++;
	next;
      } else {
	$trans_xref++;
      }
      my ($xref_id, $priority) = split (/\|/, $transcript_display_xrefs->{$transcript_id});

      # 2 separate if clauses to avoid having to fetch transcripts unnecessarily

      if (($priority < $best_xref_priority_idx)) {

	$best_xref_priority_idx = $priority;
	$best_xref = $xref_id;

      } elsif ($priority == $best_xref_priority_idx) {

	# compare transcript lengths and use longest
	my $transcript = $ta->fetch_by_dbID($transcript_id);
	my $transcript_length = $transcript->length();
	if ($transcript_length > $best_transcript_length) {
	  $best_transcript_length = $transcript_length;
	  $best_xref_priority_idx = $priority;
	  $best_xref = $xref_id;
	}
      }
    }

    if ($best_xref) {
      # Write record
      print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
      print GENE_DX_TXT $best_xref . "\t" . $gene_id ."\n";
      $hit++;
    } else {
      $miss++;
    }

  }

  close (GENE_DX);
  close (GENE_DX_TXT);
  print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
  print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
  print "Couldn't find display_xrefs for $miss genes\n" if ($miss > 0);
  print "Found display_xrefs for all genes\n" if ($miss eq 0);

  return \%genes_to_transcripts;

}

# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.

sub transcript_display_xref_sources {

  return ('HUGO',
	  'MarkerSymbol',
#	  'wormbase_transcript',
	  'flybase_symbol',
	  'Anopheles_symbol',
	  'Genoscope_annotated_gene',
	  'Genoscope_predicted_transcript',
	  'Genoscope_predicted_gene',
	  'Uniprot/SWISSPROT',
	  'RefSeq_peptide',
	  'RefSeq_dna',
	  'Uniprot/SPTREMBL',
	  'LocusLink');

}

# Get transcripts associated with each gene

sub build_genes_to_transcripts {

  my ($self) = @_;

  print "Getting transcripts for all genes\n";

  my $sql = "SELECT gene_id, transcript_id FROM transcript";
  my $sth = $self->core->dbc->prepare($sql);
  $sth->execute();

  my ($gene_id, $transcript_id);
  $sth->bind_columns(\$gene_id, \$transcript_id);

  # Note %genes_to_transcripts is global
  while ($sth->fetch()) {
    push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
  }

  print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";

}

# Find the index of an item in a list(ref), or -1 if it's not in the list.
# Only look for exact matches (case insensitive)

sub find_in_list {

  my ($item, @list) = @_;

  for (my $i = 0; $i < scalar(@list); $i++) {
    if (lc($list[$i]) eq lc($item)) {
      return $i;
    }
  }

  return -1;

}

# Take a string and a list of regular expressions
# Find the index of the highest matching regular expression
# Return the index, or -1 if not found.

sub find_match {

 my ($str, @list) = @_;

 my $str2 = $str;
 my $highest_index = -1;

  for (my $i = 0; $i < scalar(@list); $i++) {
    my $re = $list[$i];
    if ($str2 =~ /$re/i) {
      $highest_index = $i;
    }
  }

  return $highest_index;

}

# Build a map of source id (in xref database) to external_db (in core database)

sub map_source_to_external_db {

  my $self = shift;

  my %source_to_external_db;

  # get all sources
  my $sth = $self->xref->dbc->prepare("SELECT source_id, name FROM source");
  $sth->execute();
  my ($source_id, $source_name);
  $sth->bind_columns(\$source_id, \$source_name);

  while($sth->fetchrow_array()) {

    # find appropriate external_db_id for each one
    my $sql = "SELECT external_db_id FROM external_db WHERE db_name=?";
    my $core_sth = $self->core->dbc->prepare($sql);
    $core_sth->execute($source_name);

    my @row = $core_sth->fetchrow_array();

    if (@row) {

      $source_to_external_db{$source_id} = $row[0];
      #print "Source name $source_name id $source_id corresponds to core external_db_id " . $row[0] . "\n";

    } else {

      print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db\n"

    }

  } # while source

  return %source_to_external_db;
}

# Upload .txt files and execute .sql files.

sub do_upload {

  my ($self, $deleteexisting) = @_;

  my $ensembl = $self->core;
  my $core_db = $ensembl->dbc;
  # xref.txt etc

  # TODO warn if table not empty

  foreach my $table ("xref", "object_xref", "identity_xref", "external_synonym", "gene_description", "go_xref", "interpro") {

    my $file = $ensembl->dir() . "/" . $table . ".txt";
    my $sth;

    if ($deleteexisting) {

      $sth = $core_db->prepare("DELETE FROM $table");
      print "Deleting existing data in $table\n";
      $sth->execute();

    }

    # don't seem to be able to use prepared statements here
    $sth = $core_db->prepare("LOAD DATA INFILE \'$file\' IGNORE INTO TABLE $table");
    print "Uploading data in $file to $table\n";
    $sth->execute();

  }

  # gene_display_xref.sql etc
  foreach my $table ("gene", "transcript") {


    my $file = $ensembl->dir() . "/" . $table . "_display_xref.sql";
    my $sth;

    if ($deleteexisting) {

      $sth = $core_db->prepare("UPDATE $table SET display_xref_id=NULL");
      print "Setting all existing display_xref_id in $table to null\n";
      $sth->execute();

    }

    print "Setting $table display_xrefs from $file\n";
    my $str = "mysql -u " .$core_db->username() ." -p" . $core_db->password() . " -h " . $core_db->host() ." -P " . $core_db->port() . " " .$core_db->dbname() . " < $file";
    system $str;

    #$sth = $core_db->prepare("UPDATE $table SET display_xref_id=? WHERE ${table}_id=?");
    #open(DX_TXT, $file);
    #while (<DX_TXT>) {
    #  my ($xref_id, $object_id) = split;
    #  $sth->execute($xref_id, $object_id);
    #}
    #close(DX_TXT);
  }

}

# Assign gene descriptions
# Algorithm:
# foreach gene
#   get all transcripts & translations
#   get all associated xrefs
#   filter by regexp, discard blank ones
#   order by source & keyword
#   assign description of best xref to gene
# }
#
# One gene may have several associated peptides; the one to use is decided as follows.
# In decreasing order of precedence:
#
# - Consortium xref, e.g. ZFIN for zebrafish
#
# - UniProt/SWISSPROT
#     If there are several, the one with the best %query_id then %target_id is used
#
# - RefSeq
#    If there are several, the one with the best %query_id then %target_id is used
#
# - UniProt/SPTREMBL
#    If there are several, precedence is established on the basis of the occurrence of 
#    regular expression patterns in the description.

sub build_gene_descriptions {

  my ($self) = @_;

  # TODO - don't call this from, but after, gene_display_xref

  # Get all xref descriptions, filtered by regexp.
  # Discard any that are blank (i.e. regexp has removed everything)

  print "Getting & filtering xref descriptions\n";
 # Note %xref_descriptions & %xref_accessions are global

  my $sth = $self->xref->dbc->prepare("SELECT xref_id, accession, description FROM xref");
  $sth->execute();
  my ($xref_id, $accession, $description);
  $sth->bind_columns(\$xref_id, \$accession, \$description);

  my $removed = 0;
  my @regexps = $self->gene_description_filter_regexps();
  while ($sth->fetch()) {
    if ($description) {
      $description = filter_by_regexp($description, \@regexps);
      if ($description ne "") {
	$xref_descriptions{$xref_id} = $description;
	$xref_accessions{$xref_id} = $accession;
      } else {
	$removed++;
      }
    }
  }

  print "Regexp filtering (" . scalar(@regexps) . " regexps) removed $removed descriptions, left with " . scalar(keys %xref_descriptions) . "\n";

  my $dir = $self->core->dir();
  open(GENE_DESCRIPTIONS,">$dir/gene_description.txt") || die "Could not open $dir/gene_description.txt";

  # Foreach gene, get any xrefs associated with its transcripts or translations

  print "Assigning gene descriptions\n";


  foreach my $gene_id (keys %genes_to_transcripts) {

    my @gene_xrefs;

    my %local_xref_to_object;

    my @transcripts = @{$genes_to_transcripts{$gene_id}};
    foreach my $transcript (@transcripts) {

      my @xref_ids;

      my $key = "Transcript|$transcript";
      if ($object_xref_mappings{$key}) {

	@xref_ids = @{$object_xref_mappings{$key}};
	push @gene_xrefs, @xref_ids;
	foreach my $xref (@xref_ids) {
	  $local_xref_to_object{$xref} = $key;
	}

      }

      my $translation = $transcript_to_translation{$transcript};
      $key = "Translation|$translation";
      if ($object_xref_mappings{$key}) {

	@xref_ids = @{$object_xref_mappings{$key}};
	push @gene_xrefs, @xref_ids ;
	foreach my $xref (@xref_ids) {
	  $local_xref_to_object{$xref} = $key;
	}
      }

    }

    # Now sort through these and find the "best" description and write it

    if (@gene_xrefs) {

      @gene_xrefs = sort {compare_xref_descriptions($self->consortium(), $gene_id, \%local_xref_to_object)} @gene_xrefs;

      my $best_xref = $gene_xrefs[-1];
      my $description = $xref_descriptions{$best_xref};
      my $source = $xref_to_source{$best_xref};
      my $acc = $xref_accessions{$best_xref};

      print GENE_DESCRIPTIONS "$gene_id\t$description" . " [Source:$source;Acc:$acc]\n" if ($description);

    }

  } # foreach gene

  close(GENE_DESCRIPTIONS);

}

# remove a list of patterns from a string
sub filter_by_regexp {

  my ($str, $regexps) = @_;

  foreach my $regexp (@$regexps) {