BasicMapper.pm

    while ($dep_sth->fetch()) {

      my $external_db_id = $source_to_external_db{$source_id};
      next if (!$external_db_id);

      $label = $accession if (!$label);

      if (!$xrefs_written{$xref_id}) {
	print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\tDEPENDENT\n";
	$xrefs_written{$xref_id} = 1;
	$source_ids{$source_id} = $source_id;
      }

      # create an object_xref linking this (dependent) xref with any objects it maps to
      # write to file and add to object_xref_mappings
      if (defined $xref_to_objects{$master_xref_id}) {
	my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); 
	#print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n";
	foreach my $object_id (@ensembl_object_ids) {
	  my $type = $ensembl_object_types{$object_id};
	  my $full_key = $type."|".$object_id."|".$xref_id;
	  if (!$object_xrefs_written{$full_key}) {
	    print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n";
	    $object_xref_id++;
	    # Add this mapping to the list - note NON-OFFSET xref_id is used
	    my $key = $type . "|" . $object_id;
	    push @{$object_xref_mappings{$key}}, $xref_id;
	    $object_xrefs_written{$full_key} = 1;
	  }
	}
      }
    }

    # Now get the synonyms for each of these xrefs and write them to the external_synonym table
    $sql = "SELECT DISTINCT xref_id, synonym FROM synonym WHERE xref_id $id_str";

    my $syn_sth = $xref_dbi->prepare($sql);
    $syn_sth->execute();

    $syn_sth->bind_columns(\$xref_id, \$accession);
    while ($syn_sth->fetch()) {

      print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $accession . "\n";

    }

    #print "source_ids: " . join(" ", keys(%source_ids)) . "\n";

  } # while @xref_ids

  close(XREF);
  close(OBJECT_XREF);
  close(EXTERNAL_SYNONYM);

  print "Before calling display_xref, object_xref_mappings size " . scalar (keys %object_xref_mappings) . "\n";

  # calculate display_xref_ids for transcripts and genes
  my $transcript_display_xrefs = $self->build_transcript_display_xrefs($xref_id_offset);

  $self->build_gene_display_xrefs_and_descriptions($transcript_display_xrefs);

}


# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id

sub dump_comparison {

  my $self = shift;

  my $dir = $self->dir();

  print "Dumping comparison data\n";

  open (COMPARISON, ">comparison/xref_mappings.txt");
  print COMPARISON "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";

  # get the xref accession for each xref as the xref_ids are ephemeral
  # first read all the xrefs that were dumped and get an xref_id->accession map
  my %xref_id_to_accesson;
  open (XREF, "$dir/xref.txt");
  while (<XREF>) {
    my ($xref_id,$external_db_id,$accession,$label,$version,$description) = split;
    $xref_id_to_accesson{$xref_id} = $accession;
  }
  close (XREF);

  open (OBJECT_XREF, "$dir/object_xref.txt");
  while (<OBJECT_XREF>) {
    my ($object_xref_id,$object_id,$type,$xref_id) = split;
    print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
  }

  close (OBJECT_XREF);
  close (COMPARISON);

}

sub build_transcript_display_xrefs {

  my ($self, $xref_id_offset) = @_;

  my $dir = $self->dir();

  # get a list of xref sources; format:
  # key: xref_id value: source_name
  # lots of these; if memory is a problem, just get the source ID (not the name)
  # and look it up elsewhere
  # note %xref_to_source is global
  print "Building xref->source mapping table\n";
  my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
  my $sth = $self->xref->dbi()->prepare($sql);
  $sth->execute();

  my ($xref_id, $source_name);
  $sth->bind_columns(\$xref_id, \$source_name);

  while ($sth->fetch()) {
    $xref_to_source{$xref_id} = $source_name;
  }

  print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";

  # Cache the list of translation->transcript mappings & vice versa
  # Nte variables are global
  print "Building translation to transcript mappings\n";
  my $sth = $self->dbi()->prepare("SELECT translation_id, transcript_id FROM translation");
  $sth->execute();

  my ($translation_id, $transcript_id);
  $sth->bind_columns(\$translation_id, \$transcript_id);

  while ($sth->fetch()) {
    $translation_to_transcript{$translation_id} = $transcript_id;
    $transcript_to_translation{$transcript_id} = $translation_id if ($translation_id);
  }

  print "Building transcript display_xrefs\n";
  my @priorities = $self->transcript_display_xref_sources();

  my $n = 0;

  # go through each object/xref mapping and store the best ones as we go along
  my %obj_to_best_xref;

  foreach my $key (keys %object_xref_mappings) {

    my ($type, $object_id) = split /\|/, $key;

    next if ($type !~ /(Transcript|Translation)/i);

    # if a transcript has more than one associated xref,
    # use the one with the highest priority, i.e. lower list position in @priorities
    my @xrefs = @{$object_xref_mappings{$key}};
    my ($best_xref, $best_xref_priority_idx);
    $best_xref_priority_idx = 99999;
    foreach my $xref (@xrefs) {

      my $source = $xref_to_source{$xref};
      if ($source) {
	my $i = find_in_list($source, @priorities);
	if ($i > -1 && $i < $best_xref_priority_idx) {
	  $best_xref = $xref;
	  $best_xref_priority_idx = $i;
	}
      } else {
	warn("Couldn't find a source for xref $xref \n");
      }
    }
    # store object type, id, and best xref id and source priority
    if ($best_xref) {
      $obj_to_best_xref{$key} = $best_xref . "|" . $best_xref_priority_idx;
    }

  }

  # Now go through each of the calculated best xrefs and convert any that are
  # calculated against translations to be associated with their transcript,
  # if the priority of the translation xref is higher than that of the transcript
  # xref.
  # Needs to be done this way to avoid clobbering higher-priority transcripts.

  # hash keyed on transcript id, value is xref_id|source prioirity index
  my %transcript_display_xrefs;

  # Write a .sql file that can be executed, and a .txt file that can be processed
  open (TRANSCRIPT_DX, ">$dir/transcript_display_xref.sql");
  open (TRANSCRIPT_DX_TXT, ">$dir/transcript_display_xref.txt");

  foreach my $key (keys %obj_to_best_xref) {

    my ($type, $object_id) = split /\|/, $key;

    my ($best_xref, $best_xref_priority_idx) = split /\|/, $obj_to_best_xref{$key};

    # If transcript has a translation, use the best xref out of the transcript & translation
    if ($type =~ /Transcript/i) {
      my $transcript_id = $object_id;
      my $translation_id = $transcript_to_translation{$transcript_id};

      if ($translation_id) {
	my ($translation_xref, $translation_priority) = split /\|/, $obj_to_best_xref{"Translation|$translation_id"};
	my ($transcript_xref, $transcript_priority)   = split /\|/, $obj_to_best_xref{"Transcript|$transcript_id"};

	if ($translation_priority < $transcript_priority) {
	  $best_xref = $translation_xref;
	  $best_xref_priority_idx = $translation_priority;
	} else {
	  $best_xref = $transcript_xref;
	  $best_xref_priority_idx = $transcript_priority;
	}

      }
    }

    if ($best_xref) {

      # Write record with xref_id_offset
      print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
      print "wrote " . $best_xref . " (plus offset) for 95625\n" if ($object_id eq 95625);
      print TRANSCRIPT_DX_TXT ($best_xref+$xref_id_offset) . "\t" . $object_id . "\n";
      $n++;

      my $value = ($best_xref+$xref_id_offset) . "|" . $best_xref_priority_idx;
      $transcript_display_xrefs{$object_id} = $value;

    }

  }

  close(TRANSCRIPT_DX);
  close(TRANSCRIPT_DX_TXT);

  print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";

  return \%transcript_display_xrefs;

}


# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used

sub build_gene_display_xrefs_and_descriptions {

  my ($self, $transcript_display_xrefs) = @_;

  my $dir = $self->dir();

  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-species => $self->species(),
					      -dbname  => $self->dbname(),
					      -host    => $self->host(),
					      -port    => $self->port(),
					      -pass    => $self->password(),
					      -user    => $self->user(),
					      -group   => 'core');
  my $ta = $db->get_TranscriptAdaptor();

  print "Building gene display_xrefs\n";
  print "Getting transcripts for all genes\n";

  my $sql = "SELECT gene_id, transcript_id FROM transcript";
  my $sth = $self->dbi()->prepare($sql);
  $sth->execute();

  my ($gene_id, $transcript_id);
  $sth->bind_columns(\$gene_id, \$transcript_id);

  # Note %genes_to_transcripts is global
  while ($sth->fetch()) {
    push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
  }

  print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";

  print "Assigning display_xrefs to genes\n";

  open (GENE_DX, ">$dir/gene_display_xref.sql");
  open (GENE_DX_TXT, ">$dir/gene_display_xref.txt");
  my $hit = 0;
  my $miss = 0;
  my $trans_no_xref = 0;
  my $trans_xref = 0;
  foreach my $gene_id (keys %genes_to_transcripts) {

    my @transcripts = @{$genes_to_transcripts{$gene_id}};

    my $best_xref;
    my $best_xref_priority_idx = 99999;
    my $best_transcript_length = -1;
    foreach my $transcript_id (@transcripts) {
      if (!$transcript_display_xrefs->{$transcript_id}) {
	$trans_no_xref++;
	next;
      } else {
	$trans_xref++;
      }
      my ($xref_id, $priority) = split (/\|/, $transcript_display_xrefs->{$transcript_id});
      #print "gene $gene_id orig:" . $transcript_display_xrefs->{$transcript_id} . " xref id: " . $xref_id . " pri " . $priority . "\n";
      # 2 separate if clauses to avoid having to fetch transcripts unnecessarily

      if (($priority lt $best_xref_priority_idx)) {

	$best_xref_priority_idx = $priority;
	$best_xref = $xref_id;

      } elsif ($priority eq $best_xref_priority_idx) {

	# compare transcript lengths and use longest
	my $transcript = $ta->fetch_by_dbID($transcript_id);
	my $transcript_length = $transcript->length();
	if ($transcript_length > $best_transcript_length) {
	  $best_transcript_length = $transcript_length;
	  $best_xref_priority_idx = $priority;
	  $best_xref = $xref_id;
	}
      }
    }

    if ($best_xref) {
      # Write record
      print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
      print GENE_DX_TXT $best_xref . "\t" . $gene_id ."\n";
      $hit++;
    } else {
      $miss++;
    }

  }

  close (GENE_DX);
  close (GENE_DX_TXT);
  print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
  print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
  print "Couldn't find display_xrefs for $miss genes\n" if ($miss > 0);
  print "Found display_xrefs for all genes\n" if ($miss eq 0);

  # now build gene descriptions
  $self->build_gene_descriptions(\%genes_to_transcripts);

}

# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.

sub transcript_display_xref_sources {

  return ('HUGO',
	  'MarkerSymbol',
	  'wormbase_transcript',
	  'flybase_symbol',
	  'Anopheles_symbol',
	  'Genoscope_annotated_gene',
	  'Genoscope_predicted_transcript',
	  'Genoscope_predicted_gene',
	  'Uniprot/SWISSPROT',
	  'RefSeq',
	  'Uniprot/SPTREMBL',
	  'LocusLink');

}


# Find the index of an item in a list(ref), or -1 if it's not in the list.
# Only look for exact matches (case insensitive)

sub find_in_list {

  my ($item, @list) = @_;

  for (my $i = 0; $i < scalar(@list); $i++) {
    if (lc($list[$i]) eq lc($item)) {
      return $i;
    }
  }

  return -1;

}

# Take a string and a list of regular expressions
# Find the index of the highest matching regular expression
# Return the index, or -1 if not found.

sub find_match {

 my ($str, @list) = @_;

 my $str2 = $str;
 my $highest_index = -1;

  for (my $i = 0; $i < scalar(@list); $i++) {
    my $re = $list[$i];
    if ($str2 =~ /$re/i) {
      $highest_index = $i;
    }
  }

  return $highest_index;

}

# Build a map of source id (in xref database) to external_db (in core database)

sub map_source_to_external_db {

  my $self = shift;

  my %source_to_external_db;

  # get all sources
  my $sth = $self->xref->dbi()->prepare("SELECT source_id, name FROM source");
  $sth->execute();
  my ($source_id, $source_name);
  $sth->bind_columns(\$source_id, \$source_name);

  while($sth->fetchrow_array()) {

    # find appropriate external_db_id for each one
    my $sql = "SELECT external_db_id FROM external_db WHERE db_name=?";
    my $core_sth = $self->dbi()->prepare($sql);
    $core_sth->execute($source_name);

    my @row = $core_sth->fetchrow_array();

    if (@row) {

      $source_to_external_db{$source_id} = $row[0];
      #print "Source name $source_name id $source_id corresponds to core external_db_id " . $row[0] . "\n";

    } else {

      print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db\n"

    }

  } # while source

  return %source_to_external_db;
}

# Upload .txt files and execute .sql files.

sub do_upload {

  my ($self, $deleteexisting) = @_;

  # xref.txt etc

  # TODO warn if table not empty

  foreach my $table ("xref", "object_xref", "identity_xref", "external_synonym", "gene_description") {

    my $file = $self->dir() . "/" . $table . ".txt";
    my $sth;

    if ($deleteexisting) {

      $sth = $self->dbi()->prepare("DELETE FROM $table");
      print "Deleting existing data in $table\n";
      $sth->execute();

    }

    # don't seem to be able to use prepared statements here
    $sth = $self->dbi()->prepare("LOAD DATA INFILE \'$file\' INTO TABLE $table");
    print "Uploading data in $file to $table\n";
    $sth->execute();

  }

  # gene_display_xref.sql etc
  foreach my $table ("gene", "transcript") {

    my $file = $self->dir() . "/" . $table . "_display_xref.txt";
    my $sth;

    if ($deleteexisting) {

      $sth = $self->dbi()->prepare("UPDATE $table SET display_xref_id=NULL");
      print "Setting all existing display_xref_id in $table to null\n";
      $sth->execute();

    }

    print "Setting $table display_xrefs from $file\n";
    # TODO this better
    #my $str = "mysql -u " .$self->user() ." -p" . $self->password() . " -h " . $self->host() ." -P " . $self->port() . " " .$self->dbname() . " < $file";
    #system $str;

    $sth = $self->dbi()->prepare("UPDATE $table SET display_xref_id=? WHERE ${table}_id=?");
    open(DX_TXT, $file);
    while (<DX_TXT>) {
      my ($xref_id, $object_id) = split;
      $sth->execute($xref_id, $object_id);
    }

    close(DX_TXT);
  }

}

# Assign gene descriptions
# Algorithm:
# foreach gene
#   get all transcripts & translations
#   get all associated xrefs
#   filter by regexp, discard blank ones
#   order by source & keyword
#   assign description of best xref to gene
# }
#
# One gene may have several associated peptides; the one to use is decided as follows.
# In decreasing order of precedence:
#
# - Consortium xref, e.g. ZFIN for zebrafish
#
# - UniProt/SWISSPROT
#     If there are several, the one with the best %query_id then %target_id is used
#
# - RefSeq
#    If there are several, the one with the best %query_id then %target_id is used
#
# - UniProt/SPTREMBL
#    If there are several, precedence is established on the basis of the occurrence of 
#    regular expression patterns in the description.

sub build_gene_descriptions {

  my ($self, $genes_to_transcripts) = @_;

  # TODO - don't call this from, but after, gene_display_xref

  # Get all xref descriptions, filtered by regexp.
  # Discard any that are blank (i.e. regexp has removed everything)

  print "Getting & filtering xref descriptions\n";
 # Note %xref_descriptions & %xref_accessions are global

  my $sth = $self->xref->dbi()->prepare("SELECT xref_id, accession, description FROM xref");
  $sth->execute();
  my ($xref_id, $accession, $description);
  $sth->bind_columns(\$xref_id, \$accession, \$description);

  my $removed = 0;
  my @regexps = $self->gene_description_filter_regexps();
  while ($sth->fetch()) {
    if ($description) {
      $description = filter_by_regexp($description, \@regexps);
      if ($description ne "") {
	$xref_descriptions{$xref_id} = $description;
	$xref_accessions{$xref_id} = $accession;
      } else {
	$removed++;
      }
    }
  }

  print "Regexp filtering (" . scalar(@regexps) . " regexps) removed $removed descriptions, left with " . scalar(keys %xref_descriptions) . "\n";

  my $dir = $self->dir();
  open(GENE_DESCRIPTIONS,">$dir/gene_description.txt") || die "Could not open $dir/gene_description.txt";

  # Foreach gene, get any xrefs associated with its transcripts or translations

  print "Assigning gene descriptions\n";


  foreach my $gene_id (keys %genes_to_transcripts) {

    my @gene_xrefs;

    my %local_xref_to_object;

    my @transcripts = @{$genes_to_transcripts{$gene_id}};
    foreach my $transcript (@transcripts) {

      my @xref_ids;

      my $key = "Transcript|$transcript";
      if ($object_xref_mappings{$key}) {

	@xref_ids = @{$object_xref_mappings{$key}};
	push @gene_xrefs, @xref_ids;
	foreach my $xref (@xref_ids) {
	  $local_xref_to_object{$xref} = $key;
	}
	
      }

      my $translation = $transcript_to_translation{$transcript};
      $key = "Translation|$translation";
      if ($object_xref_mappings{$key}) {

	push @gene_xrefs, @{$object_xref_mappings{$key}} ;
	foreach my $xref (@xref_ids) {
	  $local_xref_to_object{$xref} = $key;
	}
      }
      
    }

    # Now sort through these and find the "best" description and write it

    if (@gene_xrefs) {

      @gene_xrefs = sort {compare_xref_descriptions($self->consortium(), $gene_id, \%local_xref_to_object)} @gene_xrefs;

      my $best_xref = $gene_xrefs[-1];
      my $description = $xref_descriptions{$best_xref};
      my $source = $xref_to_source{$best_xref};
      my $acc = $xref_accessions{$best_xref};

      print GENE_DESCRIPTIONS "$gene_id\t$description" . " [Source:$source;Acc:$acc]\n" if ($description);

    }

  } # foreach gene

  close(GENE_DESCRIPTIONS);

}

# remove a list of patterns from a string
sub filter_by_regexp {

  my ($str, $regexps) = @_;

  foreach my $regexp (@$regexps) {
    $str =~ s/$regexp//ig;
  }

  return $str;

}

# Regexp used for filter out useless text from gene descriptions
# Method can be overridden in species-specific modules
sub gene_description_filter_regexps {

  return ();

}


# The "consortium" source for this species, should be the same as in
# source table

sub consortium {

  return "xxx"; # Default to something that won't be matched as a source

}

# Sort a list of xrefs by the priority of their sources
# Assumed this function is called by Perl sort, passed with parameter
# See comment for build_gene_descriptions for how precedence is decided.

sub compare_xref_descriptions {

  my ($consortium, $gene_id, $xref_to_object) = @_;

  my @sources = ("Uniprot/SPTREMBL", "RefSeq_dna", "RefSeq_peptide", "Uniprot/SWISSPROT", $consortium);
  my @words = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);

  my $src_a = $xref_to_source{$a};
  my $src_b = $xref_to_source{$b};
  my $pos_a = find_in_list($src_a, @sources);
  my $pos_b = find_in_list($src_b, @sources);

  # If same source, need to do more work
  if ($pos_a == $pos_b) {

   if ($src_a eq "Uniprot/SWISSPROT" || $src_a =~ /RefSeq/) {

     # Compare on query identities, then target identities if queries are the same
     my $key_a = $xref_to_object->{$a}; # e.g. "Translation|1234"
     my $key_b = $xref_to_object->{$b};
     my ($type_a, $object_a) = split(/\|/, $key_a);
     my ($type_b, $object_b) = split(/\|/, $key_b);

     return 0 if ($type_a != $type_b); # only compare like with like

     my $query_identity_a = $object_xref_identities{$object_a}->{$a}->{"query_identity"};
     my $query_identity_b = $object_xref_identities{$object_b}->{$b}->{"query_identity"};

     return ($query_identity_a <=> $query_identity_b) if ($query_identity_a != $query_identity_b);

     my $target_identity_a = $object_xref_identities{$object_a}->{$a}->{"target_identity"};
     my $target_identity_b = $object_xref_identities{$object_b}->{$b}->{"target_identity"};

     return ($target_identity_a <=> $target_identity_b);

   } elsif ($src_a eq "Uniprot/SPTREMBL") {

     # Compare on words
     my $wrd_idx_a = find_match($xref_descriptions{$a}, @words);
     my $wrd_idx_b = find_match($xref_descriptions{$b}, @words);
     return $wrd_idx_a <=> $wrd_idx_b;

   } else {

     return 0;

   }
    return 0;

  } else {

    return $pos_a <=> $pos_b;

  }
}


1;