Skip to content
Snippets Groups Projects
BasicMapper.pm 34.5 KiB
Newer Older
    $source_id_str = "IN (" . join(',', @source_id_array). ")";
  } else {
    $source_id_str = "= " . $source_id_array[0];
  }

  # get source names; 
 #print STDERR $source_sql."\n";
  my $source_sql = "SELECT name, release, source_id FROM source WHERE source_id $source_id_str";
  my $source_sth = $xref_dbi->prepare($source_sql);
Glenn Proctor's avatar
Glenn Proctor committed
  #print STDERR $source_sql."\n";
  my ($source_name, $release, $source_id);
  $source_sth->bind_columns(\$source_name, \$release, \$source_id);
  while ($source_sth->fetch()) {
    print EXTERNAL_DB "$edb_id\t$source_name\t$release\tXREF\n";
    # TODO knownxref etc??
    $edb_id++;
  }

  close(EXTERNAL_DB);

  print "Before calling display_xref, object_xref_mappings size " . scalar (keys %{$object_xref_mappings}) . "\n";

  # calculate display_xref_ids for transcripts and genes
  my $transcript_display_xrefs = $self->build_transcript_display_xrefs($object_xref_mappings, $xref_id_offset);

  $self->build_gene_display_xrefs($transcript_display_xrefs);
Glenn Proctor's avatar
Glenn Proctor committed
# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id

sub dump_comparison {

  print "Dumping comparison data\n";

  open (COMPARISON, ">comparison/xref_mappings.txt");

  # get the xref accession for each xref as the xref_ids are ephemeral

  # first read all the xrefs that were dumped and get an xref_id->accession map
  my %xref_id_to_accesson;
  open (XREF, "xref.txt");
  print XREF "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";
Glenn Proctor's avatar
Glenn Proctor committed
  while (<XREF>) {
    my ($xref_id,$accession,$label,$description) = split;
    $xref_id_to_accesson{$xref_id} = $accession;
  }
  close (XREF);

  open (OBJECT_XREF, "object_xref.txt");
  while (<OBJECT_XREF>) {
    my ($object_xref_id,$object_id,$type,$xref_id) = split;
    print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
  }

Glenn Proctor's avatar
Glenn Proctor committed
  close (OBJECT_XREF);
  close (COMPARISON);
sub build_transcript_display_xrefs {

  my ($self, $object_xref_mappings, $xref_id_offset) = @_;

  # get a list of xref sources; format:
  # key: xref_id value: source_name
  # lots of these; if memory is a problem, just get the source ID (not the name)
  # and look it up elsewhere
  print "Building xref->source mapping table\n";
  my %xref_to_source;
  my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
  my $sth = $self->xref->dbi()->prepare($sql);
  $sth->execute();

  my ($xref_id, $source_name);
  $sth->bind_columns(\$xref_id, \$source_name);

  while ($sth->fetch()) {
    $xref_to_source{$xref_id} = $source_name;
  }

  print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";

  # Cache the list of translation->transcript mappings
  print "Building translation to transcript mappings\n";
  my %translation_to_transcript;
  my $sth = $self->dbi()->prepare("SELECT translation_id, transcript_id FROM translation");
  $sth->execute();

  my ($translation_id, $transcript_id);
  $sth->bind_columns(\$translation_id, \$transcript_id);

  while ($sth->fetch()) {
    $translation_to_transcript{$translation_id} = $transcript_id;
  }

  print "Building transcript display_xrefs\n";
  my @priorities = $self->transcript_display_xref_sources();

  open (TRANSCRIPT_DX, ">transcript_display_xref.sql");

  my $n = 0;
  # go through each object/xref mapping
  # store the best ones as we go along
  # hash keyed on transcript id, value is xref_id:source prioirity index
  # xref is stored with offset added
  # Note xrefs to translations are also considered; transcript ID is always stored
  my %transcript_display_xrefs;
  foreach my $key (keys %{$object_xref_mappings}) {

    my ($type, $obj) = split /:/, $key;

    next if ($type !~ /(Transcript|Translation)/i);

    # if a transcript has more than one associated xref,
    # use the one with the highest priority, i.e. lower list position in @priorities
    my @xrefs = @{$object_xref_mappings->{$key}};
    my ($best_xref, $best_xref_priority_idx);
    $best_xref_priority_idx = 99999;
    foreach my $xref (@xrefs) {
      my $source = $xref_to_source{$xref};
      if ($source) {
	my $i = find_in_list($source, @priorities);
	if ($i > -1 && $i < $best_xref_priority_idx) {
	  $best_xref = $xref;
	  $best_xref_priority_idx = $i;
	}
      } else {
	warn("Couldn't find a source for xref $xref \n");
      }
    }

    if (!$best_xref) {
      warn("Couldn't find a display xref for transcript id $obj\n");
    } else {

      # If transcript, store directly
      # If translation, lookup transcript id
      my $object_id;
      if ($type =~ /Transcript/i) {
	$object_id = $obj;
      } elsif ($type =~ /Translation/i) {
	$object_id = $translation_to_transcript{$obj};
      }
      # Write record with xref_id_offset
      print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
      my $value = ($best_xref+$xref_id_offset) . ":" . $best_xref_priority_idx;
      $transcript_display_xrefs{$object_id} = $value;
  }

  close(TRANSCRIPT_DX);

  print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";

  return \%transcript_display_xrefs;

}


# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used

sub build_gene_display_xrefs {

  my ($self, $transcript_display_xrefs) = @_;

  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-species => $self->species(),
					      -dbname  => $self->dbname(),
					      -host    => $self->host(),
					      -port    => $self->port(),
					      -password => $self->password(),
					      -user     => $self->user(),
					      -group    => 'core');
  my $ta = $db->get_TranscriptAdaptor();

  print "Building gene display_xrefs\n";
  print "Getting transcripts for all genes\n";

  my $sql = "SELECT gene_id, transcript_id FROM transcript";
  my $sth = $self->dbi()->prepare($sql);
  $sth->execute();

  my ($gene_id, $transcript_id);
  $sth->bind_columns(\$gene_id, \$transcript_id);

  my %genes_to_transcripts;
  while ($sth->fetch()) {
    push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
  }

  print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";

  print "Assigning display_xrefs to genes\n";

  open (GENE_DX, ">gene_display_xref.sql");
  my $hit = 0;
  my $miss = 0;
  my $trans_no_xref = 0;
  my $trans_xref = 0;
  foreach my $gene_id (keys %genes_to_transcripts) {

    my @transcripts = @{$genes_to_transcripts{$gene_id}};

    my $best_xref;
    my $best_xref_priority_idx = 99999;
    my $best_transcript_length = -1;
    foreach my $transcript_id (@transcripts) {
      if (!$transcript_display_xrefs->{$transcript_id}) {
	$trans_no_xref++;
	next;
      } else {
	$trans_xref++;
      }
      my ($xref_id, $priority) = split (/:/, $transcript_display_xrefs->{$transcript_id});
      #print "gene $gene_id orig:" . $transcript_display_xrefs->{$transcript_id} . " xref id: " . $xref_id . " pri " . $priority . "\n";
      my $transcript = $ta->fetch_by_dbID($transcript_id);
      my $transcript_length = $transcript->length();
      if (($priority lt $best_xref_priority_idx) ||
	  ($priority eq $best_xref_priority_idx && $transcript_length gt $best_transcript_length)) {
	$best_transcript_length = $transcript_length;
	$best_xref_priority_idx = $priority;
	$best_xref = $xref_id;
      }
    }

    if (!$best_xref) {
      #XXXwarn("Couldn't find a display xref for gene id $gene_id\n");
      $miss++;
      # Write record 
      print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
      $hit++;
    }

  }

  close (GENE_DX);
  print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
  print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
  print "Couldn't find display_xrefs for $miss genes\n";

}

# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.

sub transcript_display_xref_sources {

  return ('HUGO',
	  'MarkerSymbol',
	  'wormbase_transcript',
	  'flybase_symbol',
	  'Anopheles_symbol',
	  'Genoscope_annotated_gene',
	  'Genoscope_predicted_transcript',
	  'Genoscope_predicted_gene',
	  'Uniprot/SWISSPROT',
	  'RefSeq',
	  'Uniprot/SPTREMBL',
	  'LocusLink');

}


# Find the index of an item in a list(ref), or -1 if it's not in the list

sub find_in_list {

  my ($item, @list) = @_;

  for (my $i = 0; $i < scalar(@list); $i++) {
    if ($list[$i] =~ /$item/) {
      return $i;
    }
  }

  return -1;

}

Ian Longden's avatar
Ian Longden committed
1;