Newer
Older
$source_id_str = "IN (" . join(',', @source_id_array). ")";
} else {
$source_id_str = "= " . $source_id_array[0];
}
#print STDERR $source_sql."\n";
my $source_sql = "SELECT name, release, source_id FROM source WHERE source_id $source_id_str";
my $source_sth = $xref_dbi->prepare($source_sql);
$source_sth->execute();
my ($source_name, $release, $source_id);
$source_sth->bind_columns(\$source_name, \$release, \$source_id);
while ($source_sth->fetch()) {
print EXTERNAL_DB "$edb_id\t$source_name\t$release\tXREF\n";
# TODO knownxref etc??
$edb_id++;
}
close(EXTERNAL_DB);
print "Before calling display_xref, object_xref_mappings size " . scalar (keys %{$object_xref_mappings}) . "\n";
# calculate display_xref_ids for transcripts and genes
my $transcript_display_xrefs = $self->build_transcript_display_xrefs($object_xref_mappings, $xref_id_offset);
$self->build_gene_display_xrefs($transcript_display_xrefs);
# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id
sub dump_comparison {
print "Dumping comparison data\n";
open (COMPARISON, ">comparison/xref_mappings.txt");
# get the xref accession for each xref as the xref_ids are ephemeral
# first read all the xrefs that were dumped and get an xref_id->accession map
my %xref_id_to_accesson;
open (XREF, "xref.txt");
print XREF "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";
while (<XREF>) {
my ($xref_id,$accession,$label,$description) = split;
$xref_id_to_accesson{$xref_id} = $accession;
}
close (XREF);
open (OBJECT_XREF, "object_xref.txt");
while (<OBJECT_XREF>) {
my ($object_xref_id,$object_id,$type,$xref_id) = split;
print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
}
sub build_transcript_display_xrefs {
my ($self, $object_xref_mappings, $xref_id_offset) = @_;
# get a list of xref sources; format:
# key: xref_id value: source_name
# lots of these; if memory is a problem, just get the source ID (not the name)
# and look it up elsewhere
print "Building xref->source mapping table\n";
my %xref_to_source;
my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
my $sth = $self->xref->dbi()->prepare($sql);
$sth->execute();
my ($xref_id, $source_name);
$sth->bind_columns(\$xref_id, \$source_name);
$xref_to_source{$xref_id} = $source_name;
}
print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";
# Cache the list of translation->transcript mappings
print "Building translation to transcript mappings\n";
my %translation_to_transcript;
my $sth = $self->dbi()->prepare("SELECT translation_id, transcript_id FROM translation");
$sth->execute();
my ($translation_id, $transcript_id);
$sth->bind_columns(\$translation_id, \$transcript_id);
$translation_to_transcript{$translation_id} = $transcript_id;
}
print "Building transcript display_xrefs\n";
my @priorities = $self->transcript_display_xref_sources();
open (TRANSCRIPT_DX, ">transcript_display_xref.sql");
my $n = 0;
# go through each object/xref mapping
# store the best ones as we go along
# hash keyed on transcript id, value is xref_id:source prioirity index
# xref is stored with offset added
# Note xrefs to translations are also considered; transcript ID is always stored
my %transcript_display_xrefs;
foreach my $key (keys %{$object_xref_mappings}) {
my ($type, $obj) = split /:/, $key;
next if ($type !~ /(Transcript|Translation)/i);
# if a transcript has more than one associated xref,
# use the one with the highest priority, i.e. lower list position in @priorities
my @xrefs = @{$object_xref_mappings->{$key}};
my ($best_xref, $best_xref_priority_idx);
$best_xref_priority_idx = 99999;
foreach my $xref (@xrefs) {
my $source = $xref_to_source{$xref};
if ($source) {
my $i = find_in_list($source, @priorities);
if ($i > -1 && $i < $best_xref_priority_idx) {
$best_xref_priority_idx = $i;
}
} else {
warn("Couldn't find a source for xref $xref \n");
}
}
if (!$best_xref) {
warn("Couldn't find a display xref for transcript id $obj\n");
} else {
# If transcript, store directly
# If translation, lookup transcript id
my $object_id;
if ($type =~ /Transcript/i) {
$object_id = $obj;
} elsif ($type =~ /Translation/i) {
$object_id = $translation_to_transcript{$obj};
}
print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
my $value = ($best_xref+$xref_id_offset) . ":" . $best_xref_priority_idx;
$transcript_display_xrefs{$object_id} = $value;
}
close(TRANSCRIPT_DX);
print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
return \%transcript_display_xrefs;
}
# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used
sub build_gene_display_xrefs {
my ($self, $transcript_display_xrefs) = @_;
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-species => $self->species(),
-dbname => $self->dbname(),
-host => $self->host(),
-port => $self->port(),
-password => $self->password(),
-user => $self->user(),
-group => 'core');
my $ta = $db->get_TranscriptAdaptor();
print "Building gene display_xrefs\n";
print "Getting transcripts for all genes\n";
my $sql = "SELECT gene_id, transcript_id FROM transcript";
my $sth = $self->dbi()->prepare($sql);
$sth->execute();
my ($gene_id, $transcript_id);
$sth->bind_columns(\$gene_id, \$transcript_id);
my %genes_to_transcripts;
push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
}
print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";
print "Assigning display_xrefs to genes\n";
open (GENE_DX, ">gene_display_xref.sql");
my $hit = 0;
my $miss = 0;
my $trans_no_xref = 0;
my $trans_xref = 0;
foreach my $gene_id (keys %genes_to_transcripts) {
my @transcripts = @{$genes_to_transcripts{$gene_id}};
my $best_xref;
my $best_xref_priority_idx = 99999;
my $best_transcript_length = -1;
foreach my $transcript_id (@transcripts) {
if (!$transcript_display_xrefs->{$transcript_id}) {
$trans_no_xref++;
next;
} else {
$trans_xref++;
}
my ($xref_id, $priority) = split (/:/, $transcript_display_xrefs->{$transcript_id});
#print "gene $gene_id orig:" . $transcript_display_xrefs->{$transcript_id} . " xref id: " . $xref_id . " pri " . $priority . "\n";
my $transcript = $ta->fetch_by_dbID($transcript_id);
my $transcript_length = $transcript->length();
if (($priority lt $best_xref_priority_idx) ||
($priority eq $best_xref_priority_idx && $transcript_length gt $best_transcript_length)) {
$best_transcript_length = $transcript_length;
$best_xref_priority_idx = $priority;
$best_xref = $xref_id;
}
}
if (!$best_xref) {
#XXXwarn("Couldn't find a display xref for gene id $gene_id\n");
$miss++;
} else{
# Write record
print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
$hit++;
}
}
close (GENE_DX);
print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
print "Couldn't find display_xrefs for $miss genes\n";
}
# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.
sub transcript_display_xref_sources {
return ('HUGO',
'MarkerSymbol',
'wormbase_transcript',
'flybase_symbol',
'Anopheles_symbol',
'Genoscope_annotated_gene',
'Genoscope_predicted_transcript',
'Genoscope_predicted_gene',
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
'LocusLink');
}
# Find the index of an item in a list(ref), or -1 if it's not in the list
sub find_in_list {
my ($item, @list) = @_;
for (my $i = 0; $i < scalar(@list); $i++) {
if ($list[$i] =~ /$item/) {
return $i;
}
}
return -1;
}