Newer
Older
else{
$transcript .= "$id,";
}
}
else{
die "Unknown type *".$type."*\n".$temp."\n";
}
}
if($transcript_count){
chop $transcript; # remove last , from list
$self->get_stable_ids("transcript",$transcript,\%transcript_2_stable);
}
if($translation_count){
chop $translation;
$self->get_stable_ids("translation",$translation,\%translation_2_stable);
}
open (XREF, ">>" . $self->core->dir() . "/xref.txt");
foreach my $source ("%RefSeq%","UniProt%"){
my $sql = "select x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id from xref x, source s where s.name like '".$source."' and x.source_id = s.source_id";
my $sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id);
$sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
while($sth->fetch()){
if (!$xrefs_written{$xref_id}) {
$xrefs_written{$xref_id} = 1;
my $external_db_id = $source_to_external_db{$source_id};
if(!defined($updated_source{$external_db_id})){
$self->cleanup_sources_file($external_db_id);
}
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
if(defined($failed_xref_mappings{$xref_id})){
my ($ensembl_id,$type,$q_perc,$t_perc,$q_cut,$t_cut) = split(/\|/,$failed_xref_mappings{$xref_id});
print XREF_MISSED ($xref_id+$xref_id_offset) . "\thighest match is $accession (".$q_perc."%) to ";
if($type =~ /Translation/){
print XREF_MISSED $translation_2_stable{$ensembl_id};
}
elsif($type =~ /Transcript/){
print XREF_MISSED $transcript_2_stable{$ensembl_id};
}
else{
die "type=*".$type."*\n".$failed_xref_mappings{$xref_id}."\n";
}
print XREF_MISSED " (".$t_perc."\%) which are below there respective cutoffs off $q_cut\% and $t_cut\%.\n";
}
else{
print XREF_MISSED ($xref_id+$xref_id_offset) . "\t$accession no match to Ensembl\n";
}
}
}
$sth->finish;
}
close(XREF);
close(XREF_MISSED);
}
# dump xrefs that don't appear in either the primary_xref or dependent_xref tables
# e.g. Interpro xrefs
sub dump_orphan_xrefs() {
my ($self, $xref_id_offset) = @_;
my $count;
Ian Longden
committed
open (XREF, ">>" . $self->core->dir() . "/xref.txt");
open (XREF_MISSED, ">>" . $self->core->dir() . "/xref_failed.txt");
# need a double left-join
my $sql = "SELECT x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id FROM xref x LEFT JOIN primary_xref px ON px.xref_id=x.xref_id LEFT JOIN dependent_xref dx ON dx.dependent_xref_id=x.xref_id WHERE px.xref_id IS NULL AND dx.dependent_xref_id IS NULL";
Ian Longden
committed
my $sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id);
$sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
while ($sth->fetch()) {
my $external_db_id = $source_to_external_db{$source_id};
if ($external_db_id) { # skip "unknown" sources
Glenn Proctor
committed
if (!$xrefs_written{$xref_id}) {
if(!defined($updated_source{$external_db_id})){
$self->cleanup_sources_file($external_db_id);
}
Glenn Proctor
committed
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
Glenn Proctor
committed
$xrefs_written{$xref_id} = 1;
$count++;
}
}
}
$sth->finish();
close(XREF);
close(XREF_MISSED);
print "Wrote $count xrefs that are neither primary nor dependent\n";
}
# Dump direct xrefs. Need to do stable ID -> internal ID mapping.
sub dump_direct_xrefs {
my ($self, $xref_id_offset, $max_object_xref_id) = @_;
my $object_xref_id = $max_object_xref_id + 1;
print "Writing direct xrefs\n";
Glenn Proctor
committed
my $count = 0;
Ian Longden
committed
open (XREF, ">>" . $self->core->dir() . "/xref.txt");
open (OBJECT_XREF, ">>" . $self->core->dir() . "/object_xref.txt");
# Will need to look up translation stable ID from transcript stable ID, build hash table
print "Building transcript stable ID -> translation stable ID lookup table\n";
my %transcript_stable_id_to_translation_stable_id;
Ian Longden
committed
my $trans_sth = $self->core->dbc->prepare("SELECT tss.stable_id as transcript, tls.stable_id AS translation FROM translation tl, translation_stable_id tls, transcript_stable_id tss WHERE tss.transcript_id=tl.transcript_id AND tl.translation_id=tls.translation_id");
$trans_sth->execute();
my ($transcript_stable_id, $translation_stable_id);
$trans_sth->bind_columns(\$transcript_stable_id, \$translation_stable_id);
while ($trans_sth->fetch()) {
$transcript_stable_id_to_translation_stable_id{$transcript_stable_id} = $translation_stable_id;
}
$trans_sth->finish();
# Will need lookup tables for gene/transcript/translation stable ID to internal ID
my $stable_id_to_internal_id = $self->build_stable_id_to_internal_id_hash();
# SQL / statement handle for getting all direct xrefs
my $xref_sql = "SELECT dx.general_xref_id, dx.ensembl_stable_id, dx.type, dx.linkage_xref, x.accession, x.version, x.label, x.description, x.source_id, x.species_id FROM direct_xref dx, xref x WHERE dx.general_xref_id=x.xref_id";
Ian Longden
committed
my $xref_sth = $self->xref->dbc->prepare($xref_sql);
$xref_sth->execute();
my ($xref_id, $ensembl_stable_id, $type, $linkage_xref, $accession, $version, $label, $description, $source_id, $species_id);
$xref_sth->bind_columns(\$xref_id, \$ensembl_stable_id, \$type, \$linkage_xref,\ $accession, \$version, \$label, \$description, \$source_id, \$species_id);
while ($xref_sth->fetch()) {
my $external_db_id = $source_to_external_db{$source_id};
if ($external_db_id) {
# In the case of CCDS xrefs, direct_xref is to transcript but we want
# the mapping in the core db to be to the *translation*
if ($source_id == get_source_id_from_source_name($self->xref(), "CCDS")) {
$type = 'translation';
my $tmp_esid = $ensembl_stable_id;
$ensembl_stable_id = $transcript_stable_id_to_translation_stable_id{$tmp_esid};
warn "Can't find translation for transcript $tmp_esid" if (!$ensembl_stable_id);
#print "CCDS: transcript $tmp_esid -> translation $ensembl_stable_id\n";
}
my $ensembl_internal_id = $stable_id_to_internal_id->{$type}->{$ensembl_stable_id};
Glenn Proctor
committed
if ($ensembl_internal_id) {
Glenn Proctor
committed
if (!$xrefs_written{$xref_id}) {
if(!defined($updated_source{$external_db_id})){
$self->cleanup_sources_file($external_db_id);
}
Glenn Proctor
committed
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
$xrefs_written{$xref_id} = 1;
}
print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\n";
$object_xref_id++;
$count++;
} else {
Glenn Proctor
committed
# deal with UTR transcripts in Elegans and potentially others
# Need to link xrefs that are listed as linking to e.g. ZK829.4
# to each of ZK829.4.1, ZK829.4.2, ZK829.4.3
my $old_object_xref_id = $object_xref_id;
if ($source_id == get_source_id_from_source_name($self->xref(), "wormpep_id")) {
# search for matching stable IDs
my $pat = $ensembl_stable_id . '\..+';
foreach my $stable_id (keys %{$stable_id_to_internal_id->{$type}}) {
if ($stable_id =~ /$pat/) {
if (!$xrefs_written{$xref_id}) {
if(!defined($updated_source{$external_db_id})){
$self->cleanup_sources_file($external_db_id);
}
Glenn Proctor
committed
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
$xrefs_written{$xref_id} = 1;
}
$ensembl_internal_id = $stable_id_to_internal_id->{$type}->{$stable_id};
print OBJECT_XREF "$object_xref_id\t$ensembl_internal_id\t" . ucfirst($type) . "\t" . ($xref_id+$xref_id_offset) . "\n";
$object_xref_id++;
}
} # foreach stable_id
} # if source_id
# if we haven't changed $object_xref_id, nothing was written
print STDERR "Can't find $type corresponding to stable ID $ensembl_stable_id in ${type}_stable_id, not writing record for xref $accession\n" if ($object_xref_id == $old_object_xref_id);
Glenn Proctor
committed
}
}
close(OBJECT_XREF);
close(XREF);
$xref_sth->finish();
print "Wrote $count direct xrefs\n";
}
# Dump the interpro table from the xref database
sub dump_interpro {
my $self = shift;
Ian Longden
committed
open (INTERPRO, ">" . $self->core->dir() . "/interpro.txt");
Ian Longden
committed
my $sth = $self->xref->dbc->prepare("SELECT * FROM interpro");
$sth->execute();
my ($interpro, $pfam);
$sth->bind_columns(\$interpro, \$pfam);
while ($sth->fetch()) {
print INTERPRO $interpro . "\t" . $pfam . "\n";
}
$sth->finish();
close (INTERPRO);
}
sub build_stable_id_to_internal_id_hash {
Glenn Proctor
committed
my ($self) = @_;
my %stable_id_to_internal_id;
foreach my $type ('gene', 'transcript', 'translation') { # Add exon here if required
print "Caching stable ID -> internal ID links for ${type}s\n";
my $core_sql = "SELECT ${type}_id, stable_id FROM ${type}_stable_id" ;
Ian Longden
committed
my $sth = $self->core->dbc->prepare($core_sql);
$sth->execute();
my ($internal_id, $stable_id);
$sth->bind_columns(\$internal_id, \$stable_id);
while ($sth->fetch) {
$stable_id_to_internal_id{$type}{$stable_id} = $internal_id;
}
}
return \%stable_id_to_internal_id;
}
sub get_ensembl_object_type {
my $filename = shift;
my $type;
$filename = basename($filename);
if ($filename =~ /_dna_/i) {
$type = "Transcript";
} elsif ($filename =~ /_peptide_/i) {
$type = "Translation";
} else {
print STDERR "Cannot deduce Ensembl object type from filename $filename\n";
}
return $type;
}
sub get_method {
my $filename = shift;
$filename = basename($filename);
my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/;
return $method;
}
sub get_analysis_id {
my ($self, $ensembl_type) = @_;
my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA',
'translation' => 'XrefExonerateProtein' );
my $logic_name = $typeToLogicName{lc($ensembl_type)};
Ian Longden
committed
my $sth = $self->core->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
$sth->execute();
my $analysis_id;
if (my @row = $sth->fetchrow_array()) {
$analysis_id = $row[0];
# print "Found exising analysis ID ($analysis_id) for $logic_name\n";
} else {
print "No analysis with logic_name $logic_name found, creating ...\n";
Ian Longden
committed
$sth = $self->core->dbc->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
# TODO - other fields in analysis table
$sth->execute();
$analysis_id = $sth->{'mysql_insertid'};
print "Done (analysis ID=" . $analysis_id. ")\n";
}
$sth->finish();
return $analysis_id;
}
Glenn Proctor
committed
sub dump_core_xrefs {
Glenn Proctor
committed
my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $object_xref_id_offset, $ensembl_object_types_hashref) = @_;
my @xref_ids = keys %$xref_ids_hashref;
my %xref_to_objects = %$xref_ids_hashref;
my %ensembl_object_types = %$ensembl_object_types_hashref;
Ian Longden
committed
my $dir = $self->core->dir();
open (XREF, ">$dir/xref.txt");
open (OBJECT_XREF, ">>$dir/object_xref.txt");
open (EXTERNAL_SYNONYM, ">$dir/external_synonym.txt");
Glenn Proctor
committed
open (GO_XREF, ">$dir/go_xref.txt");
Glenn Proctor
committed
# Cache synonyms for later use
# Do one big query to get a list of all the synonyms; note each xref may have
# more than one synonym so they are stored in a hash of lists
my $syn_count;
my %synonyms;
my $syn_sth = $self->xref->dbc->prepare("SELECT xref_id, synonym FROM synonym");
$syn_sth->execute();
my ($sxref_id, $synonym);
$syn_sth->bind_columns(\$sxref_id, \$synonym);
while ($syn_sth->fetch()) {
push @{$synonyms{$sxref_id}}, $synonym;
}
# keep a unique list of source IDs to build the external_db table later
my %source_ids;
my $object_xref_id = $start_object_xref_id;
# build cache of source id -> external_db id; note %source_to_external_db is global
%source_to_external_db = $self->map_source_to_external_db();
Glenn Proctor
committed
# execute several queries with a max of 200 entries in each IN clause - more efficient
my $batch_size = 200;
Glenn Proctor
committed
# keep track of what xref_id & object_xref_ids have been written to prevent
# duplicates; e.g. several dependent xrefs may be dependent on the same master xref.
Glenn Proctor
committed
# Note %xrefs_written and %object_xrefs_written are global
while(@xref_ids) {
my @ids;
if($#xref_ids > $batch_size) {
@ids = splice(@xref_ids, 0, $batch_size);
} else {
@ids = splice(@xref_ids, 0);
}
my $id_str;
if(@ids > 1) {
$id_str = "IN (" . join(',', @ids). ")";
} else {
$id_str = "= " . $ids[0];
}
my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
Ian Longden
committed
my $xref_sth = $self->xref->dbc->prepare($sql);
$xref_sth->execute();
Glenn Proctor
committed
my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id, $master_xref_id, $linkage_annotation);
$xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
# note the xref_id we write to the file is NOT the one we've just read
Glenn Proctor
committed
# from the internal xref database as the ID may already exist in the
# core database so we add on $xref_id_offset
while ($xref_sth->fetch()) {
# make sure label is set to /something/ so that the website displays something
$label = $accession if (!$label);
if (!$xrefs_written{$xref_id}) {
my $external_db_id = $source_to_external_db{$source_id};
if ($external_db_id) { # skip "unknown" sources
if(!defined($updated_source{$external_db_id})){
$self->cleanup_sources_file($external_db_id);
}
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
$xrefs_written{$xref_id} = 1;
$source_ids{$source_id} = $source_id;
}
}
# Now get the dependent xrefs for each of these xrefs and write them as well
Glenn Proctor
committed
# Store the go_linkage_annotations as we go along (need for dumping go_xref)
my $go_source_id = get_source_id_from_source_name($self->xref, "GO");
$sql = "SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, x.source_id, x.version, dx.linkage_annotation FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id $id_str";
Ian Longden
committed
my $dep_sth = $self->xref->dbc->prepare($sql);
$dep_sth->execute();
Glenn Proctor
committed
$dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, \$source_id, \$version, \$linkage_annotation);
while ($dep_sth->fetch()) {
Glenn Proctor
committed
my $external_db_id = $source_to_external_db{$source_id};
next if (!$external_db_id);
$label = $accession if (!$label);
Glenn Proctor
committed
Glenn Proctor
committed
if (!$xrefs_written{$xref_id}) {
if(!defined($updated_source{$external_db_id})){
$self->cleanup_sources_file($external_db_id);
}
Glenn Proctor
committed
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\tDEPENDENT\n";
$xrefs_written{$xref_id} = 1;
$source_ids{$source_id} = $source_id;
}
# create an object_xref linking this (dependent) xref with any objects it maps to
# write to file and add to object_xref_mappings
if (defined $xref_to_objects{$master_xref_id}) {
my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} );
#print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n";
foreach my $object_id (@ensembl_object_ids) {
my $type = $ensembl_object_types{$object_id};
my $full_key = $type."|".$object_id."|".$xref_id;
if (!$object_xrefs_written{$full_key}) {
print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n";
Glenn Proctor
committed
# Add this mapping to the list - note NON-OFFSET xref_id is used
my $key = $type . "|" . $object_id;
push @{$object_xref_mappings{$key}}, $xref_id;
$object_xrefs_written{$full_key} = 1;
Glenn Proctor
committed
Glenn Proctor
committed
# Also store *parent's* query/target identity for dependent xrefs
$object_xref_identities{$key}->{$xref_id}->{"target_identity"} = $object_xref_identities{$key}->{$master_xref_id}->{"target_identity"};
$object_xref_identities{$key}->{$xref_id}->{"query_identity"} = $object_xref_identities{$key}->{$master_xref_id}->{"query_identity"};
Glenn Proctor
committed
Glenn Proctor
committed
# write a go_xref with the appropriate linkage type
print GO_XREF $object_xref_id . "\t" . $linkage_annotation . "\n" if ($source_id == $go_source_id);
$object_xref_id++;
}
#print "source_ids: " . join(" ", keys(%source_ids)) . "\n";
} # while @xref_ids
# Dump any synonyms for xrefs we've written
Glenn Proctor
committed
# Now write the synonyms we want to the file
foreach my $xref_id (keys %synonyms) {
foreach my $syn (@{$synonyms{$xref_id}}) {
print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $syn . "\n";
$syn_count++;
}
}
close(OBJECT_XREF);
Glenn Proctor
committed
close(GO_XREF);
# calculate display_xref_ids for transcripts and genes
my $transcript_display_xrefs = $self->build_transcript_display_xrefs($xref_id_offset);
Ian Longden
committed
$self->build_genes_to_transcripts();
Glenn Proctor
committed
$self->build_gene_display_xrefs($transcript_display_xrefs);
# now build gene descriptions
$self->build_gene_descriptions();
return $object_xref_id;
# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id
sub dump_comparison {
my $self = shift;
Ian Longden
committed
my $dir = $self->core->dir();
print "Dumping comparison data\n";
open (COMPARISON, ">comparison/xref_mappings.txt");
Glenn Proctor
committed
print COMPARISON "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";
# get the xref accession for each xref as the xref_ids are ephemeral
# first read all the xrefs that were dumped and get an xref_id->accession map
my %xref_id_to_accesson;
open (XREF, "$dir/xref.txt");
Glenn Proctor
committed
my ($xref_id,$external_db_id,$accession,$label,$version,$description) = split;
$xref_id_to_accesson{$xref_id} = $accession;
}
close (XREF);
open (OBJECT_XREF, "$dir/object_xref.txt");
while (<OBJECT_XREF>) {
my ($object_xref_id,$object_id,$type,$xref_id) = split;
print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
}
close (OBJECT_XREF);
close (COMPARISON);
sub build_transcript_display_xrefs {
my ($self, $xref_id_offset) = @_;
Ian Longden
committed
my $dir = $self->core->dir();
# get a list of xref sources; format:
# key: xref_id value: source_name
# lots of these; if memory is a problem, just get the source ID (not the name)
# and look it up elsewhere
# note %xref_to_source is global
print "Building xref->source mapping table\n";
my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
Ian Longden
committed
my $sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my ($xref_id, $source_name);
$sth->bind_columns(\$xref_id, \$source_name);
$xref_to_source{$xref_id} = $source_name;
}
print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";
# Cache the list of translation->transcript mappings & vice versa
# Note variables are global
print "Building translation to transcript mappings\n";
Ian Longden
committed
$sth = $self->core->dbc->prepare("SELECT translation_id, transcript_id FROM translation");
$sth->execute();
my ($translation_id, $transcript_id);
$sth->bind_columns(\$translation_id, \$transcript_id);
$translation_to_transcript{$translation_id} = $transcript_id;
$transcript_to_translation{$transcript_id} = $translation_id if ($translation_id);
print "Building transcript display_xrefs\n";
my @priorities = $self->transcript_display_xref_sources();
# go through each object/xref mapping and store the best ones as we go along
my %obj_to_best_xref;
foreach my $key (keys %object_xref_mappings) {
my ($type, $object_id) = split /\|/, $key;
next if ($type !~ /(Transcript|Translation)/i);
# if a transcript has more than one associated xref,
# use the one with the highest priority, i.e. lower list position in @priorities
my @xrefs = @{$object_xref_mappings{$key}};
my ($best_xref, $best_xref_priority_idx);
Glenn Proctor
committed
# store best query & target identities for each source
my %best_qi;
my %best_ti;
$best_xref_priority_idx = 99999;
my $source = $xref_to_source{$xref};
if ($source) {
my $i = find_in_list($source, @priorities);
Glenn Proctor
committed
my $s = $source . "|" . $xref;
my $key = $type . "|" . $object_id;
my $query_identity = $object_xref_identities{$key}->{$xref}->{"query_identity"};
my $target_identity = $object_xref_identities{$key}->{$xref}->{"target_identity"};
Glenn Proctor
committed
# Check if this source has a better priority than the current best one
# Note if 2 sources are the same priority, the mappings are compared on
# query_identity then target_identity
# if ($i > -1 && $i < $best_xref_priority_idx &&
# (($query_identity > $best_query_identity) ||
# ($query_identity == $best_query_identity && $target_identity > $best_target_identity))) {
Glenn Proctor
committed
if ($i > -1 && $i <= $best_xref_priority_idx && $query_identity > $best_qi{$s}) {
$best_xref_priority_idx = $i;
Glenn Proctor
committed
$best_qi{$s} = $query_identity;
$best_ti{$s} = $target_identity;
Glenn Proctor
committed
warn("Couldn't find a source for xref id $xref " . $xref_accessions{$xref_id} . "\n");
# store object type, id, and best xref id and source priority
if ($best_xref) {
$obj_to_best_xref{$key} = $best_xref . "|" . $best_xref_priority_idx;
}
}
# Now go through each of the calculated best xrefs and convert any that are
# calculated against translations to be associated with their transcript,
# if the priority of the translation xref is higher than that of the transcript
# xref.
# Needs to be done this way to avoid clobbering higher-priority transcripts.
# hash keyed on transcript id, value is xref_id|source prioirity index
my %transcript_display_xrefs;
# Write a .sql file that can be executed, and a .txt file that can be processed
open (TRANSCRIPT_DX, ">$dir/transcript_display_xref.sql");
open (TRANSCRIPT_DX_TXT, ">$dir/transcript_display_xref.txt");
foreach my $key (keys %obj_to_best_xref) {
my ($type, $object_id) = split /\|/, $key;
my ($best_xref, $best_xref_priority_idx) = split /\|/, $obj_to_best_xref{$key};
# If transcript has a translation, use the best xref out of the transcript & translation
Ian Longden
committed
my $transcript_id;
my $translation_id;
if ($type =~ /Transcript/i) {
$transcript_id = $object_id;
$translation_id = $transcript_to_translation{$transcript_id};
}
elsif ($type =~ /Translation/i) {
$translation_id = $object_id;
$transcript_id = $translation_to_transcript{$translation_id};
$object_id = $transcript_id;
}
else{
Glenn Proctor
committed
print "Cannot deal with type $type\n";
Ian Longden
committed
next;
}
if ($translation_id) {
my ($translation_xref, $translation_priority) = split /\|/, $obj_to_best_xref{"Translation|$translation_id"};
my ($transcript_xref, $transcript_priority) = split /\|/, $obj_to_best_xref{"Transcript|$transcript_id"};
my $transcript_qi = $object_xref_identities{"Transcript|$object_id"}->{$transcript_xref}->{"query_identity"};
my $translation_qi = $object_xref_identities{"Translation|$object_id"}->{$translation_xref}->{"query_identity"};
Glenn Proctor
committed
#print "transcript xref $transcript_xref transcript pri $transcript_priority transcript qi: $transcript_qi translation xref $translation_xref translation pri $translation_priority translation qi $translation_qi\n";
Ian Longden
committed
if(!$translation_xref){
$best_xref = $transcript_xref;
$best_xref_priority_idx = $transcript_priority;
}
if(!$transcript_xref){
$best_xref = $translation_xref;
$best_xref_priority_idx = $translation_priority;
elsif ($translation_priority < $transcript_priority) {
Ian Longden
committed
$best_xref = $translation_xref;
$best_xref_priority_idx = $translation_priority;
} else {
$best_xref = $transcript_xref;
$best_xref_priority_idx = $transcript_priority;
}
}
if ($best_xref) {
print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
print TRANSCRIPT_DX_TXT ($best_xref+$xref_id_offset) . "\t" . $object_id . "\n";
my $value = ($best_xref+$xref_id_offset) . "|" . $best_xref_priority_idx;
$transcript_display_xrefs{$object_id} = $value;
close(TRANSCRIPT_DX_TXT);
print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";
return \%transcript_display_xrefs;
}
# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used
Glenn Proctor
committed
sub build_gene_display_xrefs {
my ($self, $transcript_display_xrefs) = @_;
Ian Longden
committed
my $dir = $self->core->dir();
Ian Longden
committed
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $self->core->dbc);
Ian Longden
committed
my $ta = $db->get_TranscriptAdaptor();
print "Assigning display_xrefs to genes\n";
open (GENE_DX, ">$dir/gene_display_xref.sql");
open (GENE_DX_TXT, ">$dir/gene_display_xref.txt");
my $hit = 0;
my $miss = 0;
my $trans_no_xref = 0;
my $trans_xref = 0;
foreach my $gene_id (keys %genes_to_transcripts) {
my @transcripts = @{$genes_to_transcripts{$gene_id}};
my $best_xref;
my $best_xref_priority_idx = 99999;
my $best_transcript_length = -1;
foreach my $transcript_id (@transcripts) {
if (!$transcript_display_xrefs->{$transcript_id}) {
$trans_no_xref++;
next;
} else {
$trans_xref++;
}
my ($xref_id, $priority) = split (/\|/, $transcript_display_xrefs->{$transcript_id});
Glenn Proctor
committed
# 2 separate if clauses to avoid having to fetch transcripts unnecessarily
if (($priority < $best_xref_priority_idx)) {
Glenn Proctor
committed
$best_xref_priority_idx = $priority;
$best_xref = $xref_id;
Glenn Proctor
committed
} elsif ($priority == $best_xref_priority_idx) {
Glenn Proctor
committed
# compare transcript lengths and use longest
my $transcript = $ta->fetch_by_dbID($transcript_id);
my $transcript_length = $transcript->length();
if ($transcript_length > $best_transcript_length) {
$best_transcript_length = $transcript_length;
$best_xref_priority_idx = $priority;
$best_xref = $xref_id;
}
}
}
if ($best_xref) {
# Write record
print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
print GENE_DX_TXT $best_xref . "\t" . $gene_id ."\n";
} else {
$miss++;
}
}
close (GENE_DX);
close (GENE_DX_TXT);
print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
print "Couldn't find display_xrefs for $miss genes\n" if ($miss > 0);
print "Found display_xrefs for all genes\n" if ($miss eq 0);
Glenn Proctor
committed
return \%genes_to_transcripts;
}
# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.
sub transcript_display_xref_sources {
return ('HUGO',
'MarkerSymbol',
Glenn Proctor
committed
# 'wormbase_transcript',
'flybase_symbol',
'Anopheles_symbol',
'Genoscope_annotated_gene',
'Genoscope_predicted_transcript',
'Genoscope_predicted_gene',
'EntrezGene');
Glenn Proctor
committed
# Get transcripts associated with each gene
sub build_genes_to_transcripts {
my ($self) = @_;
print "Getting transcripts for all genes\n";
my $sql = "SELECT gene_id, transcript_id FROM transcript";
Ian Longden
committed
my $sth = $self->core->dbc->prepare($sql);
Glenn Proctor
committed
$sth->execute();
my ($gene_id, $transcript_id);
$sth->bind_columns(\$gene_id, \$transcript_id);
# Note %genes_to_transcripts is global
while ($sth->fetch()) {
push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
}
print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";
}
# Find the index of an item in a list(ref), or -1 if it's not in the list.
# Only look for exact matches (case insensitive)
sub find_in_list {
my ($item, @list) = @_;
for (my $i = 0; $i < scalar(@list); $i++) {
if (lc($list[$i]) eq lc($item)) {
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
return -1;
}
# Take a string and a list of regular expressions
# Find the index of the highest matching regular expression
# Return the index, or -1 if not found.
sub find_match {
my ($str, @list) = @_;
my $str2 = $str;
my $highest_index = -1;
for (my $i = 0; $i < scalar(@list); $i++) {
my $re = $list[$i];
if ($str2 =~ /$re/i) {
$highest_index = $i;
}
}
return $highest_index;
Glenn Proctor
committed
# Build a map of source id (in xref database) to external_db (in core database)
sub map_source_to_external_db {
my $self = shift;
my %source_to_external_db;
# get all sources
Ian Longden
committed
my $sth = $self->xref->dbc->prepare("SELECT source_id, name FROM source");
Glenn Proctor
committed
$sth->execute();
my ($source_id, $source_name);
$sth->bind_columns(\$source_id, \$source_name);
while($sth->fetchrow_array()) {
# find appropriate external_db_id for each one
my $sql = "SELECT external_db_id FROM external_db WHERE db_name=?";
Ian Longden
committed
my $core_sth = $self->core->dbc->prepare($sql);
Glenn Proctor
committed
$core_sth->execute($source_name);
my @row = $core_sth->fetchrow_array();
if (@row) {
$source_to_external_db{$source_id} = $row[0];
#print "Source name $source_name id $source_id corresponds to core external_db_id " . $row[0] . "\n";
} else {
print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db\n"
Glenn Proctor
committed
}
} # while source
return %source_to_external_db;
}
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
sub cleanup_sources_file{
my ($self,$id) = @_;
$updated_source{$id} =1;
my $dir = $self->core->dir();
open (DEL, ">>$dir/cleanup.sql") || die "Could not open $dir/cleanup.sql\n";
print DEL "DELETE external_synonym ";
print DEL "FROM external_synonym, xref ";
print DEL "WHERE external_synonym.xref_id = xref.xref_id ";
print DEL "AND xref.external_db_id = $id\n";
print DEL "DELETE identity_xref ";
print DEL "FROM identity_xref, object_xref, xref ";
print DEL "WHERE identity_xref.object_xref_id = object_xref.object_xref_id ";
print DEL "AND object_xref.xref_id = xref.xref_id ";
print DEL "AND xref.external_db_id = $id \n";
print DEL "DELETE object_xref ";
print DEL "FROM object_xref, xref ";
print DEL "WHERE object_xref.xref_id = xref.xref_id ";
print DEL "AND xref.external_db_id = $id\n";
print DEL "DELETE FROM xref WHERE xref.external_db_id = $id \n";
close DEL;
}
# Upload .txt files and execute .sql files.
sub do_upload {
Glenn Proctor
committed
my ($self) = @_;
Ian Longden
committed
my $ensembl = $self->core;
my $core_db = $ensembl->dbc;
# xref.txt etc
if(defined($self->delete_existing())){
my $file = $ensembl->dir() . "/cleanup.sql";
open(CLEAN,"<$file") || die "could not open $file for reading \n";
while(<CLEAN>){