Newer
Older
my $filename = shift;
$filename = basename($filename);
my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/;
return $method;
}
sub get_analysis_id {
my ($self, $ensembl_type) = @_;
my %typeToLogicName = ( 'dna' => 'XrefExonerateDNA',
'protein' => 'XrefExonerateProtein' );
my $logic_name = $typeToLogicName{lc($ensembl_type)};
my $sth = $self->dbi()->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'");
$sth->execute();
my $analysis_id;
if (my @row = $sth->fetchrow_array()) {
$analysis_id = $row[0];
print "Found exising analysis ID ($analysis_id) for $logic_name\n";
} else {
print "No analysis with logic_name $logic_name found, creating ...\n";
$sth = $self->dbi()->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())");
# TODO - other fields in analysis table
$sth->execute();
$analysis_id = $sth->{'mysql_insertid'};
print "Done (analysis ID=" . $analysis_id. ")\n";
}
return $analysis_id;
}
Glenn Proctor
committed
sub dump_core_xrefs {
Glenn Proctor
committed
my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $object_xref_id_offset, $ensembl_object_types_hashref) = @_;
my @xref_ids = keys %$xref_ids_hashref;
my %xref_to_objects = %$xref_ids_hashref;
my %ensembl_object_types = %$ensembl_object_types_hashref;
my $dir = $self->dir();
open (XREF, ">$dir/xref.txt");
open (OBJECT_XREF, ">>$dir/object_xref.txt");
open (EXTERNAL_SYNONYM, ">$dir/external_synonym.txt");
Glenn Proctor
committed
open (GO_XREF, ">$dir/go_xref.txt");
my $xref_dbi = $self->xref()->dbi();
my $core_dbi = $self->dbi();
# keep a unique list of source IDs to build the external_db table later
my %source_ids;
my $object_xref_id = $start_object_xref_id;
# build cache of source id -> external_db id; note %source_to_external_db is global
%source_to_external_db = $self->map_source_to_external_db();
Glenn Proctor
committed
# execute several queries with a max of 200 entries in each IN clause - more efficient
my $batch_size = 200;
# keep track of what xref_id & object_xref_ids have been written to prevent
# duplicates; e.g. several dependent xrefs may be dependent on the same master xref.
my %object_xrefs_written;
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
while(@xref_ids) {
my @ids;
if($#xref_ids > $batch_size) {
@ids = splice(@xref_ids, 0, $batch_size);
} else {
@ids = splice(@xref_ids, 0);
}
my $id_str;
if(@ids > 1) {
$id_str = "IN (" . join(',', @ids). ")";
} else {
$id_str = "= " . $ids[0];
}
my $sql = "SELECT * FROM xref WHERE xref_id $id_str";
my $xref_sth = $xref_dbi->prepare($sql);
$xref_sth->execute();
Glenn Proctor
committed
my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id, $master_xref_id, $linkage_annotation);
$xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
# note the xref_id we write to the file is NOT the one we've just read
Glenn Proctor
committed
# from the internal xref database as the ID may already exist in the
# core database so we add on $xref_id_offset
while ($xref_sth->fetch()) {
# make sure label is set to /something/ so that the website displays something
$label = $accession if (!$label);
if (!$xrefs_written{$xref_id}) {
my $external_db_id = $source_to_external_db{$source_id};
if ($external_db_id) { # skip "unknown" sources
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n";
$xrefs_written{$xref_id} = 1;
$source_ids{$source_id} = $source_id;
}
}
# Now get the dependent xrefs for each of these xrefs and write them as well
Glenn Proctor
committed
# Store the go_linkage_annotations as we go along (need for dumping go_xref)
my $go_source_id = get_source_id_from_source_name($self->xref, "GO");
$sql = "SELECT DISTINCT(x.xref_id), dx.master_xref_id, x.accession, x.label, x.description, x.source_id, x.version, dx.linkage_annotation FROM dependent_xref dx, xref x WHERE x.xref_id=dx.dependent_xref_id AND master_xref_id $id_str";
my $dep_sth = $xref_dbi->prepare($sql);
$dep_sth->execute();
Glenn Proctor
committed
$dep_sth->bind_columns(\$xref_id, \$master_xref_id, \$accession, \$label, \$description, \$source_id, \$version, \$linkage_annotation);
while ($dep_sth->fetch()) {
Glenn Proctor
committed
my $external_db_id = $source_to_external_db{$source_id};
next if (!$external_db_id);
$label = $accession if (!$label);
Glenn Proctor
committed
Glenn Proctor
committed
if (!$xrefs_written{$xref_id}) {
print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\tDEPENDENT\n";
$xrefs_written{$xref_id} = 1;
$source_ids{$source_id} = $source_id;
}
# create an object_xref linking this (dependent) xref with any objects it maps to
# write to file and add to object_xref_mappings
if (defined $xref_to_objects{$master_xref_id}) {
my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} );
#print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n";
foreach my $object_id (@ensembl_object_ids) {
my $type = $ensembl_object_types{$object_id};
my $full_key = $type."|".$object_id."|".$xref_id;
if (!$object_xrefs_written{$full_key}) {
print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n";
# Add this mapping to the list - note NON-OFFSET xref_id is used
my $key = $type . "|" . $object_id;
push @{$object_xref_mappings{$key}}, $xref_id;
$object_xrefs_written{$full_key} = 1;
Glenn Proctor
committed
# write a go_xref with the appropriate linkage type
print GO_XREF $object_xref_id . "\t" . $linkage_annotation . "\n" if ($source_id == $go_source_id);
$object_xref_id++;
}
# Now get the synonyms for each of these xrefs and write them to the external_synonym table
$sql = "SELECT DISTINCT xref_id, synonym FROM synonym WHERE xref_id $id_str";
my $syn_sth = $xref_dbi->prepare($sql);
$syn_sth->execute();
$syn_sth->bind_columns(\$xref_id, \$accession);
while ($syn_sth->fetch()) {
print EXTERNAL_SYNONYM ($xref_id+$xref_id_offset) . "\t" . $accession . "\n";
}
#print "source_ids: " . join(" ", keys(%source_ids)) . "\n";
} # while @xref_ids
close(XREF);
close(OBJECT_XREF);
Glenn Proctor
committed
close(GO_XREF);
print "Before calling display_xref, object_xref_mappings size " . scalar (keys %object_xref_mappings) . "\n";
# calculate display_xref_ids for transcripts and genes
my $transcript_display_xrefs = $self->build_transcript_display_xrefs($xref_id_offset);
$self->build_gene_display_xrefs_and_descriptions($transcript_display_xrefs);
return $object_xref_id;
# produce output for comparison with existing ensembl mappings
# format is (with header)
# xref_accession ensembl_type ensembl_id
sub dump_comparison {
my $self = shift;
my $dir = $self->dir();
print "Dumping comparison data\n";
open (COMPARISON, ">comparison/xref_mappings.txt");
Glenn Proctor
committed
print COMPARISON "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";
# get the xref accession for each xref as the xref_ids are ephemeral
# first read all the xrefs that were dumped and get an xref_id->accession map
my %xref_id_to_accesson;
open (XREF, "$dir/xref.txt");
Glenn Proctor
committed
my ($xref_id,$external_db_id,$accession,$label,$version,$description) = split;
$xref_id_to_accesson{$xref_id} = $accession;
}
close (XREF);
open (OBJECT_XREF, "$dir/object_xref.txt");
while (<OBJECT_XREF>) {
my ($object_xref_id,$object_id,$type,$xref_id) = split;
print COMPARISON $xref_id_to_accesson{$xref_id} . "\t" . $type . "\t" . $object_id . "\n";
}
close (OBJECT_XREF);
close (COMPARISON);
sub build_transcript_display_xrefs {
my ($self, $xref_id_offset) = @_;
my $dir = $self->dir();
# get a list of xref sources; format:
# key: xref_id value: source_name
# lots of these; if memory is a problem, just get the source ID (not the name)
# and look it up elsewhere
# note %xref_to_source is global
print "Building xref->source mapping table\n";
my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
my $sth = $self->xref->dbi()->prepare($sql);
$sth->execute();
my ($xref_id, $source_name);
$sth->bind_columns(\$xref_id, \$source_name);
$xref_to_source{$xref_id} = $source_name;
}
print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";
# Cache the list of translation->transcript mappings & vice versa
# Nte variables are global
print "Building translation to transcript mappings\n";
my $sth = $self->dbi()->prepare("SELECT translation_id, transcript_id FROM translation");
$sth->execute();
my ($translation_id, $transcript_id);
$sth->bind_columns(\$translation_id, \$transcript_id);
$translation_to_transcript{$translation_id} = $transcript_id;
$transcript_to_translation{$transcript_id} = $translation_id if ($translation_id);
print "Building transcript display_xrefs\n";
my @priorities = $self->transcript_display_xref_sources();
# go through each object/xref mapping and store the best ones as we go along
my %obj_to_best_xref;
foreach my $key (keys %object_xref_mappings) {
my ($type, $object_id) = split /\|/, $key;
next if ($type !~ /(Transcript|Translation)/i);
# if a transcript has more than one associated xref,
# use the one with the highest priority, i.e. lower list position in @priorities
my @xrefs = @{$object_xref_mappings{$key}};
my ($best_xref, $best_xref_priority_idx);
$best_xref_priority_idx = 99999;
my $source = $xref_to_source{$xref};
if ($source) {
my $i = find_in_list($source, @priorities);
if ($i > -1 && $i < $best_xref_priority_idx) {
$best_xref_priority_idx = $i;
}
} else {
warn("Couldn't find a source for xref $xref \n");
}
}
# store object type, id, and best xref id and source priority
if ($best_xref) {
$obj_to_best_xref{$key} = $best_xref . "|" . $best_xref_priority_idx;
}
}
# Now go through each of the calculated best xrefs and convert any that are
# calculated against translations to be associated with their transcript,
# if the priority of the translation xref is higher than that of the transcript
# xref.
# Needs to be done this way to avoid clobbering higher-priority transcripts.
# hash keyed on transcript id, value is xref_id|source prioirity index
my %transcript_display_xrefs;
# Write a .sql file that can be executed, and a .txt file that can be processed
open (TRANSCRIPT_DX, ">$dir/transcript_display_xref.sql");
open (TRANSCRIPT_DX_TXT, ">$dir/transcript_display_xref.txt");
foreach my $key (keys %obj_to_best_xref) {
my ($type, $object_id) = split /\|/, $key;
my ($best_xref, $best_xref_priority_idx) = split /\|/, $obj_to_best_xref{$key};
# If transcript has a translation, use the best xref out of the transcript & translation
if ($type =~ /Transcript/i) {
my $transcript_id = $object_id;
my $translation_id = $transcript_to_translation{$transcript_id};
if ($translation_id) {
my ($translation_xref, $translation_priority) = split /\|/, $obj_to_best_xref{"Translation|$translation_id"};
my ($transcript_xref, $transcript_priority) = split /\|/, $obj_to_best_xref{"Transcript|$transcript_id"};
if ($translation_priority < $transcript_priority) {
$best_xref = $translation_xref;
$best_xref_priority_idx = $translation_priority;
} else {
$best_xref = $transcript_xref;
$best_xref_priority_idx = $transcript_priority;
}
}
if ($best_xref) {
print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n";
print "wrote " . $best_xref . " (plus offset) for 95625\n" if ($object_id eq 95625);
print TRANSCRIPT_DX_TXT ($best_xref+$xref_id_offset) . "\t" . $object_id . "\n";
my $value = ($best_xref+$xref_id_offset) . "|" . $best_xref_priority_idx;
$transcript_display_xrefs{$object_id} = $value;
close(TRANSCRIPT_DX_TXT);
print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";
return \%transcript_display_xrefs;
}
# Assign display_xrefs to genes based on transcripts
# Gene gets the display xref of the highest priority of all of its transcripts
# If more than one transcript with the same priority, longer transcript is used
sub build_gene_display_xrefs_and_descriptions {
my ($self, $transcript_display_xrefs) = @_;
my $dir = $self->dir();
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-species => $self->species(),
-dbname => $self->dbname(),
-host => $self->host(),
-port => $self->port(),
-pass => $self->password(),
-user => $self->user(),
-group => 'core');
my $ta = $db->get_TranscriptAdaptor();
print "Building gene display_xrefs\n";
print "Getting transcripts for all genes\n";
my $sql = "SELECT gene_id, transcript_id FROM transcript";
my $sth = $self->dbi()->prepare($sql);
$sth->execute();
my ($gene_id, $transcript_id);
$sth->bind_columns(\$gene_id, \$transcript_id);
# Note %genes_to_transcripts is global
push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
}
print "Got " . scalar keys(%genes_to_transcripts) . " genes\n";
print "Assigning display_xrefs to genes\n";
open (GENE_DX, ">$dir/gene_display_xref.sql");
open (GENE_DX_TXT, ">$dir/gene_display_xref.txt");
my $hit = 0;
my $miss = 0;
my $trans_no_xref = 0;
my $trans_xref = 0;
foreach my $gene_id (keys %genes_to_transcripts) {
my @transcripts = @{$genes_to_transcripts{$gene_id}};
my $best_xref;
my $best_xref_priority_idx = 99999;
my $best_transcript_length = -1;
foreach my $transcript_id (@transcripts) {
if (!$transcript_display_xrefs->{$transcript_id}) {
$trans_no_xref++;
next;
} else {
$trans_xref++;
}
my ($xref_id, $priority) = split (/\|/, $transcript_display_xrefs->{$transcript_id});
#print "gene $gene_id orig:" . $transcript_display_xrefs->{$transcript_id} . " xref id: " . $xref_id . " pri " . $priority . "\n";
Glenn Proctor
committed
# 2 separate if clauses to avoid having to fetch transcripts unnecessarily
Glenn Proctor
committed
if (($priority lt $best_xref_priority_idx)) {
$best_xref_priority_idx = $priority;
$best_xref = $xref_id;
Glenn Proctor
committed
} elsif ($priority eq $best_xref_priority_idx) {
# compare transcript lengths and use longest
my $transcript = $ta->fetch_by_dbID($transcript_id);
my $transcript_length = $transcript->length();
if ($transcript_length > $best_transcript_length) {
$best_transcript_length = $transcript_length;
$best_xref_priority_idx = $priority;
$best_xref = $xref_id;
}
}
}
if ($best_xref) {
# Write record
print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n";
print GENE_DX_TXT $best_xref . "\t" . $gene_id ."\n";
} else {
$miss++;
}
}
close (GENE_DX);
close (GENE_DX_TXT);
print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n";
print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n";
print "Couldn't find display_xrefs for $miss genes\n" if ($miss > 0);
print "Found display_xrefs for all genes\n" if ($miss eq 0);
# now build gene descriptions
$self->build_gene_descriptions(\%genes_to_transcripts);
}
# Display xref sources to be used for transcripts *in order of priority*
# Source names used must be identical to those in the source table.
sub transcript_display_xref_sources {
return ('HUGO',
'MarkerSymbol',
'wormbase_transcript',
'flybase_symbol',
'Anopheles_symbol',
'Genoscope_annotated_gene',
'Genoscope_predicted_transcript',
'Genoscope_predicted_gene',
# Find the index of an item in a list(ref), or -1 if it's not in the list.
# Only look for exact matches (case insensitive)
sub find_in_list {
my ($item, @list) = @_;
for (my $i = 0; $i < scalar(@list); $i++) {
if (lc($list[$i]) eq lc($item)) {
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
return -1;
}
# Take a string and a list of regular expressions
# Find the index of the highest matching regular expression
# Return the index, or -1 if not found.
sub find_match {
my ($str, @list) = @_;
my $str2 = $str;
my $highest_index = -1;
for (my $i = 0; $i < scalar(@list); $i++) {
my $re = $list[$i];
if ($str2 =~ /$re/i) {
$highest_index = $i;
}
}
return $highest_index;
Glenn Proctor
committed
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
# Build a map of source id (in xref database) to external_db (in core database)
sub map_source_to_external_db {
my $self = shift;
my %source_to_external_db;
# get all sources
my $sth = $self->xref->dbi()->prepare("SELECT source_id, name FROM source");
$sth->execute();
my ($source_id, $source_name);
$sth->bind_columns(\$source_id, \$source_name);
while($sth->fetchrow_array()) {
# find appropriate external_db_id for each one
my $sql = "SELECT external_db_id FROM external_db WHERE db_name=?";
my $core_sth = $self->dbi()->prepare($sql);
$core_sth->execute($source_name);
my @row = $core_sth->fetchrow_array();
if (@row) {
$source_to_external_db{$source_id} = $row[0];
#print "Source name $source_name id $source_id corresponds to core external_db_id " . $row[0] . "\n";
} else {
print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db\n"
Glenn Proctor
committed
}
} # while source
return %source_to_external_db;
}
# Upload .txt files and execute .sql files.
sub do_upload {
my ($self, $deleteexisting) = @_;
# xref.txt etc
# TODO warn if table not empty
foreach my $table ("xref", "object_xref", "identity_xref", "external_synonym", "gene_description", "go_xref", "interpro") {
my $file = $self->dir() . "/" . $table . ".txt";
my $sth;
if ($deleteexisting) {
$sth = $self->dbi()->prepare("DELETE FROM $table");
print "Deleting existing data in $table\n";
$sth->execute();
}
# don't seem to be able to use prepared statements here
$sth = $self->dbi()->prepare("LOAD DATA INFILE \'$file\' INTO TABLE $table");
print "Uploading data in $file to $table\n";
$sth->execute();
}
# gene_display_xref.sql etc
foreach my $table ("gene", "transcript") {
my $file = $self->dir() . "/" . $table . "_display_xref.txt";
my $sth;
if ($deleteexisting) {
$sth = $self->dbi()->prepare("UPDATE $table SET display_xref_id=NULL");
print "Setting all existing display_xref_id in $table to null\n";
$sth->execute();
}
print "Setting $table display_xrefs from $file\n";
# TODO this better
#my $str = "mysql -u " .$self->user() ." -p" . $self->password() . " -h " . $self->host() ." -P " . $self->port() . " " .$self->dbname() . " < $file";
#system $str;
$sth = $self->dbi()->prepare("UPDATE $table SET display_xref_id=? WHERE ${table}_id=?");
open(DX_TXT, $file);
while (<DX_TXT>) {
my ($xref_id, $object_id) = split;
$sth->execute($xref_id, $object_id);
close(DX_TXT);
}
}
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
# Assign gene descriptions
# Algorithm:
# foreach gene
# get all transcripts & translations
# get all associated xrefs
# filter by regexp, discard blank ones
# order by source & keyword
# assign description of best xref to gene
# }
#
# One gene may have several associated peptides; the one to use is decided as follows.
# In decreasing order of precedence:
#
# - Consortium xref, e.g. ZFIN for zebrafish
#
# - UniProt/SWISSPROT
# If there are several, the one with the best %query_id then %target_id is used
#
# - RefSeq
# If there are several, the one with the best %query_id then %target_id is used
#
# - UniProt/SPTREMBL
# If there are several, precedence is established on the basis of the occurrence of
# regular expression patterns in the description.
sub build_gene_descriptions {
my ($self, $genes_to_transcripts) = @_;
# TODO - don't call this from, but after, gene_display_xref
# Get all xref descriptions, filtered by regexp.
# Discard any that are blank (i.e. regexp has removed everything)
print "Getting & filtering xref descriptions\n";
# Note %xref_descriptions & %xref_accessions are global
my $sth = $self->xref->dbi()->prepare("SELECT xref_id, accession, description FROM xref");
$sth->execute();
my ($xref_id, $accession, $description);
$sth->bind_columns(\$xref_id, \$accession, \$description);
my $removed = 0;
my @regexps = $self->gene_description_filter_regexps();
while ($sth->fetch()) {
if ($description) {
$description = filter_by_regexp($description, \@regexps);
if ($description ne "") {
$xref_descriptions{$xref_id} = $description;
$xref_accessions{$xref_id} = $accession;
} else {
$removed++;
}
}
}
print "Regexp filtering (" . scalar(@regexps) . " regexps) removed $removed descriptions, left with " . scalar(keys %xref_descriptions) . "\n";
my $dir = $self->dir();
open(GENE_DESCRIPTIONS,">$dir/gene_description.txt") || die "Could not open $dir/gene_description.txt";
# Foreach gene, get any xrefs associated with its transcripts or translations
print "Assigning gene descriptions\n";
foreach my $gene_id (keys %genes_to_transcripts) {
my @gene_xrefs;
my %local_xref_to_object;
my @transcripts = @{$genes_to_transcripts{$gene_id}};
foreach my $transcript (@transcripts) {
my @xref_ids;
my $key = "Transcript|$transcript";
if ($object_xref_mappings{$key}) {
@xref_ids = @{$object_xref_mappings{$key}};
push @gene_xrefs, @xref_ids;
foreach my $xref (@xref_ids) {
$local_xref_to_object{$xref} = $key;
}
}
my $translation = $transcript_to_translation{$transcript};
$key = "Translation|$translation";
if ($object_xref_mappings{$key}) {
push @gene_xrefs, @{$object_xref_mappings{$key}} ;
foreach my $xref (@xref_ids) {
$local_xref_to_object{$xref} = $key;
}
}
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
}
# Now sort through these and find the "best" description and write it
if (@gene_xrefs) {
@gene_xrefs = sort {compare_xref_descriptions($self->consortium(), $gene_id, \%local_xref_to_object)} @gene_xrefs;
my $best_xref = $gene_xrefs[-1];
my $description = $xref_descriptions{$best_xref};
my $source = $xref_to_source{$best_xref};
my $acc = $xref_accessions{$best_xref};
print GENE_DESCRIPTIONS "$gene_id\t$description" . " [Source:$source;Acc:$acc]\n" if ($description);
}
} # foreach gene
close(GENE_DESCRIPTIONS);
}
# remove a list of patterns from a string
sub filter_by_regexp {
my ($str, $regexps) = @_;
foreach my $regexp (@$regexps) {
$str =~ s/$regexp//ig;
}
return $str;
}
# Regexp used for filter out useless text from gene descriptions
# Method can be overridden in species-specific modules
sub gene_description_filter_regexps {
return ();
}
# The "consortium" source for this species, should be the same as in
# source table
sub consortium {
return "xxx"; # Default to something that won't be matched as a source
}
# Sort a list of xrefs by the priority of their sources
# Assumed this function is called by Perl sort, passed with parameter
# See comment for build_gene_descriptions for how precedence is decided.
sub compare_xref_descriptions {
my ($consortium, $gene_id, $xref_to_object) = @_;
my @sources = ("Uniprot/SPTREMBL", "RefSeq_dna", "RefSeq_peptide", "Uniprot/SWISSPROT", $consortium);
my @words = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
my $src_a = $xref_to_source{$a};
my $src_b = $xref_to_source{$b};
my $pos_a = find_in_list($src_a, @sources);
my $pos_b = find_in_list($src_b, @sources);
# If same source, need to do more work
if ($pos_a == $pos_b) {
if ($src_a eq "Uniprot/SWISSPROT" || $src_a =~ /RefSeq/) {
# Compare on query identities, then target identities if queries are the same
my $key_a = $xref_to_object->{$a}; # e.g. "Translation|1234"
my $key_b = $xref_to_object->{$b};
my ($type_a, $object_a) = split(/\|/, $key_a);
my ($type_b, $object_b) = split(/\|/, $key_b);
return 0 if ($type_a != $type_b); # only compare like with like
my $query_identity_a = $object_xref_identities{$object_a}->{$a}->{"query_identity"};
my $query_identity_b = $object_xref_identities{$object_b}->{$b}->{"query_identity"};
return ($query_identity_a <=> $query_identity_b) if ($query_identity_a != $query_identity_b);
my $target_identity_a = $object_xref_identities{$object_a}->{$a}->{"target_identity"};
my $target_identity_b = $object_xref_identities{$object_b}->{$b}->{"target_identity"};
return ($target_identity_a <=> $target_identity_b);
} elsif ($src_a eq "Uniprot/SPTREMBL") {
# Compare on words
my $wrd_idx_a = find_match($xref_descriptions{$a}, @words);
my $wrd_idx_b = find_match($xref_descriptions{$b}, @words);
return $wrd_idx_a <=> $wrd_idx_b;
} else {
return 0;
}
return 0;
} else {
return $pos_a <=> $pos_b;
}
}