Newer
Older
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
} # foreach gene
close(GENE_DESCRIPTIONS);
}
# remove a list of patterns from a string
sub filter_by_regexp {
my ($str, $regexps) = @_;
foreach my $regexp (@$regexps) {
$str =~ s/$regexp//ig;
}
return $str;
}
# Regexp used for filter out useless text from gene descriptions
# Method can be overridden in species-specific modules
sub gene_description_filter_regexps {
return ();
}
# The "consortium" source for this species, should be the same as in
# source table
sub consortium {
return "xxx"; # Default to something that won't be matched as a source
}
# Sort a list of xrefs by the priority of their sources
# Assumed this function is called by Perl sort, passed with parameter
# See comment for build_gene_descriptions for how precedence is decided.
sub compare_xref_descriptions {
my ($consortium, $gene_id, $xref_to_object) = @_;
my @sources = ("Uniprot/SPTREMBL", "RefSeq_dna", "RefSeq_peptide", "Uniprot/SWISSPROT", $consortium);
my @words = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
my $src_a = $xref_to_source{$a};
my $src_b = $xref_to_source{$b};
my $pos_a = find_in_list($src_a, @sources);
my $pos_b = find_in_list($src_b, @sources);
# If same source, need to do more work
if ($pos_a == $pos_b) {
if ($src_a eq "Uniprot/SWISSPROT" || $src_a =~ /RefSeq/) {
# Compare on query identities, then target identities if queries are the same
my $key_a = $xref_to_object->{$a}; # e.g. "Translation|1234"
my $key_b = $xref_to_object->{$b};
my ($type_a, $object_a) = split(/\|/, $key_a);
my ($type_b, $object_b) = split(/\|/, $key_b);
return 0 if ($type_a != $type_b); # only compare like with like
my $query_identity_a = $object_xref_identities{$key_a}->{$a}->{"query_identity"};
my $query_identity_b = $object_xref_identities{$key_b}->{$b}->{"query_identity"};
#print "gene 78163 " . $xref_accessions{$a} . " key a $key_a qia $query_identity_a " . $xref_accessions{$b} . " key b $key_b qib $query_identity_b \n" if ($gene_id==78163);
return ($query_identity_a <=> $query_identity_b) if ($query_identity_a != $query_identity_b);
my $target_identity_a = $object_xref_identities{$key_a}->{$a}->{"target_identity"};
my $target_identity_b = $object_xref_identities{$key_b}->{$b}->{"target_identity"};
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
return ($target_identity_a <=> $target_identity_b);
} elsif ($src_a eq "Uniprot/SPTREMBL") {
# Compare on words
my $wrd_idx_a = find_match($xref_descriptions{$a}, @words);
my $wrd_idx_b = find_match($xref_descriptions{$b}, @words);
return $wrd_idx_a <=> $wrd_idx_b;
} else {
return 0;
}
return 0;
} else {
return $pos_a <=> $pos_b;
}
}
# load external_db (if it's empty) from ../external_db/external_dbs.txt
sub upload_external_db {
Ian Longden
committed
my ($self) = @_;
Ian Longden
committed
my $core_db = $self->core->dbc;
$core_db->connect();
my $row = @{$core_db->db_handle->selectall_arrayref("SELECT COUNT(*) FROM external_db")}[0];
my $count = @{$row}[0];
if ($count == 0) {
my $edb = cwd() . "/../external_db/external_dbs.txt";
print "external_db table is empty, uploading from $edb\n";
Ian Longden
committed
my $edb_sth = $core_db->prepare("LOAD DATA INFILE \'$edb\' INTO TABLE external_db");
$edb_sth->execute();
} else {
print "external_db table already has $count rows, will not change it\n";
}
}