Commit 914275ae authored by Monika Komorowska's avatar Monika Komorowska
Browse files

Changes to UniProtDirectParser - it can be used by any species

parent 50fa6136
......@@ -6,6 +6,7 @@ use Carp;
use DBI;
use base qw( XrefParser::BaseParser );
use XrefParser::Database;
# Parse file of Uniprot records and assign direct xrefs
# All assumed to be linked to translation
......@@ -13,30 +14,67 @@ use base qw( XrefParser::BaseParser );
# --------------------------------------------------------------------------------
sub run {
sub run_script {
my ($self, $ref_arg) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $file = $ref_arg->{file};
my $verbose = $ref_arg->{verbose};
if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
croak "Need to pass source_id, species_id and files as pairs";
if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
croak "Need to pass source_id, species_id and file as pairs";
}
$verbose |=0;
my %prefix = (9606 => "ENSP0", 10090 => "ENSMUSP0", 10116 => "ENSRNOP0");
my $user = "ensro";
my $host;
my $port;
my $dbname;
my $wget = "";
if(!defined($prefix{$species_id})){
print "No prefix known for this species $species_id???\n";
if($file =~ /host[=][>](\S+?)[,]/){
$host = $1;
}
if($file =~ /port[=][>](\S+?)[,]/){
$port = $1;
}
if($file =~ /dbname[=][>](\S+?)[,]/){
$dbname = $1;
}
if($file =~ /wget[=][>](\S+?)[,]/){
$wget = $1;
}
my $ua = LWP::UserAgent->new();
$ua->timeout(10);
$ua->env_proxy();
my $response = $ua->get($wget);
if ( !$response->is_success() ) {
warn($response->status_line);
return 1;
}
my $filename = @{$files}[0];
my $production_db = XrefParser::Database->new({ host => $host,
port => $port,
user => $user,
dbname => $dbname,
pass => ""});
my $prod_dbi = $production_db->dbi();
my $file_io = $self->get_filehandle($filename);
if ( !defined($file_io) ) {
if(!defined($prod_dbi)){
return 1;
}
my ($prefix) = $prod_dbi->selectrow_array("SELECT species_prefix FROM species WHERE taxon = $species_id");
my %prefix = ($species_id => $prefix);
if(!defined($prefix{$species_id})){
print "No prefix known for this species $species_id???\n";
return 1;
}
......@@ -46,7 +84,9 @@ sub run {
my %prot2ensembl;
my $count = 0;
while ( defined( my $line = $file_io->getline() ) ) {
my @lines = split(/\n/,$response->content);
foreach my $line (@lines){
my ($prot, $ens) = split /\s+/,$line;
if($ens =~ /$prefix{$species_id}/){
push @{$prot2ensembl{$prot}}, $ens;
......
......@@ -2598,7 +2598,7 @@ data_uri = ftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/unipro
# swissprot entry
[source Uniprot/SWISSPROT::DIRECT]
# Used by homo_sapiens
name = Uniprot/SWISSPROT
download = Y
order = 22
......@@ -2607,8 +2607,7 @@ prio_descr = uniprot_mapped
parser = UniProtDirectParser
dependent = Unprot/SWISSPROT
release_uri =
data_uri = ftp://ftp.ebi.ac.uk/pub/contrib/xrefs/ens-sp.map
data_uri = script:wget=>ftp://ftp.ebi.ac.uk/pub/contrib/xrefs/ens-sp.map,host=>ens-staging1,dbname=>ensembl_production,
[source Uniprot/SWISSPROT::MULTI-predicted]
......@@ -3563,6 +3562,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
[species anopheles_gambiae]
taxonomy_id = 7165
......@@ -3945,6 +3945,7 @@ source = RefSeq_peptide::bos_taurus
source = UniGene::bos_taurus
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
source = goslim_goa::MULTI
......@@ -3976,6 +3977,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
[species canis_familiaris]
taxonomy_id = 9615
......@@ -3990,6 +3992,7 @@ source = RefSeq_peptide::canis_familiaris
source = UniGene::canis_familiaris
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species ciona_intestinalis]
......@@ -4005,6 +4008,7 @@ source = RefSeq_peptide::MULTI-vertebrate_other
source = UniGene::ciona_intestinalis
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = cint_aniseed_v1::ciona_intestinalis
source = cint_aniseed_v2::ciona_intestinalis
source = cint_jgi_v1::ciona_intestinalis
......@@ -4025,6 +4029,7 @@ source = RefSeq_peptide::MULTI-vertebrate_other
source = UniGene::ciona_savignyi
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species culex_quinquefasciatus]
......@@ -4056,6 +4061,7 @@ source = RefSeq_peptide::danio_rerio
source = UniGene::danio_rerio
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ZFIN_ID::danio_rerio#01
source = ZFIN_ID::danio_rerio#02
source = ZFIN_ID::danio_rerio#03
......@@ -4072,6 +4078,7 @@ source = InterproGO::MULTI
source = Interpro::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species dictyostelium_discoideum]
......@@ -4328,6 +4335,7 @@ source = InterproGO::MULTI
source = Interpro::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species erinaceus_europaeus]
......@@ -4343,6 +4351,7 @@ source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species equus_caballus]
......@@ -4358,6 +4367,7 @@ source = RefSeq_peptide::equus_caballus
source = UniGene::equus_caballus
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species felis_catus]
......@@ -4370,6 +4380,7 @@ source = InterproGO::MULTI
source = Interpro::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species gallus_gallus]
......@@ -4386,6 +4397,7 @@ source = RefSeq_peptide::gallus_gallus
source = UniGene::gallus_gallus
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species gasterosteus_aculeatus]
......@@ -4401,6 +4413,7 @@ source = RefSeq_dna::gasterosteus_aculeatus
source = RefSeq_peptide::MULTI-vertebrate_other
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species gorilla_gorilla]
......@@ -4415,6 +4428,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
......@@ -4470,6 +4484,7 @@ source = InterproGO::MULTI
source = Interpro::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
......@@ -4486,6 +4501,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = UniGene::macaca_mulatta
source = ncRNA::MULTI
......@@ -4519,6 +4535,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = UniGene::monodelphis_domestica
source = ncRNA::MULTI
......@@ -4565,6 +4582,7 @@ source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = UniGene::myotis_lucifugus
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species ochotona_princeps]
......@@ -4579,6 +4597,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species ornithorhynchus_anatinus]
......@@ -4592,6 +4611,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = Oxford_FGU_Oa_tscript::ornithorhynchus_anatinus
source = Oxford_FGU_Oa_gene::ornithorhynchus_anatinus
source = Platypus_olfactory_receptor::ornithorhynchus_anatinus
......@@ -4607,6 +4627,7 @@ source = InterproGO::MULTI
source = Interpro::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species oryzias_latipes]
......@@ -4622,6 +4643,7 @@ source = RefSeq_peptide::MULTI-vertebrate_other
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = UniProt::protein_id
source = Uniprot::EMBL
source = ncRNA::MULTI
......@@ -4640,6 +4662,7 @@ source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = Uniprot::EMBL
source = UniProt::protein_id
......@@ -4669,6 +4692,7 @@ source = Interpro::MULTI
source = UniGene::aedes_aegypti
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
[species pongo_abelii]
taxonomy_id = 9601
......@@ -4683,6 +4707,7 @@ source = RefSeq_peptide::pongo_abelii
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species procavia_capensis]
......@@ -4697,6 +4722,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
......@@ -4712,6 +4738,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species rattus_norvegicus]
......@@ -4747,7 +4774,7 @@ source = RefSeq_peptide::MULTI-fungi
source = SGD::saccharomyces_cerevisiae
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
[species schizosaccharomyces_pombe]
taxonomy_id = 4896
......@@ -4778,6 +4805,7 @@ source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = Uniprot::EMBL
source = UniProt::protein_id
......@@ -4795,6 +4823,7 @@ source = RefSeq_dna::MULTI-complete
source = RefSeq_peptide::MULTI-complete
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::EG
source = misc_EG::EG
......@@ -4908,6 +4937,7 @@ source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species taeniopygia_guttata]
......@@ -4923,6 +4953,7 @@ source = RefSeq_peptide::MULTI-vertebrate_other
source = UniGene::taeniopygia_guttata
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species takifugu_rubripes]
......@@ -4938,6 +4969,7 @@ source = RefSeq_peptide::MULTI-vertebrate_other
source = UniGene::takifugu_rubripes
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species tarsius_syrichta]
......@@ -4952,6 +4984,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species tetraodon_nigroviridis]
......@@ -4966,6 +4999,7 @@ source = RefSeq_dna::MULTI-vertebrate_other
source = RefSeq_peptide::MULTI-vertebrate_other
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
source = Genoscope_pred_gene::tetraodon_nigroviridis
source = Genoscope_pred_transcript::tetraodon_nigroviridis
......@@ -4986,6 +5020,7 @@ source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = UniGene::MULTI
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species tursiops_truncatus]
......@@ -5000,6 +5035,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species vicugna_pacos]
......@@ -5014,6 +5050,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species choloepus_hoffmanni]
......@@ -5028,6 +5065,7 @@ source = RefSeq_dna::MULTI-vertebrate_mammalian
source = RefSeq_peptide::MULTI-vertebrate_mammalian
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
[species xenopus_tropicalis]
......@@ -5043,6 +5081,7 @@ source = RefSeq_peptide::MULTI-vertebrate_other
source = UniGene::xenopus_tropicalis
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = Xenopus_Jamboree::xenopus_tropicalis
source = ncRNA::MULTI
......@@ -5259,6 +5298,7 @@ source = RefSeq_peptide::sus_scrofa
source = UniGene::sus_scrofa
source = Uniprot/SPTREMBL::MULTI
source = Uniprot/SWISSPROT::MULTI
source = Uniprot/SWISSPROT::DIRECT
source = ncRNA::MULTI
source = goslim_goa::MULTI
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment