From bab48e012dc2f2c872277445cf00142e955c2b82 Mon Sep 17 00:00:00 2001 From: Emmanuel Mongin <mongin@sanger.ac.uk> Date: Fri, 13 Jun 2003 18:14:51 +0000 Subject: [PATCH] added organism specific checks --- misc-scripts/protein_match/get_Xmapping.pl | 158 +++++++++++++- .../load_transcript_display_id.pl | 15 +- misc-scripts/protein_match/mapping_conf.pl | 34 ++-- misc-scripts/protein_match/maps2db.pl | 192 +++++++++++++++++- misc-scripts/protein_match/pmatch.pl | 50 +++-- 5 files changed, 404 insertions(+), 45 deletions(-) diff --git a/misc-scripts/protein_match/get_Xmapping.pl b/misc-scripts/protein_match/get_Xmapping.pl index 8254dbd407..00a5f2388b 100644 --- a/misc-scripts/protein_match/get_Xmapping.pl +++ b/misc-scripts/protein_match/get_Xmapping.pl @@ -14,6 +14,8 @@ my %conf = %::mapping_conf; # configuration options # global vars +my $org_list = $conf{'organism_list'}; + #Get general options my $organism = $conf{'organism'}; my $sptr_swiss = $conf{'sptr_swiss'}; @@ -25,7 +27,6 @@ my $refseq_gnp = $conf{'refseq_gnp'}; #Get specific options for human my $ens1 = $conf{'ens1'}; my $ens4 = $conf{'ens4'}; -my $refseq_pred = $conf{'refseq_pred_gnp'}; my $go = $conf{'go'}; my $gkb = $conf{'gkb'}; @@ -43,11 +44,141 @@ my $eleg_nom = $conf{'eleg_nom'}; my $zeb_gene = $conf{'zeb_gene'}; my $zeb_dblink = $conf{'zeb_dblink'}; -#Get specific options for drosophila -my $ext_annot = $conf{'dros_ext_annot'}; my $briggsae_peptides = $conf{'briggsae_hybrid'}; +#Check that the configuration file has been well filled in for each different organism +#Beginning of check + +my %check; +my $seenorg = 0; + +#Check if the organism is correct +foreach my $or (@{$org_list}) { + if ($or eq $organism) { + $seenorg = 1; + } +} + +if ($seenorg == 0) { + print STDERR "Either the organism name you are using ($organism) is not define or is not allowed\n"; + print STDERR "Here is a list of authorised organisms:\n"; + foreach my $or (@{$org_list}) { + print STDERR "$or\n"; + } + + exit(); +} + + +#Organism specific checks +if($organism eq "human") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + $check{'ens1'} = $conf{'ens1'}; + $check{'ens4'} = $conf{'ens4'}; + $check{'go'} = $conf{'go'}; + $check{'gkb'} = $conf{'gkb'}; + + foreach my $k (keys %check) { + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "mouse") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + $check{'mgi_sp'} = $conf{'mgi_sp'}; + $check{'mgi_locus'} = $conf{'mgi_locus'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "elegans") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'elegans_nom'} = $conf{'elegans_nom'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "anopheles") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'submitted_genes'} = $conf{'submitted_genes'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "drosophila") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "rat") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "zebrafish") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'zeb_gene'} = $conf{'zeb_gene'}; + $check{'zeb_dblink'} = $conf{'zeb_dblink'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + + +#End of check + if ((!defined $organism) || (!defined $sptr_swiss) || (!defined $out)) { die "\nSome basic options have not been set up, have a look at mapping_conf\nCurrent set up (required options):\norganism: $organism\nsptr_swiss: $sptr_swiss\nx_map: $out\n"; } @@ -488,3 +619,24 @@ sub process_parsed_sp{ } } +sub usage { + + print STDERR <<HELP + +Usage: get_Xmapping.pl +One of the element of the configuration file has not been properly loaded +for the organism $organism +Please fill in properly your configuration file + +Here is your set up: +HELP +; + + foreach my $k (keys %check) { + print STDERR "$k:\t$check{$k}\n"; + } + + + + exit(); +} diff --git a/misc-scripts/protein_match/load_transcript_display_id.pl b/misc-scripts/protein_match/load_transcript_display_id.pl index 3ff2f79f26..7260142b2f 100644 --- a/misc-scripts/protein_match/load_transcript_display_id.pl +++ b/misc-scripts/protein_match/load_transcript_display_id.pl @@ -40,15 +40,14 @@ $priority{'SWISSPROT'} = 900; $priority{'RefSeq'} = 800; $priority{'SPTREMBL'} = 700; $priority{'LocusLink'} = 100; -#$priority{'Anopheles_paper'} = 50; -#$priority{'Celera_Gene'} = 50; + if (!defined $organism) { die "\nSome basic options have not been set up, have a look at mapping_conf\nCurrent set up (required options):\norganism: $organism\n\n"; } print STDERR "Connecting to the database...\n"; print STDERR "dealing with organism ".$organism."\n"; -#my $multi = MultiTestDB->new(); + my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( -user => $user, @@ -58,7 +57,6 @@ my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( -driver => 'mysql', ); -#my $db = $multi->get_DBAdaptor( 'core' ); my $transadaptor = $db->get_TranscriptAdaptor(); my $geneadaptor = $db->get_GeneAdaptor(); @@ -68,6 +66,7 @@ my $query = "select transcript_id from transcript"; my $sth = $db->prepare($query); $sth->execute(); +print STDERR "Getting transcript display xref_id\n"; while(my $id = $sth->fetchrow) { my $trans = $transadaptor->fetch_by_dbID($id); my $xrefs = $trans->get_all_DBLinks; @@ -87,8 +86,11 @@ while(my $id = $sth->fetchrow) { $transadaptor->update($trans); } -if ($organism ne "elegans") { +print STDERR "Done\n"; +print STDERR "Getting gene display_xref_id\n"; + +if ($organism ne "elegans") { my $query1 = "select gene_id from gene"; my $sth1 = $db->prepare($query1); $sth1->execute(); @@ -113,6 +115,7 @@ if ($organism ne "elegans") { } } +#Not sure id it is really needed if wormbase_gene is put in the priority list... Laura? elsif ($organism eq "elegans") { my $query1 = "select g.gene_id, x.xref_id from gene_stable_id g, xref x, external_db e where g.stable_id = x.display_label and x.external_db_id = e.external_db_id and e.db_name = 'wormbase_gene'"; my $sth1 = $db->prepare($query1); @@ -127,7 +130,7 @@ elsif ($organism eq "elegans") { } - +print STDERR "Done\n"; diff --git a/misc-scripts/protein_match/mapping_conf.pl b/misc-scripts/protein_match/mapping_conf.pl index 2a3dc067e7..f4579c7009 100644 --- a/misc-scripts/protein_match/mapping_conf.pl +++ b/misc-scripts/protein_match/mapping_conf.pl @@ -47,43 +47,43 @@ package main; #Location of the query peptide file (eg: Ensembl predicted protein) #'query' => '/work1/mongin/mapping/primary/ensembl110.pep', - 'query' => '/acari/work4/mongin/dros3_mapping/Primary/drosophila-release3-peptides.fasta', + 'query' => '/Users/emmanuelmongin/code_test/mapping_test/Primary', #Location of the sptr file, this file will be used as an #input to grep the specific sp entries to the organism #using grep_sp_entries.pl. This file is supposed to be - #in SP format + #in SP format. This option is not really use but you are welcome to use it (it is most of the time simpler to get the data through SRS or from the files produced by Swiss-Prot). But if you have the whole SPTR somewhere you can easily access... 'total_sptr' => '', #Location of the sptr file in fasta format containing the entries specific to the organism #'sptr_fa' => '/work1/mongin/mapping/primary/HS.f', - 'sptr_fa' => '/acari/work4/mongin/dros3_mapping/Primary/7227.FASTAC', + 'sptr_fa' => '/Users/emmanuelmongin/code_test/mapping_test/Primary/7227.FASTAC', #Location of the sptr file in Swiss-Prot format containing the entries specific to the organism #'sptr_swiss' => '/ecs2/work1/lec/briggsae_peptides/briggsae.test', - 'sptr_swiss' => '/acari/work4/mongin/dros3_mapping/Primary/7227.SPC', + 'sptr_swiss' => '/Users/emmanuelmongin/code_test/mapping_test/Primary/Primary/7227.SPC', #Location of the file containing all refseq and all SP in fasta format (This file will be produced by running prepare_proteome.pl) - 'pmatch_input_fa' => '/acari/work4/mongin/dros3_mapping/Primary/total.fa', + 'pmatch_input_fa' => '/Users/emmanuelmongin/code_test/mapping_test/Primary/total.fa', #Output file containing the mapping of SP and refseq sequences to external databases - 'x_map_out' => '/acari/work4/mongin/dros3_mapping/Output/xmap.out', + 'x_map_out' => '/Users/emmanuelmongin/code_test/mapping_test/Output/xmap.out', #Output file from pmatch.pl and input file for maps2db.pl #'pmatch_out' => '/work1/mongin/mapping/outputs/pmatch_human1.txt', - 'pmatch_out' => '/acari/work4/mongin/dros3_mapping/Output/pmatch.out', + 'pmatch_out' => '/Users/emmanuelmongin/code_test/mapping_test/Primary/pmatch.out', #Location of the Refseq (proteins) file in fasta format #'refseq_fa' => '/work1/mongin/mapping/primary/refseq.fa', - 'refseq_fa' => '/acari/work4/mongin/dros3_mapping/Primary/fly.faa', + 'refseq_fa' => '', #Location of the Refseq (proteins) file in Genbank format #'refseq_gnp' => '/work1/mongin/mouse/mapping/primary/mouse.gnp', - 'refseq_gnp' => '/acari/work4/mongin/dros3_mapping/Primary/fly.gnp', + 'refseq_gnp' => '', ############################################ #Organism specific files for the X_mapping # @@ -152,7 +152,7 @@ package main; #drosophila# ############ - 'dros_ext_annot' => '/acari/work4/mongin/dros3_mapping/Primary/xrefs.txt', + 'dros_ext_annot' => '', ########## @@ -197,10 +197,13 @@ package main; #Organism related information# ############################## - #Name of the organism studied. Current keywords used(or planned to be used): human, drosophila, mouse, elegans, anopheles, zebrafish + #Name of the organism studied. Current keywords used(or planned to be used): human, drosophila, mouse, elegans, anopheles, zebrafish, rat #You can adapt the other scripts given the organisms (eg: do some specific x_mapping for a given organism) #'organism' => 'human' - 'organism' => 'drosophila', + 'organism' => 'human', + + #List of authorised organisms, don't forget to add a new organism to this list + 'organism_list' => ['human','drosophila','mouse','elegans','anopheles','zebrafish','rat'], #OX (Organism taxonomy cross-reference) number @@ -211,12 +214,7 @@ package main; #'ox' => '6238' briggsae 'ox' => '', - - - - - - + ################## #obslete options# diff --git a/misc-scripts/protein_match/maps2db.pl b/misc-scripts/protein_match/maps2db.pl index 6742085563..563428af7a 100644 --- a/misc-scripts/protein_match/maps2db.pl +++ b/misc-scripts/protein_match/maps2db.pl @@ -20,6 +20,7 @@ my %conf = %::mapping_conf; # configuration options # global vars +my $org_list = $conf{'organism_list'}; my $refseq_gnp = $conf{'refseq_gnp'}; my $xmap = $conf{'x_map_out'}; my $map = $conf{'pmatch_out'}; @@ -29,12 +30,19 @@ my $user = $conf{'dbuser'}; my $pass = $conf{'password'}; my $port = $conf{'port'}; my $organism = $conf{'organism'}; -my $type = $conf{'elegans_pseudo'}; -my $check = $conf{'check'}; my $query_pep = $conf{'query'}; my $refseq_pred = $conf{'refseq_pred_gnp'}; + +#Organism specific options +#Drosophila my $dros_ext_annot = $conf{'dros_ext_annot'}; + +#Elegans my $cefile = $conf{'eleg_nom'}; +my $type = $conf{'elegans_pseudo'}; + +#working option but obsolete +my $check = $conf{'check'}; my %map; my %cemap; @@ -45,10 +53,163 @@ my %embl2sp; my %errorflag; my %ref_map_pred; -if ((!defined $organism) || (!defined $xmap) || (!defined $map)) { - die "\nSome basic options have not been set up, have a look at mapping_conf\nCurrent set up (required options):\norganism: $organism\nx_map: $xmap\npmatch_out: $map\ndb: $dbname\nhost: $host\n\n"; +#Checks + +my %check; +my $seenorg = 0; + +#Check if the organism is correct +foreach my $or (@{$org_list}) { + if ($or eq $organism) { + $seenorg = 1; + } +} + +if ($seenorg == 0) { + print STDERR "Either the organism name you are using ($organism) is not define or is not allowed\n"; + print STDERR "Here is a list of authorised organisms:\n"; + foreach my $or (@{$org_list}) { + print STDERR "$or\n"; + } + + exit(); } +#Organism specific checks +if($organism eq "human") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + + foreach my $k (keys %check) { + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "mouse") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "elegans") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'elegans_nom'} = $conf{'elegans_nom'}; + $check{'elegans_pseudo'} = $conf{'elegans_pseudo'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "anopheles") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "drosophila") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + $check{'dros_ext_annot'} = $conf{'dros_ext_annot'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "rat") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'refseq_gnp'} = $conf{'refseq_gnp'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +if ($organism eq "zebrafish") { + $check{'sptr_swiss'} = $conf{'sptr_swiss'}; + $check{'x_map_out'} = $conf{'x_map_out'}; + $check{'pmatch_out'} = $conf{'pmatch_out'}; + $check{'db'} = $conf{'db'}; + $check{'host'} = $conf{'host'}; + $check{'dbuser'} = $conf{'dbuser'}; + $check{'password'} = $conf{'password'}; + + foreach my $k (keys %check) { + print STDERR $check{$k}."\n"; + if ($check{$k} !~ /(\S+)/) { + usage(); + } + } + +} + +#End of checks + print STDERR "Connecting to the database... $dbname:$host\n"; print STDERR "dealing with organism ".$organism."\n"; @@ -437,6 +598,29 @@ if ($organism eq "elegans") { } } } + +sub usage { + + print STDERR <<HELP + +Usage: maps2db.pl +One of the element of the configuration file has not been properly loaded +for the organism $organism +Please fill in properly your configuration file + +Here is your set up: +HELP +; + + foreach my $k (keys %check) { + print STDERR "$k:\t$check{$k}\n"; + } + + + + exit(); +} + ############### #Some OO stuff# ############### diff --git a/misc-scripts/protein_match/pmatch.pl b/misc-scripts/protein_match/pmatch.pl index 866ce5c11a..6f92b024fb 100644 --- a/misc-scripts/protein_match/pmatch.pl +++ b/misc-scripts/protein_match/pmatch.pl @@ -30,9 +30,6 @@ my $refseq_fa = $conf{'refseq_fa'}; #Set the default percentage of idt -#$opt_p = 66; - -my $organism = $conf{'organism'}; my $opt_q = $conf{'query'}; my $opt_t = $conf{'pmatch_input_fa'}; @@ -43,7 +40,23 @@ my $q_thr = $conf{'query_idt'}; my $pmatch_bin = $conf{'pmatch'}; my ($opt_w,$opt_l,$opt_d); +#Check if the configuration file is correct +my %check; +$check{'query'} = $conf{'query'}; +$check{'pmatch_input_fa'} = $conf{'pmatch_input_fa'}; +$check{'pmatch_out'} = $conf{'pmatch_out'}; +$check{'target_id'} = $conf{'target_idt'}; +$check{'query_idt'} = $conf{'query_idt'}; +$check{'pmatch'} = $conf{'pmatch'}; + +foreach my $k (keys %check) { + if ($check{$k} !~ /(\S+)/) { + usage(); + } +} + +#End of checks ################################# @@ -51,17 +64,6 @@ my $query = $opt_q; my $target = $opt_t; my %hash2; -################################# -# make worm-specific protein set from SWALL if ($opt_w) - -if ($organism eq "worm") { - if ($opt_w) { - print STDERR "extract worm sequences from SWALL...\n"; - my $getz = "getz -f seq -sf fasta \'[swall-org:Caenorhabditis elegans]\' > $$.swall"; - system "$getz"; - $target = "$$.swall"; - } -} ################################# # run pmatch (Richard Durbin's fast protein matcher, rd@sanger.ac.uk) @@ -436,6 +438,26 @@ sub process_matches { } } +sub usage { + + print STDERR <<HELP + +Usage: pmatch.pl +One of the element of the configuration file has not been properly loaded +Please fill in properly your configuration file + +Here is your set up: +HELP +; + + foreach my $k (keys %check) { + print STDERR "$k:\t$check{$k}\n"; + } + + + + exit(); +} ########################################## -- GitLab