Commit c09efab8 authored by cvs2git's avatar cvs2git
Browse files

This commit was manufactured by cvs2svn to create tag 'mergepoint-vega-46

-dev-head-1'.

Sprout from master 2008-02-14 08:57:57 UTC Glenn Proctor <gp1@sanger.ac.uk> 'Fix job naming.'
Cherrypick from master 2009-04-15 12:17:55 UTC Kerstin Howe <kj2@sanger.ac.uk> 'added FOS ends for zebrafish':
    misc-scripts/analysis_description/analysis.descriptions
    misc-scripts/analysis_description/apply_rules.pl
    misc-scripts/analysis_description/chech_web_data_column.pl
    misc-scripts/analysis_description/load_analysis_descriptions.pl
    misc-scripts/canonical_transcripts/set_canonical_transcripts.pl
    misc-scripts/db-space.pl
    misc-scripts/ebi_search_dump/ebi_search_dump.pl
    misc-scripts/id_mapping/synteny_rescore.pl
    misc-scripts/id_mapping/utils/compare_scores.pl
    misc-scripts/misc_feature/misc_set.descriptions
    misc-scripts/misc_feature/update_misc_set_descripitons.pl
    misc-scripts/translation_attribs.pl
    misc-scripts/translation_attribs_wrapper.pl
    misc-scripts/update_mapping_set.pl
    misc-scripts/xref_mapping/XrefMapper/CoreInfo.pm
    misc-scripts/xref_mapping/XrefMapper/DisplayXrefs.pm
    misc-scripts/xref_mapping/XrefMapper/Interpro.pm
    misc-scripts/xref_mapping/XrefMapper/LoadMapper.pm
    misc-scripts/xref_mapping/XrefMapper/ProcessMappings.pm
    misc-scripts/xref_mapping/XrefMapper/ProcessPaired.pm
    misc-scripts/xref_mapping/XrefMapper/ProcessPrioritys.pm
    misc-scripts/xref_mapping/XrefMapper/SubmitMapper.pm
    misc-scripts/xref_mapping/XrefMapper/TestMappings.pm
    misc-scripts/xref_mapping/XrefMapper/XrefLoader.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_ananassae.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_erecta.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_grimshawi.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_mojavensis.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_persimilis.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_pseudoobscura.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_sechellia.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_simulans.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_virilis.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_willistoni.pm
    misc-scripts/xref_mapping/XrefMapper/drosophila_yakuba.pm
    misc-scripts/xref_mapping/XrefParser/DBASSParser.pm
    misc-scripts/xref_mapping/XrefParser/HGNC_ENSTParser.pm
    misc-scripts/xref_mapping/XrefParser/HGNC_curated_transcriptParser.pm
    misc-scripts/xref_mapping/XrefParser/HPAParser.pm
    misc-scripts/xref_mapping/XrefParser/IlluminaWGParser.pm
    misc-scripts/xref_mapping/XrefParser/MGI_CCDS_Parser.pm
    misc-scripts/xref_mapping/XrefParser/MGI_Desc_Parser.pm
    misc-scripts/xref_mapping/XrefParser/MGI_Vega_Parser.pm
    misc-scripts/xref_mapping/XrefParser/MGI_curated_transcriptParser.pm
    misc-scripts/xref_mapping/XrefParser/Vega_TranParser.pm
    misc-scripts/xref_mapping/XrefParser/ncRNA_DBParser.pm
    misc-scripts/xref_mapping/xref_tracker.pl
    modules/Bio/EnsEMBL/Collection.pm
    modules/Bio/EnsEMBL/Collection/Exon.pm
    modules/Bio/EnsEMBL/Collection/Gene.pm
    modules/Bio/EnsEMBL/Collection/RepeatFeature.pm
    modules/Bio/EnsEMBL/Collection/Transcript.pm
    modules/Bio/EnsEMBL/DBSQL/GOTermAdaptor.pm
    modules/Bio/EnsEMBL/DBSQL/OntologyDBAdaptor.pm
    modules/Bio/EnsEMBL/DBSQL/OntologyTermAdaptor.pm
    modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm
    modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm
    modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm
    modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm
    modules/Bio/EnsEMBL/IdMapping/StableIdGenerator/AedesAegypti.pm
    modules/Bio/EnsEMBL/IdMapping/StableIdGenerator/EnsemblGeneric.pm
    modules/Bio/EnsEMBL/OntologyTerm.pm
    modules/t/test-genome-DBs/homo_sapiens/core/mapping_set.sql
    modules/t/test-genome-DBs/homo_sapiens/core/mapping_set.txt
    modules/t/test-genome-DBs/homo_sapiens/core/seq_region_mapping.sql
    modules/t/test-genome-DBs/homo_sapiens/core/seq_region_mapping.txt
    sql/patch_49_50_a.sql
    sql/patch_49_50_b.sql
    sql/patch_49_50_c.sql
    sql/patch_49_50_d.sql
    sql/patch_49_50_e.sql
    sql/patch_50_51_a.sql
    sql/patch_50_51_b.sql
    sql/patch_50_51_c.sql
    sql/patch_50_51_d.sql
    sql/patch_50_51_e.sql
    sql/patch_50_51_f.sql
    sql/patch_50_51_g.sql
    sql/patch_50_51_h.sql
    sql/patch_50_51_i.sql
    sql/patch_51_52_a.sql
    sql/patch_51_52_b.sql
    sql/patch_51_52_c.sql
    sql/patch_51_52_d.sql
    sql/patch_52_53_a.sql
    sql/patch_52_53_b.sql
    sql/patch_52_53_c.sql
    sql/patch_52_53_d.sql
    sql/patch_53_54_a.sql
    sql/patch_53_54_b.sql
    sql/patch_53_54_c.sql
Delete:
    misc-scripts/protein_match/process_pmach.pl
    modules/Bio/EnsEMBL/Collection/DnaAlignFeature.pm
    modules/Bio/EnsEMBL/Collection/ProteinAlignFeature.pm
    modules/Bio/EnsEMBL/DBSQL/Clone.pm
parent 33afb9d1
This diff is collapsed.
#!/opt/local/bin/perl
###!/usr/local/ensembl/bin/perl
# POD documentation - main docs before the code
=pod
=head1 NAME
apply_rules.pl
=head1 SYNOPSIS
script applies additonal rules to the analysis description table posterior to
description loading
=head1 DESCRIPTION
The rules (for details talk to Steve T.):
1) For each species, dna_align_features are displayed according to which
database they can be retrieved from:
- if they are present in both otherfeatures and core, then only the ones
from otherfeatures are to be displayed, ie we need to set the displayable
entry to 0 in core
- if they are only present in one of these two databases then show them
from that source
2) Logic names of human_cdna and mouse_cdna are slightly different in that
they should be switched off (ie set the displayable entry to 0) in human
and mouse core databases respectively - they are superceded by the cDNA
update features in the cDNA databases
3) These cDNA_update features should have a display label of 'Mouse cDNA' in
the mouse_cdna database, and 'Human cDNA' in the human_cdna database
4) All align_features from vega databases should be switched off
5) Genes with a logic_name of 'otter' have a display label of 'Vega Havana gene' in
Vega mouse and Vega human.
6) Anopholes, human and mose have different web_data columns from the definition file
7) C.elegans has some unique web_data columns
8) human and mouse have a different set of align_features switched on by default than
is defined in the definition file (don't need the 'default'=>{'....'} entry
=head1 OPTIONS
Database options
-dbhost host name for database (gets put as host= in locator)
-dbport For RDBs, what port to connect to (port= in locator)
-dbname For RDBs, what name to connect to (dbname= in locator)
-dbuser For RDBs, what username to connect as (dbuser= in locator)
-dbpass For RDBs, what password to use (dbpass= in locator)
-file Path to file containing descriptions. The file
analysis.descriptions in this directory can be used and is
supposed to be the reference file
-update Perform actual updates of analyses
-help print out documentation
=head1 EXAMPLES
=cut
use strict;
use warnings;
use Data::Dumper;
use Getopt::Long;
use Bio::EnsEMBL::Utils::Exception qw(warning throw);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Gene;
$! = 1;
my ($dsn, $dbh, $update);
# Analysis adaptors
my ($caa, $ofaa, $cdnaaa, $vegaaa);
my $dbhost = '';
my $dbuser;
my $dbpass;
my $dbport = 3306;
my $dbname;
my $help = 0;
my $version = 52;
my $file = 'analysis.descriptions';
&GetOptions (
'host|dbhost=s' => \$dbhost,
'port|dbport=s' => \$dbport,
'user|dbuser=s' => \$dbuser,
'pass|dbpass=s' => \$dbpass,
'dbname=s' => \$dbname,
'version=s' => \$version,
'file|descriptions=s' => \$file,
'update' => \$update,
'h|help!' => \$help
);
if(!$dbhost){
print ("Need to pass in -dbhost $dbhost and -dbname $dbname\n");
$help = 1;
}
if($help){
usage();
}
my %reference;
if ($file) {
open(FH, $file)
or throw("Failed to open reference file '$file': $@");
while (<FH>) {
chomp;
next if m/^\#/; # skip comments
next if m/^$/; # and blank lines
next if m/^\s+$/; # and whitespace-only lines
my ($nr, $logic_name, $description, $display_label, $displayable, $web_data) = split(/\t/);
#print join("\t", $logic_name, $description, $display_label, $displayable, $web_data), "\n";
warn ("Displayable flag for analysis '$logic_name' has to be either 0 or 1, but not '$displayable'!")
unless ($displayable =~ m/^[01]$/);
$reference{lc($logic_name)} = {
nr => "$nr",
description => "$description",
display_lable => "$display_label",
displayable => "$displayable",
web_data => "$web_data"
};
}
close FH;
} else {
throw("Need to pass reference file with analysis descriptions!");
}
$dsn = "DBI:mysql:host=" . $dbhost . ";port=" . $dbport;
eval{
$dbh = DBI->connect($dsn, $dbuser, $dbpass,
{'RaiseError' => 1,
'PrintError' => 0});
};
# get core database(s);
my $pat = defined $dbname ? $dbname : "%core_$version%";
my $sql = "show databases like '$pat'";
my $cdbs = $dbh->selectcol_arrayref($sql);
foreach my $cdb (@$cdbs) {
(my $species = $cdb) =~ s/(.+)_core_${version}_\d+[a-z]$/$1/;
#print Dumper $species;
my $cdba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-host => $dbhost,
-user => $dbuser,
-dbname => $cdb,
-pass => $dbpass,
-port => $dbport,
-species => $species
);
#print Dumper $cdba;
$caa = $cdba->get_AnalysisAdaptor();
### implements rule 1. ###
my $cdaf_logic_names = get_af_logic_names($cdba, 'dna');
#print Dumper $cdaf_logic_names;
(my $ofdb = $cdb) =~ s/_core_/_otherfeatures_/;
$sql = "show databases like '$ofdb'";
my $ofdbs = $dbh->selectcol_arrayref($sql);
if (scalar(@$ofdbs) == 0) {
print ("No otherfeatures db for " . $cdb . "! Setting all displayable entires to 1\n");
my $daf_logic_names = get_af_logic_names($cdba, 'dna');
map { update_analysis($caa, $_, 1) } @$daf_logic_names;
} else {
print ("Both core and otherfeatures dbs exist. Need to analyse dna_align_features ...\n");
my $ofdba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-host => $dbhost,
-user => $dbuser,
-dbname => $ofdbs->[0],
-pass => $dbpass,
-port => $dbport,
-species => $species,
-group => 'otherfeatures'
);
#print Dumper $ofdba;
$ofaa = $ofdba->get_AnalysisAdaptor();
my $ofdaf_logic_names = get_af_logic_names($ofdba, 'dna');
#print Dumper $ofdaf_logic_names;
my %daf_logic_names;
map {$daf_logic_names{lc($_)}++} (@$cdaf_logic_names, @$ofdaf_logic_names);
foreach my $ln (@$ofdaf_logic_names) {
if ($daf_logic_names{lc($ln)} == 2) {
print("<$ln> exists in both, setting displayable 0 for core and 1 for otherfeatures\n");
update_analysis($caa, $ln, 0);
update_analysis($ofaa, $ln, 1);
} else {
print("<$ln> exists only in otherfeatures, setting displayable according to reference file\n");
update_analysis($ofaa, $ln, $reference{lc($ln)}{displayable});
}
delete $daf_logic_names{lc($ln)};
}
foreach my $ln (keys %daf_logic_names) {
print("<$ln> exists only in core, setting displayable according to reference file\n");
update_analysis($caa, $ln, $reference{lc($ln)}{displayable});
}
}
if ($species =~ m/^(homo_sapiens|mus_musculus)/) {
### new rule:in human and mouse, the only align_features on by default
### are cDNA update and CCDS
my $core_daf_logic_names = get_af_logic_names($caa, 'dna');
my $core_paf_logic_names = get_af_logic_names($caa, 'protein');
foreach my $ln (@$core_daf_logic_names, @$core_paf_logic_names) {
next if ($ln eq 'CCDS'); #this does not apply to CCDS
my $ad = $caa->fetch_by_logic_name($ln);
if ($ad->web_data() ne ''){
my $new_display_label = $ad->web_data();
delete $new_display_label->{'default'};
# print "<$cdb> Switching align_feature '$ln' from " . $caa->dump_data($ad->web_data()) . " to " . $caa->dump_data($new_display_label) ."\n";
update_analysis($caa, $ln, $ad->displayable(),$ad->display_label,$new_display_label);
}
}
### implements rule 2. and 3. ###
(my $cdnadb = $cdb) =~ s/_core_/_cdna_/;
my $cdnadba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-host => $dbhost,
-user => $dbuser,
-dbname => $cdnadb,
-pass => $dbpass,
-port => $dbport,
-species => $species,
-group => 'cdna'
);
#print Dumper $cdnadba;
$cdnaaa = $cdnadba->get_AnalysisAdaptor();
my %alias = (
'homo_sapiens' => 'Human',
'mus_musculus' => 'Mouse'
);
my $ln = lc($alias{$species}).'_cdna';
print "<$cdb> Switching off displayable for $ln\n";
update_analysis($caa, $ln, 0);
my $dl = $alias{$species}.' cDNA';
print "<$cdnadb> Updating display_label for cDNA_update to '$dl'\n";
update_analysis($cdnaaa, 'cDNA_update', 1, $dl);
### implements rule 4. ###
(my $vegadb = $cdb) =~ s/_core_/_vega_/;
my $vegadba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-host => $dbhost,
-user => $dbuser,
-dbname => $vegadb,
-pass => $dbpass,
-port => $dbport,
-species => $species,
-group => 'vega'
);
#print Dumper $vegadba;
$vegaaa = $vegadba->get_AnalysisAdaptor();
my $vega_daf_logic_names = get_af_logic_names($vegadba, 'dna');
my $vega_paf_logic_names = get_af_logic_names($vegadba, 'protein');
foreach my $ln (@$vega_daf_logic_names, @$vega_paf_logic_names) {
print "<$vegadb> Switching align_feature '$ln' displayable off\n";
update_analysis($vegaaa, $ln, 0);
}
### implements rule 5. ###
my $display_label = 'Vega Havana gene';
print "<$cdb> Updating display_label for logic_name otter to '$display_label'\n";
update_analysis($vegaaa, 'otter', undef, $display_label);
# 6) There are two more rules for the logic_name 'ensembl' in core databases
# depending on the species:
# - all but mouse, human and anopheles, use definition file.
# - anopheles needs a display_label of 'VectorBase gene'
# - mouse and human need a different web_data column:
# '{'colour_key' => '[biotype]_[status]',
# 'caption' => 'Ensembl/Havana gene','name' => 'Merged Ensembl and Havana Genes',
# 'label_key' => '[text_label] [display_label]',
# 'default' => {'contigviewbottom' => 'transcript_label',
# 'contigviewtop' => 'gene_label',
# 'cytoview' => 'gene_label'},'key' => 'ensembl'}'
#web_data should be a hash reference
my $web_data = {'colour_key' => '[biotype]_[status]',
'caption' => 'Ensembl/Havana gene',
'name' => 'Merged Ensembl and Havana Genes',
'label_key' => '[text_label] [display_label]',
'default' => {'contigviewbottom' => 'transcript_label',
'contigviewtop' => 'gene_label',
'cytoview' => 'gene_label'},
'key' => 'ensembl'};
print "<ensembl> Updating web_data\n";
update_analysis($caa, 'ensembl', undef, undef, $web_data);
}
if ($species =~ m/^(anopheles_gambiae)/) {
print "<ensembl> Updating display_label to 'VectorBase gene'\n";
update_analysis($caa, 'ensembl', undef, 'VectorBase gene');
print "<anopheles_cdna_est> Switching displayable on\n";
update_analysis($caa, 'anopheles_cdna_est', 1);
print "<anopheles_cdna_est> Updating display_label to 'RNA (best)'\n";
update_analysis($ofaa, 'anopheles_cdna_est', undef, 'RNA (best)');
}
#rule 7
if ($species =~ m/^(caenorhabditis_elegans)/) {
# In C.elegans we have a couple of custom web_data columns that are
# perhaps most easily patched by reading from another logic_name
# - logic_name of 'ncRNA' has the same web_data as logic_name of 'tRNA'
# - logic_name of 'Pseudogene' has the same web_data as logic_name of 'wormbase'
print "<$cdb> Overwriting web_data for logic_name 'ncRNA' ".
"with web_data from 'tRNA'\n";
my $tRNA = $caa->fetch_by_logic_name('tRNA');
update_analysis($caa, 'ncRNA', undef, undef, $tRNA->web_data());
print "<$cdb> Overwriting web_data for logic_name 'Pseudogene' ".
"with web_data from 'wormbase'\n";
my $wormbase = $caa->fetch_by_logic_name('wormbase');
update_analysis($caa, 'Pseudogene', undef, undef, $wormbase->web_data());
}
}
sub get_af_logic_names{
my ($db, $molecule) = @_;
my $sql = "select distinct logic_name from ".$molecule."_align_feature daf, analysis a ".
"where daf.analysis_id=a.analysis_id;";
return $db->dbc->db_handle->selectcol_arrayref($sql);
}
sub update_analysis {
my ($aa, $logic_name, $displayable, $display_label, $web_data) = @_;
my $analysis = $aa->fetch_by_logic_name($logic_name);
throw("Analysis '$logic_name' is not defined") unless defined $analysis;
if (defined $displayable) {
print "\t[".$aa->db->dbc->dbname."] Updating '$logic_name' displayable from '".$analysis->displayable()."' to '".$displayable."'\n";
$analysis->displayable($displayable)
}
if (defined $display_label) {
print "\t[".$aa->db->dbc->dbname."] Updating '$logic_name' display_label from '".$analysis->display_label()."' to '".$display_label."'\n";
$analysis->display_label($display_label);
}
if (defined $web_data) {
print "\t[".$aa->db->dbc->dbname."] Updating '$logic_name' web_data from \"".$aa->dump_data($analysis->web_data())."\" to \"".$aa->dump_data($web_data)."\"\n";
$analysis->web_data($web_data);
}
$aa->update($analysis) if $update;
}
sub usage{
exec('perldoc', $0);
exit;
}
#!/usr/local/ensembl/bin/perl
=pod
=head1 NAME
check_web_data_column.pl
=head1 SYNOPSIS
script to check that the web_data column can be eval'd into a hash.
=head1 DESCRIPTION
This column has been giving problems due to the peculiarity of the data
(string containing quotes that has to be eval'd into a hash ref). In
order to check it, this script should be run after load_analysis_description
and apply_rules
=head1 OPTIONS
Database options
-dbhost host name for database (default=ens-staging)
-dbport For RDBs, what port to connect to (default=3306)
-dbname For RDBs, what name to connect to (dbname= in locator)
-pattern check databases matching this PATTERN
Note that this is a database pattern of the form %core_53_%
rather than a regular expression
-dbuser For RDBs, what username to connect as (dbuser= in locator)
-dbpass For RDBs, what password to use (dbpass= in locator)
-help print out documentation
=head1 EXAMPLES
In order to run it for a set of databases
perl check_web_data_column -dbuser ensro -pattern '%core_53%_'
=cut
use strict;
use warnings;
use Getopt::Long;
use DBI;
use DBD::mysql;
use Data::Dumper;
my ($dsn,$dbh);
my $dbhost = 'ens-staging';
my $dbuser;
my $dbpass;
my $dbport = 3306;
my $dbname;
my $pattern;
my $help = 0;
&GetOptions(
'host|dbhost=s' => \$dbhost,
'port|dbport=s' => \$dbport,
'user|dbuser=s' => \$dbuser,
'pass|dbpass=s' => \$dbpass,
'dbname=s' => \$dbname,
'pattern=s' => \$pattern,
'help|h!' => \$help
);
if (!$dbhost){
print ("Need to pass a dbhost\n");
$help =1;
}
if (!$dbname and !$pattern){
print("Need to enter either a database name in -dbname or a pattern in -pattern");
$help = 1;
}
if ($help){
usage();
}
#connect to database
$dsn = "DBI:mysql:host=" . $dbhost . ";port=" . $dbport;
eval{
$dbh = DBI->connect($dsn, $dbuser, $dbpass,
{'RaiseError' => 1,
'PrintError' => 0});
};
# get all database names that match pattern
my ($sth, $sql);
my $sql_pattern = $pattern || $dbname;
$sql = "SHOW DATABASES LIKE '". $sql_pattern ."'";
$sth = $dbh->prepare($sql);
$sth->execute;
my $analysis_id;
my $display_label;
my $web_data;
my $ref_web_data;
my $db_found = 0;
#for each of the database, check the web_data in the analysis_description
while (my ($dbname) = $sth->fetchrow_array) {
$sql = qq{SELECT analysis_id, display_label, web_data from $dbname.analysis_description};
my $sth1 = $dbh->prepare($sql);
$sth1->execute;
$sth1->bind_col(1,\$analysis_id);
$sth1->bind_col(2,\$display_label);
$sth1->bind_col(3,\$web_data);
while ($sth1->fetch){
if (defined $web_data and $web_data ne ''){
$ref_web_data = eval($web_data);
#print Dumper($ref_web_data);
if (ref($ref_web_data) ne 'HASH'){
print "Analysis $analysis_id with display_label $display_label in $dbname has a wrong web_data column--$web_data-- cannot be eval into hash\n";
$db_found = 1;
}
}
}
}
print "Web data column in $sql_pattern looks right\n" if (!$db_found);
sub usage{
exec('perldoc',$0);
exit;
}
#!/usr/local/ensembl/bin/perl -w
# POD documentation - main docs before the code
=pod
=head1 NAME
load_analysis_descriptions.pl
=head1 SYNOPSIS
script loads the analysis description table
=head1 DESCRIPTION
The script reads the analysis description file also found in this directory
analysis.descriptions and loads the descriptions which match the logic names in
the analysis table. Display labels are also set from this file.
It will warn about analyses present in the database which don't have descriptions
in the file.
To not update analyses in the database you need to pass the -noupdate option.
=head1 OPTIONS
Database options
-dbhost host name for database (gets put as host= in locator)
-dbport For RDBs, what port to connect to (port= in locator)
-dbname For RDBs, what name to connect to (dbname= in locator)
-dbuser For RDBs, what username to connect as (dbuser= in locator)
-dbpass For RDBs, what password to use (dbpass= in locator)
-file Path to file containing descriptions. The file