Skip to content
Snippets Groups Projects
Commit 0de6a0f5 authored by Magali Ruffier's avatar Magali Ruffier
Browse files

scripts are now part of ensembl-production pipeline

removing ensembl-core copies
parent 1339de5f
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env perl
# Calculate per-gene GC content and store as gene attributes
use warnings;
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::CliHelper;
my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new();
# get the basic options for connecting to a database server
my $optsd = $cli_helper->get_dba_opts();
# add the print option
push( @{$optsd}, "print" );
# process the command line with the supplied options plus a help subroutine
my $opts = $cli_helper->process_args( $optsd, \&usage );
# use the command line options to get an array of database details
for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) {
# use the args to create a DBA
my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} );
print STDOUT "Processing species "
. $dba->species_id()
. " from database "
. $dba->dbc()->dbname()
. " on server "
. $dba->dbc()->host() . "\n";
delete_existing($dba) unless ( $opts->{print} );
print STDOUT "Calculating Gene GC attributes\n";
my $attribute_adaptor = $dba->get_AttributeAdaptor();
my $total_count = 0;
for my $gene ( @{ $dba->get_GeneAdaptor()->fetch_all() } ) {
my $gc = $gene->feature_Slice()->get_base_count->{'%gc'};
if ( !$opts->{print} ) {
# attribute types need to be propagated from production database to all dbs
# if the type exists it won't be overwritten
my $attribute =
Bio::EnsEMBL::Attribute->new(
-CODE => 'GeneGC',
-NAME => 'Gene GC',
-DESCRIPTION => 'Percentage GC content for this gene',
-VALUE => $gc );
my @attributes = ($attribute);
$attribute_adaptor->store_on_Gene( $gene->dbID, \@attributes );
$total_count++;
} else {
print $gene->stable_id() . " " . $gc . "\n";
}
}
if ( !$opts->{print} ) {
print STDOUT "Written $total_count 'GeneGC' gene attributes to species "
. $dba->species_id()
. " from database "
. $dba->dbc()->dbname()
. " on server "
. $dba->dbc()->host() . "\n";
}
} ## end for my $db_args ( @{ $cli_helper...})
# ----------------------------------------------------------------------
sub delete_existing {
my $dba = shift;
print STDOUT "Deleting existing 'GeneGC' gene attributes\n";
$dba->dbc()->sql_helper()->execute_update(
-SQL =>
q/DELETE ga FROM gene_attrib ga, attrib_type at, gene g, seq_region s, coord_system c
WHERE at.attrib_type_id=ga.attrib_type_id AND at.code='GeneGC'
AND ga.gene_id=g.gene_id AND g.seq_region_id=s.seq_region_id
AND c.coord_system_id=s.coord_system_id AND c.species_id=?
/,
PARAMS => [ $dba->species_id() ] );
return;
}
sub usage {
my $indent = ' ' x length($0);
print <<EOF; exit(0);
What does it do?
This script calculates per-gene GC content and stores it as gene attributes.
It deletes existing Gene GC attributes. Then fetches all genes in the
core db and calculates the %gc for each gene.
Input data: dna sequence
Output tables: gene_attrib
When to run it in the release cycle?
It can be run whenever the genes and sequence are stable, i.e. any time after
the genebuild handover, but before the handover to Mart.
Which databases to run it on?
It needs to be run across all core databases for every release.
How long does it take?
It takes a total of about 10 hours to run for all core databases in normal queue,
Usage:
$0 -h host [-port port] -u user -p password \\
$indent -pattern pattern [-print] \\
$indent [-help] \\
-h|host Database host to connect to
-port Database port to connect to (default 3306)
-u|user Database username
-p|pass Password for user
-pattern Database name regexp
-print Just print, don't insert or delete attributes
-help This message
EOF
} ## end sub usage
#!/software/bin/perl
=head1 NAME
translation_attribs.pl - script to calculate peptide statistics and store
them in translation_attrib table
=head1 SYNOPSIS
translation_attribs.pl [arguments]
Required arguments:
--user=user username for the database
--pass=pass password for database
Optional arguments:
--pattern=pattern calculate translation attribs for databases matching pattern
Note that this is a standard regular expression of the
form '^[a-b].*core.*' for all core databases starting with a or b
--binpath=PATH directory where the binary script to calculate
pepstats is stored (default: /software/pubseq/bin/emboss)
--tmpdir=dir directory to store tmp results of pepstats (default=/tmp)
--host=host server where the core databases are stored (default: ens-staging)
--dbname=dbname if you want a single database to calculate the pepstats
(all databases by default)
--port=port port (default=3306)
--help print help (this message)
=head1 DESCRIPTION
This script will calculate the peptide statistics for all core databases in the server
and store them as a translation_attrib values
=head1 EXAMPLES
Calculate translation_attributes for all core databases in ens-staging
$ ./translation_attribs.pl --user ensadmin --pass password
Calculate translation_attributes for core databases starting with [a-c] in ens-staging (output LSF to PWD)
$ ./translation_attribs.pl --user ensadmin --pass password --pattern '^[a-c].*core_50.*'
Calculate translation_attribs for a single database in a ens-genomics1
$ ./translation_attribs.pl --host ens-genomics1 --user ensadmin --pass password --dbname my_core_db
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
=cut
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Attribute;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Data::Dumper;
use DBI;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::CliHelper;
my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new();
# get the basic options for connecting to a database server
my $optsd = $cli_helper->get_dba_opts();
# add the print option
push( @{$optsd}, "binpath:s" );
push( @{$optsd}, "tmpdir:s" );
push( @{$optsd}, "verbose|v" );
# process the command line with the supplied options plus a help subroutine
my $opts = $cli_helper->process_args( $optsd, \&pod2usage );
$opts->{binpath} ||= '/software/pubseq/bin/emboss';
$opts->{tmpdir} ||= '/tmp';
$opts->{port} ||= '3306';
$opts->{host} ||= 'ens-staging';
$opts->{user} ||= 'ensro';
$opts->{pattern} ||= qr/.+_core_.+/;
my %PEPSTATS_CODES = ( 'Number of residues' => 'NumResidues',
'Molecular weight' => 'MolecularWeight',
'Ave. residue weight' => 'AvgResWeight',
'Charge' => 'Charge',
'Isoelectric point' => 'IsoPoint'
);
my %MET_AND_STOP = ( 'Starts with methionine' => 'starts_met',
'Contains stop codon' => 'has_stop_codon'
);
my %attributes_to_delete = (%PEPSTATS_CODES);
my $translation_attribs = {};
my $translation;
my $dbID;
# use the command line options to get an array of database details
for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) {
# use the args to create a DBA
my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} );
my $dbname = $dba->dbc()->dbname();
print STDOUT "Processing species "
. $dba->species_id()
. " from database "
. $dba->dbc()->dbname()
. " on server "
. $dba->dbc()->host() . "\n";
if ( $dbname =~ /(vega|otherfeatures)/ ) {
my $other_dbname = $dbname;
$other_dbname =~ s/$1/core/;
$opts->{dbname} = $other_dbname;
#for vega and otherfeatures databases, add the core as the dna database
my $core_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args}, -DBNAME => $other_dbname, -GROUP => 'core' );
$dba->dnadb($core_db);
}
print "Removing attributes from database ", $dba->dbc->dbname, "\n";
remove_old_attributes( $dba, \%attributes_to_delete);
my $translationAdaptor = $dba->get_TranslationAdaptor();
my $attributeAdaptor = $dba->get_AttributeAdaptor();
print
"Going to update translation_attribs in",
$dba->dbc->dbname, "\n";
#for all the translations in the database, run pepstats and update the translation_attrib table
my @translation_ids = @{
$dba->dbc()->sql_helper()->execute_simple(
-SQL=>"SELECT tl.translation_id FROM translation tl, transcript tr, seq_region s, coord_system c WHERE tl.transcript_id = tr.transcript_id AND tr.seq_region_id = s.seq_region_id AND s.coord_system_id = c.coord_system_id AND c.species_id = ? order by tl.translation_id",
-PARAMS=>[$dba->species_id()] ) };
my $tmpfile = $opts->{tmpdir} . "/$$.pep";
open( TMP, "> $tmpfile" ) || warn "PEPSTAT: $!";
print "Retrieving translations\n";
for my $dbID (@translation_ids) {
#foreach translation, retrieve object
my $translation = $translationAdaptor->fetch_by_dbID($dbID);
if ( $opts->{verbose} ) {
print "Dumping translation dbID, $dbID...\n";
}
my $peptide_seq = $translation->seq();
if ( !( $peptide_seq =~ m/[BZX]/ig ) ) {
if ( $peptide_seq !~ /\n$/ ) { $peptide_seq .= "\n" }
$peptide_seq =~ s/\*$//;
print TMP ">$dbID\n$peptide_seq";
} else {
print "Skipping translation dbID $dbID due to ambiguity codes...\n";
}
}
close(TMP);
print "Running pepstat\n";
my $PEPSTATS = $opts->{binpath} . '/bin/pepstats';
throw("pepstats binary at $PEPSTATS cannot be executed")
if ( !-x $PEPSTATS );
open( OUT, "$PEPSTATS -filter < $tmpfile 2>&1 |" )
|| warn "PEPSTAT: $!";
my @lines = <OUT>;
close(OUT);
unlink($tmpfile);
my $attribs = {};
my $tId;
print "Parsing pepstat\n";
foreach my $line (@lines) {
if ( $line =~ /PEPSTATS of ([^ ]+)/ ) {
$tId = $1;
} elsif ( defined $tId ) {
if ( $line =~ /^Molecular weight = (\S+)(\s+)Residues = (\d+).*/ ) {
$attribs->{$tId}{'Number of residues'} = $3;
$attribs->{$tId}{'Molecular weight'} = $1;
} elsif ( $line =~
/^Average(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)/ )
{
$attribs->{$tId}{'Ave. residue weight'} = $7;
$attribs->{$tId}{'Charge'} = $12;
} elsif ( $line =~ /^Isoelectric(\s+)(\S+)(\s+)=(\s+)(\S+)/ ) {
$attribs->{$tId}{'Isoelectric point'} = $5;
} elsif ( $line =~ /FATAL/ ) {
print STDERR "pepstats: $line\n";
}
}
}
for my $id ( keys %$attribs ) {
my $aas = $attribs->{$id};
if ( $opts->{verbose} ) {
print "Storing attribs for translation dbID, $id...\n";
}
store_translation_attrib( $attributeAdaptor, $id, $aas );
}
} ## end for my $db_args ( @{ $cli_helper...})
#will remove any entries in the translation_attrib table for the attributes, if any
#this method will try to remove the old starts_met and has_stop_codon attributes, if present
#this is to allow to be run on old databases, but removing the not used attributes
sub remove_old_attributes {
my $dba = shift;
my $attributes = shift;
print "removing all translation attributes for db, "
. $dba->{_dbc}->{_dbname} . "\n";
foreach my $value ( values %{$attributes} ) {
my $sql = "delete ta FROM translation_attrib ta, attrib_type at, translation tl, transcript tr, seq_region s, coord_system c WHERE at.attrib_type_id = ta.attrib_type_id AND ta.translation_id=tl.translation_id and tl.transcript_id = tr.transcript_id AND tr.seq_region_id = s.seq_region_id AND s.coord_system_id = c.coord_system_id AND c.species_id = ? and at.code=?";
$dba->dbc()->sql_helper()->execute_update(-SQL=>$sql,-PARAMS=>[$dba->species_id(),$value]);
}
return;
} ## end sub remove_old_attributes
sub store_translation_attrib {
my ( $attributeAdaptor, $translation, $attribs ) = @_;
my @attribs;
for my $key ( keys %$attribs ) {
my $value = $attribs->{$key};
push @attribs,
Bio::EnsEMBL::Attribute->new( '-code' => $PEPSTATS_CODES{$key},
'-name' => $key,
'-value' => $value );
}
$attributeAdaptor->store_on_Translation( $translation, \@attribs );
return;
}
#!/usr/local/ensembl/bin/perl
=head1 NAME
translation_attribs_wrapper.pl
Script to calculate peptide statistics and store them in
translation_attrib table. This is mainly a wrapper around the
translation_attribs.pl script to submit several jobs to the farm.
=head1 SYNOPSIS
translation_attribs_wrapper.pl [arguments]
Required arguments:
--user=user username for the database
--pass=pass password for database
--release=release release number
Optional arguments:
--binpath=PATH directory where the binary script to
calculate pepstats is stored (default:
/software/pubseq/bin/emboss)
--tmpdir=directory directory to store temporary results of pepstats
(default: /tmp)
--host=host server where the core databases are stored
(default: ens-staging)
--port=port port (default: 3306)
--path=path path where the LSF output will be stored
(default: the current directory)
--help display help (this message)
=head1 DESCRIPTION
This script will calculate the peptide statistics for all core databases
in the server and store them as a translation_attrib values. This is a
wraper around the translation_attrib and will simply submit jobs to the
farm grouping the core databases in patterns.
=head1 EXAMPLES
Calculate translation_attributes for all databases in ens-staging
$ ./translation_attribs_wrapper.pl --user ensadmin \
--pass password --release 51 --path /my/path/to/lsf/output
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
=cut
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::Exception qw(throw);
## Command line options
my $binpath = "'/software/pubseq/bin/emboss'";
my $tmpdir = "'/tmp'";
my $host = "ens-staging";
my $path = $ENV{PWD};
my $release = undef;
my $user = undef;
my $pass = undef;
my $port = 3306;
my $help = undef;
GetOptions('binpath=s' => \$binpath,
'tmpdir=s' => \$tmpdir,
'host=s' => \$host,
'user=s' => \$user,
'pass=s' => \$pass,
'port=s' => \$port,
'release=i' => \$release,
'path=s' => \$path,
'help' => \$help
);
pod2usage(1) if($help);
throw("--user argument required") if (!defined($user));
throw("--pass argument required") if (!defined($pass));
throw("--release argument required") if (!defined($release));
my $queue = 'long';
my $memory = "'select[mem>4000] rusage[mem=4000]' -M4000000";
my $options = '';
if (defined $binpath){
$options .= "--binpath $binpath ";
}
if (defined $tmpdir){
$options .= "--tmpdir $tmpdir "
}
if (defined $host){
$options .= "--host $host ";
}
if (defined $port){
$options .= "--port $port ";
}
my @ranges = ('^[a-b]','^c','^d','^e','^f','^[g-h]','^[i-l]','^m[a-i]','^m[j-z]','^[n-o]','^p','^[q-r]','^[s-t]','^[u-z]');
my $core_db = ".*core_$release\_.*";
my $call;
foreach my $pattern (@ranges){
$call = "bsub "
. "-R 'select[(myens_staging1<=800)&&(myens_staging2<=800)]' "
. "-o ${path}/output_translation_${pattern}.txt "
. "-e ${path}/output_translation_${pattern}.err "
. "-q $queue "
. "-R$memory "
. "perl ./translation_attribs.pl --user $user --pass $pass $options";
$call .= " --pattern '" . $pattern . $core_db . "'";
system($call);
#print $call,"\n";
}
#we now need to run it for the otherfeatures|vega databases, but only the pepstats
my $vega_db = ".*_vega_$release\_.*";
$call = "bsub -R 'select[(myens_staging1<=800)&&(myens_staging2<=800)]' -o '" . "$path/output_translation_vega.txt" . "' -q $queue -R$memory perl ./translation_attribs.pl --user $user --pass $pass $options";
$call .= " --pattern '" . $vega_db. "'";
system($call);
#print $call,"\n";
@ranges = ('^[a-b]','^c','^[d-e]','^[f-h]','^[i-m]','^[n-o]','^p','^[q-s]','^[t-z]');
my $other_db = ".*_otherfeatures_$release\_.*";
foreach my $pattern (@ranges){
$call = "bsub -R 'select[(myens_staging1<=800)&&(myens_staging2<=800)]' -o '" . "$path/output_translation_other_$pattern.txt" . "' -q $queue -R$memory perl ./translation_attribs.pl --user $user --pass $pass $options";
$call .= " --pattern '" . $pattern . $other_db. "'";
system($call);
#print $call,"\n";
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment