scripts are now part of ensembl-production pipeline

removing ensembl-core copies

scripts are now part of ensembl-production pipeline
removing ensembl-core copies
0de6a0f5 · Magali Ruffier · 1339de5f · 1339de5f · 1339de5f · 1339de5f
Commit 0de6a0f5 authored 11 years ago by Magali Ruffier
--- a/misc-scripts/gene_gc.pl
+++ b/misc-scripts/gene_gc.pl
-#!/usr/bin/env perl
-# Calculate per-gene GC content and store as gene attributes
-use warnings;
-use strict;
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::Utils::CliHelper;
-my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new();
-# get the basic options for connecting to a database server
-my $optsd = $cli_helper->get_dba_opts();
-# add the print option
-push( @{$optsd}, "print" );
-# process the command line with the supplied options plus a help subroutine
-my $opts = $cli_helper->process_args( $optsd, \&usage );
-# use the command line options to get an array of database details
-for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) {
-	# use the args to create a DBA
-	my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} );
-	print STDOUT "Processing species "
-	  . $dba->species_id()
-	  . " from database "
-	  . $dba->dbc()->dbname()
-	  . " on server "
-	  . $dba->dbc()->host() . "\n";
-	delete_existing($dba) unless ( $opts->{print} );
-	print STDOUT "Calculating Gene GC attributes\n";
-	my $attribute_adaptor = $dba->get_AttributeAdaptor();
-	my $total_count       = 0;
-	for my $gene ( @{ $dba->get_GeneAdaptor()->fetch_all() } ) {
-		my $gc = $gene->feature_Slice()->get_base_count->{'%gc'};
-		if ( !$opts->{print} ) {
-	 # attribute types need to be propagated from production database to all dbs
-	 # if the type exists it won't be overwritten
-			my $attribute =
-			  Bio::EnsEMBL::Attribute->new(
-						  -CODE        => 'GeneGC',
-						  -NAME        => 'Gene GC',
-						  -DESCRIPTION => 'Percentage GC content for this gene',
-						  -VALUE       => $gc );
-			my @attributes = ($attribute);
-			$attribute_adaptor->store_on_Gene( $gene->dbID, \@attributes );
-			$total_count++;
-		} else {
-			print $gene->stable_id() . " " . $gc . "\n";
-		}
-	}
-	if ( !$opts->{print} ) {
-		print STDOUT "Written $total_count 'GeneGC' gene attributes to species "
-		  . $dba->species_id()
-		  . " from database "
-		  . $dba->dbc()->dbname()
-		  . " on server "
-		  . $dba->dbc()->host() . "\n";
-	}
-} ## end for my $db_args ( @{ $cli_helper...})
-# ----------------------------------------------------------------------
-sub delete_existing {
-	my $dba = shift;
-	print STDOUT "Deleting existing 'GeneGC' gene attributes\n";
-	$dba->dbc()->sql_helper()->execute_update(
-		-SQL =>
-q/DELETE ga FROM gene_attrib ga, attrib_type at, gene g, seq_region s, coord_system c 
-WHERE at.attrib_type_id=ga.attrib_type_id AND at.code='GeneGC'
-AND ga.gene_id=g.gene_id AND g.seq_region_id=s.seq_region_id 
-AND c.coord_system_id=s.coord_system_id AND c.species_id=?
-/,
-		PARAMS => [ $dba->species_id() ] );
-	return;
-}
-sub usage {
-	my $indent = ' ' x length($0);
-	print <<EOF; exit(0);
-What does it do?
-This script calculates per-gene GC content and stores it as gene attributes.
-It deletes existing Gene GC attributes. Then fetches all genes in the
-core db and calculates the %gc for each gene.
-Input data: dna sequence 
-Output tables: gene_attrib 
-When to run it in the release cycle?
-It can be run whenever the genes and sequence are stable, i.e. any time after 
-the genebuild handover, but before the handover to Mart.
-Which databases to run it on?
-It needs to be run across all core databases for every release.
-How long does it take?
-It takes a total of about 10 hours to run for all core databases in normal queue,
-Usage:
-  $0 -h host [-port port] -u user -p password \\
-  $indent -pattern pattern [-print] \\
-  $indent [-help]  \\
-  -h|host              Database host to connect to
-  -port                Database port to connect to (default 3306)
-  -u|user              Database username
-  -p|pass              Password for user
-  -pattern             Database name regexp
-  -print               Just print, don't insert or delete attributes
-  -help                This message
-EOF
-} ## end sub usage
--- a/misc-scripts/translation_attribs.pl
+++ b/misc-scripts/translation_attribs.pl
-#!/software/bin/perl
-=head1 NAME
-translation_attribs.pl - script to calculate peptide statistics and store 
-                         them in translation_attrib table
-=head1 SYNOPSIS
-translation_attribs.pl [arguments]
-Required arguments:
-  --user=user                         username for the database
-  --pass=pass                         password for database
-Optional arguments:
-  --pattern=pattern                   calculate translation attribs for databases matching pattern
-                                      Note that this is a standard regular expression of the
-                                      form '^[a-b].*core.*' for all core databases starting with a or b
-  --binpath=PATH                      directory where the binary script to calculate 
-                                      pepstats is stored (default: /software/pubseq/bin/emboss)
-  --tmpdir=dir                        directory to store tmp results of pepstats (default=/tmp)
-  --host=host                         server where the core databases are stored (default: ens-staging)
-  --dbname=dbname                     if you want a single database to calculate the pepstats
-                                      (all databases by default)
-  --port=port                         port (default=3306)
-  --help                              print help (this message)
-=head1 DESCRIPTION
-This script will calculate the peptide statistics for all core databases in the server 
-and store them as a translation_attrib values
-=head1 EXAMPLES
-Calculate translation_attributes for all core databases in ens-staging 
-  $ ./translation_attribs.pl --user ensadmin --pass password
-Calculate translation_attributes for core databases starting with [a-c] in ens-staging (output LSF to PWD) 
-  $ ./translation_attribs.pl --user ensadmin --pass password --pattern '^[a-c].*core_50.*'
-Calculate translation_attribs for a single database in a ens-genomics1
-  $ ./translation_attribs.pl  --host ens-genomics1 --user ensadmin --pass password --dbname my_core_db
-=head1 LICENCE
-This code is distributed under an Apache style licence. Please see
-http://www.ensembl.org/info/about/code_licence.html for details.
-=head1 AUTHOR
-Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team
-=head1 CONTACT
-=cut
-use strict;
-use warnings;
-use Getopt::Long;
-use Pod::Usage;
-use Bio::EnsEMBL::Translation;
-use Bio::EnsEMBL::Registry;
-use Bio::EnsEMBL::Attribute;
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Data::Dumper;
-use DBI;
-use Bio::EnsEMBL::Utils::Exception qw(throw);
-use Bio::EnsEMBL::Utils::CliHelper;
-my $cli_helper = Bio::EnsEMBL::Utils::CliHelper->new();
-# get the basic options for connecting to a database server
-my $optsd = $cli_helper->get_dba_opts();
-# add the print option
-push( @{$optsd}, "binpath:s" );
-push( @{$optsd}, "tmpdir:s" );
-push( @{$optsd}, "verbose|v" );
-# process the command line with the supplied options plus a help subroutine
-my $opts = $cli_helper->process_args( $optsd, \&pod2usage );
-$opts->{binpath} ||= '/software/pubseq/bin/emboss';
-$opts->{tmpdir}  ||= '/tmp';
-$opts->{port}    ||= '3306';
-$opts->{host}    ||= 'ens-staging';
-$opts->{user}    ||= 'ensro';
-$opts->{pattern} ||= qr/.+_core_.+/;
-my %PEPSTATS_CODES = ( 'Number of residues' => 'NumResidues',
-		       'Molecular weight'   => 'MolecularWeight',
-		       'Ave. residue weight' => 'AvgResWeight',
-		       'Charge' => 'Charge',
-		       'Isoelectric point' => 'IsoPoint'
-		      );
-my %MET_AND_STOP = ( 'Starts with methionine' => 'starts_met', 
-		     'Contains stop codon' => 'has_stop_codon'
-		    );
-my %attributes_to_delete = (%PEPSTATS_CODES);
-my $translation_attribs = {};
-my $translation;
-my $dbID;
-# use the command line options to get an array of database details
-for my $db_args ( @{ $cli_helper->get_dba_args_for_opts($opts) } ) {
-	# use the args to create a DBA
-	my $dba    = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args} );
-	my $dbname = $dba->dbc()->dbname();
-	print STDOUT "Processing species "
-	  . $dba->species_id()
-	  . " from database "
-	  . $dba->dbc()->dbname()
-	  . " on server "
-	  . $dba->dbc()->host() . "\n";
-	if ( $dbname =~ /(vega|otherfeatures)/ ) {
-		my $other_dbname = $dbname;
-		$other_dbname =~ s/$1/core/;
-		$opts->{dbname} = $other_dbname;
-		#for vega and otherfeatures databases, add the core as the dna database
-		my $core_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$db_args}, -DBNAME => $other_dbname, -GROUP => 'core' );
-		$dba->dnadb($core_db);
-	}
-	print "Removing attributes from database ", $dba->dbc->dbname, "\n";
-	remove_old_attributes( $dba, \%attributes_to_delete);
-	my $translationAdaptor = $dba->get_TranslationAdaptor();
-	my $attributeAdaptor   = $dba->get_AttributeAdaptor();
-	print
-	  "Going to update translation_attribs in",
-	  $dba->dbc->dbname, "\n";
-#for all the translations in the database, run pepstats and update the translation_attrib table
-	my @translation_ids = @{
-		$dba->dbc()->sql_helper()->execute_simple(
-SQL=>"SELECT tl.translation_id FROM translation tl, transcript tr, seq_region s, coord_system c WHERE tl.transcript_id = tr.transcript_id AND tr.seq_region_id = s.seq_region_id AND s.coord_system_id = c.coord_system_id AND c.species_id = ? order by tl.translation_id",
-			-PARAMS=>[$dba->species_id()] ) };
-	my $tmpfile      = $opts->{tmpdir} . "/$$.pep";
-	open( TMP, "> $tmpfile" ) || warn "PEPSTAT: $!";
-	print "Retrieving translations\n";
-	for my $dbID (@translation_ids) {
-		#foreach translation, retrieve object
-		my $translation = $translationAdaptor->fetch_by_dbID($dbID);
-		if ( $opts->{verbose} ) {
-			print "Dumping translation dbID, $dbID...\n";
-		}
-		my $peptide_seq = $translation->seq();
-		if ( !( $peptide_seq =~ m/[BZX]/ig ) ) {
-			if ( $peptide_seq !~ /\n$/ ) { $peptide_seq .= "\n" }
-			$peptide_seq =~ s/\*$//;
-			print TMP ">$dbID\n$peptide_seq";
-		} else {
-			print "Skipping translation dbID $dbID due to ambiguity codes...\n";
-		}
-	}
-	close(TMP);
-	print "Running pepstat\n";
-	my $PEPSTATS = $opts->{binpath} . '/bin/pepstats';
-	throw("pepstats binary at  $PEPSTATS cannot be executed")
-	  if ( !-x $PEPSTATS );
-	open( OUT, "$PEPSTATS -filter < $tmpfile 2>&1 |" )
-	  || warn "PEPSTAT: $!";
-	my @lines = <OUT>;
-	close(OUT);
-	unlink($tmpfile);
-	my $attribs = {};
-	my $tId;
-	print "Parsing pepstat\n";
-	foreach my $line (@lines) {
-		if ( $line =~ /PEPSTATS of ([^ ]+)/ ) {
-			$tId = $1;
-		} elsif ( defined $tId ) {
-			if ( $line =~ /^Molecular weight = (\S+)(\s+)Residues = (\d+).*/ ) {
-				$attribs->{$tId}{'Number of residues'} = $3;
-				$attribs->{$tId}{'Molecular weight'}   = $1;
-			} elsif ( $line =~
-/^Average(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)(\s+)(\S+)(\s+)=(\s+)(\S+)/ )
-			{
-				$attribs->{$tId}{'Ave. residue weight'} = $7;
-				$attribs->{$tId}{'Charge'}              = $12;
-			} elsif ( $line =~ /^Isoelectric(\s+)(\S+)(\s+)=(\s+)(\S+)/ ) {
-				$attribs->{$tId}{'Isoelectric point'} = $5;
-			} elsif ( $line =~ /FATAL/ ) {
-				print STDERR "pepstats: $line\n";
-			}
-		}
-	}
-	for my $id ( keys %$attribs ) {
-		my $aas         = $attribs->{$id};
-		if ( $opts->{verbose} ) {
-			print "Storing attribs for translation dbID, $id...\n";
-		}
-		store_translation_attrib( $attributeAdaptor, $id, $aas );
-	}
-} ## end for my $db_args ( @{ $cli_helper...})
-#will remove any entries in the translation_attrib table for the attributes, if any
-#this method will try to remove the old starts_met and has_stop_codon attributes, if present
-#this is to allow to be run on old databases, but removing the not used attributes
-sub remove_old_attributes {
-	my $dba        = shift;
-	my $attributes = shift;
-		print "removing all translation attributes for db, "
-		  . $dba->{_dbc}->{_dbname} . "\n";
-		foreach my $value ( values %{$attributes} ) {
-		    my $sql = "delete ta FROM translation_attrib ta, attrib_type at, translation tl, transcript tr, seq_region s, coord_system c  WHERE at.attrib_type_id = ta.attrib_type_id AND ta.translation_id=tl.translation_id and  tl.transcript_id = tr.transcript_id AND tr.seq_region_id = s.seq_region_id AND s.coord_system_id = c.coord_system_id AND c.species_id = ? and at.code=?";
-			$dba->dbc()->sql_helper()->execute_update(-SQL=>$sql,-PARAMS=>[$dba->species_id(),$value]);
-		}
-		return;
-} ## end sub remove_old_attributes
-sub store_translation_attrib {
-	my ( $attributeAdaptor, $translation, $attribs ) = @_;
-	my @attribs;
-	for my $key ( keys %$attribs ) {
-		my $value = $attribs->{$key};
-		push @attribs,
-		  Bio::EnsEMBL::Attribute->new( '-code'  => $PEPSTATS_CODES{$key},
-										'-name'  => $key,
-										'-value' => $value );
-	}
-	$attributeAdaptor->store_on_Translation( $translation, \@attribs );
-	return;
-}
--- a/misc-scripts/translation_attribs_wrapper.pl
+++ b/misc-scripts/translation_attribs_wrapper.pl
-#!/usr/local/ensembl/bin/perl
-=head1 NAME
-translation_attribs_wrapper.pl
-Script to calculate peptide statistics and store them in
-translation_attrib table.  This is mainly a wrapper around the
-translation_attribs.pl script to submit several jobs to the farm.
-=head1 SYNOPSIS
-translation_attribs_wrapper.pl [arguments]
-Required arguments:
-  --user=user           username for the database
-  --pass=pass           password for database
-  --release=release     release number
-Optional arguments:
-  --binpath=PATH        directory where the binary script to
-                        calculate pepstats is stored (default:
-                        /software/pubseq/bin/emboss)
-  --tmpdir=directory    directory to store temporary results of pepstats
-                        (default: /tmp)
-  --host=host           server where the core databases are stored
-                        (default: ens-staging)
-  --port=port           port (default: 3306)
-  --path=path           path where the LSF output will be stored
-                        (default: the current directory)
-  --help                display help (this message)
-=head1 DESCRIPTION
-This script will calculate the peptide statistics for all core databases
-in the server and store them as a translation_attrib values.  This is a
-wraper around the translation_attrib and will simply submit jobs to the
-farm grouping the core databases in patterns.
-=head1 EXAMPLES
-Calculate translation_attributes for all databases in ens-staging
-  $ ./translation_attribs_wrapper.pl --user ensadmin \
-    --pass password --release 51 --path /my/path/to/lsf/output
-=head1 LICENCE
-This code is distributed under an Apache style licence. Please see
-http://www.ensembl.org/info/about/code_licence.html for details.
-=head1 AUTHOR
-Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team
-=head1 CONTACT
-=cut
-use strict;
-use warnings;
-use Getopt::Long;
-use Pod::Usage;
-use Bio::EnsEMBL::Utils::Exception qw(throw);
-## Command line options
-my $binpath = "'/software/pubseq/bin/emboss'"; 
-my $tmpdir = "'/tmp'";
-my $host = "ens-staging";
-my $path = $ENV{PWD};
-my $release = undef;
-my $user = undef;
-my $pass = undef;
-my $port = 3306;
-my $help = undef;
-GetOptions('binpath=s' => \$binpath,
-	   'tmpdir=s' => \$tmpdir,
-	   'host=s'    => \$host,
-	   'user=s'    => \$user,
-	   'pass=s'    => \$pass,
-	   'port=s'    => \$port,
-	   'release=i' => \$release,
-	   'path=s'    => \$path,
-	   'help'    => \$help
-	   );
-pod2usage(1) if($help);
-throw("--user argument required") if (!defined($user));
-throw("--pass argument required") if (!defined($pass));
-throw("--release argument required") if (!defined($release));
-my $queue = 'long';
-my $memory = "'select[mem>4000] rusage[mem=4000]' -M4000000";
-my $options = '';
-if (defined $binpath){
-    $options .= "--binpath $binpath ";
-}
-if (defined $tmpdir){
-    $options .= "--tmpdir $tmpdir "
-}
-if (defined $host){
-    $options .= "--host $host ";
-}
-if (defined $port){
-    $options .= "--port $port ";
-}
-my @ranges = ('^[a-b]','^c','^d','^e','^f','^[g-h]','^[i-l]','^m[a-i]','^m[j-z]','^[n-o]','^p','^[q-r]','^[s-t]','^[u-z]');
-my $core_db = ".*core_$release\_.*";
-my $call;
-foreach my $pattern (@ranges){
-    $call = "bsub "
-      . "-R 'select[(myens_staging1<=800)&&(myens_staging2<=800)]' "
-      . "-o ${path}/output_translation_${pattern}.txt "
-      . "-e ${path}/output_translation_${pattern}.err "
-      . "-q $queue "
-      . "-R$memory "
-      . "perl ./translation_attribs.pl --user $user --pass $pass $options";
-    $call .= " --pattern '" . $pattern . $core_db . "'";
-    system($call);
-#print $call,"\n";
-}
-#we now need to run it for the otherfeatures|vega databases, but only the pepstats
-my $vega_db = ".*_vega_$release\_.*";
-$call = "bsub -R 'select[(myens_staging1<=800)&&(myens_staging2<=800)]' -o '" . "$path/output_translation_vega.txt" . "' -q $queue -R$memory perl ./translation_attribs.pl --user $user --pass $pass $options";
-$call .= " --pattern '" . $vega_db. "'";
-system($call);
-#print $call,"\n";
-@ranges = ('^[a-b]','^c','^[d-e]','^[f-h]','^[i-m]','^[n-o]','^p','^[q-s]','^[t-z]');
-my $other_db = ".*_otherfeatures_$release\_.*";
-foreach my $pattern (@ranges){
-    $call = "bsub -R 'select[(myens_staging1<=800)&&(myens_staging2<=800)]' -o '" . "$path/output_translation_other_$pattern.txt" . "' -q $queue -R$memory perl ./translation_attribs.pl --user $user --pass $pass $options";
-    $call .= " --pattern '" . $pattern . $other_db. "'";
-    system($call);
-#print $call,"\n";
-}