From 80c7932ea7f300c036ac5394692bf6ae04f8b6c1 Mon Sep 17 00:00:00 2001
From: Monika Komorowska <mk8@sanger.ac.uk>
Date: Thu, 23 Jun 2011 09:59:42 +0000
Subject: [PATCH] New script for submitting density feature jobs

---
 .../submit_density_features.pl                | 375 ++++++++++++++++++
 1 file changed, 375 insertions(+)
 create mode 100644 misc-scripts/density_feature/submit_density_features.pl

diff --git a/misc-scripts/density_feature/submit_density_features.pl b/misc-scripts/density_feature/submit_density_features.pl
new file mode 100644
index 0000000000..029c0c854c
--- /dev/null
+++ b/misc-scripts/density_feature/submit_density_features.pl
@@ -0,0 +1,375 @@
+
+# The script lists databases/species which should have density features updated at the specified stage in the release cycle. There's an option for submitting a selected script. 
+
+use strict;
+use Getopt::Long;
+use DBI qw( :sql_types );
+use Switch;
+
+my $getdbs;
+my $submit_script;
+my $outdir;
+my @host;
+my @user;
+my @pass;
+my @port;
+
+GetOptions( "getdbs|g",  \$getdbs,
+	    "submit|s=s", \$submit_script,
+            "outdir|o=s", \$outdir,
+	    "host|h=s",\@host,
+	    "user|u=s",\@user,
+	    "pass|p=s",\@pass,
+	    "port=s",\@port,
+	    "help" ,     \&usage
+	  );
+
+my $host_count = @host;
+my $user_count = @user;
+my $pass_count = @pass;
+
+usage() if (!defined @host || !defined @user || !defined @pass || $user_count != $host_count || $pass_count != $host_count);  
+
+
+my $port_count = @port;
+my $host_string = join("\|",@host);
+
+
+if ( (!defined @port) || ($port_count < $host_count) ) { 
+    for (my $i=0; $i<$host_count;$i++) {
+	if (!defined $port[$i]) {
+	    push(@port,3306);
+	}
+    } 
+}
+
+# production/master database location:
+my ( $mhost, $mport ) = ( 'ens-staging1', '3306' );
+my ( $muser, $mpass ) = ( 'ensro',        undef );
+my $mdbname = 'ensembl_production';
+
+#connect to the production database
+my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s',
+                     $mhost, $mport, $mdbname );
+my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass,
+                          { 'PrintError' => 1, 'RaiseError' => 1 } );
+
+#get the current release number
+
+my ($current_release ) = $prod_dbh->selectrow_array('select max(db_release) from db where is_current = 1');
+ 
+my @hosts = qw(ens-staging1, ens-staging2);
+
+if (!$outdir) { 
+    $outdir = $ENV{'PWD'}; 
+} else {
+  #strip the final /
+  $outdir =~ s/\/$//;  
+}
+
+
+if ( !defined $submit_script ) {
+    $getdbs = 1;
+}
+
+if ( defined $getdbs ) { 
+# ask the user which stage of the release cycle we're in
+  print <<CYCLE; 
+
+Where in the release cycle are we (0,1,2,3)?
+
+0 - None of the below - exit program
+1 - Genebuild and genebuild xrefs complete (excluding projected xrefs), all gene healthchecks cleared
+2 - Compara homologies handed over and core xref projections complete
+3 - Variation dbs handed over
+
+CYCLE
+
+my $response = <>;
+
+if ( !defined $response or $response !~ /[0-4]/ ) {
+ 
+  print <<CYCLE2;
+
+Please specify a valid option: 0, 1, 2 or 3:
+
+Where in the release cycle are we (0,1,2,3)?
+
+0 - None of the below - exit program
+1 - Genebuild and genebuild xrefs complete (excluding projected xrefs), all gene healthchecks cleared
+2 - Compara homologies handed over and core xref projections complete
+3 - Variation dbs hand
+
+CYCLE2
+
+   $response = <>;
+}
+
+
+if ($response == 0) {
+  exit(0);
+}
+
+
+#dbs for new species or changed assembly
+my @new_sp_assem;
+
+#dbs with changed gene sequence
+
+my @chg_seq;
+
+#dbs with changed repeats
+my @chg_repeats;
+
+
+if ($response >= 1) {
+    #get new dbs, or changed assembly  
+    @new_sp_assem =  map { $_->[0] }  @{ $prod_dbh->selectall_arrayref("select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where is_current = 1 and db_type = 'core' and species_id not in (select distinct species_id from db where is_current <> 1 and db_type = 'core') union
+select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where db_type = 'core' and is_current = 1 and species_id in (select distinct species_id from changelog_species cs join changelog c using (changelog_id) where is_current = 1 and status not in ('cancelled','postponed') and assembly = 'Y')") };
+    #get dbs with changed sequence
+    @chg_seq = map { $_->[0] }  @{ $prod_dbh->selectall_arrayref("select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where db_type = 'core' and is_current = 1 and species_id in (select distinct species_id from changelog_species cs join changelog c using (changelog_id) where is_current = 1 and status not in ('cancelled','postponed') and gene_set = 'Y')") };
+
+    #get dbs with changed repeats
+    @chg_repeats =  map { $_->[0] }  @{ $prod_dbh->selectall_arrayref("select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where db_type = 'core' and is_current = 1 and species_id in (select distinct species_id from changelog_species cs join changelog c using (changelog_id) where is_current = 1  and status not in ('cancelled','postponed') and repeat_masking = 'Y')") };
+
+
+    print "1. Density features scripts which can be run when qenebuild and genebuild xrefs (excluding projected xrefs) are complete and all gene healthchecks are cleared: \n\n";
+
+    print "gene_gc.pl - run on all core databases (use the commands below or script submit_density_features.pl -submit gene_gc):\n";
+
+    for (my $i=0; $i<$host_count;$i++) {
+	print "\nbsub -q normal -J genegc_stats -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_genegc.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_genegc.err perl ../gene_gc.pl -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'\n";
+    }
+
+    print "\npercent_gc_calc.pl – run on core databases for new species, or where sequence or assembly have changed (db names will be stored in file ./percent_gc_data.txt, to submit run submit_density_features.pl -submit percent_gc): \n";
+ 
+    my %array_union = ();
+    foreach my $element (@new_sp_assem, @chg_seq) { $array_union{$element}++ }
+    my @dbnames_hosts = sort(keys %array_union); 
+
+    my $file_path = "./percent_gc_data.txt";
+ 
+    open(DATAFILE, ">$file_path") or die("Failed to open file $file_path for writing\n"); 
+    foreach my $dbname_host (@dbnames_hosts) {
+	my ($db_name, $host) = split(/\|/,$dbname_host);
+	if ( ($db_name =~ /core_/) && ( $host_string =~ /$host/) ) {	    
+	    print DATAFILE $db_name."\t".$host."\n";
+	    print $db_name."\t".$host."\n";
+	}	
+    }
+    close DATAFILE;
+
+    print "\n\nrepeat_coverage_calc.pl – run on core databases for new species, or where sequence, assembly or repeats have changed (db names will be stored in file ./repeat_coverage_data.txt, to submit run submit_density_features.pl -submit repeat_coverage): \n";   
+
+    foreach my $element (@chg_repeats) { $array_union{$element}++ }
+    @dbnames_hosts = sort(keys %array_union); 
+
+    my $file_path = "./repeat_coverage_data.txt";
+    open(DATAFILE, ">$file_path") or die("Failed to open file $file_path for writing\n"); 
+    foreach my $dbname_host (@dbnames_hosts) {
+	my ($db_name, $host) = split(/\|/,$dbname_host);
+	if ( ($db_name =~ /core_/) && ( $host_string =~ /$host/) ) {	    
+	    print DATAFILE $db_name."\t".$host."\n";
+	    print $db_name."\t".$host."\n";
+	}	
+    }
+    close DATAFILE;
+
+} 
+
+
+if ($response >= 2) {
+    print "\n\n2. Density features scripts which can be run when Compara homologies are handed over and core xref projections are complete:\n\n";
+    print "gene_density_calc.pl - run on all core dbs (use the commands below or script submit_density_features.pl -submit gene_density)\n";
+    
+    for (my $i=0; $i<$host_count;$i++) {
+	print "\nbsub -q normal -J gene_density -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_gene.err perl ./gene_density_calc.pl -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'\n";
+    }
+
+    print "\n\nseq_region_stats.pl (gene stats option only) - run on all core databases (use the commands below or script submit_density_features.pl -submit seq_region_stats_gene)\n";
+    for (my $i=0; $i<$host_count;$i++) {
+	print "\nbsub -q normal -J seqreg_stats_gene -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.err perl ./seq_region_stats.pl -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."' -s gene\n";
+    }
+}
+
+
+if ($response == 3) {
+#get core dbs for which variation db has changed
+# new field in changelog from release 64
+    print "\n\n3. Density features scripts which can be run when Variation dbs are handed over:\n";
+    print "\nvariation_density.pl - run for new species or where the core assembly has changed, or if there are any changes to variation positions in the variation database\n";
+    print "\nUse this command for each species which needs variation densities recalculated (replace variables in {} ) or store tab delimited {species} {host} in file ./variation_density_data.txt and submit using submit_density_features.pl -submit variation_density :\n";
+    print "(the species will be listed from release 64)\n";
+  
+    print "\nbsub -q normal -J var_density -oo ".$outdir."/{species}_var.out -eo ".$outdir."/{species}_var.err perl ./variation_density.pl -h {host} -port {port} -u {user} -p {password} -s {species} \n";
+  
+    print "\n\nseq_region_stats.pl (snp stats option only) - run on core databases for new species or if the assembly changed, or if the variation positions have changed in the corresponding variation db\n";
+    print "\nUse this command for each database (replace variables in {}) or store tab delimited {db_name} {host} in file ./seq_region_stats_data.txt and submit using submit_density_features.pl -submit seq_region_stats_snp\n";
+ 
+    print "\nbsub -q normal -J seqreg_stats_snp -oo ".$outdir."/{db_name}_seqreg_snp.out -eo ".$outdir. "/{db_name}_seqreg_snp.err perl ./seq_region_stats.pl -h {host} -port {port} -u {user} -p {password} -d {db_name} -s snp\n";
+       
+}
+
+} else {
+
+#submit selected script
+    my @cmd;
+    my $data_file;
+    my @print_message;
+    my $error;
+    my %error_message;
+    my $queue;
+    my $file_name_end;
+    my $script;
+    my $script_title;
+    my $job_name;
+    my $option;
+    switch ($submit_script) {
+	case 'gene_gc' {
+	    for (my $i=0; $i<$host_count;$i++) {
+		push(@cmd, "bsub -q normal -J genegc_stats -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_genegc.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_genegc.err perl ../gene_gc.pl -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'");
+		push(@print_message,"Submitting gene GC calculation for host ".$host[$i]." to queue 'normal'. The output from this job goes to the file ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_genegc.out\n");
+	    }
+
+	}
+	case 'percent_gc' {
+	    $data_file = "percent_gc_data.txt";
+	    $queue = "normal";
+	    $job_name = "gc_calc";
+	    $file_name_end = "_gc";
+	    $script = "percent_gc_calc.pl";
+	    $script_title = "percent GC calculation";
+	    $option = " -d ";
+	}
+	case 'repeat_coverage' {
+	    $data_file = "repeat_coverage_data.txt";
+	    $queue = "long";
+	    $job_name = "repeat_cov";
+	    $file_name_end = "_repeat";
+	    $script = "repeat_coverage_calc.pl";
+	    $script_title = "repeat coverage calculation";
+	    $option = " -d ";
+	}
+	case 'gene_density' {
+	    for (my $i=0; $i<$host_count;$i++) {
+		push(@cmd, "bsub -q normal -J gene_density -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_gene.err perl ./gene_density_calc.pl -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'");
+		push(@print_message,"Submitting gene density calculation for host ".$host[$i]." to queue 'normal'. The output from this job goes to the file ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_gene.out\n");
+	    }
+	}
+	case 'seq_region_stats_gene' {
+	    for (my $i=0; $i<$host_count;$i++) {
+		push(@cmd, "bsub -q normal -J seqreg_stats -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.err perl ./seq_region_stats.pl -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."' -s gene");
+		push(@print_message,"Submitting seq region gene stats for host ".$host[$i]." to queue 'normal'. The output from this job goes to the file ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.out\n");
+	    }
+	}
+	case 'variation_density' {
+	    $data_file = "variation_density_data.txt";
+	    $queue = "normal";
+	    $job_name = "var_density";
+	    $file_name_end = "_var";
+	    $script = "variation_density.pl";
+	    $script_title = "variation density calculation";
+	    $option = " -s ";
+	}
+	case 'seq_region_stats_snp' {
+	    $data_file = "seq_region_stats_data.txt";
+	    $queue = "normal";
+	    $job_name = "seqreg_stats_snp";
+	    $file_name_end = "_seqreg_snp";
+	    $script = "seq_region_stats.pl";
+	    $script_title = "seq region snp stats";
+	    $option = " -s snp -d ";
+	}
+	else { usage(); }
+    }
+
+    if (defined $data_file) {
+	    open(DATAFILE, "<$data_file") or die("Failed to open file $data_file for reading\n"); 
+	    while( my $line = <DATAFILE> ) {
+		chomp $line;
+		my ($db_name, $host_name) = split(/\t/,$line);
+		if ( $host_string =~ /$host_name/) {
+		    #get user and password for host
+		    my ( $index )= grep { $host[$_] =~ /$host_name/ } 0..$#host;
+		    push(@cmd,  "bsub -q ".$queue." -J ".$job_name." -oo ".$outdir."/".$db_name.$file_name_end.".out -eo ".$outdir."/".$db_name.$file_name_end.".err perl ./".$script." -h ".$host_name." -port ".$port[$index]." -u ".$user[$index]." -p ".$pass[$index].$option. $db_name);
+		    push(@print_message,"Submitting ".$script_title." for ".$db_name ." on host ".$host_name." to queue '".$queue."'. The output from this job goes to the file ".$outdir."/".$db_name.$file_name_end.".out\n");
+		}
+		else {
+		    $error = 1;
+		    $error_message{$host_name} = "Host info for $host_name not found in the script's arguments. Please run the script again providing the user and password for host $host_name.\n";
+		}
+	
+	    }
+	    close DATAFILE;
+
+    } 
+  
+    #submit jobs to the farm
+    if ($error) {
+	foreach my $host_in_error (sort(keys %error_message)) {
+	    print $error_message{$host_in_error};
+	} 
+
+    } else {
+        my $cmd_count = 0;
+        foreach my $cmd (@cmd) {	
+	   print $print_message[$cmd_count];
+	   #for testing
+	   #print "\n\n". $cmd . "\n\n";
+	   exec($cmd);
+	   $cmd_count++;
+        }
+    }
+}
+
+sub usage {
+  my $indent = ' ' x length($0);
+  print <<EOF; exit(0);
+
+The script lists databases/species which should have density features updated at the specified stage in the release cycle. There's an option for submitting a selected script.
+
+Usage:
+
+  $0 -h host [-h host]* -u user [-u user]* -p password [-p password]* 
+  $indent -port port [-port port]*
+  $indent [-g] [-s script name]
+  $indent [-o output directory path]
+  $indent [-help]  
+
+
+  -h|host              Database host (multiple hosts can be specified) 
+
+  -u|user              Database user (each host needs a user specified)
+
+  -p|pass              User password (each host needs a password specified)
+
+  -port                Database port (default 3306)  
+
+  -g|getdbs            Use this option to generate input files for the -submit option
+
+  -s|submit            Use this option to submit a density feature script to the farm:
+
+                       gene_gc - the script will run on all core databases
+		       
+		       percent_gc - the script will run on dbs listed in ./percent_gc_data.txt
+		       
+                       repeat_coverage - the script will run on dbs listed in ./repeat_coverage_data.txt
+
+		       gene_density - the script will run on all core databases
+
+                       seq_region_stats_gene - the script will run on all core databases 
+
+		       variation_density - the script will run for species listed in ./variation_density_data.txt
+
+                       seq_region_stats_snp - the script will run on dbs listed in ./repeat_coverage_data.txt
+
+  -o|outdir            Output path for farm job commands (current path if not specified)
+
+  -help                This message
+
+
+EOF
+
+}
+
+  
-- 
GitLab