# The script lists databases/species which should have density features updated at the specified stage in the release cycle. There's an option for submitting a selected script. 

use strict;
use Getopt::Long;
use DBI qw( :sql_types );
use Switch;

my $getdbs;
my $submit_script;
my $outdir;
my @host;
my @user;
my @pass;
my @port;

GetOptions( "getdbs|g",  \$getdbs,
	    "submit|s=s", \$submit_script,
            "outdir|o=s", \$outdir,
	    "help" ,     \&usage

my $host_count = @host;
my $user_count = @user;
my $pass_count = @pass;

usage() if (!defined @host || !defined @user || !defined @pass || $user_count != $host_count || $pass_count != $host_count);  

my $port_count = @port;
my $host_string = join("\|",@host);

if ( (!defined @port) || ($port_count < $host_count) ) { 
    for (my $i=0; $i<$host_count;$i++) {
	if (!defined $port[$i]) {

# production/master database location:
my ( $mhost, $mport ) = ( 'ens-staging1', '3306' );
my ( $muser, $mpass ) = ( 'ensro',        undef );
my $mdbname = 'ensembl_production';

#connect to the production database
my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s',
                     $mhost, $mport, $mdbname );
my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass,
                          { 'PrintError' => 1, 'RaiseError' => 1 } );

#get the current release number

my ($current_release ) = $prod_dbh->selectrow_array('select max(db_release) from db where is_current = 1');
my @hosts = qw(ens-staging1, ens-staging2);

if (!$outdir) { 
    $outdir = $ENV{'PWD'}; 
} else {
  #strip the final /
  $outdir =~ s/\/$//;  
  #test if the directory exists
  if (!-d $outdir) {
      die("Directory $outdir does not exist."); 


if ( !defined $submit_script ) {
    $getdbs = 1;

if ( defined $getdbs ) { 
# ask the user which stage of the release cycle we're in
  print <<CYCLE; 

Where in the release cycle are we (0,1,2,3)?

0 - None of the below - exit program
1 - Genebuild and genebuild xrefs complete (excluding projected xrefs), all gene healthchecks cleared
2 - Compara homologies handed over and core xref projections complete
3 - Variation dbs handed over


my $response = <>;

if ( !defined $response or $response !~ /[0-4]/ ) {
  print <<CYCLE2;

Please specify a valid option: 0, 1, 2 or 3:

Where in the release cycle are we (0,1,2,3)?

0 - None of the below - exit program
1 - Genebuild and genebuild xrefs complete (excluding projected xrefs), all gene healthchecks cleared
2 - Compara homologies handed over and core xref projections complete
3 - Variation dbs hand


   $response = <>;

if ($response == 0) {

#dbs for new species or changed assembly
my @new_sp_assem;

#dbs with changed gene sequence

my @chg_seq;

#dbs with changed repeats
my @chg_repeats;

if ($response >= 1) {
    #get new dbs, or changed assembly  
    @new_sp_assem =  map { $_->[0] }  @{ $prod_dbh->selectall_arrayref("select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where is_current = 1 and db_type = 'core' and species_id not in (select distinct species_id from db where is_current <> 1 and db_type = 'core') union
select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where db_type = 'core' and is_current = 1 and species_id in (select distinct species_id from changelog_species cs join changelog c using (changelog_id) where is_current = 1 and status not in ('cancelled','postponed') and assembly = 'Y')") };
    #get dbs with changed sequence
    @chg_seq = map { $_->[0] }  @{ $prod_dbh->selectall_arrayref("select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where db_type = 'core' and is_current = 1 and species_id in (select distinct species_id from changelog_species cs join changelog c using (changelog_id) where is_current = 1 and status not in ('cancelled','postponed') and gene_set = 'Y')") };

    #get dbs with changed repeats
    @chg_repeats =  map { $_->[0] }  @{ $prod_dbh->selectall_arrayref("select distinct concat(full_db_name,'|',db_host) from db_list dl join db d using (db_id) where db_type = 'core' and is_current = 1 and species_id in (select distinct species_id from changelog_species cs join changelog c using (changelog_id) where is_current = 1  and status not in ('cancelled','postponed') and repeat_masking = 'Y')") };

    print "1. Density features scripts which can be run when qenebuild and genebuild xrefs (excluding projected xrefs) are complete and all gene healthchecks are cleared: \n\n";

    print " - run on all core databases (use the commands below or script -submit gene_gc):\n";

    for (my $i=0; $i<$host_count;$i++) {
	print "\nbsub -q normal -J genegc_stats -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_genegc.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_genegc.err perl ../ -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'\n";

    print "\ – run on core databases for new species, or where sequence or assembly have changed (db names will be stored in file ./percent_gc_data.txt, to submit run -submit percent_gc): \n";
    my %array_union = ();
    foreach my $element (@new_sp_assem, @chg_seq) { $array_union{$element}++ }
    my @dbnames_hosts = sort(keys %array_union); 

    my $file_path = "./percent_gc_data.txt";
    open(DATAFILE, ">$file_path") or die("Failed to open file $file_path for writing\n"); 
    foreach my $dbname_host (@dbnames_hosts) {
	my ($db_name, $host) = split(/\|/,$dbname_host);
	if ( ($db_name =~ /core_/) && ( $host_string =~ /$host/) ) {	    
	    print DATAFILE $db_name."\t".$host."\n";
	    print $db_name."\t".$host."\n";
    close DATAFILE;

    print "\n\ – run on core databases for new species, or where sequence, assembly or repeats have changed (db names will be stored in file ./repeat_coverage_data.txt, to submit run -submit repeat_coverage): \n";   

    foreach my $element (@chg_repeats) { $array_union{$element}++ }
    @dbnames_hosts = sort(keys %array_union); 

    my $file_path = "./repeat_coverage_data.txt";
    open(DATAFILE, ">$file_path") or die("Failed to open file $file_path for writing\n"); 
    foreach my $dbname_host (@dbnames_hosts) {
	my ($db_name, $host) = split(/\|/,$dbname_host);
	if ( ($db_name =~ /core_/) && ( $host_string =~ /$host/) ) {	    
	    print DATAFILE $db_name."\t".$host."\n";
	    print $db_name."\t".$host."\n";
    close DATAFILE;


if ($response >= 2) {
    print "\n\n2. Density features scripts which can be run when Compara homologies are handed over and core xref projections are complete:\n\n";
    print " - run on all core dbs (use the commands below or script -submit gene_density)\n";
    for (my $i=0; $i<$host_count;$i++) {
	print "\nbsub -q normal -J gene_density -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_gene.err perl ./ -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'\n";

    print "\n\ (gene stats option only) - run on all core databases (use the commands below or script -submit seq_region_stats_gene)\n";
    for (my $i=0; $i<$host_count;$i++) {
	print "\nbsub -q normal -J seqreg_stats_gene -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.err perl ./ -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."' -s gene\n";

if ($response == 3) {
#get core dbs for which variation db has changed
# new field in changelog from release 64
    print "\n\n3. Density features scripts which can be run when Variation dbs are handed over:\n";
    print "\ - run for new species or where the core assembly has changed, or if there are any changes to variation positions in the variation database\n";
    print "\nUse this command for each species which needs variation densities recalculated (replace variables in {} ) or store tab delimited {species} {host} in file ./variation_density_data.txt and submit using -submit variation_density :\n";
    print "(the species will be listed from release 64)\n";
    print "\nbsub -q normal -J var_density -oo ".$outdir."/{species}_var.out -eo ".$outdir."/{species}_var.err perl ./ -h {host} -port {port} -u {user} -p {password} -s {species} \n";
    print "\n\ (snp stats option only) - run on core databases for new species or if the assembly changed, or if the variation positions have changed in the corresponding variation db\n";
    print "\nUse this command for each database (replace variables in {}) or store tab delimited {db_name} {host} in file ./seq_region_stats_data.txt and submit using -submit seq_region_stats_snp\n";
    print "\nbsub -q normal -J seqreg_stats_snp -oo ".$outdir."/{db_name}_seqreg_snp.out -eo ".$outdir. "/{db_name}_seqreg_snp.err perl ./ -h {host} -port {port} -u {user} -p {password} -d {db_name} -s snp\n";

} else {

#submit selected script
    my @cmd;
    my $data_file;
    my @print_message;
    my $error;
    my %error_message;
    my $queue;
    my $file_name_end;
    my $script;
    my $script_title;
    my $job_name;
    my $option;
    switch ($submit_script) {
	case 'gene_gc' {
	    for (my $i=0; $i<$host_count;$i++) {
		push(@cmd, "bsub -q normal -J genegc_stats -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_genegc.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_genegc.err perl ../ -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'");
		push(@print_message,"Submitting gene GC calculation for host ".$host[$i]." to queue 'normal'. The output from this job goes to the file ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_genegc.out\n");

	case 'percent_gc' {
	    $data_file = "percent_gc_data.txt";
	    $queue = "normal";
	    $job_name = "gc_calc";
	    $file_name_end = "_gc";
	    $script = "";
	    $script_title = "percent GC calculation";
	    $option = " -d ";
	case 'repeat_coverage' {
	    $data_file = "repeat_coverage_data.txt";
	    $queue = "long";
	    $job_name = "repeat_cov";
	    $file_name_end = "_repeat";
	    $script = "";
	    $script_title = "repeat coverage calculation";
	    $option = " -d ";
	case 'gene_density' {
	    for (my $i=0; $i<$host_count;$i++) {
		push(@cmd, "bsub -q normal -J gene_density -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_gene.err perl ./ -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."'");
		push(@print_message,"Submitting gene density calculation for host ".$host[$i]." to queue 'normal'. The output from this job goes to the file ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_gene.out\n");
	case 'seq_region_stats_gene' {
	    for (my $i=0; $i<$host_count;$i++) {
		push(@cmd, "bsub -q normal -J seqreg_stats -oo ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.out -eo ".$outdir. "/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.err perl ./ -h ".$host[$i]." -port ".$port[$i]." -u ".$user[$i]." -p ".$pass[$i]." -pattern 'core_".$current_release."' -s gene");
		push(@print_message,"Submitting seq region gene stats for host ".$host[$i]." to queue 'normal'. The output from this job goes to the file ".$outdir."/core_dbs_".$current_release."_".$host[$i]."_seqreg_gene.out\n");
	case 'variation_density' {
	    $data_file = "variation_density_data.txt";
	    $queue = "normal";
	    $job_name = "var_density";
	    $file_name_end = "_var";
	    $script = "";
	    $script_title = "variation density calculation";
	    $option = " -s ";
	case 'seq_region_stats_snp' {
	    $data_file = "seq_region_stats_data.txt";
	    $queue = "normal";
	    $job_name = "seqreg_stats_snp";
	    $file_name_end = "_seqreg_snp";
	    $script = "";
	    $script_title = "seq region snp stats";
	    $option = " -s snp -d ";
	else { usage(); }

    if (defined $data_file) {
	    open(DATAFILE, "<$data_file") or die("Failed to open file $data_file for reading\n"); 
	    while( my $line = <DATAFILE> ) {
		chomp $line;
		my ($db_name, $host_name) = split(/\t/,$line);
		if ( $host_string =~ /$host_name/) {
		    #get user and password for host
		    my ( $index )= grep { $host[$_] =~ /$host_name/ } 0..$#host;
		    push(@cmd,  "bsub -q ".$queue." -J ".$job_name." -oo ".$outdir."/".$db_name.$file_name_end.".out -eo ".$outdir."/".$db_name.$file_name_end.".err perl ./".$script." -h ".$host_name." -port ".$port[$index]." -u ".$user[$index]." -p ".$pass[$index].$option. $db_name);
		    push(@print_message,"Submitting ".$script_title." for ".$db_name ." on host ".$host_name." to queue '".$queue."'. The output from this job goes to the file ".$outdir."/".$db_name.$file_name_end.".out\n");
		else {
		    $error = 1;
		    $error_message{$host_name} = "Host info for $host_name not found in the script's arguments. Please run the script again providing the user and password for host $host_name.\n";
	    close DATAFILE;

    #submit jobs to the farm
    if ($error) {
	foreach my $host_in_error (sort(keys %error_message)) {
	    print $error_message{$host_in_error};

    } else {
        my $cmd_count = 0;
        foreach my $cmd (@cmd) {	
	   print $print_message[$cmd_count];
	   #for testing
	   #print "\n\n". $cmd . "\n\n";

sub usage {
  my $indent = ' ' x length($0);
  print <<EOF; exit(0);

The script lists databases/species which should have density features updated at the specified stage in the release cycle. There's an option for submitting a selected script.


  $0 -h host [-h host]* -u user [-u user]* -p password [-p password]* 
  $indent -port port [-port port]*
  $indent [-g] [-s script name]
  $indent [-o output directory path]
  $indent [-help]  

  -h|host              Database host (multiple hosts can be specified) 

  -u|user              Database user (each host needs a user specified)

  -p|pass              User password (each host needs a password specified)

  -port                Database port (default 3306)  

  -g|getdbs            Use this option to generate input files for the -submit option

  -s|submit            Use this option to submit a density feature script to the farm:

                       gene_gc - the script will run on all core databases
		       percent_gc - the script will run on dbs listed in ./percent_gc_data.txt
                       repeat_coverage - the script will run on dbs listed in ./repeat_coverage_data.txt

		       gene_density - the script will run on all core databases

                       seq_region_stats_gene - the script will run on all core databases 

		       variation_density - the script will run for species listed in ./variation_density_data.txt

                       seq_region_stats_snp - the script will run on dbs listed in ./repeat_coverage_data.txt

  -o|outdir            Output path for farm job commands (current path if not specified)

  -help                This message

