diff --git a/misc-scripts/ebi_search_dump/dump_ebi.pl b/misc-scripts/ebi_search_dump/dump_ebi.pl index 35ef8ccfe6c95bc932728c1b7367e9bd79c1b494..5b2d4f9b258a3258de237559affcaa9e1213a008 100644 --- a/misc-scripts/ebi_search_dump/dump_ebi.pl +++ b/misc-scripts/ebi_search_dump/dump_ebi.pl @@ -18,22 +18,26 @@ use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor; use HTML::Entities; -my ($host, $user, $pass, $port, $dbpattern, $max_genes, $gzip, $no_variation); +my ($host, $user, $pass, $port, $dbpattern, $max_genes, $nogzip, $no_variation, $parallel, $dir); GetOptions( "host=s", \$host, "user=s", \$user, "pass=s", \$pass, "port=i", \$port, "dbpattern|pattern=s", \$dbpattern, - "gzip!", \$gzip, + "nogzip!", \$nogzip, "max_genes=i", \$max_genes, "no_variation", \$no_variation, + "parallel", \$parallel, + "dir=s", \$dir, "help" , \&usage ); -if( !$host || !$dbpattern ) { - usage(); -} +$user = $user || "ensro"; +$host = $host || "ens-staging"; +$port = $port || "3306"; +$dbpattern = $dbpattern || "_core_"; +$dir = $dir || "/lustre/scratch1/ensembl/gp1/xml"; my $entry_count; @@ -61,41 +65,80 @@ sub run() { next if ($dbname !~ /$dbpattern/); - my $file = $dbname . ".xml"; - $file .= ".gz" if ($gzip); + my $file = $dir . "/" . $dbname . ".xml"; + $file .= ".gz" unless ($nogzip); + + - if ($gzip) { + if ($parallel) { - $fh = new IO::Zlib; - $fh->open("$file", "wb9") || die ("Can't open compressed stream to $file"); + submit($dbname, $file); - } else { + } else { - open(FILE, ">$file") || die "Can't open $file"; + dump_single($dbname, $file); } - print "Dumping $dbname to $file\n"; + } + +} + +# ------------------------------------------------------------------------------- + +sub submit { - my $start_time = time; + my ($dbname, $file) = @_; - my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor('-host' => $host, - '-port' => $port, - '-user' => $user, - '-pass' => $pass, - '-dbname' => $dbname, - '-species' => $dbname); + print "Submitting job for $dbname\n"; - header($dba, $dbname); + my $o = $dir . "/" . ${dbname} . ".out"; + my $e = $dir . "/" . ${dbname} . ".err"; - content($dba); + my $p = ($pass) ? "-pass $pass" : ''; - footer(); + my $n = substr($dbname, 0, 10); - print_time($start_time); + system "bsub -o $o -e $e -J $dbname perl dump_ebi.pl -user $user -host $host $p -port $port -dbpattern $dbname -gzip"; + +} + +# ------------------------------------------------------------------------------- + +sub dump_single { + + my ($dbname, $file) = @_; + + unless ($nogzip) { + + $fh = new IO::Zlib; + $fh->open("$file", "wb9") || die ("Can't open compressed stream to $file"); + + } else { + + open(FILE, ">$file") || die "Can't open $file"; } + print "Dumping $dbname to $file\n"; + + my $start_time = time; + + my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor('-host' => $host, + '-port' => $port, + '-user' => $user, + '-pass' => $pass, + '-dbname' => $dbname, + '-species' => $dbname); + + header($dba, $dbname); + + content($dba); + + footer(); + + print_time($start_time); + } # ------------------------------------------------------------------------------- @@ -245,12 +288,14 @@ sub footer { print "Dumped $entry_count entries\n"; - if ($gzip) { + if ($nogzip) { - $fh->close(); + close(FILE); } else { - close(FILE); + + $fh->close(); + } } @@ -266,13 +311,13 @@ sub p { $str .= "\n"; - if ($gzip) { + if ($nogzip) { - print $fh $str; + print FILE $str; } else { - print FILE $str; + print $fh $str; } @@ -362,22 +407,26 @@ sub usage { Usage: perl $0 <options> - -host Database host to connect to. + -host Database host to connect to. Defaults to ens-staging. - -port Database port to connect to. + -port Database port to connect to. Defaults to 3306. - -dbpattern Database name regexp + -dbpattern Database name regexp. Defaults to _core_ - -user Database username. + -user Database username. Defaults to ensro. -pass Password for user. - -gzip Compress output as it's written. + -dir Directory to write output to. Defaults to /lustre/scratch1/ensembl/gp1/xml. + + -nogzip Don't compress output as it's written. -max_genes Only dump this many genes for testing. -no_variation Don't dump variation IDs. + -parallel Submit jobs in parallel. + -help This message. EOF