diff --git a/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm b/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm new file mode 100644 index 0000000000000000000000000000000000000000..f8585eec5b756ddc9b3f1dec7b2d6f9d09ccf488 --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm @@ -0,0 +1,769 @@ +package Bio::EnsEMBL::Utils::ConversionSupport; + +=head1 NAME + +Bio::EnsEMBL::Utils::ConversionSupport - Utility module for Vega release and +schema conversion scripts + +=head1 SYNOPSIS + + my $serverroot = '/path/to/ensembl'; + my $suport = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot); + + # parse common options + $support->parse_common_options; + + # parse extra options for your script + $support->parse_extra_options('string_opt=s', 'numeric_opt=n'); + + # ask user if he wants to run script with these parameters + $support->confirm_params; + + # see individual method documentation for more stuff + +=head1 DESCRIPTION + +This module is a collection of common methods and provides helper functions +for the Vega release and schema conversion scripts. Amongst others, it reads +options from a config file, parses commandline options and does logging. + +=head1 LICENCE + +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details + +=head1 AUTHOR + +Patrick Meidl <pm2@sanger.ac.uk> + +=head1 CONTACT + +Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Getopt::Long; +use Text::Wrap; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use FindBin qw($Bin $Script); +use POSIX qw(strftime); + +=head2 new + + Arg[1] : String $serverroot - root directory of your ensembl sandbox + Example : my $support = new Bio::EnsEMBL::Utils::ConversionSupport( + '/path/to/ensembl'); + Description : constructor + Return type : Bio::EnsEMBL::Utils::ConversionSupport object + Exceptions : thrown if no serverroot is provided + Caller : general + +=cut + +sub new { + my $class = shift; + (my $serverroot = shift) or throw("You must supply a serverroot"); + my $self = { + '_serverroot' => $serverroot, + '_param' => { interactive => 1 }, + '_warnings' => 0, + }; + bless ($self, $class); + return $self; +} + +=head2 parse_common_options + + Example : $support->parse_common_options; + Description : This method reads options from a configuration file and parses + some commandline options that are common to all scripts (like + db connection settings, help, dry-run). Commandline options + will override config file settings. + + All options will be accessible via $self->param('name'). + Return type : true on success + Exceptions : thrown if configuration file can't be opened + Caller : general + +=cut + +sub parse_common_options { + my $self = shift; + + # read commandline options + my %h; + Getopt::Long::Configure("pass_through"); + &GetOptions( \%h, + 'dbname|db_name=s', + 'host|dbhost|db_host=s', + 'port|dbport|db_port=n', + 'user|dbuser|db_user=s', + 'pass|dbpass|db_pass=s', + 'driver|dbdriver|db_driver=s', + 'conffile|conf=s', + 'logfile|log=s', + 'interactive|i=s', + 'dry_run|dry|n=s', + 'help|h|?', + ); + + # reads config file + my $conffile = $h{'conffile'} || $self->serverroot . "/conf/Conversion.ini"; + if (-e $conffile) { + open(CONF, $conffile) or throw( + "Unable to open configuration file $conffile for reading: $!"); + while (<CONF>) { + # remove comments + s/\s+[;].*$//; + s/^[#;].*$//; + + # read options into internal parameter datastructure + /(\w\S*)\s*=\s*(.*)/; + $self->param($1, $2); + } + } else { + warning("Unable to open configuration file $conffile for reading: $!"); + } + + # override configured parameter with commandline options + map { $self->param($_, $h{$_}) } keys %h; + return(1); +} + +=head2 parse_extra_options + + Arg[1-N] : option descriptors that will be passed on to Getopt::Long + Example : $support->parse_extra_options('string_opt=s', 'numeric_opt=n'); + Description : Parse extra commandline options by passing them on to + Getopt::Long and storing parameters in $self->param('name). + Return type : true on success + Exceptions : none (caugth by $self->error) + Caller : general + +=cut + +sub parse_extra_options { + my ($self, @params) = @_; + Getopt::Long::Configure("no_pass_through"); + eval { + # catch warnings to pass to $self->error + local $SIG{__WARN__} = sub { die @_; }; + &GetOptions(\%{ $self->{'_param'} }, @params); + }; + $self->error($@) if $@; + return(1); +} + +=head2 confirm_params + + Example : $support->confirm_params; + Description : Prints a table of parameters that were collected from config + file and commandline and asks user to confirm if he wants + to proceed. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub confirm_params { + my $self = shift; + + # print parameter table + print "Running script with these parameters:\n\n"; + print $self->list_all_params; + + # ask user if he wants to proceed + $self->user_confirm; + + return(1); +} + +=head2 list_all_params + + Example : print LOG $support->list_all_params; + Description : prints a table of the parameters used in the script + Return type : String - the table to print + Exceptions : none + Caller : general + +=cut + +sub list_all_params { + my $self = shift; + my $txt = sprintf " %-20s%-40s\n", qw(PARAMETER VALUE); + $txt .= " " . "-"x70 . "\n"; + $Text::Wrap::colums = 72; + foreach my $key (sort keys %{ $self->{'_param'} }) { + my @vals = $self->param($key); + $txt .= Text::Wrap::wrap( sprintf(' %-20s', $key), + ' 'x24, + join(", ", @vals) + ) . "\n"; + } + $txt .= "\n"; + return $txt; +} + +=head2 user_confirm + + Example : print "Do you want to continue?\n"; + $support->user_confirm; + Description : If running interactively, the user is asked if he wants to + proceed. + Return type : true on success. + Exceptions : none + Caller : general + +=cut + +sub user_confirm { + my $self = shift; + + if ($self->param('interactive')) { + print "Continue? [y/N] "; + my $input = lc(<>); + chomp $input; + unless ($input eq 'y') { + print "Aborting.\n"; + exit(0); + } + } + + return(1); +} + +=head2 comma_to_list + + Arg[1-N] : list of parameter names to parse + Example : $support->comma_to_list('chromosomes'); + Description : Transparently converts comma-separated lists into arrays (to + allow different styles of commandline options, see perldoc + Getopt::Long for details). Parameters are converted in place + (accessible through $self->param('name')). + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub comma_to_list { + my $self = shift; + foreach my $param (@_) { + $self->param($param, + split (/,/, join (',', $self->param($param)))); + } + return(1); +} + +=head2 list_to_file + + Arg[1] : Name of parameter to parse + Example : $support->list_to_file('gene_stable_id'); + Description : Determines whether a parameter holds a list or it is a filename + to read the list entries from. + Return type : true on success + Exceptions : thrown if list file can't be opened + Caller : general + +=cut + +sub list_or_file { + my ($self, $param) = @_; + my @vals = $self->param($param); + return unless (@vals); + + my $firstval = $vals[0]; + if (scalar(@vals) == 1 && -e $firstval) { + # we didn't get a list of values, but a file to read values from + @vals = (); + open(IN, $firstval) or throw("Cannot open $firstval for reading: $!"); + while(<IN>){ + chomp; + push(@vals, $_); + } + close(IN); + $self->param($param, @vals); + } + $self->comma_to_list($param); + return(1); +} + +=head2 param + + Arg[1] : Parameter name + Arg[2-N] : (optional) List of values to set + Example : my $dbname = $support->param('dbname'); + $support->param('port', 3306); + $support->chromosomes(1, 6, 'X'); + Description : Getter/setter for parameters. Accepts single-value params and + list params. + Return type : Scalar value for single-value parameters, array of values for + list parameters + Exceptions : thrown if no parameter name is supplied + Caller : general + +=cut + +sub param { + my $self = shift; + my $name = shift or throw("You must supply a parameter name"); + + # setter + if (@_) { + if (scalar(@_) == 1) { + # single value + $self->{'_param'}->{$name} = shift; + } else { + # list of values + @{ $self->{'_param'}->{$name} } = @_; + } + } + + # getter + if (ref($self->{'_param'}->{$name}) eq 'ARRAY') { + # list parameter + return @{ $self->{'_param'}->{$name} }; + } elsif (defined($self->{'_param'}->{$name})) { + # single-value parameter + return $self->{'_param'}->{$name}; + } else { + return (); + } +} + +=head2 error + + Arg[1] : (optional) String - error message + Example : $support->error("An error occurred: $@"); + exit(0) if $support->error; + Description : Getter/setter for error messages + Return type : String - error message + Exceptions : none + Caller : general + +=cut + +sub error { + my $self = shift; + $self->{'_error'} = shift if (@_); + return $self->{'_error'}; +} + +=head2 warnings + + Example : print LOG "There were ".$support->warnings." warnings.\n"; + Description : Returns the number of warnings encountered while running the + script (the warning counter is increased by $self->log_warning). + Return type : Int - number of warnings + Exceptions : none + Caller : general + +=cut + +sub warnings { + my $self = shift; + return $self->{'_warnings'}; +} + +=head2 serverroot + + Arg[1] : (optional) String - root directory of your ensembl sandbox + Example : my $serverroot = $support->serverroot; + Description : Getter/setter for the root directory of your ensembl sandbox. + This is set when ConversionSupport object is created, so + usually only used as a getter. + Return type : String - the server root directory + Exceptions : none + Caller : general + +=cut + +sub serverroot { + my $self = shift; + $self->{'_serverroot'} = shift if (@_); + return $self->{'_serverroot'}; +} + +=head2 get_database + + Arg[1] : String $database - the type of database to connect to + (eg core, otter) + Example : my $db = $support->get_database('core'); + Description : Connects to the database specified. + Return type : DBAdaptor of the appropriate type + Exceptions : thrown if asking for unknown database + Caller : general + +=cut + +sub get_database { + my $self = shift; + my $database = shift or throw("You must provide a database"); + $self->check_required_params(qw(host port user pass dbname)); + + my %adaptors = ( + core => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + ensembl => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + otter => 'Bio::Otter::DBSQL::DBAdaptor', + vega => 'Bio::Otter::DBSQL::DBAdaptor', + ); + my %valid = map { $_ => 1 } keys %adaptors; + throw("Unknown database: $database") unless $valid{$database}; + + $self->dynamic_use($adaptors{$database}); + my $dba = $adaptors{$database}->new( + -host => $self->param('host'), + -port => $self->param('port'), + -user => $self->param('user'), + -pass => $self->param('pass'), + -dbname => $self->param('dbname'), + ); + return $dba; +} + +=head2 check_required_params + + Arg[1-N] : List @params - parameters to check + Example : $self->check_required_params(qw(dbname host port)); + Description : Checks $self->param to make sure the requested parameters + have been set. Dies if parameters are missing. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub check_required_params { + my ($self, @params) = @_; + my @missing = (); + foreach my $param (@params) { + push @missing, $param unless $self->param($param); + } + if (@missing) { + throw("Missing parameters: @missing.\nYou must specify them on the commandline or in your conffile.\n"); + } + return(1); +} + +=head2 dynamic_use + + Arg [1] : String $classname - The name of the class to require/import + Example : $self->dynamic_use('Bio::EnsEMBL::DBSQL::DBAdaptor'); + Description: Requires and imports the methods for the classname provided, + checks the symbol table so that it doesnot re-require modules + that have already been required. + Returntype : true on success + Exceptions : Warns to standard error if module fails to compile + Caller : internal + +=cut + +sub dynamic_use { + my ($self, $classname) = @_; + my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ? + ($1,$2) : ('::', $classname); + no strict 'refs'; + # return if module has already been imported + return 1 if $parent_namespace->{$module.'::'}; + + eval "require $classname"; + throw("Failed to require $classname: $@") if ($@); + $classname->import(); + + return 1; +} + +=head2 get_chrlength + + Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dba + Example : my $chr_length = $support->get_chrlength($dba); + Description : Get all chromosomes and their length from the database. Return + chr_name/length for the chromosomes the user requested (or all + chromosomes by default) + Return type : Hashref - chromosome_name => length + Exceptions : thrown if not passing a Bio::EnsEMBL::DBSQL::DBAdaptor + Caller : general + +=cut + +sub get_chrlength { + my ($self, $dba) = @_; + throw("get_chrlength should be passed a Bio::EnsEMBL::DBSQL::DBAdaptor\n") + unless ($dba->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')); + + my $sa = $dba->get_SliceAdaptor; + my @chromosomes = map { $_->seq_region_name } + @{ $sa->fetch_all('chromosome') }; + my %chr = map { $_ => $sa->fetch_by_region('chromosome', $_)->length } + @chromosomes; + + my @wanted = $self->param('chromosomes'); + if (@wanted) { + # check if user supplied invalid chromosome names + foreach my $chr (@wanted) { + my $found = 0; + foreach my $chr_from_db (keys %chr) { + if ($chr_from_db eq $chr) { + $found = 1; + last; + } + } + unless ($found) { + warning("Didn't find chromosome $chr in database " . + $self->param('dbname')); + } + } + + # filter to requested chromosomes only + HASH: + foreach my $chr_from_db (keys %chr) { + foreach my $chr (@wanted) { + if ($chr_from_db eq $chr) { + next HASH; + } + } + delete($chr{$chr_from_db}); + } + } + + return \%chr; +} + +=head2 sort_chromosomes + + Arg[1] : Hashref $chr_hashref - Hashref with chr_name as keys + Example : my $chr = { '6-COX' => 1, '1' => 1, 'X' => 1 }; + my @sorted = $support->sort_chromosomes($chr); + Description : Sorts chromosomes in an intuitive way (numerically, then + alphabetically) + Return type : List - sorted chromosome names + Exceptions : thrown if no hashref is provided + Caller : general + +=cut + +sub sort_chromosomes { + my ($self, $chr_hashref) = @_; + throw("You have to pass a hashref of your chromosomes") + unless ($chr_hashref and ref($chr_hashref) eq 'HASH'); + return (sort _by_chr_num keys %$chr_hashref); +} + +=head2 _by_chr_num + + Example : my @sorted = sort _by_chr_num qw(X, 6-COX, 14, 7); + Description : Subroutine to use in sort for sorting chromosomes. Sorts + numerically, then alphabetically + Return type : values to be used by sort + Exceptions : none + Caller : internal ($self->sort_chromosomes) + +=cut + +sub _by_chr_num { + my @awords = split /-/, $a; + my @bwords = split /-/, $b; + + my $anum = $awords[0]; + my $bnum = $bwords[0]; + + if ($anum !~ /^[0-9]*$/) { + if ($bnum !~ /^[0-9]*$/) { + return $anum cmp $bnum; + } else { + return 1; + } + } + if ($bnum !~ /^[0-9]*$/) { + return -1; + } + + if ($anum <=> $bnum) { + return $anum <=> $bnum; + } else { + if ($#awords == 0) { + return -1; + } elsif ($#bwords == 0) { + return 1; + } else { + return $awords[1] cmp $bwords[1]; + } + } +} + +=head2 log + + Arg[1] : String $txt - the text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle('>>'); + $support->log('Log foo.\n', 1); + Description : Logs a message to the filehandle initialised by calling + $self->log_filehandle(). You can supply an indentation level + to get nice hierarchical log messages. + Return type : true on success + Exceptions : thrown when no filehandle can be obtained + Caller : general + +=cut + +sub log { + my ($self, $txt, $indent) = @_; + $indent ||= 0; + $txt = " "x$indent . $txt; + my $fh = $self->{'_log_filehandle'}; + throw("Unable to obtain log filehandle") unless $fh; + print $fh "$txt"; + return(1); +} + +=head2 log_warning + + Arg[1] : String $txt - the warning text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle('>>'); + $support->log_warning('Log foo.\n', 1); + Description : Logs a message via $self->log and increases the warning counter. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub log_warning { + my ($self, $txt, $indent) = @_; + $txt = "WARNING: " . $txt; + $self->log($txt, $indent); + $self->{'_warnings'}++; + return(1); +} + +=head2 log_filehandle + + Arg[1] : String $mode - file access mode + Example : my $log = $support->log_filehandle('>>'); + # print to the filehandle + print $log 'Lets start logging...\n'; + # log via the wrapper $self->log() + $support->log('Another log message.\n'); + Description : Returns a filehandle for logging (STDERR by default, logfile if + set from config or commandline). You can use the filehandle + directly to print to, or use the smart wrapper $self->log() + Return type : Filehandle - the filehandle to log to + Exceptions : thrown if logfile can't be opened + Caller : general + +=cut + +sub log_filehandle { + my ($self, $mode) = @_; + $mode ||= ">"; + my $fh = \*STDERR; + if (my $logfile = $self->param('logfile')) { + open($fh, "$mode", $logfile) or throw( + "Unable to open $logfile for writing: $!"); + } + $self->{'_log_filehandle'} = $fh; + return $self->{'_log_filehandle'}; +} + +=head2 init_log + + Example : print LOG $support->init_log; + Description : Returns some header information for a logfile. This includes + script name, date, user running the script and parameters the + script will be running with + Return type : String - the log text + Exceptions : none + Caller : general + +=cut + +sub init_log { + my $self = shift; + + # print script name, date, user who is running it + my $hostname = `hostname`; + chomp $hostname; + my $script = "$hostname:$Bin/$Script"; + my $user = `whoami`; + chomp $user; + my $txt = "Script: $script\nDate: ".$self->date."\nUser: $user\n"; + + # print parameters the script is running with + $txt .= "Parameters:\n\n"; + $txt .= $self->list_all_params; + + return $txt; +} + +=head2 finish_log + + Example : print LOG $support->finish_log; + Description : Return footer information to write to a logfile. This includes + the number of logged warnings, timestamp and memory footprint. + Return type : String - the log text + Exceptions : none + Caller : general + +=cut + +sub finish_log { + my $self = shift; + my $txt = "All done. ".$self->warnings." warnings. ".$self->date_and_mem."\n"; + return $txt; +} + +=head2 date_and_mem + + Example : print LOG "Time, memory usage: ".$support->date_and_mem."\n"; + Description : Prints a timestamp and the memory usage of your script. + Return type : String - timestamp and memory usage + Exceptions : none + Caller : general + +=cut + +sub date_and_mem { + my $date = strftime "%Y-%m-%d %T", localtime; + my $mem = `ps $$ -o vsz |tail -1`; + chomp $mem; + return "[$date, mem $mem]"; +} + +=head2 date + + Example : print "Date: " . $support->date . "\n"; + Description : Prints a nicely formatted timestamp (YYYY-DD-MM hh:mm:ss) + Return type : String - the timestamp + Exceptions : none + Caller : general + +=cut + +sub date { + return strftime "%Y-%m-%d %T", localtime; +} + +=head2 mem + + Example : print "Memory usage: " . $support->mem . "\n"; + Description : Prints the memory used by your script. Not sure about platform + dependence of this call ... + Return type : String - memory usage + Exceptions : none + Caller : general + +=cut + +sub mem { + my $mem = `ps $$ -o vsz |tail -1`; + chomp $mem; + return $mem; +} + +1;