Skip to content
Snippets Groups Projects
ConversionSupport.pm 24.6 KiB
Newer Older
package Bio::EnsEMBL::Utils::ConversionSupport;

=head1 NAME

Bio::EnsEMBL::Utils::ConversionSupport - Utility module for Vega release and
schema conversion scripts

=head1 SYNOPSIS

    my $serverroot = '/path/to/ensembl';
    my $suport = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot);

    # parse common options
    $support->parse_common_options;

    # parse extra options for your script
    $support->parse_extra_options('string_opt=s', 'numeric_opt=n');

    # ask user if he wants to run script with these parameters
    $support->confirm_params;

    # see individual method documentation for more stuff

=head1 DESCRIPTION

This module is a collection of common methods and provides helper functions 
for the Vega release and schema conversion scripts. Amongst others, it reads
options from a config file, parses commandline options and does logging.

=head1 LICENCE

This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details

=head1 AUTHOR

Patrick Meidl <pm2@sanger.ac.uk>

=head1 CONTACT

Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk

=cut

use strict;
use warnings;
no warnings 'uninitialized';

use Getopt::Long;
use Text::Wrap;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use FindBin qw($Bin $Script);
use POSIX qw(strftime);

=head2 new

  Arg[1]      : String $serverroot - root directory of your ensembl sandbox
  Example     : my $support = new Bio::EnsEMBL::Utils::ConversionSupport(
                                        '/path/to/ensembl');
  Description : constructor
  Return type : Bio::EnsEMBL::Utils::ConversionSupport object
  Exceptions  : thrown if no serverroot is provided
  Caller      : general

=cut

sub new {
    my $class = shift;
    (my $serverroot = shift) or throw("You must supply a serverroot");
    my $self = {
        '_serverroot'   => $serverroot,
        '_param'        => { interactive => 1 },
        '_warnings'     => 0,
    };
    bless ($self, $class);
    return $self;
}

=head2 parse_common_options

  Example     : $support->parse_common_options;
  Description : This method reads options from a configuration file and parses
                some commandline options that are common to all scripts (like
                db connection settings, help, dry-run). Commandline options
                will override config file settings. 

                All options will be accessible via $self->param('name').
  Return type : true on success 
  Exceptions  : thrown if configuration file can't be opened
  Caller      : general

=cut

sub parse_common_options {
    my $self = shift;

    # read commandline options
    my %h;
    Getopt::Long::Configure("pass_through");
    &GetOptions( \%h,
        'dbname|db_name=s',
        'host|dbhost|db_host=s',
        'port|dbport|db_port=n',
        'user|dbuser|db_user=s',
        'pass|dbpass|db_pass=s',
        'driver|dbdriver|db_driver=s',
        'conffile|conf=s',
        'logfile|log=s',
        'verbose|v=s',
        'interactive|i=s',
        'dry_run|dry|n=s',
        'help|h|?',
    );

    # reads config file
    my $conffile = $h{'conffile'} || $self->serverroot . "/conf/Conversion.ini";
    if (-e $conffile) {
        open(CONF, $conffile) or throw( 
            "Unable to open configuration file $conffile for reading: $!");
        while (<CONF>) {

            # read options into internal parameter datastructure
            next unless (/(\w\S*)\s*=\s*(.*)/);
            $self->param($1, $2);
        }
    } else {
        warning("Unable to open configuration file $conffile for reading: $!");
    }
    
    # override configured parameter with commandline options
    map { $self->param($_, $h{$_}) } keys %h;
    return(1);
}

=head2 parse_extra_options

  Arg[1-N]    : option descriptors that will be passed on to Getopt::Long
  Example     : $support->parse_extra_options('string_opt=s', 'numeric_opt=n');
  Description : Parse extra commandline options by passing them on to
                Getopt::Long and storing parameters in $self->param('name).
  Return type : true on success
  Exceptions  : none (caugth by $self->error)
  Caller      : general

=cut

sub parse_extra_options {
    my ($self, @params) = @_;
    Getopt::Long::Configure("no_pass_through");
    eval {
        # catch warnings to pass to $self->error
        local $SIG{__WARN__} = sub { die @_; };
        &GetOptions(\%{ $self->{'_param'} }, @params);
    };
    $self->error($@) if $@;
    return(1);
}

=head2 confirm_params

  Example     : $support->confirm_params;
  Description : Prints a table of parameters that were collected from config
                file and commandline and asks user to confirm if he wants
                to proceed.
  Return type : true on success
  Exceptions  : none
  Caller      : general

=cut

sub confirm_params {
    my $self = shift;

    # print parameter table
    print "Running script with these parameters:\n\n";
    print $self->list_all_params;

    # ask user if he wants to proceed
    exit unless $self->user_proceed("Continue?");
    
    return(1);
}

=head2 list_all_params

  Example     : print LOG $support->list_all_params;
  Description : prints a table of the parameters used in the script
  Return type : String - the table to print
  Exceptions  : none
  Caller      : general

=cut

sub list_all_params {
    my $self = shift;
    my $txt = sprintf "    %-20s%-40s\n", qw(PARAMETER VALUE);
    $txt .= "    " . "-"x70 . "\n";
    $Text::Wrap::colums = 72;
    foreach my $key (sort keys %{ $self->{'_param'} }) {
        my @vals = $self->param($key);
        $txt .= Text::Wrap::wrap( sprintf('    %-20s', $key),
                                  ' 'x24,
                                  join(", ", @vals)
                                ) . "\n";
    }
    $txt .= "\n";
    return $txt;
}

=head2 user_proceed

  Arg[1]      : (optional) String $text - notification text to present to user
  Example     : # run a code snipped conditionally
                if ($support->user_proceed("Run the next code snipped?") {
                    # run some code
                }
                # exit if requested by user
                exit unless ($support->user_proceed("Want to continue?"));
  Description : If running interactively, the user is asked if he wants to
                perform a script action. If he doesn't, this section is skipped
                and the script proceeds with the code. When running
                non-interactively, the section is run by default.
  Return type : TRUE to proceed, FALSE to skip.
sub user_proceed {
    my ($self, $text) = @_;
        print "$text\n" if $text;
        print "[y/N] ";
        my $input = lc(<>);
        chomp $input;
        unless ($input eq 'y') {
            print "Skipping.\n";
            return(0);
=head2 user_confirm

  Description : DEPRECATED - please use user_proceed() instead

=cut

sub user_confirm {
    my $self = shift;
    exit unless $self->user_proceed("Continue?");
}

=head2 read_user_input

  Arg[1]      : (optional) String $text - notification text to present to user
  Example     : my $ret = $support->read_user_input("Choose a number [1/2/3]");
                if ($ret == 1) {
                    # do something
                } elsif ($ret == 2) {
                    # do something else
                }
  Description : If running interactively, the user is asked for input.
  Return type : String - user's input
  Exceptions  : none
  Caller      : general

=cut

sub read_user_input {
    my ($self, $text) = @_;

    if ($self->param('interactive')) {
        print "$text\n" if $text;
        my $input = <>;
        chomp $input;
        return $input;
    }
}

=head2 comma_to_list

  Arg[1-N]    : list of parameter names to parse
  Example     : $support->comma_to_list('chromosomes');
  Description : Transparently converts comma-separated lists into arrays (to
                allow different styles of commandline options, see perldoc
                Getopt::Long for details). Parameters are converted in place
                (accessible through $self->param('name')).
  Return type : true on success
  Exceptions  : none
  Caller      : general

=cut

sub comma_to_list {
    my $self = shift;
    foreach my $param (@_) {
        $self->param($param,
            split (/,/, join (',', $self->param($param))));
    }
    return(1);
}

=head2 list_to_file

  Arg[1]      : Name of parameter to parse
  Example     : $support->list_to_file('gene_stable_id');
  Description : Determines whether a parameter holds a list or it is a filename
                to read the list entries from.
  Return type : true on success
  Exceptions  : thrown if list file can't be opened
  Caller      : general

=cut

sub list_or_file {
    my ($self, $param) = @_;
    my @vals = $self->param($param);
    return unless (@vals);

    my $firstval = $vals[0];
    if (scalar(@vals) == 1 && -e $firstval) {
        # we didn't get a list of values, but a file to read values from
        @vals = ();
        open(IN, $firstval) or throw("Cannot open $firstval for reading: $!");
        while(<IN>){
            chomp;
            push(@vals, $_);
        }
        close(IN);
        $self->param($param, @vals);
    }
    $self->comma_to_list($param);
    return(1);
}

=head2 param

  Arg[1]      : Parameter name
  Arg[2-N]    : (optional) List of values to set
  Example     : my $dbname = $support->param('dbname');
                $support->param('port', 3306);
                $support->param('chromosomes', 1, 6, 'X');
  Description : Getter/setter for parameters. Accepts single-value params and
                list params.
  Return type : Scalar value for single-value parameters, array of values for
                list parameters
  Exceptions  : thrown if no parameter name is supplied
  Caller      : general

=cut

sub param {
    my $self = shift;
    my $name = shift or throw("You must supply a parameter name");

    # setter
    if (@_) {
        if (scalar(@_) == 1) {
            # single value
            $self->{'_param'}->{$name} = shift;
        } else {
            # list of values
            @{ $self->{'_param'}->{$name} } = @_;
        }
    }

    # getter
    if (ref($self->{'_param'}->{$name}) eq 'ARRAY') {
        # list parameter
        return @{ $self->{'_param'}->{$name} };
    } elsif (defined($self->{'_param'}->{$name})) {
        # single-value parameter
        return $self->{'_param'}->{$name};
    } else {
        return ();
    }
}

=head2 error

  Arg[1]      : (optional) String - error message
  Example     : $support->error("An error occurred: $@");
                exit(0) if $support->error;
  Description : Getter/setter for error messages
  Return type : String - error message
  Exceptions  : none
  Caller      : general

=cut

sub error {
    my $self = shift;
    $self->{'_error'} = shift if (@_);
    return $self->{'_error'};
}

=head2 warnings

  Example     : print LOG "There were ".$support->warnings." warnings.\n";
  Description : Returns the number of warnings encountered while running the
                script (the warning counter is increased by $self->log_warning).
  Return type : Int - number of warnings
  Exceptions  : none
  Caller      : general

=cut

sub warnings {
    my $self = shift;
    return $self->{'_warnings'};
}

=head2 serverroot

  Arg[1]      : (optional) String - root directory of your ensembl sandbox
  Example     : my $serverroot = $support->serverroot;
  Description : Getter/setter for the root directory of your ensembl sandbox.
                This is set when ConversionSupport object is created, so
                usually only used as a getter.
  Return type : String - the server root directory
  Exceptions  : none
  Caller      : general

=cut

sub serverroot {
    my $self = shift;
    $self->{'_serverroot'} = shift if (@_);
    return $self->{'_serverroot'};
}

=head2 get_database

  Arg[1]      : String $database - the type of database to connect to
                (eg core, otter)
  Example     : my $db = $support->get_database('core');
  Description : Connects to the database specified.
  Return type : DBAdaptor of the appropriate type
  Exceptions  : thrown if asking for unknown database
  Caller      : general

=cut

sub get_database {
    my $self = shift;
    my $database = shift or throw("You must provide a database");
    $self->check_required_params(qw(host port user pass dbname));

    my %adaptors = (
        core    => 'Bio::EnsEMBL::DBSQL::DBAdaptor',
        ensembl => 'Bio::EnsEMBL::DBSQL::DBAdaptor',
        otter   => 'Bio::Otter::DBSQL::DBAdaptor',
        vega    => 'Bio::Otter::DBSQL::DBAdaptor',
    );
    my %valid = map { $_ => 1 } keys %adaptors;
    throw("Unknown database: $database") unless $valid{$database};

    $self->dynamic_use($adaptors{$database});
    my $dba = $adaptors{$database}->new(
            -host   => $self->param('host'),
            -port   => $self->param('port'),
            -user   => $self->param('user'),
            -pass   => $self->param('pass'),
            -dbname => $self->param('dbname'),
    );
    return $dba;
}

=head2 check_required_params

  Arg[1-N]    : List @params - parameters to check
  Example     : $self->check_required_params(qw(dbname host port));
  Description : Checks $self->param to make sure the requested parameters
                have been set. Dies if parameters are missing.
  Return type : true on success
  Exceptions  : none
  Caller      : general

=cut

sub check_required_params {
    my ($self, @params) = @_;
    my @missing = ();
    foreach my $param (@params) {
        push @missing, $param unless $self->param($param);
    }
    if (@missing) {
        throw("Missing parameters: @missing.\nYou must specify them on the commandline or in your conffile.\n");
    }
    return(1);
}

=head2 dynamic_use

  Arg [1]    : String $classname - The name of the class to require/import
  Example    : $self->dynamic_use('Bio::EnsEMBL::DBSQL::DBAdaptor');
  Description: Requires and imports the methods for the classname provided,
               checks the symbol table so that it doesnot re-require modules
               that have already been required.
  Returntype : true on success
  Exceptions : Warns to standard error if module fails to compile
  Caller     : internal

=cut

sub dynamic_use {
    my ($self, $classname) = @_;
    my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ?
                                        ($1,$2) : ('::', $classname);
    no strict 'refs';
    # return if module has already been imported
    return 1 if $parent_namespace->{$module.'::'};
    
    eval "require $classname";
    throw("Failed to require $classname: $@") if ($@);
    $classname->import();
    
    return 1;
}

=head2 get_chrlength

  Arg[1]      : Bio::EnsEMBL::DBSQL::DBAdaptor $dba
  Example     : my $chr_length = $support->get_chrlength($dba);
  Description : Get all chromosomes and their length from the database. Return
                chr_name/length for the chromosomes the user requested (or all
                chromosomes by default)
  Return type : Hashref - chromosome_name => length
  Exceptions  : thrown if not passing a Bio::EnsEMBL::DBSQL::DBAdaptor
  Caller      : general

=cut

sub get_chrlength {
    my ($self, $dba) = @_;
    throw("get_chrlength should be passed a Bio::EnsEMBL::DBSQL::DBAdaptor\n")
        unless ($dba->isa('Bio::EnsEMBL::DBSQL::DBAdaptor'));

    my $sa = $dba->get_SliceAdaptor;
    my @chromosomes = map { $_->seq_region_name } 
                            @{ $sa->fetch_all('chromosome') };
    my %chr = map { $_ => $sa->fetch_by_region('chromosome', $_)->length }
                            @chromosomes;

    my @wanted = $self->param('chromosomes');
    if (@wanted) {
        # check if user supplied invalid chromosome names
        foreach my $chr (@wanted) {
            my $found = 0;
            foreach my $chr_from_db (keys %chr) {
                if ($chr_from_db eq $chr) {
                    $found = 1;
                    last;
                }
            }
            unless ($found) {
                warning("Didn't find chromosome $chr in database " .
                         $self->param('dbname'));
            }
        }
        
        # filter to requested chromosomes only
        HASH: 
        foreach my $chr_from_db (keys %chr) {
            foreach my $chr (@wanted) {
                if ($chr_from_db eq $chr) {
                    next HASH;
                }
            }
            delete($chr{$chr_from_db});
        }
    }

    return \%chr;
}

=head2 get_taxonomy_id

  Arg[1]      : Bio::EnsEMBL::DBSQL::DBAdaptor $dba
  Example     : my $sid = $support->get_taxonony_id($dba);
  Description : Retrieves the taxononmy ID from the meta table
  Return type : Int - the taxonomy ID
  Exceptions  : thrown if no taxonomy ID is found in the database
  Caller      : general

=cut

sub get_taxonomy_id {
    my ($self, $dba) = @_;
    my $sql = 'SELECT meta_value FROM meta WHERE meta_key = "species.taxonomy_id"';
    my $sth = $dba->dbc->db_handle->prepare($sql);
    $sth->execute;
    my ($tid) = $sth->fetchrow_array;
    $sth->finish;
    $self->throw("Could not determine taxonomy_id from database.") unless $tid;
    return $tid;
}

=head2 get_species_scientific_name

  Arg[1]      : Bio::EnsEMBL::DBSQL::DBAdaptor $dba
  Example     : my $species = $support->get_species_scientific_name($dba);
  Description : Retrieves the species scientific name (Genus species) from the
                meta table
  Return type : String - species scientific name
  Exceptions  : thrown if species name can't be determined from db
  Caller      : general

=cut

sub get_species_scientific_name {
    my ($self, $dba) = @_;
    my $sql = qq(
        SELECT
                meta_value
        FROM
                meta
        WHERE meta_key = "species.classification"
        ORDER BY meta_id
        LIMIT 2
    );
    my $sth = $dba->dbc->db_handle->prepare($sql);
    $sth->execute;
    my @sp;
    while (my @row = $sth->fetchrow_array) {
        push @sp, $row[0];
    }
    $sth->finish;
    my $species = join(" ", reverse @sp);
    $self->throw("Could not determine species scientific name from database.")
        unless $species;
    return $species;
}

=head2 sort_chromosomes

  Arg[1]      : Hashref $chr_hashref - Hashref with chr_name as keys
  Example     : my $chr = { '6-COX' => 1, '1' => 1, 'X' => 1 };
                my @sorted = $support->sort_chromosomes($chr);
  Description : Sorts chromosomes in an intuitive way (numerically, then
                alphabetically)
  Return type : List - sorted chromosome names
  Exceptions  : thrown if no hashref is provided
  Caller      : general

=cut

sub sort_chromosomes {
    my ($self, $chr_hashref) = @_;
    throw("You have to pass a hashref of your chromosomes")
        unless ($chr_hashref and ref($chr_hashref) eq 'HASH');
    return (sort _by_chr_num keys %$chr_hashref);
}

=head2 _by_chr_num

  Example     : my @sorted = sort _by_chr_num qw(X, 6-COX, 14, 7);
  Description : Subroutine to use in sort for sorting chromosomes. Sorts
                numerically, then alphabetically
  Return type : values to be used by sort
  Exceptions  : none
  Caller      : internal ($self->sort_chromosomes)

=cut

sub _by_chr_num {
    my @awords = split /-/, $a;
    my @bwords = split /-/, $b;

    my $anum = $awords[0];
    my $bnum = $bwords[0];

    if ($anum !~ /^[0-9]*$/) {
        if ($bnum !~ /^[0-9]*$/) {
            return $anum cmp $bnum;
        } else {
            return 1;
        }
    }
    if ($bnum !~ /^[0-9]*$/) {
        return -1;
    }

    if ($anum <=> $bnum) {
        return $anum <=> $bnum;
    } else {
        if ($#awords == 0) {
            return -1;
        } elsif ($#bwords == 0) {
            return 1;
        } else {
            return $awords[1] cmp $bwords[1];
        }
    }
}

=head2 log

  Arg[1]      : String $txt - the text to log
  Arg[2]      : Int $indent - indentation level for log message
  Example     : my $log = $support->log_filehandle('>>');
                $support->log('Log foo.\n', 1);
  Description : Logs a message to the filehandle initialised by calling
                $self->log_filehandle(). You can supply an indentation level
                to get nice hierarchical log messages.
  Return type : true on success
  Exceptions  : thrown when no filehandle can be obtained
  Caller      : general

=cut

sub log {
    my ($self, $txt, $indent) = @_;
    $indent ||= 0;
    $txt = "    "x$indent . $txt;
    my $fh = $self->{'_log_filehandle'};
    throw("Unable to obtain log filehandle") unless $fh;
    print $fh "$txt";
    return(1);
}

=head2 log_warning

  Arg[1]      : String $txt - the warning text to log
  Arg[2]      : Int $indent - indentation level for log message
  Example     : my $log = $support->log_filehandle('>>');
                $support->log_warning('Log foo.\n', 1);
  Description : Logs a message via $self->log and increases the warning counter.
  Return type : true on success
  Exceptions  : none
  Caller      : general

=cut

sub log_warning {
    my ($self, $txt, $indent) = @_;
    $txt = "WARNING: " . $txt;
    $self->log($txt, $indent);
    $self->{'_warnings'}++;
    return(1);
}

=head2 log_filehandle

  Arg[1]      : String $mode - file access mode
  Example     : my $log = $support->log_filehandle('>>');
                # print to the filehandle
                print $log 'Lets start logging...\n';
                # log via the wrapper $self->log()
                $support->log('Another log message.\n');
  Description : Returns a filehandle for logging (STDERR by default, logfile if
                set from config or commandline). You can use the filehandle
                directly to print to, or use the smart wrapper $self->log()
  Return type : Filehandle - the filehandle to log to
  Exceptions  : thrown if logfile can't be opened
  Caller      : general

=cut

sub log_filehandle {
    my ($self, $mode) = @_;
    $mode ||= ">";
    my $fh = \*STDERR;
    if (my $logfile = $self->param('logfile')) {
        if (my $logpath = $self->param('logpath')) {
            $logfile = "$logpath/$logfile";
        }
        open($fh, "$mode", $logfile) or throw(
            "Unable to open $logfile for writing: $!");
    }
    $self->{'_log_filehandle'} = $fh;
    return $self->{'_log_filehandle'};
}

=head2 init_log

  Example     : print LOG $support->init_log;
  Description : Returns some header information for a logfile. This includes
                script name, date, user running the script and parameters the
                script will be running with
  Return type : String - the log text
  Exceptions  : none
  Caller      : general

=cut

sub init_log {
    my $self = shift;

    # print script name, date, user who is running it
    my $hostname = `hostname`;
    chomp $hostname;
    my $script = "$hostname:$Bin/$Script";
    my $user = `whoami`;
    chomp $user;
    my $txt = "Script: $script\nDate: ".$self->date."\nUser: $user\n";

    # print parameters the script is running with
    $txt .= "Parameters:\n\n";
    $txt .= $self->list_all_params;

    return $txt;
}

=head2 finish_log

  Example     : print LOG $support->finish_log;
  Description : Return footer information to write to a logfile. This includes
                the number of logged warnings, timestamp and memory footprint.
  Return type : String - the log text
  Exceptions  : none
  Caller      : general

=cut

sub finish_log {
    my $self = shift;
    my $txt = "All done. ".$self->warnings." warnings. ".$self->date_and_mem."\n";
    return $txt;
}

=head2 date_and_mem

  Example     : print LOG "Time, memory usage: ".$support->date_and_mem."\n";
  Description : Prints a timestamp and the memory usage of your script.
  Return type : String - timestamp and memory usage
  Exceptions  : none
  Caller      : general

=cut

sub date_and_mem {
    my $date = strftime "%Y-%m-%d %T", localtime;
    my $mem = `ps -p $$ -o vsz |tail -1`;
    chomp $mem;
    return "[$date, mem $mem]";
}

=head2 date

  Example     : print "Date: " . $support->date . "\n";
  Description : Prints a nicely formatted timestamp (YYYY-DD-MM hh:mm:ss)
  Return type : String - the timestamp
  Exceptions  : none
  Caller      : general

=cut

sub date {
    return strftime "%Y-%m-%d %T", localtime;
}

=head2 mem

  Example     : print "Memory usage: " . $support->mem . "\n";
  Description : Prints the memory used by your script. Not sure about platform
                dependence of this call ...
  Return type : String - memory usage
  Exceptions  : none
  Caller      : general

=cut

sub mem {
    my $mem = `ps -p $$ -o vsz |tail -1`;