Skip to content
Snippets Groups Projects
Commit ac81f789 authored by Andy Yates's avatar Andy Yates
Browse files

Adding new script for autogenerating aliases

parent aa689741
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long qw( :config no_ignore_case );
use Pod::Usage;
use POSIX;
use Bio::EnsEMBL::DBSQL::DBConnection;
sub run {
my ($class) = @_;
my $self = bless( {}, $class );
$self->args();
$self->check_opts();
my $species = $self->_species();
foreach my $s ( sort { $a->{production} cmp $b->{production} } values %{$species} ) {
$self->v( 'Processing %s', $s->{production} );
my $aliases_to_add = $self->_aliases_to_add($s);
$self->_write_aliases($aliases_to_add, $s);
$self->v('Done');
}
return;
}
sub args {
my ($self) = @_;
my $opts = {
# Master database location:
mhost => 'ens-staging1',
mport => 3306,
mdatabase => 'ensembl_production',
species => [],
write => 0
};
my @cmd_opts = qw/
mhost|mh=s
mport|mP=i
muser|mu=s
mpass|mp=s
mdatabase|md=s
species|s=s@
verbose|v!
help
man
/;
GetOptions( $opts, @cmd_opts ) or pod2usage( -verbose => 1, -exitval => 1 );
pod2usage( -verbose => 1, -exitval => 0 ) if $opts->{help};
pod2usage( -verbose => 2, -exitval => 0 ) if $opts->{man};
$self->{opts} = $opts;
return;
}
sub check_opts {
my ($self) = @_;
my $o = $self->{opts};
foreach my $required (qw/mhost muser/) {
my $msg = "Required parameter --${required} was not given";
pod2usage( -msg => $msg, -verbose => 1, -exitval => 1 ) if !$o->{$required};
}
if(! @{$self->{opts}->{species}}){
my $msg = "Required parameter --species was not given";
pod2usage( -msg => $msg, -verbose => 1, -exitval => 1 );
}
return;
}
sub _write_aliases {
my ($self, $aliases, $species) = @_;
my $dbc = $self->_production_dbc();
$dbc->sql_helper()->transaction(sub {
my $sql = 'insert into species_alias (species_id, alias, is_current, created_at) values (?,?,?, NOW())';
my $id = $species->{id};
$dbc->sql_helper()->batch(-SQL => $sql, -CALLBACK => sub {
my ($sth) = @_;
foreach my $a (@{$aliases}) {
if($self->{opts}->{write}) {
$sth->execute($a, $id);
}
else {
$self->v('Would have inserted the alias %s for species_id %d', $a, $id);
}
}
return;
});
});
return;
}
sub _species {
my ($self) = @_;
my $dbc = $self->_production_dbc();
my $h = $dbc->sql_helper();
my $sql = <<'SQL';
select species_id, common_name, web_name, scientific_name, production_name, url_name
from species
where production_name like ?
and is_current = 1
SQL
my %species;
foreach my $species (@{$self->{opts}->{species}}) {
$self->v('Querying production for current species like %s', $species);
$dbc->sql_helper()->execute_no_return(-SQL => $sql, -PARAMS => [$species], -CALLBACK => sub {
my ($row) = @_;
my ($id, $common_name, $web_name, $scientific_name, $production_name, $url_name) = @{$row};
if(!exists $species{$id}) {
$species{$id} = {
id => $id,
production => $production_name,
common => $common_name,
web => $web_name,
scientific => $scientific_name,
url => $url_name
};
}
return;
});
}
#Enrich after executing
$self->_enrich(\%species);
return \%species;
}
sub _enrich {
my ($self, $species) = @_;
foreach my $id (keys %{$species}) {
my $s = $species->{$id};
$s->{aliases} = $self->_aliases($s);
$s->{automatic_aliases} = $self->_automatic_aliases($s);
}
return;
}
sub _aliases {
my ($self, $s) = @_;
my $dbc = $self->_production_dbc();
my $aliases = $dbc->sql_helper()->execute_simple(
-SQL => 'select alias from species_alias where is_current = 1 and species_id =?',
-PARAMS => [$s->{id}]
);
my %hash = map { $_ => 1 } @{$aliases};
return \%hash;
}
sub _automatic_aliases {
my ($self, $species) = @_;
my $production_name = $species->{production};
my $automatic_aliases = {};
# *** Assume homo_sapiens ***
my $alias = $production_name;
#1). homo_sapiens
$automatic_aliases->{$alias} = 1;
#2). homo sapiens
$alias =~ tr [_] [ ];
$automatic_aliases->{$alias} = 1;
#3). hsapiens
$production_name =~ /^(.)[^_]*_(.*)$/;
$alias = $1 . $2;
$automatic_aliases->{$alias} = 1;
#4). hsap
$production_name =~ /^(.)[^_]*_(...).*$/;
$alias = $1 . $2;
$automatic_aliases->{$alias} = 1;
#5). homosap
$production_name =~ /^(...)[^_]*_(...).*$/;
$alias = $1 . $2;
$automatic_aliases->{$alias} = 1;
return $automatic_aliases;
}
sub _aliases_to_add {
my ($self, $species) = @_;
my @aliases_to_add;
foreach my $autogenerated (keys %{$species->{automatic_aliases}}) {
if(exists $species->{aliases}->{$autogenerated}) {
$self->v('Skipping %s as it already registered for this species', $autogenerated);
}
else {
push(@aliases_to_add, $autogenerated);
$self->v('%s is a new alias', $autogenerated);
}
}
return \@aliases_to_add;
}
sub _production_dbc {
my ($self) = @_;
my $o = $self->{opts};
my %args = (
-HOST => $o->{mhost},
-PORT => $o->{mport},
-DBNAME => $o->{mdatabase},
-USER => $o->{muser}
);
$args{-PASS} = $o->{mpass} if $o->{mpass};
return Bio::EnsEMBL::DBSQL::DBConnection->new(%args);
}
sub v {
my ( $self, $msg, @args ) = @_;
return unless $self->{opts}->{verbose};
my $s_msg = sprintf( $msg, @args );
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) =
localtime( time() );
print sprintf(
"[%02d-%02d-%04d %02d:%02d:%02d] %s\n",
$mday, $mon, $year + 1900,
$hour, $min, $sec, $s_msg
);
return;
}
__PACKAGE__->run();
__END__
=pod
=head1 NAME
generate_default_aliases.pl
=head1 SYNOPSIS
./generate_default_aliases.pl
-mh host -mp password -mu user [-mP port] \\
[-md database] \\
[-th host] [-tP port] \\
[-tu user] [-tp password] [-td database] \\
[-species] \\
[-v]
=head1 DESCRIPTION
A script used to generate a minimal set of required aliases. Assuming the
production_name I<homo_sapiens> we would generate the following
=over 8
=item B<homo_sapiens>
=item B<homo sapiens>
=item B<hsapiens>
=item B<hsap>
=item B<homsap>
=back
It is up to the user to add more via the admin interface. We do not remove
aliases with this script
=head1 OPTIONS
=over 8
=item B<-mh|--mhost>
Host for the production database
=item B<-mP|--mport>
Port for the production database
=item B<-mu|--muser>
User for the production database
=item B<-mp|--mpass>
Pass for the production database
=item B<-md|--mdatabase>
Name for the production database.
=item B<-s|--species>
Species to generate the names for. Can use a SQL pattern here and multiple
cmd line entries. Please use B<production_names>.
=item B<--verbose>
Make the script chatty
=item B<--help>
Help message
=item B<--man>
Man page
=back
=cut
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment