From 2fb78fe12f8c504663689166fd471101a72be7f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?= <ak4@sanger.ac.uk> Date: Tue, 11 Aug 2009 10:11:46 +0000 Subject: [PATCH] Info: Script to populate the meta table with species aliases. The script reads the already existing aliases from the meta table (meta_key 'species.alias') and adds to this aliases computed from the species name. It also uses the information stored for the meta_keys species.taxonomy_id, species.common_name, species.ensembl_common_name, and species.ensembl_alias_name as aliases. If the -n or --dryrun options are *not* specified, the existing list of aliases is deleted from the meta table and the new list is inserted. In any case, the list of aliases will be displayed on the console. If the -d or --dbname options are *not* used, the script will iterate over all Core databases. If the -d or --dbname option *is* used, only that Core database will be examined. This script assumes that the database is a single-species database. This script does not check for alias duplications between species. Usage: ./add_species_aliases.pl [-n] -h dbhost [-P dbport] \ -u dbuser [-p dbpass] \ [-d dbname] ./add_species_aliases.pl -? Arguments: -n/--dryrun Dry run, don't write to database -h/--host dbhost Database server host name -P/--port dbport Database server port (optional) -u/--user dbuser Database user name -p/--pass dbpass User password (optional) -d/--name dbname Database name (optional) -?/--help Displays this information --- misc-scripts/add_species_aliases.pl | 170 ++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100755 misc-scripts/add_species_aliases.pl diff --git a/misc-scripts/add_species_aliases.pl b/misc-scripts/add_species_aliases.pl new file mode 100755 index 0000000000..e9cd1ae20a --- /dev/null +++ b/misc-scripts/add_species_aliases.pl @@ -0,0 +1,170 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +use Bio::EnsEMBL::Registry; + +use Getopt::Long qw( :config no_ignore_case ); + +#----------------------------------------------------------------------- + +sub usage { + print("Info:\n"); + print <<EOT; + Script to populate the meta table with species aliases. + + The script reads the already existing aliases from the + meta table (meta_key 'species.alias') and adds to this + aliases computed from the species name. It also uses the + information stored for the meta_keys species.taxonomy_id, + species.common_name, species.ensembl_common_name, and + species.ensembl_alias_name as aliases. + + If the -n or --dryrun options are *not* specified, the existing + list of aliases is deleted from the meta table and the new list + is inserted. In any case, the list of aliases will be displayed + on the console. + + If the -d or --dbname options are *not* used, the script will + iterate over all Core databases. If the -d or --dbname option + *is* used, only that Core database will be examined. + + This script assumes that the database is a single-species + database. + + This script does not check for alias duplications between + species. + + +EOT + + print("Usage:\n"); + printf( "\t%s\t[-n] -h dbhost [-P dbport] \\\n" + . "\t%s\t-u dbuser [-p dbpass] \\\n" + . "\t%2\$s\t[-d dbname]\n", + $0, ' ' x length($0) ); + print("\n"); + printf( "\t%s\t-?\n", $0 ); + print("\n"); + print("Arguments:\n"); + print("\t-n/--dryrun\t\tDry run, don't write to database\n"); + print("\t-h/--host dbhost\tDatabase server host name\n"); + print("\t-P/--port dbport\tDatabase server port (optional)\n"); + print("\t-u/--user dbuser\tDatabase user name\n"); + print("\t-p/--pass dbpass\tUser password (optional)\n"); + print("\t-d/--name dbname\tDatabase name (optional)\n"); + print("\t-?/--help\t\tDisplays this information\n"); +} + +#----------------------------------------------------------------------- + +my $dryrun; +my ( $dbhost, $dbport ); +my ( $dbuser, $dbpass ); +my $dbname; + +if ( + !GetOptions( + 'dryrun|n' => \$dryrun, + 'dbhost|host|h=s' => \$dbhost, + 'dbport|port|P=i' => \$dbport, + 'dbuser|user|u=s' => \$dbuser, + 'dbpass|pass|p=s' => \$dbpass, + 'dbname|name|d=s' => \$dbname, + 'help|?' => sub { usage(); exit } ) + || !defined($dbhost) + || !defined($dbuser) ) +{ + usage(); + exit; +} + +my $registry = 'Bio::EnsEMBL::Registry'; + +$registry->load_registry_from_db( + '-host' => $dbhost, + '-port' => $dbport, + '-user' => $dbuser, + '-pass' => $dbpass, +); + +my $select_stmt = qq( +SELECT DISTINCT LCASE(meta_value) +FROM meta +WHERE meta_key IN ( + 'species.alias', 'species.taxonomy_id', 'species.common_name', + 'species.ensembl_common_name', 'species.ensembl_alias_name' +) + AND species_id = 1 +); + +my @dbas = @{ $registry->get_all_DBAdaptors( '-group' => 'Core' ) }; + +foreach my $dba (@dbas) { + my $dbh = $dba->dbc()->db_handle(); + if ( defined($dbname) && $dbname ne $dba->dbc()->dbname() ) { next } + + my $species = $dba->species(); + if ( $species =~ /^Ancestral/ ) { next } + + my %aliases; + + my $alias = $species; + $aliases{$alias} = 1; + + $alias =~ tr [_] [ ]; + $aliases{$alias} = 1; + + $species =~ /^(.)[^_]*_(.*)$/; + $alias = $1 . $2; + $aliases{$alias} = 1; + + $species =~ /^(.)[^_]*_(...).*$/; + $alias = $1 . $2; + $aliases{$alias} = 1; + + $species =~ /^(...)[^_]*_(...).*$/; + $alias = $1 . $2; + $aliases{$alias} = 1; + + my $select_sth = $dbh->prepare($select_stmt); + + $select_sth->execute(); + + my $meta_value; + + $select_sth->bind_columns( \$meta_value ); + + while ( $select_sth->fetch() ) { + $aliases{$meta_value} = 1; + } + + my @aliases = + sort { length($a) <=> length($b) || $a cmp $b } keys(%aliases); + + my $insert_stmt = sprintf( + "INSERT IGNORE INTO meta (species_id, meta_key, meta_value) " + . "VALUES %s", + join( + ', ', + map { + sprintf( "( 1, 'species.alias', %s )", $dbh->quote( lc($_) ) ) + } @aliases + ) ); + + printf( "Database = %s\n", $dba->dbc()->dbname() ); + printf( "Aliases = \n\t%s\n", join( "\n\t", @aliases ) ); + + if ( !$dryrun ) { + # Delete old aliases. + $dbh->do( "DELETE FROM meta WHERE species_id = 1 " + . "AND meta_key = 'species.alias'" ); + + # Insert new aliases. + $dbh->do($insert_stmt); + } else { + print("(not writing to database)\n"); + } + +} ## end foreach my $dba (@dbas) -- GitLab