From 2fb78fe12f8c504663689166fd471101a72be7f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?=
 <ak4@sanger.ac.uk>
Date: Tue, 11 Aug 2009 10:11:46 +0000
Subject: [PATCH] Info:     Script to populate the meta table with species
 aliases.

    The script reads the already existing aliases from the
    meta table (meta_key 'species.alias') and adds to this
    aliases computed from the species name.  It also uses the
    information stored for the meta_keys species.taxonomy_id,
    species.common_name, species.ensembl_common_name, and
    species.ensembl_alias_name as aliases.

    If the -n or --dryrun options are *not* specified, the existing
    list of aliases is deleted from the meta table and the new list
    is inserted.  In any case, the list of aliases will be displayed
    on the console.

    If the -d or --dbname options are *not* used, the script will
    iterate over all Core databases.  If the -d or --dbname option
    *is* used, only that Core database will be examined.

    This script assumes that the database is a single-species
    database.

    This script does not check for alias duplications between
    species.


Usage:
        ./add_species_aliases.pl        [-n] -h dbhost [-P dbport] \
                                        -u dbuser [-p dbpass] \
                                        [-d dbname]

        ./add_species_aliases.pl        -?

Arguments:
        -n/--dryrun             Dry run, don't write to database
        -h/--host dbhost        Database server host name
        -P/--port dbport        Database server port (optional)
        -u/--user dbuser        Database user name
        -p/--pass dbpass        User password (optional)
        -d/--name dbname        Database name (optional)
        -?/--help               Displays this information
---
 misc-scripts/add_species_aliases.pl | 170 ++++++++++++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100755 misc-scripts/add_species_aliases.pl

diff --git a/misc-scripts/add_species_aliases.pl b/misc-scripts/add_species_aliases.pl
new file mode 100755
index 0000000000..e9cd1ae20a
--- /dev/null
+++ b/misc-scripts/add_species_aliases.pl
@@ -0,0 +1,170 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Registry;
+
+use Getopt::Long qw( :config no_ignore_case );
+
+#-----------------------------------------------------------------------
+
+sub usage {
+  print("Info:\n");
+  print <<EOT;
+    Script to populate the meta table with species aliases.
+
+    The script reads the already existing aliases from the
+    meta table (meta_key 'species.alias') and adds to this
+    aliases computed from the species name.  It also uses the
+    information stored for the meta_keys species.taxonomy_id,
+    species.common_name, species.ensembl_common_name, and
+    species.ensembl_alias_name as aliases.
+
+    If the -n or --dryrun options are *not* specified, the existing
+    list of aliases is deleted from the meta table and the new list
+    is inserted.  In any case, the list of aliases will be displayed
+    on the console.
+
+    If the -d or --dbname options are *not* used, the script will
+    iterate over all Core databases.  If the -d or --dbname option
+    *is* used, only that Core database will be examined.
+
+    This script assumes that the database is a single-species
+    database.
+
+    This script does not check for alias duplications between
+    species.
+
+
+EOT
+
+  print("Usage:\n");
+  printf( "\t%s\t[-n] -h dbhost [-P dbport] \\\n"
+      . "\t%s\t-u dbuser [-p dbpass] \\\n"
+      . "\t%2\$s\t[-d dbname]\n",
+    $0, ' ' x length($0) );
+  print("\n");
+  printf( "\t%s\t-?\n", $0 );
+  print("\n");
+  print("Arguments:\n");
+  print("\t-n/--dryrun\t\tDry run, don't write to database\n");
+  print("\t-h/--host dbhost\tDatabase server host name\n");
+  print("\t-P/--port dbport\tDatabase server port (optional)\n");
+  print("\t-u/--user dbuser\tDatabase user name\n");
+  print("\t-p/--pass dbpass\tUser password (optional)\n");
+  print("\t-d/--name dbname\tDatabase name (optional)\n");
+  print("\t-?/--help\t\tDisplays this information\n");
+}
+
+#-----------------------------------------------------------------------
+
+my $dryrun;
+my ( $dbhost, $dbport );
+my ( $dbuser, $dbpass );
+my $dbname;
+
+if (
+  !GetOptions(
+    'dryrun|n'        => \$dryrun,
+    'dbhost|host|h=s' => \$dbhost,
+    'dbport|port|P=i' => \$dbport,
+    'dbuser|user|u=s' => \$dbuser,
+    'dbpass|pass|p=s' => \$dbpass,
+    'dbname|name|d=s' => \$dbname,
+    'help|?'          => sub { usage(); exit } )
+  || !defined($dbhost)
+  || !defined($dbuser) )
+{
+  usage();
+  exit;
+}
+
+my $registry = 'Bio::EnsEMBL::Registry';
+
+$registry->load_registry_from_db(
+  '-host' => $dbhost,
+  '-port' => $dbport,
+  '-user' => $dbuser,
+  '-pass' => $dbpass,
+);
+
+my $select_stmt = qq(
+SELECT DISTINCT LCASE(meta_value)
+FROM    meta
+WHERE meta_key IN (
+  'species.alias', 'species.taxonomy_id', 'species.common_name',
+  'species.ensembl_common_name', 'species.ensembl_alias_name'
+)
+  AND species_id = 1
+);
+
+my @dbas = @{ $registry->get_all_DBAdaptors( '-group' => 'Core' ) };
+
+foreach my $dba (@dbas) {
+  my $dbh = $dba->dbc()->db_handle();
+  if ( defined($dbname) && $dbname ne $dba->dbc()->dbname() ) { next }
+
+  my $species = $dba->species();
+  if ( $species =~ /^Ancestral/ ) { next }
+
+  my %aliases;
+
+  my $alias = $species;
+  $aliases{$alias} = 1;
+
+  $alias =~ tr [_] [ ];
+  $aliases{$alias} = 1;
+
+  $species =~ /^(.)[^_]*_(.*)$/;
+  $alias = $1 . $2;
+  $aliases{$alias} = 1;
+
+  $species =~ /^(.)[^_]*_(...).*$/;
+  $alias = $1 . $2;
+  $aliases{$alias} = 1;
+
+  $species =~ /^(...)[^_]*_(...).*$/;
+  $alias = $1 . $2;
+  $aliases{$alias} = 1;
+
+  my $select_sth = $dbh->prepare($select_stmt);
+
+  $select_sth->execute();
+
+  my $meta_value;
+
+  $select_sth->bind_columns( \$meta_value );
+
+  while ( $select_sth->fetch() ) {
+    $aliases{$meta_value} = 1;
+  }
+
+  my @aliases =
+    sort { length($a) <=> length($b) || $a cmp $b } keys(%aliases);
+
+  my $insert_stmt = sprintf(
+    "INSERT IGNORE INTO meta (species_id, meta_key, meta_value) "
+      . "VALUES %s",
+    join(
+      ', ',
+      map {
+        sprintf( "( 1, 'species.alias', %s )", $dbh->quote( lc($_) ) )
+        } @aliases
+    ) );
+
+  printf( "Database = %s\n", $dba->dbc()->dbname() );
+  printf( "Aliases  = \n\t%s\n", join( "\n\t", @aliases ) );
+
+  if ( !$dryrun ) {
+    # Delete old aliases.
+    $dbh->do( "DELETE FROM meta WHERE species_id = 1 "
+        . "AND meta_key = 'species.alias'" );
+
+    # Insert new aliases.
+    $dbh->do($insert_stmt);
+  } else {
+    print("(not writing to database)\n");
+  }
+
+} ## end foreach my $dba (@dbas)
-- 
GitLab