From 837d452aed46cd211784b102da25907fb48661c2 Mon Sep 17 00:00:00 2001 From: Monika Komorowska <mk8@sanger.ac.uk> Date: Tue, 22 May 2012 14:14:54 +0000 Subject: [PATCH] This script creates and populates the ensembl_stable_ids_xx db --- .../populate_stable_id_lookup.pl | 387 ++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 misc-scripts/stable_id_lookup/populate_stable_id_lookup.pl diff --git a/misc-scripts/stable_id_lookup/populate_stable_id_lookup.pl b/misc-scripts/stable_id_lookup/populate_stable_id_lookup.pl new file mode 100644 index 0000000000..7f8a957636 --- /dev/null +++ b/misc-scripts/stable_id_lookup/populate_stable_id_lookup.pl @@ -0,0 +1,387 @@ +#!/usr/bin/env perl + +#The script populates a stable_id lookup database with all stable ids found in databases on a specified server for +#a specified db release. +#The stable ids are copied for objects listed in hash %group_objects + +use strict; +use warnings; +use DBI qw( :sql_types ); +use Getopt::Long; +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::ApiVersion; + + +my $lhost; +my $lport; +my $luser; +my $lpass; +my $ldbname; +my $create; +my $db_version; +my @host; +my @user; +my @port; + +sub insert_stable_ids; +sub insert_species_id; + +GetOptions( "lhost|lh=s" => \$lhost, + "lport=i" => \$lport, + "luser|lu=s" => \$luser, + "lpass|lp=s" => \$lpass, + "ldbname|ld=s" =>\$ldbname, + "create!" => \$create, + "db_version=i" => \$db_version, + "host|h=s",\@host, + "user|u=s",\@user, + "port=s",\@port, + "help" , \&usage, + +); + +usage() if (!defined $lhost || !defined $luser || !defined $lpass || !@host || !@user ); + + +my $host_count = @host; +my $user_count = @user; +my $port_count = @port; + +# if we have fewer user names specified than hosts copy user name from the first -u parameter +if ($user_count < $host_count) { + for (my $i = $user_count; $i < $host_count; $i++) { + push(@user,$user[0]); + } +} + +if ( (!@port) || ($port_count < $host_count) ) { + + if (!defined $port[0]) { + $port[0] = 3306; + } + + for (my $i=1; $i<$host_count;$i++) { + + if (!defined $port[$i]) { + push(@port,$port[0]); + } + } +} + +$db_version ||= software_version(); +$ldbname ||= "ensembl_stable_ids_$db_version"; +#$ldbname ||= "ensemblgenomes_stable_ids_$db_version"; + +my $registry = "Bio::EnsEMBL::Registry"; + + +if ($host_count == 1) { + $registry->load_registry_from_db( -host => $host[0], -port => $port[0],-user => $user[0], -db_version => $db_version); +} else { + + my @server_array; + + for (my $i=0; $i < @host; $i++) { + push @server_array, { -host => $host[$i], -user => $user[$i], -port => $port[$i]}; + } + $registry->load_registry_from_multiple_dbs(@server_array); + +} +$registry->set_disconnect_when_inactive(); + +my @dbas = @{$registry->get_all_DBAdaptors()}; + +my $dbh; +my $species_insert_sth; +my $species_sth; +my $stable_id_insert_sth; + + +if (@dbas) { + #if any db adaptors exist connect to the stable id lookup database and delete old data first + my $dsn = "DBI:mysql:host=$lhost;"; + if ($lport) { + $dsn .= "port=$lport;"; + } + if (!$create) { + $dsn .= "database=$ldbname"; + } + $dbh = DBI->connect( $dsn, $luser, $lpass, + { 'PrintError' => 1, 'RaiseError' => 1 } ); + + if ($create) { + print "Creating database $ldbname\n"; + + eval { + $dbh->do("drop database if exists $ldbname"); + $dbh->do("create database $ldbname"); + + my $cmd = "mysql -h $lhost"; + if ($lport) { + $cmd .= " -P $lport"; + } + $cmd .= " -u $luser --password=$lpass $ldbname < ./sql/tables.sql"; + system($cmd) == 0 or die("error encountered when creating schema for database $ldbname\n"); + + $dbh->do("use $ldbname"); + + $dbh->do("INSERT INTO meta(species_id,meta_key,meta_value) VALUES (NULL,'schema_version',$db_version)"); + + }; + + if ($@) { + die("An SQL error occured while creating database $ldbname:\n$@"); + } + + + } + + #statements used when populating the species table + $species_sth = $dbh->prepare("SELECT species_id FROM species WHERE name = ?"); + $species_insert_sth = $dbh->prepare("INSERT INTO species(name,taxonomy_id) values (?,?)"); + + #statements used when populating stable_id_lookup table + $stable_id_insert_sth = $dbh->prepare("INSERT INTO stable_id_lookup VALUES(?,?,?,?)"); + +} else { + die("No DBAdaptors found on ". join(',',@host) ." for db version $db_version\n"); +} + + +my %group_objects = ( + core => { + + Exon => 1, + Gene => 1, + Transcript => 1, + Translation => 1, + Operon => 1, + OperonTranscript => 1, + }, + compara => { + GeneTree => 1, + Family => 1, + }, + ); + + +#hash which stores species we have already processed +my %dba_species; + + +#populate stable_id_lookup table with stable ids + +while (my $dba = shift @dbas) { + + next if ( exists($dba_species{$dba->species()}) ); + + my @stable_id_objects = keys %{$group_objects{$dba->group()}}; + + my $species_id; + + if (@stable_id_objects) { + + my $species_name = $dba->species(); + $species_sth->bind_param( 1, $species_name, SQL_VARCHAR ); + $species_sth->execute(); + + ($species_id) = $species_sth->fetchrow_array(); + + if (!$species_id) { + $species_id = insert_species_id($dba); + } + + if ($species_id) { + $dba_species{$dba->species()} = 1; + } + + } + + foreach my $object_name (@stable_id_objects) { + + my $adaptor = $dba->get_adaptor($object_name); + my %stable_ids; + + if ($adaptor->can('list_stable_ids')) { + + %stable_ids = map { $_ => 1 } @{$adaptor->list_stable_ids()}; + + } else { + + %stable_ids = map { ($_->stable_id() || '') => 1 } @{$adaptor->fetch_all()}; + } + + + delete $stable_ids{''}; + my @stable_ids = keys %stable_ids; + + if (@stable_ids) { + + insert_stable_ids(\@stable_ids,$dba,$object_name, $species_id); + } + } + +} + +$species_sth->finish() if ($species_sth); +$species_insert_sth->finish() if ($species_insert_sth); +$stable_id_insert_sth->finish() if ($stable_id_insert_sth); + +$dbh->disconnect() if ($dbh); + + + +sub insert_stable_ids { + + my $stable_ids = shift; + my $dba = shift; + my $object = shift; + my $lookup_db_species_id = shift; + + + my @stable_ids = @$stable_ids; + + my @species_id; + my @db_type; + my @object_type; + + for (1..@stable_ids) { + push @species_id, $lookup_db_species_id; + push @db_type, $dba->group(); + push @object_type, $object; + } + + my $tuples; + my @tuple_status; + eval { + $tuples = $stable_id_insert_sth->execute_array( + { ArrayTupleStatus => \@tuple_status }, + \@stable_ids, + \@species_id, + \@db_type, + \@object_type, + ); + }; + + if ($tuples) { + print STDOUT "Successfully inserted $tuples stable_ids for species_id: $species_id[0],db_type: ". $dba->group(). ", object_type: $object\n"; + } + else { + for my $tuple (0..@stable_ids-1) { + my $status = $tuple_status[$tuple]; + $status = [0, "Skipped"] unless defined $status; + next unless ref $status; + printf STDERR "Failed to insert (%s, %s, %s, %s): %s\n", + $stable_ids[$tuple], $species_id[$tuple], $db_type[$tuple], $object_type[$tuple], $status->[1]; + } + } + +} + + +sub insert_species_id { + + my $dba = shift; + my $species_name = $dba->species(); + + #add species to the species table + + my $meta_container = $dba->get_adaptor('MetaContainer'); + my $taxonomy_id; + if ($meta_container) { + my $values = $meta_container->list_value_by_key('species.taxonomy_id'); + if ($values) { + my @values = @$values; + $taxonomy_id = $values[0]; + } + } + + #add row to the species table + $species_insert_sth->bind_param( 1, $species_name, SQL_VARCHAR ); + $species_insert_sth->bind_param( 2, $taxonomy_id, SQL_INTEGER ); + + $species_insert_sth->execute(); + my $species_id = $dbh->last_insert_id( undef, undef, 'species', 'species_id' ); + + if (!$species_id) { + die("Failed to insert row for species $species_name\n"); + } + return $species_id; + +} + + +sub usage { + my $indent = ' ' x length($0); + print <<EOF; exit(0); + +The script populates a stable_id lookup database with all stable ids found in databases +on a specified server (or servers) for a specified db release. +Stable ids are copied for objects listed in hash %group_objects + +Options -lhost -luser -lpass are mandatory and specify the credentials for the server on which a stable id lookup database exists or is to be created (if using option -create). If an argument for option -ldbname is not provided, the default name for the database wil be used: 'ensembl_stable_id_lookup_xx', where xx is the database release (option -db_version). + +Options -host -user -port specify the credentials of the server(s) where stable ids are to be copied from. + +To run the script cd into the directory where the script lives eg: +cd ensembl/misc-scripts/stable_id_lookup/ + + +This command will create database ensembl_stable_id_lookup_67 on server ens-staging1 and will copy stable ids from databases for release 67 found on ens-staging1 and ens-staging2: + +populate_stable_id_lookup.pl -lhost ens-staging1 -luser ensadmin -lpass xxxx -create -db_version 67 -host ens-staging1 -host ens-staging2 -user ensro + + +Usage: + + $0 -lhost host_name -luser user_name -lpass password + $indent [-ldbname database_name] [-lport port_number] + $indent -host host_name [-host host_name2] -user user_name [-user user_name2] + $indent [-port port_number [-port port_number2]] + $indent [-create] [-db_version] + $indent [-help] + +GetOptions( "host|h=s" => \$host, + "port=i" => \$port, + "user|u=s"=> \$user, + "lhost|lh=s" => \$lhost, + "lport=i" => \$lport, + "luser|lu=s" => \$luser, + "lpass|lp=s" => \$lpass, + "ldbname|ld=s" =>\$ldbname, + "create!" => \$create, + "db_version=i" => \$db_version, + "help" , \&usage, +); + + + + -h|host Database host where stable_ids are to be copied from (multiple hosts can be specified) + + -u|user Database user where stable_ids are to be copied from (each host needs a user specified, + if multiple -h|host options are given and fewer -u|user options are specified, + the first user name will be used for the hosts where no user name was given) + + -port Database port where stable_ids are to be copied from (if more than one host is specified + multiple ports can be provided) + + -lh|lhost Database host where stable_id lookup database exists or is to be created + + -lu|luser Database user where stable_id lookup database exists or is to be created + + -lp|lpass Database password where stable_id lookup database exists or is to be created + + -lport Database port where stable_id lookup database exists or is to be created + + -ld|ldbname Database name for the stable id lookup database + + -create Create the stable id lookup database using sql source ./sql/tables.sql + + -db_version If not specified, software_version() returned by the ApiVersion module will be used + + -help This message + + +EOF + +} -- GitLab