From 78b1069f7cb75fed350e05da7f93b21804a2c695 Mon Sep 17 00:00:00 2001 From: Ian Longden <ianl@sanger.ac.uk> Date: Wed, 12 May 2010 07:31:51 +0000 Subject: [PATCH] many changes but the most important is to only add seq_region mapping if the seq_region_ids are different --- misc-scripts/update_mapping_set.pl | 186 +++++++++++++++++------------ 1 file changed, 110 insertions(+), 76 deletions(-) diff --git a/misc-scripts/update_mapping_set.pl b/misc-scripts/update_mapping_set.pl index ad7bd0927f..4b6eb88f49 100644 --- a/misc-scripts/update_mapping_set.pl +++ b/misc-scripts/update_mapping_set.pl @@ -24,12 +24,20 @@ Optional arguments: --host=host server where the core databases are stored (default: ens-staging) + --oldhost = oldhost server where the old release databases are stored + --dbname=dbname if you want a single database to update the mapping_set information (default: all databases) --port=port port (default: 3306) + --oldport=port old database server port (default: 5306) + + --olduser=user old database server username + + --oldpass=pass password for old database server + --help print help (this message) =head1 DESCRIPTION @@ -51,7 +59,7 @@ Update mapping_set information for all databases in ens-staging in release NN (the usual use case in release process): $ ./update_mapping_set.pl --user ensadmin \ - --pass password --release NN + --pass password --release NN --old_host ensembldb-ensembl.org Update mapping_set information only for pig database in ens-genomics1: @@ -88,10 +96,14 @@ use constant NEW_MAPPING => 3; ## Command line options my $host = 'ens-staging'; +my $oldhost = 'ensembldb.ensembl.org'; my $dbname = undef; my $user = undef; my $pass = undef; my $port = 3306; +my $oldport = 5306; +my $olduser = "anonymous"; +my $oldpass = undef; my $help = undef; my $release = undef; @@ -101,7 +113,11 @@ GetOptions('host=s' => \$host, 'pass=s' => \$pass, 'port=s' => \$port, 'release=i' => \$release, - 'help' => \$help + 'oldhost=s' => \$oldhost, + 'oldport=s' => \$oldport, + 'olduser=s' => \$olduser, + 'oldpass=s' => \$oldpass, + 'help' => \$help ); pod2usage(1) if($help); @@ -111,65 +127,78 @@ throw("--release argument required") if(!defined($release)); my $database = 'information_schema'; my $dbh = DBI->connect("DBI:mysql:database=$database;host=$host;port=$port",$user,$pass); +my $old_dbh = DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$oldport",$olduser,$oldpass); my $status; my $database_name; #since there is no database defined, will run it agains all core databases my $pattern; if (!defined ($dbname)){ - $pattern = '_core_\$release_'; + $pattern = "_core_".$release."_"; } else{ $pattern = $dbname; } #fetch all databases matching the pattern +print STDERR $pattern."\n"; my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'"); $sth->execute(); my $dbs = $sth->fetchall_arrayref(); my $schema_build; foreach my $db_name (@{$dbs}){ - print STDERR "Going to update mapping for $db_name->[0]....\n"; - my $mapping_set_id; - my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database - my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id - my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)"); - my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)"); - $status = &mapping_status($dbh,$db_name->[0],\$mapping_set_id,$release); - $schema_build = get_schema_and_build($db_name->[0]); - #add mapping_set information - $sth_mapping_set->execute($mapping_set_id,$schema_build); - if ($status == INITIAL_MAPPING){ - #first time run the script, create new entry in mapping_set and copy seq_region entries in seq_region_mapping - $current_seq_region = &read_seq_region($dbh,$db_name->[0]); - #copy the seq_region_id in the seq_region_mapping - foreach my $seq_region_name (keys %{$current_seq_region}){ - #when copying there won't be any ambiguity with coord_systems - foreach my $region_id (values %{$current_seq_region->{$seq_region_name}}){ - $sth_seq_mapping->execute($region_id,$region_id,$mapping_set_id); - } - } - } - elsif ($status == SAME_MAPPING){ - #seq_region_mapping has not change, nothing to do for the moment.... - - } - elsif ($status == NEW_MAPPING){ - #there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id - my $previous_dbname = &get_previous_dbname($dbh,$db_name->[0],$release); - $current_seq_region = &read_seq_region($dbh,$db_name->[0]); - $old_seq_region = &read_seq_region($dbh,$previous_dbname); - #update the seq_region_mapping table with the old->new seq_region_id relation - foreach my $seq_region_name (keys %{$old_seq_region}){ - next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared - foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){ - next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database - $sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id); - } - } - } - else{ - throw("Mapping status not recognized by script: $status \n\n"); - } + print STDERR "Going to update mapping for $db_name->[0]....\n"; + my $mapping_set_id; + my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database + my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id + my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)"); + my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)"); + $status = &mapping_status($dbh,$old_dbh, $db_name->[0],\$mapping_set_id,$release); + $schema_build = get_schema_and_build($db_name->[0]); + + #add mapping_set information + if ($status == INITIAL_MAPPING){ + $sth_mapping_set->execute($mapping_set_id,$schema_build); + #first time run the script, create new entry in mapping_set and copy seq_region entries in seq_region_mapping + + ############ + #Actually NO only store the differences so for the initial one it is NONE. + ############ + + # $current_seq_region = &read_seq_region($dbh,$db_name->[0]); + # #copy the seq_region_id in the seq_region_mapping + # foreach my $seq_region_name (keys %{$current_seq_region}){ + # #when copying there won't be any ambiguity with coord_systems + # foreach my $region_id (values %{$current_seq_region->{$seq_region_name}}){ + # $sth_seq_mapping->execute($region_id,$region_id,$mapping_set_id); + # } + # } + } + elsif ($status == SAME_MAPPING){ + #seq_region_mapping has not change, nothing to do for the moment.... + + } + elsif ($status == NEW_MAPPING){ + $sth_mapping_set->execute($mapping_set_id,$schema_build); + #there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id + my $previous_dbname = &get_previous_dbname($old_dbh,$db_name->[0],$release); + $current_seq_region = &read_seq_region($dbh,$db_name->[0]); + $old_seq_region = &read_seq_region($old_dbh,$previous_dbname); + #update the seq_region_mapping table with the old->new seq_region_id relation + my $count = 0; + foreach my $seq_region_name (keys %{$old_seq_region}){ + next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared + foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){ + next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database + next if ($old_seq_region->{$seq_region_name}->{$coord_system_id} == $current_seq_region->{$seq_region_name}->{$coord_system_id}); # if no change no need to write out + $sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id); + $count++; + } + } + print STDERR "Added $count seq_region_mapping entry\n\n"; + } + else{ + throw("Mapping status not recognized by script: $status \n\n"); + } } #will for a given database, will return the seq_region_name->seq_region_id relation @@ -194,43 +223,48 @@ sub read_seq_region{ #method to check the status of the current core database: INITIAL_MAPPING, SAME_MAPPING and NEW_MAPPING are the possible states sub mapping_status{ - my $dbh = shift; - my $dbname = shift; - my $mapping_set_id_ref = shift; - my $release = shift; - - my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set"); - $sth_max_mapping->execute(); - ( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array(); - if (! $$mapping_set_id_ref){ - #the table is empty, first mapping - $$mapping_set_id_ref = 1; - return INITIAL_MAPPING; - } - else{ - #there is information, find out if it is the same mapping as previous release - my $previous_dbname = &get_previous_dbname($dbh,$dbname,$release); - throw("No previous database present for $dbname\n") if (!defined $previous_dbname); - my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname); - my $previous_seq_region_size = &get_seq_region_size($dbh,$previous_dbname); - if ($cur_seq_region_size == $previous_seq_region_size){ - #if both tables have same size, SAME_MAPPING - return SAME_MAPPING; - } - else{ - #if tables have different size, NEW_MAPPING - $$mapping_set_id_ref++; - return NEW_MAPPING; - } - } + my $dbh = shift; + my $old_dbh = shift; + my $dbname = shift; + my $mapping_set_id_ref = shift; + my $release = shift; + + # my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set"); + # $sth_max_mapping->execute(); + # ( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array(); + # if (! $$mapping_set_id_ref){ + # #the table is empty, first mapping + # $$mapping_set_id_ref = 1; + # return INITIAL_MAPPING; + # } + # else{ + #there is information, find out if it is the same mapping as previous release + + my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release); + if(!defined($previous_dbname)){ + print "No previous database present for $dbname so cannot do diff so will initialise with this as the first version of the database\n"; + $$mapping_set_id_ref = 1; + return INITIAL_MAPPING; + } + my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname); + my $previous_seq_region_size = &get_seq_region_size($old_dbh,$previous_dbname); + if ($cur_seq_region_size == $previous_seq_region_size){ + #if both tables have same size, SAME_MAPPING + return SAME_MAPPING; + } + else{ + #if tables have different size, NEW_MAPPING + $$mapping_set_id_ref++; + return NEW_MAPPING; + } + #} } - #for a given database, returns the size of the seq_region_table sub get_seq_region_size{ my $dbh = shift; my $dbname = shift; - my $sth_status = $dbh->prepare("show table status from $dbname like 'seq_region'"); + my $sth_status = $dbh->prepare("show table status from $dbname like 'seq_region'") ; $sth_status->execute(); my @table_status = $sth_status->fetchrow_array(); return $table_status[6]; #return the size of the table -- GitLab