diff --git a/misc-scripts/update_mapping_set.pl b/misc-scripts/update_mapping_set.pl index 517932244f1a0dc6cf1a5f6bfbd0a1dd9c37aa0f..303efbf690c23790a9ad9618db3ff8644bfa251b 100644 --- a/misc-scripts/update_mapping_set.pl +++ b/misc-scripts/update_mapping_set.pl @@ -32,7 +32,7 @@ Optional arguments: --port=port port (default: 3306) - --oldport=port old database server port (default: 5306) + --oldport=port old database server port (default: 3306) --olduser=user old database server username @@ -89,21 +89,17 @@ use DBI qw(:sql_types); use Bio::EnsEMBL::Utils::Exception qw(throw); -use constant INITIAL_MAPPING => 1; -use constant SAME_MAPPING => 2; -use constant NEW_MAPPING => 3; - ## Command line options my $host = 'ens-staging'; my $host2 = 'ens-staging2'; -my $oldhost = 'ensembldb.ensembl.org'; +my $oldhost = 'ens-livemirror'; my $dbname = undef; my $user = undef; my $pass = undef; my $port = 3306; -my $oldport = 5306; -my $olduser = "anonymous"; +my $oldport = 3306; +my $olduser = "ensro"; my $oldpass = undef; my $help = undef; my $release = undef; @@ -132,8 +128,6 @@ my $old_dbh = DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$ol foreach my $h ($host,$host2) { my $dbh = DBI->connect("DBI:mysql:database=$database;host=$h;port=$port",$user,$pass); - my $status; - my $database_name; #since there is no database defined, will run it agains all core databases my $pattern; @@ -143,173 +137,121 @@ foreach my $h ($host,$host2) { else{ $pattern = $dbname; } - #fetch all databases matching the pattern -# print STDERR $pattern."\n"; - my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'"); + +# Fetch all databases matching the pattern + my $sth = $dbh->prepare("SHOW DATABASES like \'%$pattern%\'"); $sth->execute(); my $dbs = $sth->fetchall_arrayref(); - my $schema_build; foreach my $db_name (@{$dbs}){ - print STDERR "Going to update mapping for $db_name->[0]....\n"; - my $mapping_set_id = 1; - my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)"); - my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)"); - $schema_build = get_schema_and_build($db_name->[0]); + my $current_dbname = $db_name->[0]; + print STDERR "Going to update mapping for $current_dbname....\n"; + my $mapping_set_id = get_max_mapping_set_id($dbh, $current_dbname) + 1; + my $sth_seq_mapping = $dbh->prepare("INSERT INTO $current_dbname.seq_region_mapping VALUES(?,?,?)"); + my $sth_mapping_set = $dbh->prepare("INSERT INTO $current_dbname.mapping_set VALUES(?,?)"); + my $schema_build = get_schema_and_build($current_dbname); + my $current_assembly = get_assembly($dbh, $current_dbname) ; - my $current_assembly = get_assembly($dbh, $db_name->[0]) ; - my $previous_dbname = &get_previous_dbname($old_dbh,$db_name->[0],$release); + $sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run; # If there is no previous database, no mapping needed + my $previous_dbname = &get_previous_dbname($old_dbh,$current_dbname,$release); if (!defined($previous_dbname)) { - $sth_mapping_set->execute($mapping_set_id, $schema_build) unless $dry_run; - next; + print STDERR "First instance known for $current_dbname, no mapping needed\n"; + next; } - my $old_assembly = get_assembly($old_dbh, $previous_dbname); # If it is a new assembly, no mapping needed + my $old_assembly = get_assembly($old_dbh, $previous_dbname); if ($old_assembly ne $current_assembly) { - $sth_mapping_set->execute($mapping_set_id, $schema_build) unless $dry_run; + print STDERR "New assembly for $current_dbname, no mapping needed\n" ; next; } - my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database - my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id - $status = &mapping_status($dbh,$old_dbh, $db_name->[0],\$mapping_set_id,$release); - -## prune newly added mapping_set entries -# $dbh->do(qq(delete from $db_name->[0].mapping_set where schema_build = '$schema_build')); -# next; - - #add mapping_set information - if ($status == INITIAL_MAPPING){ - #first time run the script, create new entry in mapping_set - $sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run; +# If there has been no change in seq_region, no mapping needed + my $cur_seq_region_checksum = &get_seq_region_checksum($dbh,$dbname); + my $previous_seq_region_checksum = &get_seq_region_checksum($old_dbh,$previous_dbname); + if ($cur_seq_region_checksum == $previous_seq_region_checksum) { + print STDERR "No change in seq_region for $current_dbname, no mapping needed\n"; + next; } - elsif ($status == SAME_MAPPING){ - #seq_region_mapping has not change, just add a new entry for this version - $sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run; - } - elsif ($status == NEW_MAPPING){ - $sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run; - #there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id - $current_seq_region = &read_seq_region($dbh,$db_name->[0]); - $old_seq_region = &read_seq_region($old_dbh,$previous_dbname); - #update the seq_region_mapping table with the old->new seq_region_id relation - my $count = 0; - foreach my $seq_region_name (keys %{$old_seq_region}){ - next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared - foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){ - next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database - next if ($old_seq_region->{$seq_region_name}->{$coord_system_id} == $current_seq_region->{$seq_region_name}->{$coord_system_id}); # if no change no need to write out - $sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id) unless $dry_run; - $count++; - } +# There has been a seq_region change between releases, add the relation old_seq_region_id->new_seq_region_id + my $current_seq_region = &read_seq_region($dbh,$current_dbname); + my $old_seq_region = &read_seq_region($old_dbh,$previous_dbname); + +# Update the seq_region_mapping table with the old->new seq_region_id relation + my $count = 0; + foreach my $seq_region_name (keys %{$old_seq_region}){ + my $current_name_hash = $current_seq_region->{$seq_region_name}; + my $old_name_hash = $old_seq_region->{$seq_region_name}; +# The seq_region might have disappeared + if (!defined $current_name_hash) { + next; + } + foreach my $coord_system_id (keys %{$old_name_hash}){ + my $current_cs_hash = $current_name_hash->{$coord_system_id}; + my $old_cs_hash = $old_name_hash->{$coord_system_id}; +# The coord_system might have been removed in the current database + if (!defined $current_cs_hash) { + next; + } + foreach my $length (keys %{$old_cs_hash}) { + my $current_id = $current_cs_hash->{$length}; + my $old_id = $old_cs_hash->{$length}; +# If no change, no need to write out + if (!defined $current_id || $old_id == $current_id) { + next; + } + $sth_seq_mapping->execute($old_id, $current_id, $mapping_set_id) unless $dry_run; + $count++; + } } - print STDERR "Added $count seq_region_mapping entry\n\n"; - } - else{ - throw("Mapping status not recognized by script: $status \n\n"); } + print STDERR "Added $count seq_region_mapping entry for $dbname\n\n"; } } -#will for a given database, will return the seq_region_name->seq_region_id relation +# For a given database, will return the seq_region_name->seq_region_id relation sub read_seq_region{ my $dbh = shift; my $dbname = shift; my %seq_region_hash; - my $seq_region_id; - my $seq_region_name; - my $coord_system_id; - my $sth = $dbh->prepare("SELECT seq_region_id, name, coord_system_id FROM $dbname.seq_region"); + my ($seq_region_id, $seq_region_name, $coord_system_id, $length, $cs_name); + my $sth = $dbh->prepare("SELECT seq_region_id, name, length, coord_system_id FROM $dbname.seq_region"); $sth->execute(); $sth->bind_col(1,\$seq_region_id); $sth->bind_col(2,\$seq_region_name); - $sth->bind_col(3,\$coord_system_id); + $sth->bind_col(3,\$length); + $sth->bind_col(4,\$coord_system_id); while ($sth->fetch){ - #there might be more than one assembly in the core database, thus we need the coord_system_id to remove ambiguity - $seq_region_hash{$seq_region_name}{$coord_system_id} = $seq_region_id; + $seq_region_hash{$seq_region_name}{$coord_system_id}{$length} = $seq_region_id; } return \%seq_region_hash; } -#method to check the status of the current core database: INITIAL_MAPPING, SAME_MAPPING and NEW_MAPPING are the possible states -sub mapping_status{ - my $dbh = shift; - my $old_dbh = shift; - my $dbname = shift; - my $mapping_set_id_ref = shift; - my $release = shift; - - my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set"); - $sth_max_mapping->execute(); - ( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array(); - if (! $$mapping_set_id_ref){ - #the table is empty, first mapping - my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release); - if ($previous_dbname) { - - my $old_sth_max_mapping = $old_dbh->prepare("select max(mapping_set_id) from $previous_dbname.mapping_set"); - $old_sth_max_mapping->execute(); - ( $$mapping_set_id_ref ) = $old_sth_max_mapping->fetchrow_array(); - if ($$mapping_set_id_ref) { - print (" There are mappings in the previous version of the database $dbname, need to do something with them ?\n"); - } - else { - print " Previous version of the $dbname has no entries, this is wrong\n"; - } - } - else { - print " This is the first version of a this database $dbname, so just creating a new entry\n"; - $$mapping_set_id_ref = 1; - } - return INITIAL_MAPPING; - } - else{ - #there is information, find out if it is the same mapping as previous release - my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release); - if(!defined($previous_dbname)){ - print " No previous database present for $dbname so cannot do diff so will initialise with this as the first version of the database\n"; - $$mapping_set_id_ref = 1; - return INITIAL_MAPPING; - } - my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname); - my $previous_seq_region_size = &get_seq_region_size($old_dbh,$previous_dbname); - if ($cur_seq_region_size == $previous_seq_region_size){ - #if both tables have same size, SAME_MAPPING - return SAME_MAPPING; - } - else{ - #if tables have different size, NEW_MAPPING - $$mapping_set_id_ref++; - return NEW_MAPPING; - } - } -} - -#for a given database, returns the size of the seq_region_table -sub get_seq_region_size{ +# For a given database, returns the size of the seq_region_table +sub get_seq_region_checksum{ my $dbh = shift; my $dbname = shift; - my $sth_status = $dbh->prepare("show table status from $dbname like 'seq_region'") ; + my $sth_status = $dbh->prepare("checksum table $dbname.seq_region") ; $sth_status->execute(); - my @table_status = $sth_status->fetchrow_array(); - return $table_status[6]; #return the size of the table + my $table_status = $sth_status->fetchrow_array(); + return $table_status; #return the size of the table } -#will return the max mapping_set_id being used in the mapping_set table +# Will return the max mapping_set_id being used in the mapping_set table sub get_max_mapping_set_id{ my $dbh = shift; my $dbname = shift; - my $sth_mapping = $dbh->prepare("select max(mapping_set_id) from mapping_set"); + my $sth_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set"); $sth_mapping->execute(); my ($max_mapping_set_id) = $sth_mapping->fetchrow_array(); + if (!defined $max_mapping_set_id) { return 0; } return $max_mapping_set_id; } -#this method will return the name of the previous database to release for same species (assuming is present) +# This method will return the name of the previous database to release for same species (assuming is present) sub get_previous_dbname{ my $dbh = shift; my $dbname = shift; @@ -317,23 +259,22 @@ sub get_previous_dbname{ my $previous_dbname; $dbname =~ /(^[a-z]+_[a-z]+_core_)/; - + if (!$1) { throw("Database name $dbname is not in the right format"); } my $previous_release_name = $1 . (--$release); my $previous_sth = $dbh->prepare("show databases like \'%$previous_release_name%\'"); $previous_sth->execute(); ($previous_dbname) = $previous_sth->fetchrow_array() ; return $previous_dbname; - } -#for a standard ensembl database name, returns the release number and assembly +# For a standard ensembl database name, returns the release number and assembly sub get_schema_and_build{ my ($dbname) = @_; my @dbname = split/_/, $dbname; return join "_",$dbname[($#dbname -1)], $dbname[($#dbname)]; } - +# Returna the assembly name for a given database sub get_assembly{ my ($dbh, $dbname) = @_; my $sth = $dbh->prepare("select meta_value from $dbname.meta where meta_key = 'assembly.default'");