Skip to content
Snippets Groups Projects
Commit 545d3b05 authored by Magali Ruffier's avatar Magali Ruffier
Browse files

a bit of text formatting

methods are cleaner
added length check to decide if it is the same region
parent 923eea01
No related branches found
No related tags found
No related merge requests found
......@@ -32,7 +32,7 @@ Optional arguments:
--port=port port (default: 3306)
--oldport=port old database server port (default: 5306)
--oldport=port old database server port (default: 3306)
--olduser=user old database server username
......@@ -89,21 +89,17 @@ use DBI qw(:sql_types);
use Bio::EnsEMBL::Utils::Exception qw(throw);
use constant INITIAL_MAPPING => 1;
use constant SAME_MAPPING => 2;
use constant NEW_MAPPING => 3;
## Command line options
my $host = 'ens-staging';
my $host2 = 'ens-staging2';
my $oldhost = 'ensembldb.ensembl.org';
my $oldhost = 'ens-livemirror';
my $dbname = undef;
my $user = undef;
my $pass = undef;
my $port = 3306;
my $oldport = 5306;
my $olduser = "anonymous";
my $oldport = 3306;
my $olduser = "ensro";
my $oldpass = undef;
my $help = undef;
my $release = undef;
......@@ -132,8 +128,6 @@ my $old_dbh = DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$ol
foreach my $h ($host,$host2) {
my $dbh = DBI->connect("DBI:mysql:database=$database;host=$h;port=$port",$user,$pass);
my $status;
my $database_name;
#since there is no database defined, will run it agains all core databases
my $pattern;
......@@ -143,173 +137,121 @@ foreach my $h ($host,$host2) {
else{
$pattern = $dbname;
}
#fetch all databases matching the pattern
# print STDERR $pattern."\n";
my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'");
# Fetch all databases matching the pattern
my $sth = $dbh->prepare("SHOW DATABASES like \'%$pattern%\'");
$sth->execute();
my $dbs = $sth->fetchall_arrayref();
my $schema_build;
foreach my $db_name (@{$dbs}){
print STDERR "Going to update mapping for $db_name->[0]....\n";
my $mapping_set_id = 1;
my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)");
my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)");
$schema_build = get_schema_and_build($db_name->[0]);
my $current_dbname = $db_name->[0];
print STDERR "Going to update mapping for $current_dbname....\n";
my $mapping_set_id = get_max_mapping_set_id($dbh, $current_dbname) + 1;
my $sth_seq_mapping = $dbh->prepare("INSERT INTO $current_dbname.seq_region_mapping VALUES(?,?,?)");
my $sth_mapping_set = $dbh->prepare("INSERT INTO $current_dbname.mapping_set VALUES(?,?)");
my $schema_build = get_schema_and_build($current_dbname);
my $current_assembly = get_assembly($dbh, $current_dbname) ;
my $current_assembly = get_assembly($dbh, $db_name->[0]) ;
my $previous_dbname = &get_previous_dbname($old_dbh,$db_name->[0],$release);
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
# If there is no previous database, no mapping needed
my $previous_dbname = &get_previous_dbname($old_dbh,$current_dbname,$release);
if (!defined($previous_dbname)) {
$sth_mapping_set->execute($mapping_set_id, $schema_build) unless $dry_run;
next;
print STDERR "First instance known for $current_dbname, no mapping needed\n";
next;
}
my $old_assembly = get_assembly($old_dbh, $previous_dbname);
# If it is a new assembly, no mapping needed
my $old_assembly = get_assembly($old_dbh, $previous_dbname);
if ($old_assembly ne $current_assembly) {
$sth_mapping_set->execute($mapping_set_id, $schema_build) unless $dry_run;
print STDERR "New assembly for $current_dbname, no mapping needed\n" ;
next;
}
my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database
my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id
$status = &mapping_status($dbh,$old_dbh, $db_name->[0],\$mapping_set_id,$release);
## prune newly added mapping_set entries
# $dbh->do(qq(delete from $db_name->[0].mapping_set where schema_build = '$schema_build'));
# next;
#add mapping_set information
if ($status == INITIAL_MAPPING){
#first time run the script, create new entry in mapping_set
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
# If there has been no change in seq_region, no mapping needed
my $cur_seq_region_checksum = &get_seq_region_checksum($dbh,$dbname);
my $previous_seq_region_checksum = &get_seq_region_checksum($old_dbh,$previous_dbname);
if ($cur_seq_region_checksum == $previous_seq_region_checksum) {
print STDERR "No change in seq_region for $current_dbname, no mapping needed\n";
next;
}
elsif ($status == SAME_MAPPING){
#seq_region_mapping has not change, just add a new entry for this version
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
}
elsif ($status == NEW_MAPPING){
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
#there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id
$current_seq_region = &read_seq_region($dbh,$db_name->[0]);
$old_seq_region = &read_seq_region($old_dbh,$previous_dbname);
#update the seq_region_mapping table with the old->new seq_region_id relation
my $count = 0;
foreach my $seq_region_name (keys %{$old_seq_region}){
next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared
foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){
next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database
next if ($old_seq_region->{$seq_region_name}->{$coord_system_id} == $current_seq_region->{$seq_region_name}->{$coord_system_id}); # if no change no need to write out
$sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id) unless $dry_run;
$count++;
}
# There has been a seq_region change between releases, add the relation old_seq_region_id->new_seq_region_id
my $current_seq_region = &read_seq_region($dbh,$current_dbname);
my $old_seq_region = &read_seq_region($old_dbh,$previous_dbname);
# Update the seq_region_mapping table with the old->new seq_region_id relation
my $count = 0;
foreach my $seq_region_name (keys %{$old_seq_region}){
my $current_name_hash = $current_seq_region->{$seq_region_name};
my $old_name_hash = $old_seq_region->{$seq_region_name};
# The seq_region might have disappeared
if (!defined $current_name_hash) {
next;
}
foreach my $coord_system_id (keys %{$old_name_hash}){
my $current_cs_hash = $current_name_hash->{$coord_system_id};
my $old_cs_hash = $old_name_hash->{$coord_system_id};
# The coord_system might have been removed in the current database
if (!defined $current_cs_hash) {
next;
}
foreach my $length (keys %{$old_cs_hash}) {
my $current_id = $current_cs_hash->{$length};
my $old_id = $old_cs_hash->{$length};
# If no change, no need to write out
if (!defined $current_id || $old_id == $current_id) {
next;
}
$sth_seq_mapping->execute($old_id, $current_id, $mapping_set_id) unless $dry_run;
$count++;
}
}
print STDERR "Added $count seq_region_mapping entry\n\n";
}
else{
throw("Mapping status not recognized by script: $status \n\n");
}
print STDERR "Added $count seq_region_mapping entry for $dbname\n\n";
}
}
#will for a given database, will return the seq_region_name->seq_region_id relation
# For a given database, will return the seq_region_name->seq_region_id relation
sub read_seq_region{
my $dbh = shift;
my $dbname = shift;
my %seq_region_hash;
my $seq_region_id;
my $seq_region_name;
my $coord_system_id;
my $sth = $dbh->prepare("SELECT seq_region_id, name, coord_system_id FROM $dbname.seq_region");
my ($seq_region_id, $seq_region_name, $coord_system_id, $length, $cs_name);
my $sth = $dbh->prepare("SELECT seq_region_id, name, length, coord_system_id FROM $dbname.seq_region");
$sth->execute();
$sth->bind_col(1,\$seq_region_id);
$sth->bind_col(2,\$seq_region_name);
$sth->bind_col(3,\$coord_system_id);
$sth->bind_col(3,\$length);
$sth->bind_col(4,\$coord_system_id);
while ($sth->fetch){
#there might be more than one assembly in the core database, thus we need the coord_system_id to remove ambiguity
$seq_region_hash{$seq_region_name}{$coord_system_id} = $seq_region_id;
$seq_region_hash{$seq_region_name}{$coord_system_id}{$length} = $seq_region_id;
}
return \%seq_region_hash;
}
#method to check the status of the current core database: INITIAL_MAPPING, SAME_MAPPING and NEW_MAPPING are the possible states
sub mapping_status{
my $dbh = shift;
my $old_dbh = shift;
my $dbname = shift;
my $mapping_set_id_ref = shift;
my $release = shift;
my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set");
$sth_max_mapping->execute();
( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array();
if (! $$mapping_set_id_ref){
#the table is empty, first mapping
my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release);
if ($previous_dbname) {
my $old_sth_max_mapping = $old_dbh->prepare("select max(mapping_set_id) from $previous_dbname.mapping_set");
$old_sth_max_mapping->execute();
( $$mapping_set_id_ref ) = $old_sth_max_mapping->fetchrow_array();
if ($$mapping_set_id_ref) {
print (" There are mappings in the previous version of the database $dbname, need to do something with them ?\n");
}
else {
print " Previous version of the $dbname has no entries, this is wrong\n";
}
}
else {
print " This is the first version of a this database $dbname, so just creating a new entry\n";
$$mapping_set_id_ref = 1;
}
return INITIAL_MAPPING;
}
else{
#there is information, find out if it is the same mapping as previous release
my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release);
if(!defined($previous_dbname)){
print " No previous database present for $dbname so cannot do diff so will initialise with this as the first version of the database\n";
$$mapping_set_id_ref = 1;
return INITIAL_MAPPING;
}
my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname);
my $previous_seq_region_size = &get_seq_region_size($old_dbh,$previous_dbname);
if ($cur_seq_region_size == $previous_seq_region_size){
#if both tables have same size, SAME_MAPPING
return SAME_MAPPING;
}
else{
#if tables have different size, NEW_MAPPING
$$mapping_set_id_ref++;
return NEW_MAPPING;
}
}
}
#for a given database, returns the size of the seq_region_table
sub get_seq_region_size{
# For a given database, returns the size of the seq_region_table
sub get_seq_region_checksum{
my $dbh = shift;
my $dbname = shift;
my $sth_status = $dbh->prepare("show table status from $dbname like 'seq_region'") ;
my $sth_status = $dbh->prepare("checksum table $dbname.seq_region") ;
$sth_status->execute();
my @table_status = $sth_status->fetchrow_array();
return $table_status[6]; #return the size of the table
my $table_status = $sth_status->fetchrow_array();
return $table_status; #return the size of the table
}
#will return the max mapping_set_id being used in the mapping_set table
# Will return the max mapping_set_id being used in the mapping_set table
sub get_max_mapping_set_id{
my $dbh = shift;
my $dbname = shift;
my $sth_mapping = $dbh->prepare("select max(mapping_set_id) from mapping_set");
my $sth_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set");
$sth_mapping->execute();
my ($max_mapping_set_id) = $sth_mapping->fetchrow_array();
if (!defined $max_mapping_set_id) { return 0; }
return $max_mapping_set_id;
}
#this method will return the name of the previous database to release for same species (assuming is present)
# This method will return the name of the previous database to release for same species (assuming is present)
sub get_previous_dbname{
my $dbh = shift;
my $dbname = shift;
......@@ -317,23 +259,22 @@ sub get_previous_dbname{
my $previous_dbname;
$dbname =~ /(^[a-z]+_[a-z]+_core_)/;
if (!$1) { throw("Database name $dbname is not in the right format"); }
my $previous_release_name = $1 . (--$release);
my $previous_sth = $dbh->prepare("show databases like \'%$previous_release_name%\'");
$previous_sth->execute();
($previous_dbname) = $previous_sth->fetchrow_array() ;
return $previous_dbname;
}
#for a standard ensembl database name, returns the release number and assembly
# For a standard ensembl database name, returns the release number and assembly
sub get_schema_and_build{
my ($dbname) = @_;
my @dbname = split/_/, $dbname;
return join "_",$dbname[($#dbname -1)], $dbname[($#dbname)];
}
# Returna the assembly name for a given database
sub get_assembly{
my ($dbh, $dbname) = @_;
my $sth = $dbh->prepare("select meta_value from $dbname.meta where meta_key = 'assembly.default'");
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment