Commit f9bd2451 authored by Steve Trevanion's avatar Steve Trevanion
Browse files

changes to get it to do what I think it should do - 1) staging and staging2,...

changes to get it to do what I think it should do - 1) staging and staging2, 2) add an initial mapping_set entry if this is  new species 3) warn if there is no mapping_set entry but this is not a new species - in this case I the genebuilders should be adding the old one since this is the scenario we want to catch! (need to check the assembly hasn't changed), 4) dry_run, ie preview mode
parent fa1fff85
......@@ -95,7 +95,8 @@ use constant NEW_MAPPING => 3;
## Command line options
my $host = 'ens-staging';
my $host = 'ens-staging';
my $host2 = 'ens-staging2';
my $oldhost = 'ensembldb.ensembl.org';
my $dbname = undef;
my $user = undef;
......@@ -106,6 +107,7 @@ my $olduser = "anonymous";
my $oldpass = undef;
my $help = undef;
my $release = undef;
my $dry_run = undef;
GetOptions('host=s' => \$host,
'dbname=s' => \$dbname,
......@@ -117,7 +119,8 @@ GetOptions('host=s' => \$host,
'oldport=s' => \$oldport,
'olduser=s' => \$olduser,
'oldpass=s' => \$oldpass,
'help' => \$help
'help' => \$help,
'dry_run' => \$dry_run,
);
pod2usage(1) if($help);
......@@ -126,99 +129,94 @@ throw("--pass argument required") if (!defined($pass));
throw("--release argument required") if(!defined($release));
my $database = 'information_schema';
my $dbh = DBI->connect("DBI:mysql:database=$database;host=$host;port=$port",$user,$pass);
my $old_dbh = DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$oldport",$olduser,$oldpass);
my $status;
my $database_name;
#since there is no database defined, will run it agains all core databases
my $pattern;
if (!defined ($dbname)){
foreach my $h ($host,$host2) {
my $dbh = DBI->connect("DBI:mysql:database=$database;host=$h;port=$port",$user,$pass);
my $status;
my $database_name;
#since there is no database defined, will run it agains all core databases
my $pattern;
if (!defined ($dbname)){
$pattern = "_core_".$release."_";
}
else{
$pattern = $dbname;
}
#fetch all databases matching the pattern
print STDERR $pattern."\n";
my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'");
$sth->execute();
my $dbs = $sth->fetchall_arrayref();
my $schema_build;
foreach my $db_name (@{$dbs}){
print STDERR "Going to update mapping for $db_name->[0]....\n";
my $mapping_set_id;
my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database
my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id
my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)");
my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)");
$status = &mapping_status($dbh,$old_dbh, $db_name->[0],\$mapping_set_id,$release);
$schema_build = get_schema_and_build($db_name->[0]);
#add mapping_set information
if ($status == INITIAL_MAPPING){
$sth_mapping_set->execute($mapping_set_id,$schema_build);
#first time run the script, create new entry in mapping_set and copy seq_region entries in seq_region_mapping
############
#Actually NO only store the differences so for the initial one it is NONE.
############
# $current_seq_region = &read_seq_region($dbh,$db_name->[0]);
# #copy the seq_region_id in the seq_region_mapping
# foreach my $seq_region_name (keys %{$current_seq_region}){
# #when copying there won't be any ambiguity with coord_systems
# foreach my $region_id (values %{$current_seq_region->{$seq_region_name}}){
# $sth_seq_mapping->execute($region_id,$region_id,$mapping_set_id);
# }
# }
}
elsif ($status == SAME_MAPPING){
#seq_region_mapping has not change, nothing to do for the moment....
else{
$pattern = $dbname;
}
elsif ($status == NEW_MAPPING){
$sth_mapping_set->execute($mapping_set_id,$schema_build);
#there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id
my $previous_dbname = &get_previous_dbname($old_dbh,$db_name->[0],$release);
$current_seq_region = &read_seq_region($dbh,$db_name->[0]);
$old_seq_region = &read_seq_region($old_dbh,$previous_dbname);
#update the seq_region_mapping table with the old->new seq_region_id relation
my $count = 0;
foreach my $seq_region_name (keys %{$old_seq_region}){
next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared
foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){
next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database
next if ($old_seq_region->{$seq_region_name}->{$coord_system_id} == $current_seq_region->{$seq_region_name}->{$coord_system_id}); # if no change no need to write out
$sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id);
$count++;
#fetch all databases matching the pattern
# print STDERR $pattern."\n";
my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'");
$sth->execute();
my $dbs = $sth->fetchall_arrayref();
my $schema_build;
foreach my $db_name (@{$dbs}){
print STDERR "Going to update mapping for $db_name->[0]....\n";
my $mapping_set_id;
my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database
my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id
my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)");
my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)");
$status = &mapping_status($dbh,$old_dbh, $db_name->[0],\$mapping_set_id,$release);
$schema_build = get_schema_and_build($db_name->[0]);
## prune newly added mapping_set entries
# $dbh->do(qq(delete from $db_name->[0].mapping_set where schema_build = '$schema_build'));
# next;
#add mapping_set information
if ($status == INITIAL_MAPPING){
#first time run the script, create new entry in mapping_set
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
}
elsif ($status == SAME_MAPPING){
#seq_region_mapping has not change, just add a new entry for this version
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
}
elsif ($status == NEW_MAPPING){
$sth_mapping_set->execute($mapping_set_id,$schema_build) unless $dry_run;
#there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id
my $previous_dbname = &get_previous_dbname($old_dbh,$db_name->[0],$release);
$current_seq_region = &read_seq_region($dbh,$db_name->[0]);
$old_seq_region = &read_seq_region($old_dbh,$previous_dbname);
#update the seq_region_mapping table with the old->new seq_region_id relation
my $count = 0;
foreach my $seq_region_name (keys %{$old_seq_region}){
next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared
foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){
next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database
next if ($old_seq_region->{$seq_region_name}->{$coord_system_id} == $current_seq_region->{$seq_region_name}->{$coord_system_id}); # if no change no need to write out
$sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id) unless $dry_run;
$count++;
}
}
print STDERR "Added $count seq_region_mapping entry\n\n";
}
else{
throw("Mapping status not recognized by script: $status \n\n");
}
print STDERR "Added $count seq_region_mapping entry\n\n";
}
else{
throw("Mapping status not recognized by script: $status \n\n");
}
}
#will for a given database, will return the seq_region_name->seq_region_id relation
sub read_seq_region{
my $dbh = shift;
my $dbname = shift;
my %seq_region_hash;
my $seq_region_id;
my $seq_region_name;
my $coord_system_id;
my $sth = $dbh->prepare("SELECT seq_region_id, name, coord_system_id FROM $dbname.seq_region");
$sth->execute();
$sth->bind_col(1,\$seq_region_id);
$sth->bind_col(2,\$seq_region_name);
$sth->bind_col(3,\$coord_system_id);
while ($sth->fetch){
#there might be more than one assembly in the core database, thus we need the coord_system_id to remove ambiguity
$seq_region_hash{$seq_region_name}{$coord_system_id} = $seq_region_id;
}
return \%seq_region_hash;
my $dbh = shift;
my $dbname = shift;
my %seq_region_hash;
my $seq_region_id;
my $seq_region_name;
my $coord_system_id;
my $sth = $dbh->prepare("SELECT seq_region_id, name, coord_system_id FROM $dbname.seq_region");
$sth->execute();
$sth->bind_col(1,\$seq_region_id);
$sth->bind_col(2,\$seq_region_name);
$sth->bind_col(3,\$coord_system_id);
while ($sth->fetch){
#there might be more than one assembly in the core database, thus we need the coord_system_id to remove ambiguity
$seq_region_hash{$seq_region_name}{$coord_system_id} = $seq_region_id;
}
return \%seq_region_hash;
}
#method to check the status of the current core database: INITIAL_MAPPING, SAME_MAPPING and NEW_MAPPING are the possible states
......@@ -229,35 +227,49 @@ sub mapping_status{
my $mapping_set_id_ref = shift;
my $release = shift;
# my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set");
# $sth_max_mapping->execute();
# ( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array();
# if (! $$mapping_set_id_ref){
# #the table is empty, first mapping
# $$mapping_set_id_ref = 1;
# return INITIAL_MAPPING;
# }
# else{
#there is information, find out if it is the same mapping as previous release
my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release);
if(!defined($previous_dbname)){
print "No previous database present for $dbname so cannot do diff so will initialise with this as the first version of the database\n";
$$mapping_set_id_ref = 1;
my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set");
$sth_max_mapping->execute();
( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array();
if (! $$mapping_set_id_ref){
#the table is empty, first mapping
my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release);
if ($previous_dbname) {
my $old_sth_max_mapping = $old_dbh->prepare("select max(mapping_set_id) from $previous_dbname.mapping_set");
$old_sth_max_mapping->execute();
( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array();
if ($$mapping_set_id_ref) {
print (" There are mappings in the previous version of the database, need to do something with them ?\n");
}
else {
print " Previous version of the database has no entries, this is wrong\n";
}
}
else {
print " This is the first version of a this species, so just creating a new entry\n";
$$mapping_set_id_ref = 1;
}
return INITIAL_MAPPING;
}
my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname);
my $previous_seq_region_size = &get_seq_region_size($old_dbh,$previous_dbname);
if ($cur_seq_region_size == $previous_seq_region_size){
#if both tables have same size, SAME_MAPPING
return SAME_MAPPING;
}
else{
#if tables have different size, NEW_MAPPING
$$mapping_set_id_ref++;
return NEW_MAPPING;
}
#}
#there is information, find out if it is the same mapping as previous release
my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release);
if(!defined($previous_dbname)){
print " No previous database present for $dbname so cannot do diff so will initialise with this as the first version of the database\n";
$$mapping_set_id_ref = 1;
return INITIAL_MAPPING;
}
my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname);
my $previous_seq_region_size = &get_seq_region_size($old_dbh,$previous_dbname);
if ($cur_seq_region_size == $previous_seq_region_size){
#if both tables have same size, SAME_MAPPING
return SAME_MAPPING;
}
else{
#if tables have different size, NEW_MAPPING
$$mapping_set_id_ref++;
return NEW_MAPPING;
}
}
}
#for a given database, returns the size of the seq_region_table
......@@ -286,13 +298,14 @@ sub get_previous_dbname{
my $dbh = shift;
my $dbname = shift;
my $release = shift;
my $previous_dbname;
$dbname =~ /(^[a-z]+_[a-z]+_core_)/;
my $previous_release_name = $1 . (--$release);
my $previous_sth = $dbh->prepare("show databases like \'%$previous_release_name%\'");
$previous_sth->execute();
my ($previous_dbname) = $previous_sth->fetchrow_array();
($previous_dbname) = $previous_sth->fetchrow_array() ;
return $previous_dbname;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment