update_mapping_set.pl 10.5 KB
Newer Older
1
#!/usr/local/ensembl/bin/perl
2 3 4

=head1 NAME

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
5 6 7 8
update_mapping_set.pl   - script to update 2 tables: mapping_set and
                          seq_region_id mapping.  These tables are
                          supposed to store different mapping of
                          seq_region_id between releases
9 10 11 12 13 14 15

=head1 SYNOPSIS

update_mapping_set.pl [arguments]

Required arguments:

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
16
  --user=user             username for the database
17

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
18
  --pass=pass             password for the database
19

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
20
  --release=num_release   release we want to run the updating
21 22 23

Optional arguments:

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
24 25
  --host=host             server where the core databases are stored
                          (default: ens-staging)
26

27 28
  --oldhost = oldhost   server where the old release databases are stored

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
29 30 31
  --dbname=dbname         if you want a single database to update
                          the mapping_set information (default: all
                          databases)
32

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
33
  --port=port             port (default: 3306)
34

35 36 37 38 39 40
  --oldport=port          old database server port (default: 5306)

  --olduser=user          old database server username

  --oldpass=pass          password for old database server

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
41
  --help                  print help (this message)
42 43 44

=head1 DESCRIPTION

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
45 46 47 48 49 50 51 52 53 54
This script will update the mapping_set table with the current
schema_build information and indicate if the seq_region table has
changed from previous release.

If it has, the table seq_region_mapping will contain the
relation between old (external_seq_region_id) and the current
(internal_seq_region_id).

If it hasn't, a single entry in the mapping_set table with the same
mapping_set_id as the previous release will be stored.
55 56 57

=head1 EXAMPLES

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
58
Update mapping_set information for all databases in ens-staging in
59
release NN (the usual use case in release process):
60

61
  $ ./update_mapping_set.pl --user ensadmin \
62
    --pass password --release NN --old_host ensembldb-ensembl.org
63

64
Update mapping_set information only for pig database in ens-genomics1:
65

66 67
  $ ./update_mapping_set.pl --host ens-genomics1 \
    --user ensadmin --pass password --dbname my_pig_db --release 52
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98

=head1 LICENCE

This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.

=head1 AUTHOR

Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team

=head1 CONTACT

=cut

use strict;
use warnings;

use Getopt::Long;
use Pod::Usage;

use DBI qw(:sql_types);

use Bio::EnsEMBL::Utils::Exception qw(throw);

use constant INITIAL_MAPPING => 1;
use constant SAME_MAPPING => 2;
use constant NEW_MAPPING => 3;

## Command line options

my $host = 'ens-staging';
99
my $oldhost = 'ensembldb.ensembl.org';
100 101 102 103
my $dbname = undef;
my $user = undef;
my $pass = undef;
my $port = 3306;
104 105 106
my $oldport = 5306;
my $olduser = "anonymous";
my $oldpass = undef;
107 108 109 110 111 112 113 114 115
my $help = undef;
my $release = undef;

GetOptions('host=s'    => \$host,
	   'dbname=s'  => \$dbname,
	   'user=s'    => \$user,
	   'pass=s'    => \$pass,
	   'port=s'    => \$port,
           'release=i' => \$release,
116 117 118 119 120
	   'oldhost=s' => \$oldhost,
	   'oldport=s' => \$oldport,
	   'olduser=s' => \$olduser,
	   'oldpass=s' => \$oldpass,
	   'help'      => \$help
121 122 123 124 125 126 127 128 129
	   );

pod2usage(1) if($help);
throw("--user argument required") if (!defined($user));
throw("--pass argument required") if (!defined($pass));
throw("--release argument required") if(!defined($release));

my $database = 'information_schema';
my $dbh = DBI->connect("DBI:mysql:database=$database;host=$host;port=$port",$user,$pass);
130
my $old_dbh =  DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$oldport",$olduser,$oldpass);
131 132 133 134 135 136
my $status;
my $database_name;

#since there is no database defined, will run it agains all core databases
my $pattern;
if (!defined ($dbname)){
137
    $pattern = "_core_".$release."_";
138 139 140 141 142
}
else{
$pattern = $dbname;
}
#fetch all databases matching the pattern
143
print STDERR $pattern."\n";
144 145 146 147 148
my $sth = $dbh->prepare("SHOW DATABASES WHERE `database` REGEXP \'$pattern\'");
$sth->execute();
my $dbs = $sth->fetchall_arrayref();
my $schema_build;
foreach my $db_name (@{$dbs}){
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
  print STDERR "Going to update mapping for $db_name->[0]....\n";
  my $mapping_set_id;
  my $current_seq_region = (); # hash containing the relation seq_region_name->seq_region_id for the current database
  my $old_seq_region = (); #hash containing the previous database relation seq_region_name->seq_region_id
  my $sth_seq_mapping = $dbh->prepare("INSERT INTO $db_name->[0].seq_region_mapping VALUES(?,?,?)");
  my $sth_mapping_set = $dbh->prepare("INSERT INTO $db_name->[0].mapping_set VALUES(?,?)");
  $status = &mapping_status($dbh,$old_dbh, $db_name->[0],\$mapping_set_id,$release);
  $schema_build = get_schema_and_build($db_name->[0]);

  #add mapping_set information
  if ($status == INITIAL_MAPPING){
    $sth_mapping_set->execute($mapping_set_id,$schema_build);
    #first time run the script, create new entry in mapping_set and copy seq_region entries in seq_region_mapping

    ############
    #Actually NO only store the differences so for the initial one it is NONE.
    ############

    #        $current_seq_region =  &read_seq_region($dbh,$db_name->[0]);
    #        #copy the seq_region_id in the seq_region_mapping
    #        foreach my $seq_region_name (keys %{$current_seq_region}){
    #            #when copying there won't be any ambiguity with coord_systems
    #            foreach my $region_id (values %{$current_seq_region->{$seq_region_name}}){
    #                $sth_seq_mapping->execute($region_id,$region_id,$mapping_set_id);
    #            }
    #     }
  }
  elsif ($status == SAME_MAPPING){
    #seq_region_mapping has not change, nothing to do for the moment....

  }
  elsif ($status == NEW_MAPPING){
    $sth_mapping_set->execute($mapping_set_id,$schema_build);
    #there has been a seq_region change between releases, add a new mapping_set and the relation old_seq_region_id->new_seq_region_id
    my $previous_dbname = &get_previous_dbname($old_dbh,$db_name->[0],$release);
    $current_seq_region =  &read_seq_region($dbh,$db_name->[0]);
    $old_seq_region = &read_seq_region($old_dbh,$previous_dbname);
    #update the seq_region_mapping table with the old->new seq_region_id relation
    my $count = 0;
    foreach my $seq_region_name (keys %{$old_seq_region}){
      next if (!defined $current_seq_region->{$seq_region_name}); #the seq_region might have disappeared
      foreach my $coord_system_id (keys %{$old_seq_region->{$seq_region_name}}){
	next if (!defined $current_seq_region->{$seq_region_name}->{$coord_system_id}); #the coord_system might have been removed in current database
	next if ($old_seq_region->{$seq_region_name}->{$coord_system_id} == $current_seq_region->{$seq_region_name}->{$coord_system_id}); # if no change no need to write out
	$sth_seq_mapping->execute($old_seq_region->{$seq_region_name}->{$coord_system_id},$current_seq_region->{$seq_region_name}->{$coord_system_id},$mapping_set_id);
	$count++;
      }
    }
    print STDERR "Added $count seq_region_mapping entry\n\n";
  }
  else{
    throw("Mapping status not recognized by script: $status \n\n");
  }
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
}

#will for a given database, will return the seq_region_name->seq_region_id relation
sub read_seq_region{
    my $dbh = shift;
    my $dbname = shift;
    my %seq_region_hash;
    my $seq_region_id;
    my $seq_region_name;
    my $coord_system_id;
    my $sth = $dbh->prepare("SELECT seq_region_id, name, coord_system_id FROM $dbname.seq_region");
    $sth->execute();
    $sth->bind_col(1,\$seq_region_id);
    $sth->bind_col(2,\$seq_region_name);
    $sth->bind_col(3,\$coord_system_id);
    while ($sth->fetch){
	#there might be more than one assembly in the core database, thus we need the coord_system_id to remove ambiguity
	$seq_region_hash{$seq_region_name}{$coord_system_id} = $seq_region_id;
    }
    return \%seq_region_hash;
}

#method to check the status of the current core database: INITIAL_MAPPING, SAME_MAPPING and NEW_MAPPING are the possible states
sub mapping_status{
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
  my $dbh = shift;
  my $old_dbh = shift;
  my $dbname = shift;
  my $mapping_set_id_ref = shift;
  my $release = shift;
  
  #    my $sth_max_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set");
  #    $sth_max_mapping->execute();
  #    ( $$mapping_set_id_ref ) = $sth_max_mapping->fetchrow_array();
  #    if (! $$mapping_set_id_ref){
  #	#the table is empty, first mapping
  #	$$mapping_set_id_ref = 1;
  #	return INITIAL_MAPPING;
  #    }
  #    else{
  #there is information, find out if it is the same mapping as previous release
  
  my $previous_dbname = &get_previous_dbname($old_dbh,$dbname,$release);
  if(!defined($previous_dbname)){
    print "No previous database present for $dbname so cannot do diff so will initialise with this as the first version of the database\n";
    $$mapping_set_id_ref = 1;
    return INITIAL_MAPPING;
  }
  my $cur_seq_region_size = &get_seq_region_size($dbh,$dbname);
  my $previous_seq_region_size = &get_seq_region_size($old_dbh,$previous_dbname);
  if ($cur_seq_region_size == $previous_seq_region_size){
    #if both tables have same size, SAME_MAPPING
    return SAME_MAPPING;
  }
  else{
    #if tables have different size, NEW_MAPPING
    $$mapping_set_id_ref++;
    return NEW_MAPPING;
  }	
  #}
261 262 263 264 265 266
}

#for a given database, returns the size of the seq_region_table
sub get_seq_region_size{
    my $dbh = shift;
    my $dbname = shift;
267
    my $sth_status = $dbh->prepare("show table status from $dbname like 'seq_region'") ;
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
    $sth_status->execute();
    my @table_status = $sth_status->fetchrow_array();
    return $table_status[6]; #return the size of the table
}

#will return the max mapping_set_id being used in the mapping_set table
sub get_max_mapping_set_id{
    my $dbh = shift;
    my $dbname = shift;

    my $sth_mapping = $dbh->prepare("select max(mapping_set_id) from mapping_set");
    $sth_mapping->execute();
    my ($max_mapping_set_id) = $sth_mapping->fetchrow_array();
    return $max_mapping_set_id;
}

#this method will return the name of the previous database to release for same species (assuming is present)
sub get_previous_dbname{
    my $dbh = shift;
    my $dbname = shift;
    my $release = shift;

    $dbname =~ /(^[a-z]+_[a-z]+_core_)/;

    my $previous_release_name = $1 . (--$release);
    my $previous_sth = $dbh->prepare("show databases like \'%$previous_release_name%\'");
    $previous_sth->execute();
    my ($previous_dbname) = $previous_sth->fetchrow_array();
    return $previous_dbname;
    
}

#for a standard ensembl database name, returns the release number and assembly
sub get_schema_and_build{
  my ($dbname) = @_;
  my @dbname = split/_/, $dbname;
  return join "_",$dbname[($#dbname -1)], $dbname[($#dbname)];
}