build.pl: Program for building binary delta between two database revisions.

apply.pl: Program for applying above delta. ... plus README files.

build.pl: Program for building binary delta between two database revisions.
apply.pl: Program for applying above delta. ... plus README files.
4e7b9ad1 · Andreas Kusalananda Kähäri · 6d953f48 · 4e7b9ad1 · 4e7b9ad1 · 4e7b9ad1
Commit 4e7b9ad1 authored 21 years ago by Andreas Kusalananda Kähäri
--- a/misc-scripts/binary_delta/apply.README
+++ b/misc-scripts/binary_delta/apply.README
+$Id$
+Andreas Kähäri, andreas.kahari@ebi.ac.uk
+
+    ======================================================================
+                                About "apply.pl"
+    ======================================================================
+
+    The apply.pl program is a Perl script that will run on any Unix system
+    with  a  Perl installed  (along  with  some common  non-standard  Perl
+    modules) shell interpreter.   It also makes use of  the xdelta program
+    (more on this below).
+
+    The program will patch an older  release of an Ensembl database into a
+    newer release by applying binary "delta files" created by the build.pl
+    program (discussed elsewhere).  The delta files are applied to the raw
+    MySQL database files  and will thus incorporate any  schema changes as
+    well as data changes.  It is hoped that the process of downloading the
+    delta files and applying them to  the older release of the database on
+    an external  site will be  much quicker than downloading  the complete
+    new release.
+
+    Given a directory  of delta files created by build.pl  and a directory
+    containing  the correct  and  untouched old  revision  of a  database,
+    apply.pl  will create  a new  directory an  populate it  with the  new
+    revision of the database.
+
+
+Requirements / Configuration
+
+    To work,  apply.pl needs the  following components, which  are usually
+    not part of your every-day Unix base-system.
+
+    1. The xdelta program (version 1.1.3, not version 2),
+       http://sourceforge.net/projects/xdelta/
+
+    2. The following Perl modules, some available as standard modules,
+       others available from CPAN at http://www.cpan.org/
+
+       * Compress::Zlib
+       * Cwd
+       * Digest::MD5
+       * File::Basename
+       * File::Copy
+       * Getopt::Std
+
+    Check your  distribution CDs  before downloading and  installing these
+    prerequisits from the web.
+
+
+Usage
+
+    Running  the  apply.pl program  without  any  arguments generates  the
+    following informational text (or something very similar to it):
+
+        Usage:  ./apply.pl [options] [--] database old_v new_v
+
+        database    The database to work on, e.g. "homo_sapiens_core".
+        old_v       The older version, e.g. "11_31".
+        new_v       The newer version, e.g. "12_31".
+
+        The options may be any of these:
+
+        -c cmd  Path to xdelta executable.
+                Default: "xdelta".
+        -s path Path to the directory where the delta directory is stored.
+                Default: "."
+        -d path Path to the directory holding the old version of the
+                database, and where the new version of the database
+                should be created.
+                Default: "."
+
+    Assuming the  current directory  holds a sub-directory  containing the
+    11_31  release of  e.g.  the homo_sapiens_core  Ensembl database,  and
+    another sub-directory  containing the  delta files, the  12_31 release
+    may be created by doing this:
+
+        ./apply.pl homo_sapiens_core 11_31 12_31 | tee apply.out
+
+    Note  that  the  non-optional  arguments   are  exactly  the  same  as
+    those  used  with build.pl  to  create  the  delta files.   The  delta
+    files  for   this  example  are   assumed  to  be  available   in  the
+    homo_sapiens_core_11_31_delta_12_31 directory.
+
+    The  apply.pl program  will  verify  the MD5  checksums  of all  files
+    involved in the  patch, including the delta files.   The patching will
+    fail if any checksum fails.  This means that databases modified on the
+    external site can not be updated in this way.
+
+
+
+vim: et
--- a/misc-scripts/binary_delta/apply.pl
+++ b/misc-scripts/binary_delta/apply.pl
+#!/usr/bin/perl -w
+
+# $Id$
+#
+# apply.pl
+#
+# A program that uses a previously created set of binary delta
+# files to produce a new revision of an ensembl database out of
+# an older revision of the same database.  The delta files must
+# have been built with the build.pl Perl program.
+#
+# See also apply.README
+#
+# Author: Andreas Kahari, <andreas.kahari@ebi.ac.uk>
+#
+
+use strict;
+use warnings;
+
+use Cwd;
+
+use File::Basename;
+use File::Copy;
+
+use Getopt::Std;
+
+use aux qw(:default :apply);
+
+my %opts;
+my $xdelta_cmd	= $opts{'c'} = 'xdelta';
+my $src_prefix	= $opts{'s'} = '.';
+my $dst_prefix	= $opts{'d'} = '.';
+
+if (!getopts('c:s:d:', \%opts)) {
+    usage_apply(\%opts);
+    die;
+}
+
+$xdelta_cmd = $opts{'c'};
+$src_prefix = $opts{'s'};
+$dst_prefix = $opts{'d'};
+
+if ($#ARGV != 2) {
+    usage_apply(\%opts);
+    die;
+}
+
+my $db = $ARGV[0];
+my $v1 = $ARGV[1]; my $v1_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v1;
+my $v2 = $ARGV[2]; my $v2_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v2;
+
+my $delta_dir = sprintf "%s/%s_%s_delta_%s", $dst_prefix, $db, $v1, $v2;
+
+die $! if (! -d $v1_dir);
+die $! if (! -d $delta_dir);
+
+if (-d $v2_dir) {
+    printf STDERR "Whoa, the directory '%s' already exists\n", $v2_dir;
+    die $!;
+}
+
+printf STDERR "Creating the directory '%s'\n", $v2_dir;
+mkdir($v2_dir) or dir $!;
+
+my $v1_all_size = 0;
+my $v2_all_size = 0;
+my $delta_all_size = 0;
+
+foreach my $info_file (glob($delta_dir . '/*.info')) {
+    my $base_name  = basename($info_file);
+
+    $base_name     =~ s/\.info$//;
+
+    my $v1_file    = sprintf "%s/%s", $v1_dir, $base_name;
+    my $v2_file    = sprintf "%s/%s", $v2_dir, $base_name;
+    my $delta_file = sprintf "%s/%s", $delta_dir, $base_name;
+
+    printf "Processing '%s'\n", $base_name;
+
+    open(INFO, $info_file) or die $!;
+
+    my $patch_command = <INFO>; chomp $patch_command;
+
+    my $v1_line = <INFO>; chomp $v1_line;
+    my ($v1_sum, $v1_size) = split /\s+/, $v1_line;
+
+    my $v2_line = <INFO>; chomp $v2_line;
+    my ($v2_sum, $v2_size) = split /\s+/, $v2_line;
+
+    my $delta_line = <INFO>; chomp $delta_line;
+    my ($delta_sum, $delta_size) = split /\s+/, $delta_line;
+
+    close INFO;
+
+    if ($v1_sum ne '(none)' && $v1_sum ne make_checksum($v1_file)) {
+	print "\tChecksum mismatch for old file\n";
+	print "\tCan not continue\n";
+	die;
+    } else {
+	print "\tChecksum ok for old file\n";
+    }
+
+    if ($delta_sum ne '(none)' && $delta_sum ne make_checksum($delta_file)) {
+	print "\tChecksum mismatch for delta file\n";
+	print "\tCan not continue\n";
+	die;
+    } else {
+	print "\tChecksum ok for delta file\n";
+    }
+
+    if ($patch_command eq 'PATCH') {
+	print "\tPatching file\n";
+	system($xdelta_cmd, 'patch', $delta_file, $v1_file, $v2_file);
+    } elsif ($patch_command eq 'ADD') {
+	print "\tAdding new file\n";
+	copy($delta_file, $v2_file);
+    } elsif ($patch_command eq 'COPY') {
+	print "\tCopying old file\n";
+	copy($v1_file, $v2_file);
+    } elsif ($patch_command eq 'ZIP') {
+	print "\tDecompressing compressed file\n";
+	do_decompress($delta_file, $v2_file);
+    } else {
+	warn "\tStrange patch command: $patch_command\n";
+    }
+
+    if ($v2_sum ne '(none)' && $v2_sum ne make_checksum($v2_file)) {
+	print "\tChecksum mismatch for new file\n";
+	print "\tCan not continue\n";
+	die;
+    } else {
+	print "\tChecksum ok for new file\n";
+    }
+}
--- a/misc-scripts/binary_delta/aux.pm
+++ b/misc-scripts/binary_delta/aux.pm
+package aux;
+
+# $Id$
+#
+# aux.pm
+#
+# Auxiliary subroutines for the apply.pl and build.pl programs.
+#
+# Author: Andreas Kahari, <andreas.kahari@ebi.ac.uk>
+#
+
+require Exporter;
+@ISA = qw(Exporter);
+
+@EXPORT_OK   = qw( make_human_readable make_checksum
+		   usage_apply do_decompress
+		   usage_build do_compress );
+
+%EXPORT_TAGS = (
+    default  => [ qw( make_human_readable make_checksum ) ],
+    apply    => [ qw( usage_apply do_decompress ) ],
+    build    => [ qw( usage_build do_compress ) ]
+);
+
+use Digest::MD5;
+use Compress::Zlib;
+
+#================= :default
+
+# Compute the MD5 checksum of a file.  Returns the checksum as a
+# hex string.
+sub make_checksum
+{
+    my $file_path = shift;
+
+    my $digest = new Digest::MD5;
+
+    open FILE, $file_path or die $!;
+    binmode FILE;
+
+    $digest->addfile(*FILE);
+
+    my $hex = $digest->hexdigest;
+    close FILE;
+
+    return $hex;
+}
+
+# Converts a byte count to a form more easly read by humans.
+# Returns a string consisting of an float (two decimal places),
+# a space, and a suffix.
+sub make_human_readable
+{
+    my $bytes = shift;
+
+    my @prefix = qw(b Kb Mb Gb Tb Pb);
+    my $step = 0;
+
+    while ($bytes > 10000) {
+	$bytes /= 1024;
+	++$step;
+    }
+
+    return sprintf("%.2f %s", $bytes, $prefix[$step]);
+}
+
+#================= :apply
+
+# Display usage information for the apply.pl program.
+sub usage_apply
+{
+    my $opts = shift;
+
+    print STDERR <<EOT;
+Usage:  $0 [options] [--] database old_v new_v
+
+database    The database to work on, e.g. "homo_sapiens_core".
+old_v       The older version, e.g. "11_31".
+new_v       The newer version, e.g. "12_31".
+
+The options may be any of these:
+
+-c cmd  Path to xdelta executable.
+        Default: "$opts->{'c'}".
+-s path Path to the directory where the delta directory is stored.
+        Default: "$opts->{'s'}"
+-d path Path to the directory holding the old version of the
+        database, and where the new version of the database
+        should be created.
+        Default: "$opts->{'d'}"
+
+EOT
+}
+
+# Decompress a file.
+sub do_decompress
+{
+    my $zfile_path = shift;
+    my $file_path  = shift;
+
+    open(OUT, '>' . $file_path) or die $!;
+    binmode OUT;
+
+    my $gz = gzopen($zfile_path, "r");
+
+    if (!defined($gz)) {
+	close OUT;
+	die $gzerrno;
+    }
+
+    my $buffer;
+    while ((my $bytesread = $gz->gzread($buffer)) != 0) {
+	print OUT substr($buffer, 0, $bytesread);
+    }
+
+    $gz->gzclose();
+    close OUT;
+}
+
+#================= :build
+
+# Display usage information for the build.pl program.
+sub usage_build
+{
+    my $opts = shift;
+
+    print STDERR <<EOT;
+Usage:  $0 [options] [--] database old_v new_v
+
+database    The database to work on, e.g. "homo_sapiens_core".
+old_v       The older version, e.g. "11_31".
+new_v       The newer version, e.g. "12_31".
+
+The options may be any of these:
+
+-c cmd  Path to xdelta executable.
+        Default: "$opts->{'c'}".
+-s path Path to the directory where the databases are stored.
+        Default: "$opts->{'s'}"
+-d path Path to the directory within which the delta
+        directory should be created.
+        Default: "$opts->{'d'}"
+
+EOT
+}
+
+# Compress a file.
+sub do_compress
+{
+    my $file_path  = shift;
+    my $zfile_path = shift;
+
+    open(IN, $file_path) or die $!;
+    binmode IN;
+
+    my $gz = gzopen($zfile_path, "w");
+
+    if (!defined($gz)) {
+	close IN;
+	die $gzerrno;
+    }
+
+    while (defined(my $input = <IN>)) {
+	$gz->gzwrite($input) or die $gzerrno;
+    }
+
+    $gz->gzclose();
+    close IN;
+}
+
+1;
--- a/misc-scripts/binary_delta/build.README
+++ b/misc-scripts/binary_delta/build.README
+$Id$
+Andreas Kähäri, andreas.kahari@ebi.ac.uk
+
+    ======================================================================
+                                About "build.pl"
+    ======================================================================
+
+    The build.pl program is a Perl script that will run on any Unix system
+    with  a  Perl installed  (along  with  some common  non-standard  Perl
+    modules) shell interpreter.   It also makes use of  the xdelta program
+    (more on this below).
+
+    The program  will compute binary  "delta files"  that may be  used for
+    upgrading from  one release of  an Ensembl  database to the  next one.
+    The deltas  are computed from  the raw  MySQL database files  and will
+    thus incorporate  any schema changes as  well as data changes.   It is
+    hoped that the process of downloading the delta files and appying them
+    to the older release of the database  on an external site will be much
+    quicker than downloading the complete new release.
+
+    The  build.pl program  creates a  new directory  that apart  from some
+    extra files *looks*  exactly the same as the MySQL  data directory for
+    the new release.  The files in it, however, ought to be lot smaller.
+
+    The new release may then be acuired by applying the delta files on the
+    old release using the apply.pl program (discussed elsewhere).
+
+
+Requirements / Configuration
+
+    To work,  build.pl needs the  following components, which  are usually
+    not part of your every-day Unix base-system.
+
+    1. The xdelta program (version 1.1.3, not version 2),
+       http://sourceforge.net/projects/xdelta/
+
+    2. The following Perl modules, some available as standard modules,
+       others available from CPAN at http://www.cpan.org/
+
+       * Compress::Zlib
+       * Cwd
+       * Digest::MD5
+       * File::Basename
+       * File::Copy
+       * Getopt::Std
+
+    Check your  distribution CDs  before downloading and  installing these
+    prerequisits from the web.
+
+
+Usage
+
+    Running  the  build.pl program  without  any  arguments generates  the
+    following informational text (or something very similar to it):
+
+        Usage:  ./build.pl [options] [--] database old_v new_v
+
+        database    The database to work on, e.g. "homo_sapiens_core".
+        old_v       The older version, e.g. "11_31".
+        new_v       The newer version, e.g. "12_31".
+
+        The options may be any of these:
+
+        -c cmd  Path to xdelta executable.
+                Default: "xdelta".
+        -s path Path to the directory where the databases are stored.
+                Default: "."
+        -d path Path to the directory within which the delta
+                directory should be created.
+                Default: "."
+
+
+    To create  the delta  files containing all  changes between  the 11_31
+    release  and  the  12_31  release of  the  homo_sapiens_core  database
+    located in the current directory, do this:
+
+        ./build.pl homo_sapiens_core 11_31 12_31 | tee build.out
+
+    This   creates    a   third   directory   called,    in   this   case,
+    homo_sapiens_core_11_31_delta_12_31  (in the  current directory)  into
+    which the  generated delta files will  be put.
+
+    The "|  tee build.out" bit  ensures that  the output that  the program
+    produces  (which includes  statistics about  how much  space that  was
+    saved for each  file etc.) is both displayed in  the console and saved
+    to the specified file, "build.out" in this case.
+
+    To specify alternate locations for the databases, the generated files,
+    or  for  the xdelta  executable,  use  the  -s,  -d, and  -c  switches
+    respectively as described above.
+
+    For each file in the both releases, the program will check whether the
+    file
+
+        * was not present in the old release,
+        * hasn't changed in the new release,
+        * is larger than 2 Gb (in which case xdelta can't be used), or
+        * needs patching.
+
+    This information is stored in the  *.info files in the delta directory
+    and is later used by apply.pl to create the new release.
+
+    A set of MD5  checksums of each file is also  calculated and stored in
+    the *.info files  in the delta directory.  For each  file, up to three
+    checksums are  calculated; that of  the old  release, that of  the new
+    release, and that of  the delta file.  This is done  to assert that no
+    errors were introduced in the  data transmission to the external site,
+    and so  that we're absolutely  sure we  are patching the  correct file
+    later on.  The  xdelta program also calculates and stores  its own MD5
+    checksum with the delta files it creates.  Better safe than sorry.
+
+
+
+vim: et
--- a/misc-scripts/binary_delta/build.pl
+++ b/misc-scripts/binary_delta/build.pl
+#!/usr/bin/perl -w
+
+# $Id$
+#
+# build.pl
+#
+# A program that creates binary delta files containing the
+# differences between two revisions of an ensembl database.  The
+# delta files must be applied with the apply.pl Perl program.
+#
+# See also build.README
+#
+# Author: Andreas Kahari, <andreas.kahari@ebi.ac.uk>
+#
+
+use strict;
+use warnings;
+
+use Cwd;
+
+use File::Basename;
+use File::Copy;
+
+use Getopt::Std;
+
+use aux qw(:default :build);
+
+my %opts;
+my $xdelta_cmd	= $opts{'c'} = 'xdelta';
+my $src_prefix	= $opts{'s'} = '.';
+my $dst_prefix	= $opts{'d'} = '.';
+
+my $too_big	= 2_147_483_648;    # 2 Gb
+#my $too_big	=       307_200;    # 300 Kb (for debugging gzipping)
+
+if (!getopts('c:s:d:', \%opts)) {
+    usage_build(\%opts);
+    die;
+}
+
+$xdelta_cmd = $opts{'c'};
+$src_prefix = $opts{'s'};
+$dst_prefix = $opts{'d'};
+
+if ($#ARGV != 2) {
+    usage_usage(\%opts);
+    die;
+}
+
+my $db = $ARGV[0];
+my $v1 = $ARGV[1]; my $v1_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v1;
+my $v2 = $ARGV[2]; my $v2_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v2;
+
+my $delta_dir = sprintf "%s/%s_%s_delta_%s", $dst_prefix, $db, $v1, $v2;
+
+die $! if (! -d $v1_dir);
+die $! if (! -d $v2_dir);
+
+if (! -d $delta_dir) {
+    printf STDERR "Creating delta directory '%s'\n", $delta_dir;
+    mkdir($delta_dir) or die $!;
+}
+
+my $v1_all_size = 0;
+my $v2_all_size = 0;
+my $delta_all_size = 0;
+
+foreach my $v2_file (glob($v2_dir . '/*')) {
+    my $base_name  = basename($v2_file);
+    my $v1_file    = sprintf "%s/%s", $v1_dir, $base_name;
+    my $delta_file = sprintf "%s/%s", $delta_dir, $base_name;
+
+    printf "Processing '%s'\n", $base_name;
+    my $v1_sum = '(none)';
+    my $v2_sum;
+    my $delta_sum = '(none)';
+
+    print "\tCalculating checksum of new file\n";
+    $v2_sum = make_checksum($v2_file);
+
+    my $v1_size = 0;
+    my $v2_size = (stat $v2_file)[7];
+    my $delta_size = 0;
+
+    my $patch_command;
+
+    if (-f $v1_file) {
+	print "\tCalculating checksum of old file\n";
+	$v1_sum = make_checksum($v1_file);
+
+	$v1_size = (stat $v1_file)[7];
+
+	if ($v1_sum eq $v2_sum && $v1_size == $v2_size) {
+	    $patch_command = 'COPY';
+	    print "\tThe files are identical\n";
+	} elsif ($v1_size >= $too_big || $v2_size >= $too_big) {
+	    $patch_command = 'ZIP';
+	    print "\tFiles are huge, compressing new file\n";
+	    do_compress($v2_file, $delta_file);
+	} else {
+	    $patch_command = 'PATCH';
+	    print "\tCreating delta file\n";
+	    system($xdelta_cmd, 'delta', '-9',
+		$v1_file, $v2_file, $delta_file);
+	}
+    } else {
+	$patch_command = 'ADD';	    # use 'ZIP' here as well?
+	print "\tCopying file\n";
+	copy($v2_file, $delta_file);
+    }
+
+    if ($patch_command ne 'COPY') {
+	print "\tCalculating checksum of delta file\n";
+	$delta_sum = make_checksum($delta_file);
+	$delta_size = (stat $delta_file)[7];
+    }
+
+    print "\tWriting info file\n";
+    open(INFO, '>' . $delta_file . '.info') or die $!;
+    printf INFO "%s\n%s\t%d\n%s\t%d\n%s\t%d\n",
+	$patch_command,
+	$v1_sum, $v1_size,
+	$v2_sum, $v2_size,
+	$delta_sum, $delta_size;
+    close INFO;
+
+    $v1_all_size += $v1_size;
+    $v2_all_size += $v2_size;
+    $delta_all_size += $delta_size;
+
+    printf "This file:\nOld %s, New %s, Delta %s, Saved %s (%.2f%%)\n",
+	make_human_readable($v1_size),
+	make_human_readable($v2_size),
+	make_human_readable($delta_size),
+	make_human_readable($v2_size - $delta_size),
+	($v2_size == 0 ? 0 : 100 * (1.0 - $delta_size / $v2_size));
+    printf "Overall:\nOld %s, New %s, Delta %s, Saved %s (%.2f%%)\n\n",
+	make_human_readable($v1_all_size),
+	make_human_readable($v2_all_size),
+	make_human_readable($delta_all_size),
+	make_human_readable($v2_all_size - $delta_all_size),
+	($v2_all_size == 0 ? 0 : 100 * (1.0 - $delta_all_size / $v2_all_size));
+}