From 4e7b9ad19ae4101d301fb9ff1ec44248b64aab24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?= <ak4@sanger.ac.uk> Date: Fri, 11 Jul 2003 12:41:18 +0000 Subject: [PATCH] build.pl: Program for building binary delta between two database revisions. apply.pl: Program for applying above delta. ... plus README files. --- misc-scripts/binary_delta/apply.README | 91 +++++++++++++ misc-scripts/binary_delta/apply.pl | 134 +++++++++++++++++++ misc-scripts/binary_delta/aux.pm | 171 +++++++++++++++++++++++++ misc-scripts/binary_delta/build.README | 114 +++++++++++++++++ misc-scripts/binary_delta/build.pl | 143 +++++++++++++++++++++ 5 files changed, 653 insertions(+) create mode 100644 misc-scripts/binary_delta/apply.README create mode 100755 misc-scripts/binary_delta/apply.pl create mode 100644 misc-scripts/binary_delta/aux.pm create mode 100644 misc-scripts/binary_delta/build.README create mode 100755 misc-scripts/binary_delta/build.pl diff --git a/misc-scripts/binary_delta/apply.README b/misc-scripts/binary_delta/apply.README new file mode 100644 index 0000000000..2afc0bc3f9 --- /dev/null +++ b/misc-scripts/binary_delta/apply.README @@ -0,0 +1,91 @@ +$Id$ +Andreas Kähäri, andreas.kahari@ebi.ac.uk + + ====================================================================== + About "apply.pl" + ====================================================================== + + The apply.pl program is a Perl script that will run on any Unix system + with a Perl installed (along with some common non-standard Perl + modules) shell interpreter. It also makes use of the xdelta program + (more on this below). + + The program will patch an older release of an Ensembl database into a + newer release by applying binary "delta files" created by the build.pl + program (discussed elsewhere). The delta files are applied to the raw + MySQL database files and will thus incorporate any schema changes as + well as data changes. It is hoped that the process of downloading the + delta files and applying them to the older release of the database on + an external site will be much quicker than downloading the complete + new release. + + Given a directory of delta files created by build.pl and a directory + containing the correct and untouched old revision of a database, + apply.pl will create a new directory an populate it with the new + revision of the database. + + +Requirements / Configuration + + To work, apply.pl needs the following components, which are usually + not part of your every-day Unix base-system. + + 1. The xdelta program (version 1.1.3, not version 2), + http://sourceforge.net/projects/xdelta/ + + 2. The following Perl modules, some available as standard modules, + others available from CPAN at http://www.cpan.org/ + + * Compress::Zlib + * Cwd + * Digest::MD5 + * File::Basename + * File::Copy + * Getopt::Std + + Check your distribution CDs before downloading and installing these + prerequisits from the web. + + +Usage + + Running the apply.pl program without any arguments generates the + following informational text (or something very similar to it): + + Usage: ./apply.pl [options] [--] database old_v new_v + + database The database to work on, e.g. "homo_sapiens_core". + old_v The older version, e.g. "11_31". + new_v The newer version, e.g. "12_31". + + The options may be any of these: + + -c cmd Path to xdelta executable. + Default: "xdelta". + -s path Path to the directory where the delta directory is stored. + Default: "." + -d path Path to the directory holding the old version of the + database, and where the new version of the database + should be created. + Default: "." + + Assuming the current directory holds a sub-directory containing the + 11_31 release of e.g. the homo_sapiens_core Ensembl database, and + another sub-directory containing the delta files, the 12_31 release + may be created by doing this: + + ./apply.pl homo_sapiens_core 11_31 12_31 | tee apply.out + + Note that the non-optional arguments are exactly the same as + those used with build.pl to create the delta files. The delta + files for this example are assumed to be available in the + homo_sapiens_core_11_31_delta_12_31 directory. + + The apply.pl program will verify the MD5 checksums of all files + involved in the patch, including the delta files. The patching will + fail if any checksum fails. This means that databases modified on the + external site can not be updated in this way. + + + +vim: et diff --git a/misc-scripts/binary_delta/apply.pl b/misc-scripts/binary_delta/apply.pl new file mode 100755 index 0000000000..debe4c5770 --- /dev/null +++ b/misc-scripts/binary_delta/apply.pl @@ -0,0 +1,134 @@ +#!/usr/bin/perl -w + +# $Id$ +# +# apply.pl +# +# A program that uses a previously created set of binary delta +# files to produce a new revision of an ensembl database out of +# an older revision of the same database. The delta files must +# have been built with the build.pl Perl program. +# +# See also apply.README +# +# Author: Andreas Kahari, <andreas.kahari@ebi.ac.uk> +# + +use strict; +use warnings; + +use Cwd; + +use File::Basename; +use File::Copy; + +use Getopt::Std; + +use aux qw(:default :apply); + +my %opts; +my $xdelta_cmd = $opts{'c'} = 'xdelta'; +my $src_prefix = $opts{'s'} = '.'; +my $dst_prefix = $opts{'d'} = '.'; + +if (!getopts('c:s:d:', \%opts)) { + usage_apply(\%opts); + die; +} + +$xdelta_cmd = $opts{'c'}; +$src_prefix = $opts{'s'}; +$dst_prefix = $opts{'d'}; + +if ($#ARGV != 2) { + usage_apply(\%opts); + die; +} + +my $db = $ARGV[0]; +my $v1 = $ARGV[1]; my $v1_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v1; +my $v2 = $ARGV[2]; my $v2_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v2; + +my $delta_dir = sprintf "%s/%s_%s_delta_%s", $dst_prefix, $db, $v1, $v2; + +die $! if (! -d $v1_dir); +die $! if (! -d $delta_dir); + +if (-d $v2_dir) { + printf STDERR "Whoa, the directory '%s' already exists\n", $v2_dir; + die $!; +} + +printf STDERR "Creating the directory '%s'\n", $v2_dir; +mkdir($v2_dir) or dir $!; + +my $v1_all_size = 0; +my $v2_all_size = 0; +my $delta_all_size = 0; + +foreach my $info_file (glob($delta_dir . '/*.info')) { + my $base_name = basename($info_file); + + $base_name =~ s/\.info$//; + + my $v1_file = sprintf "%s/%s", $v1_dir, $base_name; + my $v2_file = sprintf "%s/%s", $v2_dir, $base_name; + my $delta_file = sprintf "%s/%s", $delta_dir, $base_name; + + printf "Processing '%s'\n", $base_name; + + open(INFO, $info_file) or die $!; + + my $patch_command = <INFO>; chomp $patch_command; + + my $v1_line = <INFO>; chomp $v1_line; + my ($v1_sum, $v1_size) = split /\s+/, $v1_line; + + my $v2_line = <INFO>; chomp $v2_line; + my ($v2_sum, $v2_size) = split /\s+/, $v2_line; + + my $delta_line = <INFO>; chomp $delta_line; + my ($delta_sum, $delta_size) = split /\s+/, $delta_line; + + close INFO; + + if ($v1_sum ne '(none)' && $v1_sum ne make_checksum($v1_file)) { + print "\tChecksum mismatch for old file\n"; + print "\tCan not continue\n"; + die; + } else { + print "\tChecksum ok for old file\n"; + } + + if ($delta_sum ne '(none)' && $delta_sum ne make_checksum($delta_file)) { + print "\tChecksum mismatch for delta file\n"; + print "\tCan not continue\n"; + die; + } else { + print "\tChecksum ok for delta file\n"; + } + + if ($patch_command eq 'PATCH') { + print "\tPatching file\n"; + system($xdelta_cmd, 'patch', $delta_file, $v1_file, $v2_file); + } elsif ($patch_command eq 'ADD') { + print "\tAdding new file\n"; + copy($delta_file, $v2_file); + } elsif ($patch_command eq 'COPY') { + print "\tCopying old file\n"; + copy($v1_file, $v2_file); + } elsif ($patch_command eq 'ZIP') { + print "\tDecompressing compressed file\n"; + do_decompress($delta_file, $v2_file); + } else { + warn "\tStrange patch command: $patch_command\n"; + } + + if ($v2_sum ne '(none)' && $v2_sum ne make_checksum($v2_file)) { + print "\tChecksum mismatch for new file\n"; + print "\tCan not continue\n"; + die; + } else { + print "\tChecksum ok for new file\n"; + } +} diff --git a/misc-scripts/binary_delta/aux.pm b/misc-scripts/binary_delta/aux.pm new file mode 100644 index 0000000000..023a157730 --- /dev/null +++ b/misc-scripts/binary_delta/aux.pm @@ -0,0 +1,171 @@ +package aux; + +# $Id$ +# +# aux.pm +# +# Auxiliary subroutines for the apply.pl and build.pl programs. +# +# Author: Andreas Kahari, <andreas.kahari@ebi.ac.uk> +# + +require Exporter; +@ISA = qw(Exporter); + +@EXPORT_OK = qw( make_human_readable make_checksum + usage_apply do_decompress + usage_build do_compress ); + +%EXPORT_TAGS = ( + default => [ qw( make_human_readable make_checksum ) ], + apply => [ qw( usage_apply do_decompress ) ], + build => [ qw( usage_build do_compress ) ] +); + +use Digest::MD5; +use Compress::Zlib; + +#================= :default + +# Compute the MD5 checksum of a file. Returns the checksum as a +# hex string. +sub make_checksum +{ + my $file_path = shift; + + my $digest = new Digest::MD5; + + open FILE, $file_path or die $!; + binmode FILE; + + $digest->addfile(*FILE); + + my $hex = $digest->hexdigest; + close FILE; + + return $hex; +} + +# Converts a byte count to a form more easly read by humans. +# Returns a string consisting of an float (two decimal places), +# a space, and a suffix. +sub make_human_readable +{ + my $bytes = shift; + + my @prefix = qw(b Kb Mb Gb Tb Pb); + my $step = 0; + + while ($bytes > 10000) { + $bytes /= 1024; + ++$step; + } + + return sprintf("%.2f %s", $bytes, $prefix[$step]); +} + +#================= :apply + +# Display usage information for the apply.pl program. +sub usage_apply +{ + my $opts = shift; + + print STDERR <<EOT; +Usage: $0 [options] [--] database old_v new_v + +database The database to work on, e.g. "homo_sapiens_core". +old_v The older version, e.g. "11_31". +new_v The newer version, e.g. "12_31". + +The options may be any of these: + +-c cmd Path to xdelta executable. + Default: "$opts->{'c'}". +-s path Path to the directory where the delta directory is stored. + Default: "$opts->{'s'}" +-d path Path to the directory holding the old version of the + database, and where the new version of the database + should be created. + Default: "$opts->{'d'}" + +EOT +} + +# Decompress a file. +sub do_decompress +{ + my $zfile_path = shift; + my $file_path = shift; + + open(OUT, '>' . $file_path) or die $!; + binmode OUT; + + my $gz = gzopen($zfile_path, "r"); + + if (!defined($gz)) { + close OUT; + die $gzerrno; + } + + my $buffer; + while ((my $bytesread = $gz->gzread($buffer)) != 0) { + print OUT substr($buffer, 0, $bytesread); + } + + $gz->gzclose(); + close OUT; +} + +#================= :build + +# Display usage information for the build.pl program. +sub usage_build +{ + my $opts = shift; + + print STDERR <<EOT; +Usage: $0 [options] [--] database old_v new_v + +database The database to work on, e.g. "homo_sapiens_core". +old_v The older version, e.g. "11_31". +new_v The newer version, e.g. "12_31". + +The options may be any of these: + +-c cmd Path to xdelta executable. + Default: "$opts->{'c'}". +-s path Path to the directory where the databases are stored. + Default: "$opts->{'s'}" +-d path Path to the directory within which the delta + directory should be created. + Default: "$opts->{'d'}" + +EOT +} + +# Compress a file. +sub do_compress +{ + my $file_path = shift; + my $zfile_path = shift; + + open(IN, $file_path) or die $!; + binmode IN; + + my $gz = gzopen($zfile_path, "w"); + + if (!defined($gz)) { + close IN; + die $gzerrno; + } + + while (defined(my $input = <IN>)) { + $gz->gzwrite($input) or die $gzerrno; + } + + $gz->gzclose(); + close IN; +} + +1; diff --git a/misc-scripts/binary_delta/build.README b/misc-scripts/binary_delta/build.README new file mode 100644 index 0000000000..840f3b3e29 --- /dev/null +++ b/misc-scripts/binary_delta/build.README @@ -0,0 +1,114 @@ +$Id$ +Andreas Kähäri, andreas.kahari@ebi.ac.uk + + ====================================================================== + About "build.pl" + ====================================================================== + + The build.pl program is a Perl script that will run on any Unix system + with a Perl installed (along with some common non-standard Perl + modules) shell interpreter. It also makes use of the xdelta program + (more on this below). + + The program will compute binary "delta files" that may be used for + upgrading from one release of an Ensembl database to the next one. + The deltas are computed from the raw MySQL database files and will + thus incorporate any schema changes as well as data changes. It is + hoped that the process of downloading the delta files and appying them + to the older release of the database on an external site will be much + quicker than downloading the complete new release. + + The build.pl program creates a new directory that apart from some + extra files *looks* exactly the same as the MySQL data directory for + the new release. The files in it, however, ought to be lot smaller. + + The new release may then be acuired by applying the delta files on the + old release using the apply.pl program (discussed elsewhere). + + +Requirements / Configuration + + To work, build.pl needs the following components, which are usually + not part of your every-day Unix base-system. + + 1. The xdelta program (version 1.1.3, not version 2), + http://sourceforge.net/projects/xdelta/ + + 2. The following Perl modules, some available as standard modules, + others available from CPAN at http://www.cpan.org/ + + * Compress::Zlib + * Cwd + * Digest::MD5 + * File::Basename + * File::Copy + * Getopt::Std + + Check your distribution CDs before downloading and installing these + prerequisits from the web. + + +Usage + + Running the build.pl program without any arguments generates the + following informational text (or something very similar to it): + + Usage: ./build.pl [options] [--] database old_v new_v + + database The database to work on, e.g. "homo_sapiens_core". + old_v The older version, e.g. "11_31". + new_v The newer version, e.g. "12_31". + + The options may be any of these: + + -c cmd Path to xdelta executable. + Default: "xdelta". + -s path Path to the directory where the databases are stored. + Default: "." + -d path Path to the directory within which the delta + directory should be created. + Default: "." + + + To create the delta files containing all changes between the 11_31 + release and the 12_31 release of the homo_sapiens_core database + located in the current directory, do this: + + ./build.pl homo_sapiens_core 11_31 12_31 | tee build.out + + This creates a third directory called, in this case, + homo_sapiens_core_11_31_delta_12_31 (in the current directory) into + which the generated delta files will be put. + + The "| tee build.out" bit ensures that the output that the program + produces (which includes statistics about how much space that was + saved for each file etc.) is both displayed in the console and saved + to the specified file, "build.out" in this case. + + To specify alternate locations for the databases, the generated files, + or for the xdelta executable, use the -s, -d, and -c switches + respectively as described above. + + For each file in the both releases, the program will check whether the + file + + * was not present in the old release, + * hasn't changed in the new release, + * is larger than 2 Gb (in which case xdelta can't be used), or + * needs patching. + + This information is stored in the *.info files in the delta directory + and is later used by apply.pl to create the new release. + + A set of MD5 checksums of each file is also calculated and stored in + the *.info files in the delta directory. For each file, up to three + checksums are calculated; that of the old release, that of the new + release, and that of the delta file. This is done to assert that no + errors were introduced in the data transmission to the external site, + and so that we're absolutely sure we are patching the correct file + later on. The xdelta program also calculates and stores its own MD5 + checksum with the delta files it creates. Better safe than sorry. + + + +vim: et diff --git a/misc-scripts/binary_delta/build.pl b/misc-scripts/binary_delta/build.pl new file mode 100755 index 0000000000..afa2ef91d5 --- /dev/null +++ b/misc-scripts/binary_delta/build.pl @@ -0,0 +1,143 @@ +#!/usr/bin/perl -w + +# $Id$ +# +# build.pl +# +# A program that creates binary delta files containing the +# differences between two revisions of an ensembl database. The +# delta files must be applied with the apply.pl Perl program. +# +# See also build.README +# +# Author: Andreas Kahari, <andreas.kahari@ebi.ac.uk> +# + +use strict; +use warnings; + +use Cwd; + +use File::Basename; +use File::Copy; + +use Getopt::Std; + +use aux qw(:default :build); + +my %opts; +my $xdelta_cmd = $opts{'c'} = 'xdelta'; +my $src_prefix = $opts{'s'} = '.'; +my $dst_prefix = $opts{'d'} = '.'; + +my $too_big = 2_147_483_648; # 2 Gb +#my $too_big = 307_200; # 300 Kb (for debugging gzipping) + +if (!getopts('c:s:d:', \%opts)) { + usage_build(\%opts); + die; +} + +$xdelta_cmd = $opts{'c'}; +$src_prefix = $opts{'s'}; +$dst_prefix = $opts{'d'}; + +if ($#ARGV != 2) { + usage_usage(\%opts); + die; +} + +my $db = $ARGV[0]; +my $v1 = $ARGV[1]; my $v1_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v1; +my $v2 = $ARGV[2]; my $v2_dir = sprintf "%s/%s_%s", $src_prefix, $db, $v2; + +my $delta_dir = sprintf "%s/%s_%s_delta_%s", $dst_prefix, $db, $v1, $v2; + +die $! if (! -d $v1_dir); +die $! if (! -d $v2_dir); + +if (! -d $delta_dir) { + printf STDERR "Creating delta directory '%s'\n", $delta_dir; + mkdir($delta_dir) or die $!; +} + +my $v1_all_size = 0; +my $v2_all_size = 0; +my $delta_all_size = 0; + +foreach my $v2_file (glob($v2_dir . '/*')) { + my $base_name = basename($v2_file); + my $v1_file = sprintf "%s/%s", $v1_dir, $base_name; + my $delta_file = sprintf "%s/%s", $delta_dir, $base_name; + + printf "Processing '%s'\n", $base_name; + my $v1_sum = '(none)'; + my $v2_sum; + my $delta_sum = '(none)'; + + print "\tCalculating checksum of new file\n"; + $v2_sum = make_checksum($v2_file); + + my $v1_size = 0; + my $v2_size = (stat $v2_file)[7]; + my $delta_size = 0; + + my $patch_command; + + if (-f $v1_file) { + print "\tCalculating checksum of old file\n"; + $v1_sum = make_checksum($v1_file); + + $v1_size = (stat $v1_file)[7]; + + if ($v1_sum eq $v2_sum && $v1_size == $v2_size) { + $patch_command = 'COPY'; + print "\tThe files are identical\n"; + } elsif ($v1_size >= $too_big || $v2_size >= $too_big) { + $patch_command = 'ZIP'; + print "\tFiles are huge, compressing new file\n"; + do_compress($v2_file, $delta_file); + } else { + $patch_command = 'PATCH'; + print "\tCreating delta file\n"; + system($xdelta_cmd, 'delta', '-9', + $v1_file, $v2_file, $delta_file); + } + } else { + $patch_command = 'ADD'; # use 'ZIP' here as well? + print "\tCopying file\n"; + copy($v2_file, $delta_file); + } + + if ($patch_command ne 'COPY') { + print "\tCalculating checksum of delta file\n"; + $delta_sum = make_checksum($delta_file); + $delta_size = (stat $delta_file)[7]; + } + + print "\tWriting info file\n"; + open(INFO, '>' . $delta_file . '.info') or die $!; + printf INFO "%s\n%s\t%d\n%s\t%d\n%s\t%d\n", + $patch_command, + $v1_sum, $v1_size, + $v2_sum, $v2_size, + $delta_sum, $delta_size; + close INFO; + + $v1_all_size += $v1_size; + $v2_all_size += $v2_size; + $delta_all_size += $delta_size; + + printf "This file:\nOld %s, New %s, Delta %s, Saved %s (%.2f%%)\n", + make_human_readable($v1_size), + make_human_readable($v2_size), + make_human_readable($delta_size), + make_human_readable($v2_size - $delta_size), + ($v2_size == 0 ? 0 : 100 * (1.0 - $delta_size / $v2_size)); + printf "Overall:\nOld %s, New %s, Delta %s, Saved %s (%.2f%%)\n\n", + make_human_readable($v1_all_size), + make_human_readable($v2_all_size), + make_human_readable($delta_all_size), + make_human_readable($v2_all_size - $delta_all_size), + ($v2_all_size == 0 ? 0 : 100 * (1.0 - $delta_all_size / $v2_all_size)); +} -- GitLab