--- nfo/perl/libs/Data/Transfer/Sync/Compare/Checksum.pm 2003/02/11 09:53:07 1.2 +++ nfo/perl/libs/Data/Transfer/Sync/Compare/Checksum.pm 2004/06/19 01:48:03 1.6 @@ -1,4 +1,4 @@ -## $Id: Checksum.pm,v 1.2 2003/02/11 09:53:07 joko Exp $ +## $Id: Checksum.pm,v 1.6 2004/06/19 01:48:03 joko Exp $ ## ## Copyright (c) 2002 Andreas Motl ## @@ -6,6 +6,20 @@ ## ## ---------------------------------------------------------------------------------------- ## $Log: Checksum.pm,v $ +## Revision 1.6 2004/06/19 01:48:03 joko +## introduced infrastructure for "local checksum"-mechanism +## modified _calcChecksum +## added sub _dumpCompact +## +## Revision 1.5 2004/05/11 19:43:02 joko +## don't use File::RsyncP::Digest on win32 +## +## Revision 1.4 2004/05/06 12:53:07 jonen +## + added use of File::RsycP::digest +## +## Revision 1.3 2003/05/13 08:19:00 joko +## switched to crc32 +## ## Revision 1.2 2003/02/11 09:53:07 joko ## + metadata-structure-change, fixed some code here ## @@ -15,6 +29,15 @@ ## ---------------------------------------------------------------------------------------- +=pod + +=head1 Todo + + o Data::Transfer::Sync::Compare::Slot using Compare::Struct + o Load checksum algorithm on demand, do some negotiation + +=cut + package Data::Transfer::Sync::Compare::Checksum; use strict; @@ -22,54 +45,91 @@ use mixin::with qw( Data::Transfer::Sync ); - -# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - main - +use shortcuts qw( RUNNING_IN_HELL ); use Data::Dumper; -use Digest::MD5 qw(md5 md5_hex md5_base64); +# TODO: Load these appropriatly at runtime. +use Digest::MD5 qw( md5 md5_hex md5_base64 ); +use String::CRC32; +use Data::Mungle::Transform::Deep qw( deep_copy ); + +# don't use File::RsyncP::Digest on win32 +# TODO: enhance here! (e.g. negotiate proper checksum-algorithm first, apply afterwards) +if (not RUNNING_IN_HELL()) { + eval "use File::RsyncP::Digest;"; +} # get logger instance my $logger = Log::Dispatch::Config->instance; +# Maybe refactor to shortcuts::checksum? +# calculate checksum for current object sub _calcChecksum { my $self = shift; my $descent = shift; - my $specifier = shift; + my $options = shift; - # calculate checksum for current object + my $checksum; + my $ident = $self->{node}->{$descent}->{ident}; # build dump of this node my $payload = $self->{node}->{$descent}->{payload}; + $payload = deep_copy($payload) if $options->{clone}; + + # new 2004-06-17: remove attributes if request before dumping/calculating + map { delete $payload->{$_} } @{$options->{exclude}} if $options->{exclude}; + + # various methods to dump this thing #my $dump = $ident . "\n" . $item->quickdump(); #my $dump = $ident . "\n" . Dumper($item); - my $dump = $ident . "\n" . $self->_dumpCompact($payload); + my $dump = $self->_dumpCompact($payload); + $dump = $ident . $dump if $self->{options}->{ident}; # TODO: $logger->dump( ... ); #$logger->debug( __PACKAGE__ . ": " . $dump ); #$logger->dump( __PACKAGE__ . ": " . $dump ); # calculate checksum from dump - # note: the 32-bit integer hash from DBI seems - # to generate duplicates with small payloads already in ranges of hundreds of items/rows!!! - # try to avoid to use it or try to use it only for payloads greater than, hmmm, let's say 30 chars? - # (we had about 15 chars average per item (row)) - - # md5-based fingerprint, base64 encoded (from Digest::MD5) - $self->{node}->{$descent}->{checksum} = md5_base64($dump) . '=='; - # 32-bit integer "hash" value (maybe faster?) (from DBI) - #$self->{node}->{$descent}->{checksum} = DBI::hash($dump, 1); + + # 1. md5-based fingerprint, base64 encoded (from Digest::MD5) + #$checksum = md5_base64($dump) . '=='; + + # 2. 32-bit integer "hash" value (maybe faster?) (from DBI) + # Note: The 32-bit integer hash from DBI seems to generate duplicates + # with small payloads already in ranges of hundreds of items/rows!!! + # Try to avoid it or try to use it only for payloads greater than, hmmm, let's say 30 chars? + # (we had about 15 chars average per item (row)) + # Possible (generic) solution: Just generate checksum, if length(checksum(payload)) < length(payload) + #$checksum = DBI::hash($dump, 1); + + # 3. good old crc32??? + #$checksum = crc32($dump); + + # 4. File::RsyncP::Digest - Perl interface to rsync message digest algorithms + if (not RUNNING_IN_HELL()) { + my $rsDigest = new File::RsyncP::Digest; + $rsDigest->add($dump); + my $digest = $rsDigest->digest(); + $checksum = unpack("H*", $digest); + + } else { # fallback to Digest::MD5 on win32 + #print $dump, " ==> "; + $checksum = md5_base64($dump) . '=='; + #print $checksum, "\n"; + } + + # 5. some more modern Digest::SHA1 or similar? # signal good - return 1; + return $checksum; } -sub _readChecksum { +sub readChecksum { my $self = shift; my $descent = shift; @@ -88,7 +148,7 @@ # get checksum for current entry # TODO: don't have the checksum column/property hardcoded as "cs" here, make this configurable somehow if ($self->{meta}->{$descent}->{isChecksumAuthority}) { - $self->_calcChecksum($descent); + $self->{node}->{$descent}->{checksum} = $self->_calcChecksum($descent, { ident => 1 }); } else { $self->{node}->{$descent}->{checksum} = $self->{node}->{$descent}->{payload}->{cs}; } @@ -98,5 +158,60 @@ } +sub _readLocalChecksum { + my $self = shift; + my $descent = shift; + $self->{node}->{$descent}->{checksum_local_storage} = $self->{node}->{$descent}->{payload}->{cs_local}; + $self->{node}->{$descent}->{checksum_local_storage} ||= ""; +} + +sub _calcLocalChecksum { + my $self = shift; + my $descent = shift; + # TODO: don't hardcode this (oid, cs, cs_local) + $self->{node}->{$descent}->{checksum_local_calculated} = + $self->_calcChecksum($descent, { clone => 1, exclude => [qw( oid cs cs_local )] }); + $self->{node}->{$descent}->{checksum_local_calculated} ||= ""; +} + +sub handleLocalChecksum { + my $self = shift; + my $descent = shift; + $self->_readLocalChecksum($descent); + $self->_calcLocalChecksum($descent); +} + +sub _dumpCompact { + my $self = shift; + + my @data = (); + + my $count = 0; + + # walk through all arguments + foreach (@_) { + my $item = {}; + # assume item is a hashref and walk through all entries + foreach my $key (keys %$_) { + my $val = $_->{$key}; + # dive into tied/hidden types and resolve deep structure inside + if (ref $val eq 'Set::Object') { + $item->{$key} = $val->members(); + } else { + $item->{$key} = $val; + } + } + push @data, $item; + $count++; + } + + my $indent_save = $Data::Dumper::Indent; + $Data::Dumper::Indent = 0; + my $result = Dumper(@data); + $Data::Dumper::Indent = $indent_save; + return $result; + +} 1; +__END__