Archive-BagIt

 view release on metacpan or  search on metacpan

lib/Archive/BagIt.pm  view on Meta::CPAN

            my ($key, $value) = %entry;
            if (__case_aware_compare_for_baginfo($key, $searchkey, $lc_flag)) {
                return $idx;
            }
        }
    }
    return;
}

###############################################

sub _collect_errors {
    my ($self, $res) = @_;
    push @{$self->{errors}}, $res;
    return;
}

###############################################

sub _if_error_push {
    my ($self, $res) = @_;
    if ($res ne "") {
        return $self->_collect_errors($res);
    }
    return 1;
}

###############################################

sub _check_baginfo_keys_generically {
    my ($self, $info) = @_;
    my %keys;
    my $ret=1;
    foreach my $entry (@{$info}) {
        my ($key, $value) = %{$entry};
        my $res = _check_key($key); # check key
        $ret &&= $self->_if_error_push($res);
        $res = _check_value($value); # check value
        $ret &&= $self->_if_error_push($res);
        # code to prepare check of uniqueness
        if ($self->is_baginfo_key_reserved($key)) {
            $keys{ lc $key }++;
        } else {
            $keys{ $key }++
        }
    }
    # check for uniqueness
    foreach my $key (keys %keys) {
        if (
            ($self->is_baginfo_key_reserved_as_uniq($key))
                and ($keys{$key} > 1)
        ) {
            $ret &&=$self->_collect_errors("Baginfo key '$key' exists $keys{$key}, but should be uniq!");
        }
    }
    return $ret;
}

###############################################

sub _verify_baginfo {
    my ($self, $info) = @_;
    my $ret = 1;

    if (!defined $info) {
        if (exists $self->{bag_info_file}) {
            $ret &&= $self->_collect_errors("'bag-info.txt' exists, but is not (partially) parseable!");
        }
    } else {
        $ret &&= $self->_check_baginfo_keys_generically($info);
        # check for payload oxum
        my ($loaded_payloadoxum) = $self->get_baginfo_values_by_key('Payload-Oxum');
        if (!defined $loaded_payloadoxum) {
            push @{$self->{warnings}}, "Payload-Oxum was expected in bag-info.txt, but not found!"; # payload-oxum is recommended, but optional
        } else {
            my ($octets, $streamcount) = $self->calc_payload_oxum();
            if ("$octets.$streamcount" ne $loaded_payloadoxum) {
                $ret &&= $self->_collect_errors( "Payload-Oxum differs, calculated $octets.$streamcount but $loaded_payloadoxum was expected by bag-info.txt");
            }
        }
    }
    return $ret;
}

###############################################

sub verify_baginfo {
    my ($self) = @_;
    my $info = $self->bag_info();
    if (List::Util::any { /the baginfo file .* could not be parsed correctly/ } @{$self->{'errors'}}) {
        return;
    }
    return $self->_verify_baginfo($info);
}

###############################################


sub delete_baginfo_by_key {
    my ($self, $searchkey) = @_;
    my $idx = $self->_find_baginfo_idx($searchkey);
    if (defined $idx) {
        splice @{$self->{bag_info}}, $idx, 1; # delete nth indexed entry
    }
    return 1;
}

###############################################


sub exists_baginfo_key {
    my ($self, $searchkey) =@_;
    return (defined  $self->_find_baginfo_idx($searchkey));
}

###############################################

sub _replace_baginfo_by_first_match {
    my ($self, $searchkey, $newvalue) = @_;
    my $idx = $self->_find_baginfo_idx( $searchkey);
    if (defined $idx) {
        $self->{bag_info}[$idx] = {$searchkey => $newvalue};
        return $idx;
    }
    return;
}

###############################################

sub _check_key {
    my ($key) = @_;
    if (!defined $key) {
        return "key should match '[^\\r\\n:]+', but is not defined";
    }
    if ($key =~ m/[\r\n]/s) {
        return "key should match '[^\\r\\n:]+', but contains newlines (key='$key')";
    }
    if ($key =~ m/:/) {
        return "key should not contain a colon! (key='$key')";
    }
    if ($key =~ m/^$/) {
        return "key should have a length > null (key='')";
    }
    return "";
}

###############################################

sub _check_key_or_croak {
    my ($key) = @_;
    my $res = _check_key($key);
    if ($res eq "") { return 1;}
    croak $res;

lib/Archive/BagIt.pm  view on Meta::CPAN

sub _build_forced_fixity_algorithm {
    my ($self) = @_;
    if ($self->use_plugins()) {
        return;
    } else {
        if ($self->bag_version() >= 1.0) {
            return Archive::BagIt::Plugin::Algorithm::SHA512->new(bagit => $self);
        }
        else {
            return Archive::BagIt::Plugin::Algorithm::MD5->new(bagit => $self);
        }
    }
}

###############################################


sub load_plugins {
    my ($self, @plugins) = @_;
    my $loaded_plugins = $self->plugins;
    if (defined $loaded_plugins) {
        @plugins = grep {not exists $loaded_plugins->{$_}} @plugins;
    }
    return if @plugins == 0;
    if (exists $ENV{TEST_ACTIVE}) {
        use Cwd;
        my $dir = getcwd();
        my @dirs = File::Spec->splitdir($dir);
        if ($dirs[-1] eq 't') {
            push @INC, "../lib";
        }
    }
    foreach my $plugin (@plugins) {
        load_class ($plugin) or croak ("Can't load $plugin");
        $plugin->new({bagit => $self});
    }

    return 1;
}

###############################################


sub load {
    my ($self) = @_;
    # call trigger
    $self->bag_path;
    $self->bag_version;
    $self->bag_encoding;
    $self->bag_info;
    $self->payload_path;
    $self->manifest_files;
    $self->checksum_algos;
    $self->tagmanifest_files;
    return 1;
}

###############################################


sub verify_bag {
    my ($self,$opts) = @_;
    #removed the ability to pass in a bag in the parameters, but might want options
    #like $return all errors rather than dying on first one
    my $bagit = $self->bag_path;
    my $version = $self->bag_version(); # to call trigger
    my $encoding = $self->bag_encoding(); # to call trigger
    my $baginfo = $self->verify_baginfo(); #to call trigger

    my $fetch_file = File::Spec->catfile($self->metadata_path, "fetch.txt");
    my $payload_dir   = $self->payload_path;
    my $return_all_errors = $opts->{return_all_errors};

    if (-f $fetch_file) {
        croak("Fetching via file '$fetch_file' is not supported by current Archive::BagIt implementation")
    }
    # check forced fixity
    if ($self->has_forced_fixity_algorithm()) {
        my $forced_fixity_alg = $self->forced_fixity_algorithm()->name();
        my $manifest_file = File::Spec->catfile($self->metadata_path, "manifest-$forced_fixity_alg.txt");
        croak("Manifest '$manifest_file' is not a regular file or does not exist for given bagit version '$version'") unless -f ($manifest_file);
    }
    croak("Payload-directory '$payload_dir' is not a directory or does not exist") unless -d ($payload_dir);

    unless ($version > .95) {
        croak ("Bag Version $version is unsupported");
    }

    my @errors;


    # check for manifests
    foreach my $algorithm ( keys %{ $self->manifests }) {
        my $res = $self->manifests->{$algorithm}->verify_manifest($self->payload_files, $return_all_errors);
        if ((defined $res) && ($res ne "1")) { push @errors, $res; }
    }
    #check for tagmanifests
    foreach my $algorithm ( keys %{ $self->manifests }) {
        my $res = $self->manifests->{$algorithm}->verify_tagmanifest($self->non_payload_files, $return_all_errors);
        if ((defined $res) && ($res ne "1")) { push @errors, $res; }
    }
    push @{$self->{errors}}, @errors;
    my $err = $self->errors();
    my @err =  @{ $err };
    if (scalar( @err ) > 0) {
        croak join("\n","bag verify for bagit version '$version' failed with invalid files.", @err);
    }
    return 1;
}


sub calc_payload_oxum {
    my($self) = @_;
    my @payload = @{$self->payload_files};
    my $octets=0;
    my $streamcount = scalar @payload;
    foreach my $local_name (@payload) {# local_name is relative to bagit base
        my $file = File::Spec->catfile($self->bag_path(), $local_name);
        if (-e $file) {
            my $filesize = 0;
            $filesize = -s $file or carp "empty file $file detected";
            $octets += $filesize;
        } else { croak "file $file does not exist, $!"; }
    }
    return ($octets, $streamcount);
}


sub calc_bagsize {
    my($self) = @_;
    my ($octets,$streamcount) = $self->calc_payload_oxum();
    if ($octets < 1024) { return "$octets B"; }
    elsif ($octets < 1024*1024) {return sprintf("%0.1f kB", $octets/1024); }
    elsif ($octets < 1024*1024*1024) {return sprintf "%0.1f MB", $octets/(1024*1024); }
    elsif ($octets < 1024*1024*1024*1024) {return sprintf "%0.1f GB", $octets/(1024*1024*1024); }
    else { return sprintf "%0.2f TB", $octets/(1024*1024*1024*1024); }
}


sub create_bagit {
    my($self) = @_;
    my $metadata_path = $self->metadata_path();
    my $bagit_path = File::Spec->catfile( $metadata_path, "bagit.txt");
    open(my $BAGIT, ">:encoding(UTF-8)", $bagit_path) or croak("Can't open $bagit_path for writing: $!");
    print($BAGIT "BagIt-Version: 1.0\nTag-File-Character-Encoding: UTF-8");
    close($BAGIT);
    return 1;
}


sub create_baginfo {
    my($self) = @_; # because bag-info.txt allows multiple key-value-entries, hash is replaced
    $self->add_or_replace_baginfo_by_key('Bagging-Date', POSIX::strftime("%Y-%m-%d", gmtime(time)));
    $self->add_or_replace_baginfo_by_key('Bag-Software-Agent', 'Archive::BagIt <https://metacpan.org/pod/Archive::BagIt>');
    my ($octets, $streams) = $self->calc_payload_oxum();
    $self->add_or_replace_baginfo_by_key('Payload-Oxum', "$octets.$streams");
    $self->add_or_replace_baginfo_by_key('Bag-Size', $self->calc_bagsize());
    # The RFC does not allow reordering:
    my $metadata_path = $self->metadata_path();
    my $bag_info_path = File::Spec->catfile( $metadata_path, "bag-info.txt");
    if (
            (exists $self->{errors})
            and ((scalar @{$self->{errors}}) > 0)
    ) {
        croak "Could not create baginfo, because current file $bag_info_path has parsing errors!";
    }

lib/Archive/BagIt.pm  view on Meta::CPAN

    return $self;
}





__PACKAGE__->meta->make_immutable;

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Archive::BagIt - The main module to handle bags.

=head1 VERSION

version 0.101

=head1 NAME

Achive::BagIt - The main module to handle Bags

=head1 SOURCE

The original development version was on github at L<http://github.com/rjeschmi/Archive-BagIt>
and may be cloned from there.

The actual development version is available at L<https://git.fsfe.org/art1pirat/Archive-BagIt>

=head1 Conformance to RFC8493

The module should fulfill the RFC requirements, with following limitations:

=over

=item only encoding UTF-8 is supported

=item version 0.97 or 1.0 allowed

=item version 0.97 requires tag-/manifest-files with md5-fixity

=item version 1.0 requires tag-/manifest-files with sha512-fixity

=item BOM is not supported

=item Carriage Return in bagit-files are not allowed

=item fetch.txt is unsupported

=back

At the moment only filepaths in linux-style are supported.

To get an more detailled overview, see the testsuite under F<t/verify_bag.t> and corresponding test bags from the BagIt conformance testsuite of Library of Congress under F<bagit_conformance_suite/>.

See L<https://datatracker.ietf.org/doc/rfc8493/?include_text=1> for details.

=head1 TODO

=over

=item enhanced testsuite

=item reduce complexity

=item use modern perl code

=item add flag to enable very strict verify

=back

=head1 Backward Compatibility

To reduce the complexity of code in current module the support for

=over

=item parallel processing
=item synchronous I/O

=back

is removed. The existing code is very fast, so there is no performance loss.

In near future the support for L<Archive::BagIt::Fast> will be  removed, because it needs hooks, which increase code
complexity in current module without any performance benefit.

=head1 FAQ

=head2 How to access the manifest-entries directly?

Try this:

   foreach my $algorithm ( keys %{ $self->manifests }) {
       my $entries_ref = $self->manifests->{$algorithm}->manifest_entries();
       # $entries_ref returns a hashref like:
       # {
       #     data/hello.txt   "e7c22b994c59d9cf2b48e549b1e24666636045930d3da7c1acb299d1c3b7f931f94aae41edda2c2b207a36e10f8bcb8d45223e54878f5b316e7ce3b6bc019629"
       # }
   }

Similar for tagmanifests

=head2 How fast is L<Archive::BagIt>?

I have made great efforts to optimize Archive::BagIt for high throughput. There are two limiting factors:

=over

=item calculation of checksums, by switching from the module "Digest" to OpenSSL by using L<Net::SSLeay> a significant
   speed increase could be achieved.

=item loading the files referenced in the manifest files was previously done serially and using synchronous I/O. By
   using the L<IO::Async> module, the files are loaded asynchronously, the performance gain is huge.

=back

On my system with 8cores, SSD and a large 9GB bag with 568 payload files the results for C<verify_bag()> are:

                    processing time          run time             throughput
   Version       user time    system time    total time    total    MB/s
    v0.71        38.31s        1.60s         39.938s       100%     230
    v0.81        25.48s        1.68s         27.1s          67%     340
    v0.82        48.85s        3.89s          6.84s         17%    1346

=head2 How fast is L<Archive::BagIt::Fast>?

It depends. On my system with 8cores, SSD and a 38MB bag with 48 payload files the results for C<verify_bag()> are:

                  Rate         Base         Fast
   Base         3.01/s           --         -21%
   Fast         3.80/s          26%           --

On my system with 8cores, SSD and a large 9GB bag with 568 payload files the results for C<verify_bag()> are:

                s/iter         Base         Fast
   Base           74.6           --          -9%
   Fast           68.3           9%           --

But you should measure which variant is best for you. In general the default L<Archive::BagIt> is fast enough.

=head2 How to update an old bag of version v0.97 to v1.0?

You could try this:

   use Archive::BagIt;
   my $bag=Archive::BagIt->new( $my_old_bag_filepath );
   $bag->load();
   $bag->store();

=head2 How to create UTF-8 based paths under MS Windows?

For versions < Windows10: I have no idea and suggestions for a portable solution are very welcome!
For Windows 10: Thanks to L<https://superuser.com/questions/1033088/is-it-possible-to-set-locale-of-a-windows-application-to-utf-8/1451686#1451686>
you have to enable UTF-8 support via 'System Administration' -> 'Region' -> 'Administrative'
-> 'Region Settings' -> Flag 'Use Unicode UTF-8 for worldwide language support'

Hint: The better way is to use only portable filenames. See L<perlport> for details.

=head1 BUGS

None known yet.

=head1 THANKS

Thanks to Rob Schmidt <rjeschmi@gmail.com> for the trustful handover of the project and thanks for your initial work!
I would also like to thank Patrick Hochstenbach and Rusell McOrmond for their valuable and especially detailed advice!
And without the helpful, sometimes rude help of the IRC channel #perl I would have been stuck in a lot of problems.
Without the support of my colleagues at SLUB Dresden, the project would never have made it this far.

=head1 SYNOPSIS

This modules will hopefully help with the basic commands needed to create
and verify a bag. This part supports BagIt 1.0 according to RFC 8493 ([https://tools.ietf.org/html/rfc8493](https://tools.ietf.org/html/rfc8493)).

You only need to know the following methods first:

=head2 read a BagIt

    use Archive::BagIt;

    #read in an existing bag:
    my $bag_dir = "/path/to/bag";
    my $bag = Archive::BagIt->new($bag_dir);

=head2 construct a BagIt around a payload

    use Archive::BagIt;
    my $bag2 = Archive::BagIt->make_bag($bag_dir);

=head2 verify a BagIt-dir

    use Archive::BagIt;

    # Validate a BagIt archive against its manifest
    my $bag3 = Archive::BagIt->new($bag_dir);
    my $is_valid1 = $bag3->verify_bag();

    # Validate a BagIt archive against its manifest, report all errors
    my $bag4 = Archive::BagIt->new($bag_dir);
    my $is_valid2 = $bag4->verify_bag( {report_all_errors => 1} );

=head2 read a BagIt-dir, change something, store

Because all methods operate lazy, you should ensure to parse parts of the bag *BEFORE* you modify it.
Otherwise it will be overwritten!

    use Archive::BagIt;
    my $bag5 = Archive::BagIt->new($bag_dir); # lazy, nothing happened
    $bag5->load(); # this updates the object representation by parsing the given $bag_dir
    $bag5->store(); # this writes the bag new

=head1 METHODS

=head2 Constructor

The constructor sub, will create a bag with a single argument,

    use Archive::BagIt;

    #read in an existing bag:
    my $bag_dir = "/path/to/bag";
    my $bag = Archive::BagIt->new($bag_dir);

or use hashreferences

    use Archive::BagIt;

    #read in an existing bag:
    my $bag_dir = "/path/to/bag";
    my $bag = Archive::BagIt->new(
        bag_path => $bag_dir,
    );

The arguments are:

=over 1

=item C<bag_path> - path to bag-directory

=item C<force_utf8> - if set the warnings about non portable filenames are disabled (default: enabled)

=item C<use_plugins> - expected manifest plugin strings, if set it uses the requested plugins,
      example C<Archive::BagIt::Plugin::Manifest::SHA256>.
      HINT: this option *disables* the forced fixity check in C<verify_bag()>!

=back

The bag object will use $bag_dir, BUT an existing $bag_dir is not read. If you use C<store()> an existing bag will be overwritten!

See C<load()> if you want to parse/modify an existing bag.

=head2 has_force_utf8()

to check if force_utf8() was set.

If set it ignores warnings about potential filepath problems.

=head2 bag_path([$new_value])

Getter/setter for bag path

=head2 metadata_path()

Getter for metadata path

=head2 payload_path()

Getter for payload path

=head2 checksum_algos()

Getter for registered Checksums

=head2 bag_version()

Getter for bag version

=head2 bag_encoding()

Getter for bag encoding.

HINT: the current version of Archive::BagIt only supports UTF-8, but the method could return other values depending on given Bags.

=head2 bag_info([$new_value])

Getter/Setter for bag info. Expects/returns an array of HashRefs implementing simple key-value pairs.

HINT: RFC8493 does not allow *reordering* of entries!

=head2 has_bag_info()

returns true if bag info exists.

=head2 errors()

Getter to return collected errors after a C<verify_bag()> call with Option C<report_all_errors>

=head2 warnings()

Getter to return collected warnings after a C<verify_bag()> call

=head2 digest_callback()

This method could be reimplemented by derived classes to handle fixity checks in own way. The
getter returns an anonymous function with following interface:

   my $digest = $self->digest_callback;
   &$digest( $digestobject, $filename);

This anonymous function MUST use the C<get_hash_string()> function of the L<Archive::BagIt::Role::Algorithm> role,
which is implemented by each L<Archive::BagIt::Plugin::Algorithm::XXXX> module.

See L<Archive::BagIt::Fast> for details.

=head2 get_baginfo_values_by_key($searchkey)

Returns all values which match $searchkey, undef otherwise

=head2 is_baginfo_key_reserved_as_uniq($searchkey)

returns true if key is reserved and should be uniq

=head2 is_baginfo_key_reserved( $searchkey )

returns true if key is reserved

=head2 verify_baginfo()

checks baginfo-keys, returns true if all fine, otherwise returns undef and the message is pushed to C<errors()>.
Warnings pushed to C< warnings() >

=head2 delete_baginfo_by_key( $searchkey )

deletes an entry of given $searchkey if exists.
If multiple entries with $searchkey exists, only the last one is deleted.

=head2 exists_baginfo_key( $searchkey )

returns true if a given $searchkey exists

=head2 append_baginfo_by_key($searchkey, $newvalue)

Appends a key value pair to bag_info.

HINT: check return code if append was successful, because some keys needs to be uniq.

=head2 add_or_replace_baginfo_by_key($searchkey, $newvalue)

It replaces the first entry with $newvalue if $searchkey exists, otherwise it appends.

=head2 forced_fixity_algorithm()

Getter to return the forced fixity algorithm depending on BagIt version

=head2 manifest_files()

Getter to find all manifest-files

=head2 tagmanifest_files()

Getter to find all tagmanifest-files

=head2 payload_files()

Getter to find all payload-files

=head2 non_payload_files()

Getter to find all non payload-files

=head2 plugins()

Getter/setter to algorithm plugins

=head2 manifests()

Getter/Setter to all manifests (objects)

=head2 algos()

Getter/Setter to all registered Algorithms

=head2 load_plugins

As default SHA512 and MD5 will be loaded and therefore used. If you want to create a bag only with one or a specific
checksum-algorithm, you could use this method to (re-)register it. It expects list of strings with namespace of type:
Archive::BagIt::Plugin::Algorithm::XXX where XXX is your chosen fixity algorithm.

=head2 load()

Triggers loading of an existing bag

=head2 verify_bag($opts)

A method to verify a bag deeply. If C<$opts> is set with C<{return_all_errors}> all fixity errors are reported.
The default ist to croak with error message if any error is detected.

HINT: You might also want to check L<Archive::BagIt::Fast> to see a more direct way of accessing files (and thus faster).

=head2 calc_payload_oxum()

returns an array with octets and streamcount of payload-dir

=head2 calc_bagsize()

returns a string with human readable size of paylod

=head2 create_bagit()

creates a bagit.txt file

=head2 create_baginfo()

creates a bag-info.txt file

Hint: the entries 'Bagging-Date', 'Bag-Software-Agent', 'Payload-Oxum' and 'Bag-Size' will be automagically set,
existing values in internal bag-info representation will be overwritten!

=head2 store()

store a bagit-obj if bagit directory-structure was already constructed.

=head2 init_metadata( $bag_path, $options)

A constructor that will just create the metadata directory

This won't make a bag, but it will create the conditions to do that eventually

=head2 make_bag( $bag_path, $options )

A constructor that will make and return a bag from a directory,

It expects a preliminary bagit-dir exists.
If there a data directory exists, assume it is already a bag (no checking for invalid files in root)

=head1 AVAILABILITY

The latest version of this module is available from the Comprehensive Perl
Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
site near you, or see L<https://metacpan.org/module/Archive::BagIt/>.

=head1 BUGS AND LIMITATIONS

You can make new bug reports, and view existing ones, through the
web interface at L<http://rt.cpan.org>.

=head1 AUTHOR

Andreas Romeyke <cpan@andreas.romeyke.de>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2025 by Rob Schmidt <rjeschmi@gmail.com>, William Wueppelmann and Andreas Romeyke.

This is free software; you can redistribute it and/or modify it under



( run in 0.761 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )