Archive-BagIt

 view release on metacpan or  search on metacpan

lib/Archive/BagIt.pm  view on Meta::CPAN

        foreach my $local_entry (@paths) {
            my $path_entry = File::Spec->catdir($current_dir, $self->__handle_nonportable_local_entry($local_entry, $dir));
            if ((defined $excludedir) && ($path_entry eq $excludedir)) {
                # ignore it, because excluded
            } elsif (-f $path_entry) {
                push @tmp_file_paths, $path_entry;
            } elsif (-d $path_entry) {
                push @todo, $path_entry;
            } else {
                croak "not a file nor a dir found '$path_entry'";
            }
        }
        push @file_paths, sort @tmp_file_paths;
        foreach my $subdir (sort @todo) {
            &$finder($subdir);
        }
    };
    my $absolute = File::Spec->rel2abs( $dir );
    &$finder($absolute);
    @file_paths = map { File::Spec->abs2rel( $_, $dir)} @file_paths;
    return @file_paths;
}

sub _build_payload_files{
    my ($self) = @_;
    my $payload_dir = $self->payload_path;
    my $reldir = File::Spec->abs2rel($payload_dir, $self->bag_path());
    $reldir =~ s/^\.$//;
    my @payload = map {
        $reldir eq "" ? $_ : File::Spec->catfile($reldir, $_)
    } $self->__file_find($payload_dir, File::Spec->rel2abs($self->metadata_path));
    return wantarray ? @payload : \@payload;
}


sub __build_read_bagit_txt {
    my($self) = @_;
    my $bagit = $self->metadata_path;
    my $file = File::Spec->catfile($bagit, "bagit.txt");
    open(my $BAGIT, "<:encoding(UTF-8)", $file) or croak("Cannot read '$file': $!");
    my $version_string = <$BAGIT>;
    my $encoding_string = <$BAGIT>;
    close($BAGIT);
    if (defined $version_string) {
        $version_string =~ s/[\r\n]//;
    }
    if (defined $encoding_string) {
        $encoding_string =~s/[\r\n]//;
    }
    return ($version_string, $encoding_string, $file);
}

sub _build_bag_version {
    my($self) = @_;
    my ($version_string, $encoding_string, $file) = $self->__build_read_bagit_txt();
    croak "Version line missed in '$file" unless defined $version_string;
    if ($version_string =~ /^BagIt-Version: ([01]\.[0-9]+)$/) {
        return $1;
    } else {
        $version_string =~ s/\r/<CR>/;
        $version_string =~ s/^\N{U+FEFF}/<BOM>/;
        croak "Version string '$version_string' of '$file' is incorrect";
    };
}

sub _build_bag_encoding {
    my($self) = @_;
    my ($version_string, $encoding_string, $file) = $self->__build_read_bagit_txt();
    croak "Encoding line missed in '$file" unless defined $encoding_string;
    croak "Encoding '$encoding_string' of '$file' not supported by current Archive::BagIt module!" unless ($encoding_string ne "UTF-8");
    return $encoding_string;
}

sub __sort_bag_info {
    my @sorted = sort {
        my %tmpa = %{$a};
        my %tmpb = %{$b};
        my ($ka, $va) = each %tmpa;
        my ($kb, $vb) = each %tmpb;
        my $kres = $ka cmp $kb;
        if ($kres != 0) {
            return $kres;
        } else {
            return $va cmp $vb;
        }
    } @_;
    return @sorted;
}


sub _extract_key_from_textblob {
    my ($self, $textblob) = @_;
    if (!defined $textblob) {
        push @{$self->{errors}}, "the baginfo file '" . $self->{bag_info_file} . "' could not be parsed correctly, because textblob for key extraction is undefined";
        return (undef, undef);
    }
    my $key;
    my $rx_word = qr{[^: \t\r\n]+};# Hint: this word definition for bag-info.txt-keys differs from word definition of bag-info.txt-values!
    my $rx_spc = qr{\s}; #qr{[\t ]};
    if ($textblob =~ s/\A($rx_word)$rx_spc*:[\t ]*//m) {
        # label if starts with chars not colon or whitespace followed by zero or more spaces, a colon, zero or more spaces
        if ($textblob eq "") {
            push @{$self->{errors}}, "the baginfo file '" . $self->{bag_info_file} . "' could not be parsed correctly, because following text blob not fullfill the match requirements for values: '$textblob', empty value detected";
            return ($1, undef);
        }
        $key = $1;
    } else {
        push @{$self->{errors}}, "the baginfo file '".$self->{bag_info_file}."' could not be parsed correctly, because following text blob not fullfill the match requirements for keys: '$textblob'";
    }
    return ($key, $textblob);
}

sub _extract_value_from_textblob {
    my ($self, $textblob) = @_;
    if (!defined $textblob) {
        push @{$self->{errors}}, "the baginfo file '" . $self->{bag_info_file} . "' could not be parsed correctly, because textblob for value extraction is undefined";
        return (undef, undef);
    }
    if ($textblob eq "") {
        push @{$self->{errors}}, "the baginfo file '" . $self->{bag_info_file} . "' could not be parsed correctly, because textblob for value extraction is empty";
        return (undef, "");

lib/Archive/BagIt.pm  view on Meta::CPAN

}


sub make_bag {
    my ($class, $bag_path, $options) = @_;
    my $isa = ref $class;
    if ($isa eq "Archive::BagIt") { # not a class, but an object!
        croak "make_bag() only a class subroutine, not useable with objects. Try store() instead!\n";
    }
    my $self = $class->init_metadata($bag_path, $options);
    return $self;
}





__PACKAGE__->meta->make_immutable;

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Archive::BagIt - The main module to handle bags.

=head1 VERSION

version 0.101

=head1 NAME

Achive::BagIt - The main module to handle Bags

=head1 SOURCE

The original development version was on github at L<http://github.com/rjeschmi/Archive-BagIt>
and may be cloned from there.

The actual development version is available at L<https://git.fsfe.org/art1pirat/Archive-BagIt>

=head1 Conformance to RFC8493

The module should fulfill the RFC requirements, with following limitations:

=over

=item only encoding UTF-8 is supported

=item version 0.97 or 1.0 allowed

=item version 0.97 requires tag-/manifest-files with md5-fixity

=item version 1.0 requires tag-/manifest-files with sha512-fixity

=item BOM is not supported

=item Carriage Return in bagit-files are not allowed

=item fetch.txt is unsupported

=back

At the moment only filepaths in linux-style are supported.

To get an more detailled overview, see the testsuite under F<t/verify_bag.t> and corresponding test bags from the BagIt conformance testsuite of Library of Congress under F<bagit_conformance_suite/>.

See L<https://datatracker.ietf.org/doc/rfc8493/?include_text=1> for details.

=head1 TODO

=over

=item enhanced testsuite

=item reduce complexity

=item use modern perl code

=item add flag to enable very strict verify

=back

=head1 Backward Compatibility

To reduce the complexity of code in current module the support for

=over

=item parallel processing
=item synchronous I/O

=back

is removed. The existing code is very fast, so there is no performance loss.

In near future the support for L<Archive::BagIt::Fast> will be  removed, because it needs hooks, which increase code
complexity in current module without any performance benefit.

=head1 FAQ

=head2 How to access the manifest-entries directly?

Try this:

   foreach my $algorithm ( keys %{ $self->manifests }) {
       my $entries_ref = $self->manifests->{$algorithm}->manifest_entries();
       # $entries_ref returns a hashref like:
       # {
       #     data/hello.txt   "e7c22b994c59d9cf2b48e549b1e24666636045930d3da7c1acb299d1c3b7f931f94aae41edda2c2b207a36e10f8bcb8d45223e54878f5b316e7ce3b6bc019629"
       # }
   }

Similar for tagmanifests

=head2 How fast is L<Archive::BagIt>?



( run in 1.327 second using v1.01-cache-2.11-cpan-524268b4103 )