Apache-Tika-Async

 view release on metacpan or  search on metacpan

lib/Apache/Tika/Async.pm  view on Meta::CPAN

    default => sub { [ ] },
);

=head2 B<java>

  java => '/opt/openjdk-11-jre/bin/java',

Sets the Java executable to be used.

=cut

has java => (
    is => 'rw',
    #isa => 'Str',
    default => 'java',
);

=head2 B<java_args>

  java_args => [],

Sets the Java options to be used.

=cut

has java_args => (
    is => 'rw',
    #isa => 'Array',
    builder => sub { [
        # So that Tika can re-read some problematic PDF files better
        '-Dorg.apache.pdfbox.baseParser.pushBackSize=1000000'
    ] },
);

sub _tika_config_xml {
    my( $self, %entries ) = @_;
    return join '',
'<?xml version="1.0" encoding="UTF-8"?>',
'<properties>',
'<!-- <parsers etc.../> -->',
'<server>',
    '<params>',
    (map { join '', "<$_>" => $entries{ $_ } => "</$_>" } sort keys %entries),
    '</params>',
'</server>',
'</properties>',
}

sub tika_config {
    my( $self, %entries ) = @_;
    return $self->_tika_config_xml(
        logLevel => $self->loglevel,
        %entries
    );
}

sub tika_config_temp_file {
    my( $self, %entries ) = @_;

    my( $fh, $name ) = tempfile();
    binmode $fh;
    print {$fh} $self->tika_config(%entries);
    close $fh;

    return $name;
}

sub best_jar_file {
    my( $package, @files ) = @_;
    # Do a natural sort on the dot-version
    (sort { my $ad; $a =~ /\bserver-(?:standard-|)(\d+)\.(\d+)/ and $ad=sprintf '%02d.%04d', $1, $2;
            my $bd; $b =~ /\bserver-(?:standard-|)(\d+)\.(\d+)/ and $bd=sprintf '%02d.%04d', $1, $2;
                $bd <=> $ad
          } @files)[0]
}

sub cmdline {
    my( $self )= @_;
    $self->java,
    @{$self->java_args},
    '-jar',
    $self->jarfile,
    '--config', $self->tika_config_temp_file,
    @{$self->tika_args},
};

sub fetch {
    my( $self, %options )= @_;
    my @cmd= $self->cmdline;
    push @cmd, $options{ type };
    push @cmd, $options{ filename };
    @cmd= map { qq{"$_"} } @cmd;
    #die "Fetching from local process is currently disabled";
    #warn "[@cmd]";
    '' . readpipe(@cmd)
}

sub decode_csv {
    my( $self, $line )= @_;
    $line =~ m!"([^"]+)"!g;
}

sub get_meta {
    my( $self, $file )= @_;
    #return decode_json($self->fetch( filename => $file, type => 'meta' ));
    # Hacky CSV-to-hash decode :-/
    return $self->fetch( filename => $file, type => 'meta' )->meta->get;
};

sub get_text {
    my( $self, $file )= @_;
    return $self->fetch( filename => $file, type => 'text' )->get;
};

sub get_test {
    my( $self, $file )= @_;
    return $self->fetch( filename => $file, type => 'test' )->get;
};

sub get_all {
    my( $self, $file )= @_;



( run in 0.935 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )