Apache-Tika-Async
view release on metacpan or search on metacpan
lib/Apache/Tika/Async.pm view on Meta::CPAN
default => sub { [ ] },
);
=head2 B<java>
java => '/opt/openjdk-11-jre/bin/java',
Sets the Java executable to be used.
=cut
has java => (
is => 'rw',
#isa => 'Str',
default => 'java',
);
=head2 B<java_args>
java_args => [],
Sets the Java options to be used.
=cut
has java_args => (
is => 'rw',
#isa => 'Array',
builder => sub { [
# So that Tika can re-read some problematic PDF files better
'-Dorg.apache.pdfbox.baseParser.pushBackSize=1000000'
] },
);
sub _tika_config_xml {
my( $self, %entries ) = @_;
return join '',
'<?xml version="1.0" encoding="UTF-8"?>',
'<properties>',
'<!-- <parsers etc.../> -->',
'<server>',
'<params>',
(map { join '', "<$_>" => $entries{ $_ } => "</$_>" } sort keys %entries),
'</params>',
'</server>',
'</properties>',
}
sub tika_config {
my( $self, %entries ) = @_;
return $self->_tika_config_xml(
logLevel => $self->loglevel,
%entries
);
}
sub tika_config_temp_file {
my( $self, %entries ) = @_;
my( $fh, $name ) = tempfile();
binmode $fh;
print {$fh} $self->tika_config(%entries);
close $fh;
return $name;
}
sub best_jar_file {
my( $package, @files ) = @_;
# Do a natural sort on the dot-version
(sort { my $ad; $a =~ /\bserver-(?:standard-|)(\d+)\.(\d+)/ and $ad=sprintf '%02d.%04d', $1, $2;
my $bd; $b =~ /\bserver-(?:standard-|)(\d+)\.(\d+)/ and $bd=sprintf '%02d.%04d', $1, $2;
$bd <=> $ad
} @files)[0]
}
sub cmdline {
my( $self )= @_;
$self->java,
@{$self->java_args},
'-jar',
$self->jarfile,
'--config', $self->tika_config_temp_file,
@{$self->tika_args},
};
sub fetch {
my( $self, %options )= @_;
my @cmd= $self->cmdline;
push @cmd, $options{ type };
push @cmd, $options{ filename };
@cmd= map { qq{"$_"} } @cmd;
#die "Fetching from local process is currently disabled";
#warn "[@cmd]";
'' . readpipe(@cmd)
}
sub decode_csv {
my( $self, $line )= @_;
$line =~ m!"([^"]+)"!g;
}
sub get_meta {
my( $self, $file )= @_;
#return decode_json($self->fetch( filename => $file, type => 'meta' ));
# Hacky CSV-to-hash decode :-/
return $self->fetch( filename => $file, type => 'meta' )->meta->get;
};
sub get_text {
my( $self, $file )= @_;
return $self->fetch( filename => $file, type => 'text' )->get;
};
sub get_test {
my( $self, $file )= @_;
return $self->fetch( filename => $file, type => 'test' )->get;
};
sub get_all {
my( $self, $file )= @_;
( run in 0.935 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )