Apache-Tika-Async
view release on metacpan or search on metacpan
lib/Apache/Tika/Server.pm view on Meta::CPAN
sub spawn_child( $self, @cmd ) {
my ($pid);
if( $^O =~ /mswin/i ) {
$pid = $self->spawn_child_win32(@cmd)
} else {
$pid = $self->spawn_child_posix(@cmd)
};
return $pid
}
sub launch( $self ) {
if( !$self->pid ) {
my $cmdline= join " ", $self->cmdline; # well, for Windows...
#warn $cmdline;
my $pid= $self->spawn_child( $self->cmdline )
or croak "Couldn't launch [$cmdline]: $!/$^E";
$self->pid( $pid );
sleep 2; # Java...
};
}
sub url {
# Should return URI instead
my( $self, $type )= @_;
$type||= 'text';
my $url= {
text => 'rmeta',
test => 'tika', # but GET instead of PUT
meta => 'rmeta',
#all => 'all',
language => 'language/string',
all => 'rmeta',
# unpack
}->{ $type };
sprintf
'http://%s:%s/%s',
$self->host,
$self->port,
$url
};
# /rmeta
# /unpacker
# /all
# /tika
# /language
# hello world
sub fetch {
my( $self, %options )= @_;
$options{ type }||= 'text';
my $url= $self->url( $options{ type } );
if(! $options{ content } and $options{ filename }) {
# read $options{ filename }
open my $fh, '<', $options{ filename }
or croak "Couldn't read '$options{ filename }': $!";
binmode $fh;
local $/;
$options{ content } = <$fh>;
};
my $method;
if( 'test' eq $options{ type } ) {
$method= 'get';
} else {
$method= 'put';
;
};
my $headers = $options{ headers } || {};
#my ($code,$res) = await
# $self->ua->request( $method, $url, $options{ content }, %$headers );
return $self->ua->request( $method, $url, $options{ content }, %$headers )
->then(sub( $code, $res ) {
my $info;
if( 'all' eq $options{ type }
or 'text' eq $options{ type }
or 'meta' eq $options{ type } ) {
if( $code !~ /^2..$/ ) {
croak "Got HTTP error code $code for '$options{ filename }'";
};
my $item = $res->[0];
# Should/could this be lazy?
my $c = delete $item->{'X-TIKA:content'};
# Ghetto-strip HTML we don't want:
if( $c =~ m!<body>(.*)</body>!s or $c =~ m!<body\s*/>!) {
$c = $1;
if( $item->{"Content-Type"} and $item->{"Content-Type"} =~ m!^text/plain\b!) {
# Also strip the enclosing <p>..</p>
$c =~ s!\A\s*<p>(.*)\s*</p>\s*\z!$1!s;
};
} else {
warn "Couldn't find HTML body in response: $c";
};
$info= Apache::Tika::DocInfo->new({
content => $c,
meta => $item,
});
if( ! defined $info->{meta}->{"meta:language"} ) {
# Yay. Two requests.
my $lang_meta = $self->fetch(%options, type => 'language', 'Content-Type' => $item->{'Content-Type'})->get;
$info->{meta}->{"meta:language"} = $lang_meta->meta->{"info"};
};
} else {
# Must be '/language'
if( $code !~ /^2..$/ ) {
croak "Got HTTP error code $code";
};
if( ref $res ) {
$res = $res->[0];
} else {
( run in 0.542 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )