Apache-Tika-Async

 view release on metacpan or  search on metacpan

lib/Apache/Tika/Server.pm  view on Meta::CPAN


sub spawn_child( $self, @cmd ) {
    my ($pid);
    if( $^O =~ /mswin/i ) {
        $pid = $self->spawn_child_win32(@cmd)
    } else {
        $pid = $self->spawn_child_posix(@cmd)
    };

    return $pid
}

sub launch( $self ) {
    if( !$self->pid ) {
        my $cmdline= join " ", $self->cmdline; # well, for Windows...
        #warn $cmdline;
        my $pid= $self->spawn_child( $self->cmdline )
            or croak "Couldn't launch [$cmdline]: $!/$^E";
        $self->pid( $pid );
        sleep 2; # Java...
    };
}

sub url {
    # Should return URI instead
    my( $self, $type )= @_;
    $type||= 'text';

    my $url= {
        text => 'rmeta',
        test => 'tika', # but GET instead of PUT
        meta => 'rmeta',
        #all => 'all',
        language => 'language/string',
        all => 'rmeta',
        # unpack
    }->{ $type };

    sprintf
        'http://%s:%s/%s',
        $self->host,
        $self->port,
        $url
};

# /rmeta
# /unpacker
# /all
# /tika
# /language
#    hello world
sub fetch {
    my( $self, %options )= @_;
    $options{ type }||= 'text';
    my $url= $self->url( $options{ type } );

    if(! $options{ content } and $options{ filename }) {
        # read $options{ filename }
        open my $fh, '<', $options{ filename }
            or croak "Couldn't read '$options{ filename }': $!";
        binmode $fh;
        local $/;
        $options{ content } = <$fh>;
    };

    my $method;
    if( 'test' eq $options{ type } ) {
        $method= 'get';

    } else {
        $method= 'put';
        ;
    };

    my $headers = $options{ headers } || {};

    #my ($code,$res) = await
    #    $self->ua->request( $method, $url, $options{ content }, %$headers );
    return $self->ua->request( $method, $url, $options{ content }, %$headers )
    ->then(sub( $code, $res ) {
        my $info;
        if(    'all' eq $options{ type }
            or 'text' eq $options{ type }
            or 'meta' eq $options{ type } ) {
            if( $code !~ /^2..$/ ) {
                croak "Got HTTP error code $code for '$options{ filename }'";
            };
            my $item = $res->[0];
            # Should/could this be lazy?
            my $c = delete $item->{'X-TIKA:content'};
            # Ghetto-strip HTML we don't want:
            if( $c =~ m!<body>(.*)</body>!s or $c =~ m!<body\s*/>!) {
                $c = $1;

                if( $item->{"Content-Type"} and $item->{"Content-Type"} =~ m!^text/plain\b!) {
                    # Also strip the enclosing <p>..</p>
                    $c =~ s!\A\s*<p>(.*)\s*</p>\s*\z!$1!s;
                };
            } else {
                warn "Couldn't find HTML body in response: $c";
            };

            $info= Apache::Tika::DocInfo->new({
                content => $c,
                meta => $item,
            });

            if( ! defined $info->{meta}->{"meta:language"} ) {
                # Yay. Two requests.
                my $lang_meta = $self->fetch(%options, type => 'language', 'Content-Type' => $item->{'Content-Type'})->get;
                $info->{meta}->{"meta:language"} = $lang_meta->meta->{"info"};
            };

        } else {
            # Must be '/language'
            if( $code !~ /^2..$/ ) {
                croak "Got HTTP error code $code";
            };
            if( ref $res ) {
                $res = $res->[0];
            } else {



( run in 0.542 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )