Apache-Tika-Async

 view release on metacpan or  search on metacpan

lib/Apache/Tika/Server.pm  view on Meta::CPAN

    '-jar',
    $self->jarfile,
    #'--port', $self->port,
    '--config', $self->tika_config_temp_file,
    @{$self->tika_args},
};

sub spawn_child_win32( $self, @cmd ) {
    system(1, @cmd)
}

sub spawn_child_posix( $self, @cmd ) {
    require POSIX;
    POSIX->import("setsid");

    # daemonize
    defined(my $pid = fork())   || die "can't fork: $!";
    if( $pid ) {    # non-zero now means I am the parent
        return $pid;
    };

    # We are the child, close about everything, then exec
    chdir("/")                  || die "can't chdir to /: $!";
    (setsid() != -1)            || die "Can't start a new session: $!";
    open(STDERR, ">&STDOUT")    || die "can't dup stdout: $!";
    open(STDIN,  "< /dev/null") || die "can't read /dev/null: $!";
    open(STDOUT, "> /dev/null") || die "can't write to /dev/null: $!";
    exec @cmd;
    exit 1;
}

sub spawn_child( $self, @cmd ) {
    my ($pid);
    if( $^O =~ /mswin/i ) {
        $pid = $self->spawn_child_win32(@cmd)
    } else {
        $pid = $self->spawn_child_posix(@cmd)
    };

    return $pid
}

sub launch( $self ) {
    if( !$self->pid ) {
        my $cmdline= join " ", $self->cmdline; # well, for Windows...
        #warn $cmdline;
        my $pid= $self->spawn_child( $self->cmdline )
            or croak "Couldn't launch [$cmdline]: $!/$^E";
        $self->pid( $pid );
        sleep 2; # Java...
    };
}

sub url {
    # Should return URI instead
    my( $self, $type )= @_;
    $type||= 'text';

    my $url= {
        text => 'rmeta',
        test => 'tika', # but GET instead of PUT
        meta => 'rmeta',
        #all => 'all',
        language => 'language/string',
        all => 'rmeta',
        # unpack
    }->{ $type };

    sprintf
        'http://%s:%s/%s',
        $self->host,
        $self->port,
        $url
};

# /rmeta
# /unpacker
# /all
# /tika
# /language
#    hello world
sub fetch {
    my( $self, %options )= @_;
    $options{ type }||= 'text';
    my $url= $self->url( $options{ type } );

    if(! $options{ content } and $options{ filename }) {
        # read $options{ filename }
        open my $fh, '<', $options{ filename }
            or croak "Couldn't read '$options{ filename }': $!";
        binmode $fh;
        local $/;
        $options{ content } = <$fh>;
    };

    my $method;
    if( 'test' eq $options{ type } ) {
        $method= 'get';

    } else {
        $method= 'put';
        ;
    };

    my $headers = $options{ headers } || {};

    #my ($code,$res) = await
    #    $self->ua->request( $method, $url, $options{ content }, %$headers );
    return $self->ua->request( $method, $url, $options{ content }, %$headers )
    ->then(sub( $code, $res ) {
        my $info;
        if(    'all' eq $options{ type }
            or 'text' eq $options{ type }
            or 'meta' eq $options{ type } ) {
            if( $code !~ /^2..$/ ) {
                croak "Got HTTP error code $code for '$options{ filename }'";
            };
            my $item = $res->[0];
            # Should/could this be lazy?
            my $c = delete $item->{'X-TIKA:content'};
            # Ghetto-strip HTML we don't want:



( run in 0.811 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )