CWB
view release on metacpan or search on metacpan
lib/CWB/CQP.pm view on Meta::CPAN
# activate corpus in managed mode (automatic character encoding conversion)
$cqp->activate($corpus);
# execute CQP command (blocking mode) and check for error
@lines = $cqp->exec($my_cmd);
unless ($cqp->ok) {
@cqp_error_message = $cqp->error_message;
my_error_handler();
}
# it's easier to use an automatic error handler
$cqp->set_error_handler(\&my_error_handler); # user-defined
$cqp->set_error_handler('die'); # built-in, useful for one-off scripts
# read TAB-delimited table from count, group, tabulate, ...
@table = $cqp->exec_rows($my_cmd);
# run CQP command in background (non-blocking mode)
$cqp->run($my_cmd);
if ($cqp->ready) { # specify optional timeout in seconds
my $line = $cqp->getline;
my @fields = $cqp->getrow; # TAB-delimited output
}
@lines = $cqp->getlines(10); # reads 10 lines, blocking if necessary
# execute in query lock mode (to improve security of CGI scripts)
$cqp->begin_query;
# execute untrusted CQP queries
$cqp->end_query;
@lines = $cqp->exec_query($untrusted_query); # convenience wrapper
# dump/undump a named query into/from a table of corpus positions
@matches = $cqp->dump("Last" [, $from, $to]);
$cqp->undump("Copy", @matches); # produces copy of "Last"
# safely quote regular expressions and literal strings for CQP queries
$query = $cqp->quote('[0-9]+"-[a-z-]+'); # picks single or double quotes
$query = $cqp->quote($cqp->quotemeta($word)); # escape all metacharacters
# activate CQP progress messages during query execution
$cqp->progress_on;
$status = $cqp->progress; # after starting CQP command with run()
($total, $pass, $n_passes, $msg, $percent) = $cqp->progress_info;
$cqp->progress_off;
$cqp->set_progress_handler(\&my_progress_handler); # user-defined handler
# shut down CQP server (exits gracefully)
undef $cqp;
=head1 DESCRIPTION
A B<CWB::CQP> object represents an instance of the corpus query processor CQP
running as a background process. By calling suitable methods on this object,
arbitrary CQP commands can be executed and their output can be captured.
The C<STDERR> stream of the CQP process is monitored for error messages,
which can automatically trigger an error handler.
Every B<CWB::CQP> object has its own CQP background process and communication is
fully asynchronous. This enables scripts to perform other actions while a long
CQP command is executing, or to run multiple CQP instances in parallel.
In managed mode (enabled with the B<activate> method), the API works consistently
with Perl Unicode strings, which are automatically translated to the character
encoding of the CWB corpus in the background.
=cut
use warnings;
use strict;
use sigtrap qw(die PIPE); # catch write errors to background CQP process
use CWB;
use Carp;
use FileHandle;
use IPC::Open3;
use IO::Select;
use Encode;
use POSIX ":sys_wait_h"; # for non-blocking waitpid
## package global variables
our @CQP_options = "-c"; # always run CQP in child mode
our $CQP_version = "3.5.0"; # required version of CQP (checked at startup)
our %Child = (); # keep track of running CQP processes
sub SIGCHLD_handler {
foreach my $child_pid (keys %Child) {
my $reaped = waitpid($child_pid, WNOHANG);
die "CWB::CQP: Child process #$child_pid terminated unexpectedly -- not safe to continue\n"
if $reaped > 0;
}
return; # allow other signal handlers to reap the SIGCHLD
}
$SIG{CHLD} = \&SIGCHLD_handler;
=head1 METHODS
The following methods are available:
=over 4
=item I<$cqp> = B<new> CWB::CQP;
=item I<$cqp> = B<new> CWB::CQP '-r /corpora/registry', '-l /data/cqpresults';
Spawn new CQP background process. The object I<$cqp> can then be used to communicate with
this CQP instance. Optional arguments of the B<new> method are passed as command-line
options to CQP. Use at your own risk.
=cut
## CWB::CQP object constructor
sub new {
my $class = shift; # class name
my $self = {}; # namespace for new CQP class object
my @options = @_; # CQP command-line options (use at your own risk)
# split options with values, e.g. "-r /my/registry" => "-r", "/my/registry" (doesn't work for multiple options in one string)
lib/CWB/CQP.pm view on Meta::CPAN
while ($self->{'select_any'}->can_read($timeout)) {
## STDERR -- read all available output on stderr first
if ($self->{'select_err'}->can_read(0)) {
sysread $self->{'err'}, $stderr_buffer, $self->{'block_size'}, length($stderr_buffer); # append to $stderr_buffer
}
## STDOUT -- if there is no more data on stderr, we should be able to read from stdout
elsif ($self->{'select_out'}->can_read(0)) {
sysread $self->{'out'}, $self->{'buffer'}, $self->{'block_size'}, length($self->{'buffer'}); # append to object's input buffer
if ($self->{'buffer'} =~ /\n/) {
## if there's a complete line in the input buffer, split off all lines
my @new_lines = split /\n/, $self->{'buffer'}, -1; # make sure that last line is handled correctly if buffer ends in \n
$self->{'buffer'} = pop @new_lines; # last entry is incomplete line ("" if buffer ended in \n) => return to input buffer
foreach my $line (@new_lines) {
## skip blank line printed after each CQP command
next if $line eq "";
## handle progress messages if ProgressBar has been activated
if ($self->{'progress'} and $line =~ /^-::-PROGRESS-::-/) {
my ($pass, $n_passes, $message);
(undef, $pass, $n_passes, $message) = split /\t/, $line;
my $percent = ($message =~ /([0-9]+)\%\s*complete/) ? $1+0 : undef; # extract progress percentage, if present
my $total_percent = (100 * ($pass - 1) + ($percent || 0)) / $n_passes; # estimate total progress ($percent assumed to be 0% if not given)
$self->{'progress_info'} = [$total_percent, $pass, $n_passes, $message, $percent];
my $handler = $self->{'progress_handler'};
if (ref($handler) eq 'CODE') {
$handler->($total_percent, $pass, $n_passes, $message, $percent); # call user-defined progress handler
}
}
## regular output lines are collected in object's line buffer
else {
$line = $encoder->decode($line) if defined $encoder;
push @{$self->{'lines'}}, $line;
$lines++;
}
}
}
last if $lines > 0; # if we have read a line and there is no output on stderr, return from function
}
## ERROR -- we should never reach this point
else {
die "CWB::CQP: INTERNAL ERROR in _update() -- no data on stdout or stderr of CQP child process";
}
}
if ($stderr_buffer ne "") {
$self->{'status'} = 'error'; # any output on stderr indicates that something went wrong
my @lines = split /\n/, $stderr_buffer;
@lines = map {$encoder->decode($_)} @lines
if defined $encoder;
$self->error(@lines); # may call error handler and abort, or print message and continue
# note that error() method automatically adds lines to internal error_message buffer
}
return $lines;
}
=item I<$cqp>->B<run>(I<$cmd>);
Start a single CQP command I<$cmd> in the background. This method returns immediately.
Command output can then be read with the B<getline>, B<getlines> and B<getrow> methods.
If asynchronous communication is desired, use B<ready> to check whether output is available.
It is an error to B<run> a new command before the output of the previous command has completely
been processed.
=cut
sub run {
croak 'USAGE: $cqp->run($cmd);'
unless @_ == 2;
my $self = shift;
my $cmd = shift;
my $debug = $self->{'debug'};
my $encoder = $self->{'encoder'};
$cmd =~ s/\n+/ /g; # make sure there are no newline characters (to be on the safe side)
$cmd =~ s/(;\s*)+$//; # ";" will be added when $cmd is sent to CQP
my $active_cmd = $self->{'command'};
croak "Error: new CQP command issued while '$active_cmd' is still being processed"
if $active_cmd;
$self->{'command'} = "$cmd;";
$self->{'status'} = 'ok';
$self->{'buffer'} = "";
$self->{'lines'} = [];
$self->{'error_message'} = [];
print "CQP << $cmd;\n"
if $debug;
$cmd = $encoder->encode($cmd, Encode::FB_PERLQQ) # turn invalid chars into hex escapes, so CQP can try to deal with them
if defined $encoder;
$self->{'in'}->print("$cmd;\n .EOL.;\n"); # append .EOL. command to mark end of CQP output
}
=item I<$num_of_lines> = I<$cqp>->B<ready>;
=item I<$num_of_lines> = I<$cqp>->B<ready>(I<$timeout>);
Check if output from current CQP command is available for reading with B<getline> etc.,
returning the number of lines currently held in the input buffer (possibly including an
end-of-output marker line that will not be returned by B<getline> etc.). If there is no
active command, returns B<undef>.
The first form of the command returns immediately. The second form waits up to I<$timeout>
seconds for CQP output to become available. Use a negative I<$timeout> for blocking mode.
=cut
sub ready {
my $self = shift;
my $timeout = shift;
my $lines = @{$self->{'lines'}};
return $lines # output has already been buffered => ready to read
if $lines > 0;
return undef # no command active => undefined state
unless $self->{'command'};
return $self->_update($timeout); # try to read from CQP process & return number of lines available (NB: line buffer was empty before)
}
( run in 0.764 second using v1.01-cache-2.11-cpan-5837b0d9d2c )