CWB

 view release on metacpan or  search on metacpan

lib/CWB/CQP.pm  view on Meta::CPAN

  # activate corpus in managed mode (automatic character encoding conversion)
  $cqp->activate($corpus);

  # execute CQP command (blocking mode) and check for error
  @lines = $cqp->exec($my_cmd);
  unless ($cqp->ok) {
    @cqp_error_message = $cqp->error_message;
    my_error_handler();
  }

  # it's easier to use an automatic error handler
  $cqp->set_error_handler(\&my_error_handler); # user-defined
  $cqp->set_error_handler('die'); # built-in, useful for one-off scripts

  # read TAB-delimited table from count, group, tabulate, ...
  @table = $cqp->exec_rows($my_cmd);

  # run CQP command in background (non-blocking mode)
  $cqp->run($my_cmd);
  if ($cqp->ready) {  # specify optional timeout in seconds
    my $line = $cqp->getline;
    my @fields = $cqp->getrow; # TAB-delimited output
  }
  @lines = $cqp->getlines(10); # reads 10 lines, blocking if necessary

  # execute in query lock mode (to improve security of CGI scripts)
  $cqp->begin_query;
    # execute untrusted CQP queries
  $cqp->end_query;
  
  @lines = $cqp->exec_query($untrusted_query); # convenience wrapper
  
  # dump/undump a named query into/from a table of corpus positions
  @matches = $cqp->dump("Last" [, $from, $to]);
  $cqp->undump("Copy", @matches);  # produces copy of "Last"

  # safely quote regular expressions and literal strings for CQP queries
  $query = $cqp->quote('[0-9]+"-[a-z-]+');      # picks single or double quotes
  $query = $cqp->quote($cqp->quotemeta($word)); # escape all metacharacters

  # activate CQP progress messages during query execution
  $cqp->progress_on;
  $status = $cqp->progress; # after starting CQP command with run()
  ($total, $pass, $n_passes, $msg, $percent) = $cqp->progress_info;
  $cqp->progress_off;

  $cqp->set_progress_handler(\&my_progress_handler); # user-defined handler

  # shut down CQP server (exits gracefully)
  undef $cqp;

=head1 DESCRIPTION

A B<CWB::CQP> object represents an instance of the corpus query processor CQP
running as a background process.  By calling suitable methods on this object,
arbitrary CQP commands can be executed and their output can be captured.
The C<STDERR> stream of the CQP process is monitored for error messages,
which can automatically trigger an error handler.

Every B<CWB::CQP> object has its own CQP background process and communication is
fully asynchronous.  This enables scripts to perform other actions while a long
CQP command is executing, or to run multiple CQP instances in parallel.

In managed mode (enabled with the B<activate> method), the API works consistently
with Perl Unicode strings, which are automatically translated to the character
encoding of the CWB corpus in the background.

=cut

use warnings;
use strict;

use sigtrap qw(die PIPE);       # catch write errors to background CQP process

use CWB;
use Carp;
use FileHandle;
use IPC::Open3;
use IO::Select;
use Encode;

use POSIX ":sys_wait_h";         # for non-blocking waitpid

## package global variables
our @CQP_options = "-c";         # always run CQP in child mode
our $CQP_version = "3.5.0";      # required version of CQP (checked at startup)

our %Child = ();                 # keep track of running CQP processes

sub SIGCHLD_handler {
  foreach my $child_pid (keys %Child) {
    my $reaped = waitpid($child_pid, WNOHANG);
    die "CWB::CQP: Child process #$child_pid terminated unexpectedly -- not safe to continue\n"
      if $reaped > 0;
  }
  return; # allow other signal handlers to reap the SIGCHLD
}
$SIG{CHLD} = \&SIGCHLD_handler;

=head1 METHODS

The following methods are available:

=over 4

=item I<$cqp> = B<new> CWB::CQP;

=item I<$cqp> = B<new> CWB::CQP '-r /corpora/registry', '-l /data/cqpresults';

Spawn new CQP background process.  The object I<$cqp> can then be used to communicate with 
this CQP instance.  Optional arguments of the B<new> method are passed as command-line
options to CQP.  Use at your own risk.

=cut

## CWB::CQP object constructor
sub new {
  my $class = shift;            # class name
  my $self = {};                # namespace for new CQP class object
  my @options = @_;             # CQP command-line options (use at your own risk)
  # split options with values, e.g. "-r /my/registry" => "-r", "/my/registry" (doesn't work for multiple options in one string)

lib/CWB/CQP.pm  view on Meta::CPAN

  while ($self->{'select_any'}->can_read($timeout)) {
    ## STDERR -- read all available output on stderr first
    if ($self->{'select_err'}->can_read(0)) {
      sysread $self->{'err'}, $stderr_buffer, $self->{'block_size'}, length($stderr_buffer); # append to $stderr_buffer
    }

    ## STDOUT -- if there is no more data on stderr, we should be able to read from stdout
    elsif ($self->{'select_out'}->can_read(0)) {
      sysread $self->{'out'}, $self->{'buffer'}, $self->{'block_size'}, length($self->{'buffer'}); # append to object's input buffer
      if ($self->{'buffer'} =~ /\n/) {
        ## if there's a complete line in the input buffer, split off all lines
        my @new_lines = split /\n/, $self->{'buffer'}, -1; # make sure that last line is handled correctly if buffer ends in \n
        $self->{'buffer'} = pop @new_lines; # last entry is incomplete line ("" if buffer ended in \n) => return to input buffer
        foreach my $line (@new_lines) {
          ## skip blank line printed after each CQP command
          next if $line eq "";
          ## handle progress messages if ProgressBar has been activated
          if ($self->{'progress'} and $line =~ /^-::-PROGRESS-::-/) {
              my ($pass, $n_passes, $message); 
              (undef, $pass, $n_passes, $message) = split /\t/, $line;
              my $percent = ($message =~ /([0-9]+)\%\s*complete/) ? $1+0 : undef; # extract progress percentage, if present
              my $total_percent = (100 * ($pass - 1) + ($percent || 0)) / $n_passes; # estimate total progress ($percent assumed to be 0% if not given)
              $self->{'progress_info'} = [$total_percent, $pass, $n_passes, $message, $percent];
              my $handler = $self->{'progress_handler'};
              if (ref($handler) eq 'CODE') {
                $handler->($total_percent, $pass, $n_passes, $message, $percent); # call user-defined progress handler
              }
          }
          ## regular output lines are collected in object's line buffer
          else {
            $line = $encoder->decode($line) if defined $encoder;
            push @{$self->{'lines'}}, $line;
            $lines++;
          }
        }
      }
      last if $lines > 0;       # if we have read a line and there is no output on stderr, return from function
    }

    ## ERROR -- we should never reach this point
    else {
      die "CWB::CQP: INTERNAL ERROR in _update() -- no data on stdout or stderr of CQP child process";
    }
  }

  if ($stderr_buffer ne "") {
    $self->{'status'} = 'error'; # any output on stderr indicates that something went wrong
    my @lines = split /\n/, $stderr_buffer;
    @lines = map {$encoder->decode($_)} @lines
      if defined $encoder;
    $self->error(@lines); # may call error handler and abort, or print message and continue 
    # note that error() method automatically adds lines to internal error_message buffer
  }
  return $lines;
}

=item I<$cqp>->B<run>(I<$cmd>);

Start a single CQP command I<$cmd> in the background.  This method returns immediately.
Command output can then be read with the B<getline>, B<getlines> and B<getrow> methods.
If asynchronous communication is desired, use B<ready> to check whether output is available.

It is an error to B<run> a new command before the output of the previous command has completely
been processed.

=cut

sub run {
  croak 'USAGE:  $cqp->run($cmd);'
    unless @_ == 2;
  my $self = shift;
  my $cmd = shift;
  my $debug = $self->{'debug'};
  my $encoder = $self->{'encoder'};

  $cmd =~ s/\n+/ /g;            # make sure there are no newline characters (to be on the safe side)
  $cmd =~ s/(;\s*)+$//;         # ";" will be added when $cmd is sent to CQP

  my $active_cmd = $self->{'command'};
  croak "Error: new CQP command issued while '$active_cmd' is still being processed"
    if $active_cmd;

  $self->{'command'} = "$cmd;";
  $self->{'status'} = 'ok';
  $self->{'buffer'} = "";
  $self->{'lines'} = [];
  $self->{'error_message'} = [];

  print "CQP << $cmd;\n"
    if $debug;
  $cmd = $encoder->encode($cmd, Encode::FB_PERLQQ) # turn invalid chars into hex escapes, so CQP can try to deal with them
    if defined $encoder;
  $self->{'in'}->print("$cmd;\n .EOL.;\n"); # append .EOL. command to mark end of CQP output
}

=item I<$num_of_lines> = I<$cqp>->B<ready>;

=item I<$num_of_lines> = I<$cqp>->B<ready>(I<$timeout>);

Check if output from current CQP command is available for reading with B<getline> etc.,
returning the number of lines currently held in the input buffer (possibly including an
end-of-output marker line that will not be returned by B<getline> etc.).  If there is no
active command, returns B<undef>.

The first form of the command returns immediately.  The second form waits up to I<$timeout>
seconds for CQP output to become available.  Use a negative I<$timeout> for blocking mode.

=cut

sub ready {
  my $self = shift;
  my $timeout = shift;

  my $lines = @{$self->{'lines'}};
  return $lines            # output has already been buffered => ready to read
    if $lines > 0;
  return undef             # no command active => undefined state
    unless $self->{'command'};
  return $self->_update($timeout); # try to read from CQP process & return number of lines available (NB: line buffer was empty before)
}



( run in 0.764 second using v1.01-cache-2.11-cpan-5837b0d9d2c )