CWB

 view release on metacpan or  search on metacpan

lib/CWB/CQP.pm  view on Meta::CPAN

# -*-cperl-*-

=head1 NAME

CWB::CQP - Interact with a CQP process running in the background

=head1 SYNOPSIS

  use CWB::CQP;

  # start CQP server process in the background
  $cqp = new CWB::CQP;
  $cqp = new CWB::CQP("-r /corpora/registry", "-I /global/init.cqp");

  # check for specified or newer CQP version
  $ok = $cqp->check_version($major, $minor, $beta);

  # activate corpus in managed mode (automatic character encoding conversion)
  $cqp->activate($corpus);

  # execute CQP command (blocking mode) and check for error
  @lines = $cqp->exec($my_cmd);
  unless ($cqp->ok) {
    @cqp_error_message = $cqp->error_message;
    my_error_handler();
  }

  # it's easier to use an automatic error handler
  $cqp->set_error_handler(\&my_error_handler); # user-defined
  $cqp->set_error_handler('die'); # built-in, useful for one-off scripts

  # read TAB-delimited table from count, group, tabulate, ...
  @table = $cqp->exec_rows($my_cmd);

  # run CQP command in background (non-blocking mode)
  $cqp->run($my_cmd);
  if ($cqp->ready) {  # specify optional timeout in seconds
    my $line = $cqp->getline;
    my @fields = $cqp->getrow; # TAB-delimited output
  }
  @lines = $cqp->getlines(10); # reads 10 lines, blocking if necessary

  # execute in query lock mode (to improve security of CGI scripts)
  $cqp->begin_query;
    # execute untrusted CQP queries
  $cqp->end_query;
  
  @lines = $cqp->exec_query($untrusted_query); # convenience wrapper
  
  # dump/undump a named query into/from a table of corpus positions
  @matches = $cqp->dump("Last" [, $from, $to]);
  $cqp->undump("Copy", @matches);  # produces copy of "Last"

  # safely quote regular expressions and literal strings for CQP queries
  $query = $cqp->quote('[0-9]+"-[a-z-]+');      # picks single or double quotes
  $query = $cqp->quote($cqp->quotemeta($word)); # escape all metacharacters

  # activate CQP progress messages during query execution
  $cqp->progress_on;
  $status = $cqp->progress; # after starting CQP command with run()
  ($total, $pass, $n_passes, $msg, $percent) = $cqp->progress_info;
  $cqp->progress_off;

  $cqp->set_progress_handler(\&my_progress_handler); # user-defined handler

  # shut down CQP server (exits gracefully)
  undef $cqp;

=head1 DESCRIPTION

A B<CWB::CQP> object represents an instance of the corpus query processor CQP
running as a background process.  By calling suitable methods on this object,
arbitrary CQP commands can be executed and their output can be captured.
The C<STDERR> stream of the CQP process is monitored for error messages,
which can automatically trigger an error handler.

Every B<CWB::CQP> object has its own CQP background process and communication is
fully asynchronous.  This enables scripts to perform other actions while a long
CQP command is executing, or to run multiple CQP instances in parallel.

In managed mode (enabled with the B<activate> method), the API works consistently
with Perl Unicode strings, which are automatically translated to the character
encoding of the CWB corpus in the background.

=cut

use warnings;
use strict;

use sigtrap qw(die PIPE);       # catch write errors to background CQP process

use CWB;
use Carp;
use FileHandle;
use IPC::Open3;
use IO::Select;
use Encode;

use POSIX ":sys_wait_h";         # for non-blocking waitpid

## package global variables
our @CQP_options = "-c";         # always run CQP in child mode
our $CQP_version = "3.5.0";      # required version of CQP (checked at startup)

our %Child = ();                 # keep track of running CQP processes

sub SIGCHLD_handler {
  foreach my $child_pid (keys %Child) {
    my $reaped = waitpid($child_pid, WNOHANG);
    die "CWB::CQP: Child process #$child_pid terminated unexpectedly -- not safe to continue\n"
      if $reaped > 0;
  }
  return; # allow other signal handlers to reap the SIGCHLD
}
$SIG{CHLD} = \&SIGCHLD_handler;

=head1 METHODS

The following methods are available:

=over 4

lib/CWB/CQP.pm  view on Meta::CPAN

Spawn new CQP background process.  The object I<$cqp> can then be used to communicate with 
this CQP instance.  Optional arguments of the B<new> method are passed as command-line
options to CQP.  Use at your own risk.

=cut

## CWB::CQP object constructor
sub new {
  my $class = shift;            # class name
  my $self = {};                # namespace for new CQP class object
  my @options = @_;             # CQP command-line options (use at your own risk)
  # split options with values, e.g. "-r /my/registry" => "-r", "/my/registry" (doesn't work for multiple options in one string)
  @options = map { (/^(--?[A-Za-z0-9]+)\s+(.+)$/) ? ($1, $2) : $_ } @options;

  ## run CQP server in the background
  my $in = $self->{'in'} = new FileHandle;   # stdin of CQP
  my $out = $self->{'out'} = new FileHandle; # stdout of CQP
  my $err = $self->{'err'} = new FileHandle; # stderr of CQP
  my $pid = open3($in, $out, $err, $CWB::CQP, @CQP_options, @options);
  $self->{'pid'} = $pid; # child process ID (so process can be killed if necessary)
  $Child{$pid} = 1;      # a weak ref to $self might be better here
  $in->autoflush(1);     # make sure that commands sent to CQP are always flushed immediately

  $self->{'encoder'} = undef; # in managed mode, an Encode object for character encoding conversion
  binmode($in, ":raw");
  binmode($out, ":raw");
  binmode($err, ":raw");

  my ($need_major, $need_minor, $need_beta) = split /\./, $CQP_version; # required CQP version
  $need_beta = 0 unless $need_beta;

  my $version_string = $out->getline; # child mode (-c) should print version on startup
  chomp $version_string;
  croak "ERROR: CQP backend startup failed ('$CWB::CQP @CQP_options @options')\n"
    unless $version_string =~ /^CQP\s+(?:\w+\s+)*([0-9]+)\.([0-9]+)(?:\.b?([0-9]+))?(?:\s+(.*))?$/;
  $self->{'major_version'} = $1;
  $self->{'minor_version'} = $2;
  $self->{'beta_version'} = $3 || 0;
  $self->{'compile_date'} = $4 || "unknown";
  croak "ERROR: CQP version too old, need at least v$CQP_version ($version_string)\n"
    unless ($1 > $need_major or
            $1 == $need_major
            and ($2 > $need_minor or
                 ($2 == $need_minor and $3 >= $need_beta)));

  ## command execution
  $self->{'command'} = undef; # CQP command string that is currently being processed (undef = last command has been completed)
  $self->{'lines'} = [];      # array of output lines read from CQP process
  $self->{'buffer'} = "";     # read buffer for standard output from CQP process
  $self->{'block_size'} = 256;  # block size for reading from CQP's output and error streams
  $self->{'query_lock'} = undef;# holds random key while query lock mode is active

  ## error handling (messages on stderr)
  $self->{'error_handler'} = undef; # set to subref for user-defined error handler
  $self->{'status'} = 'ok';         # status of last executed command ('ok' or 'error')
  $self->{'error_message'} = [];    # arrayref to array containing message produced by last command (if any)

  ## handling of CQP progress messages
  $self->{'progress'} = 0;             # whether progress messages are activated
  $self->{'progress_handler'} = undef; # optional callback for progress messages
  $self->{'progress_info'} = [];       # contains last available progress information: [$total_percent, $pass, $n_passes, $message, $percent]

  ## debugging (prints more or less everything on stdout)
  $self->{'debug'} = 0;

  ## select vectors for CQP output (stdout, stderr, stdout|stderr)
  $self->{'select_err'} = new IO::Select($err);
  $self->{'select_out'} = new IO::Select($out);
  $self->{'select_any'} = new IO::Select($err, $out);

  ## CQP object setup complete
  bless($self, $class);

  ## the following command will collect and ignore any output which may have been produced during startup
  $self->exec("set PrettyPrint off"); # pretty-printing should be turned off for non-interactive use

  return $self;
}

=item B<undef> I<$cqp>;

Exit CQP background process gracefully by issuing an C<exit;> command.
This is done automatically when the variable I<$cqp> goes out of scope.
Note that there may be a slight delay while B<CWB::CQP> waits for the CQP
process to terminate.

B<Do NOT> send an C<exit;> command to CQP explicitly (with B<exec> or B<run>).
This looks like a program crash to B<CWB::CQP> and will result in immediate
termination of the Perl script.

=cut

sub DESTROY {
  my $self = shift;
  my $pid = $self->{'pid'};
  my $alive = delete $Child{$pid}; # remove from list of children so no longer caught by signal handler
  
  if ($alive && $self->{'command'}) {
    while ($self->_update) {} # read pending output from active command
  }
  my $out = $self->{'out'};
  if (defined $out) {
    $out->print("exit")         # exit CQP backend
      if $alive;
    $out->close;
  }
  my $in = $self->{'in'};
  if (defined $in) {
    $in->close;
  }
  waitpid $pid, 0  # wait for CQP to exit and reap background process
}

=item I<$ok> = I<$cqp>->B<check_version>(I<$major>, I<$minor>, I<$beta>);

Check for minimum required CQP version, i.e. the background process has
to be CQP version I<$major>.I<$minor>.I<$beta> or newer.
I<$minor> and I<$beta> may be omitted, in which case they default to 0.
Note that the B<CWB::CQP> module automatically checks whether the CQP version
is compatible with its own requirements when a new object is created.
The B<check_version> method can subsequently be used to check for a more

lib/CWB/CQP.pm  view on Meta::CPAN

    $corpus = uc($corpus) unless $corpus =~ /:/; # enforce uppercase unless a subcorpus is activated
    $self->exec($corpus); # activate corpus (will raise an error in the usual way if corpus cannot be activated)
    if ($self->ok) {
      my @info = $self->exec("info");
      my $charset = undef;
      foreach (@info) {
        if (/^Charset:\s+(\S+)$/) {
          $charset = $1;
          last;
        }
      }
      my $encoder = (defined $charset) ? find_encoding($charset) : undef;
      if (defined $encoder) {
        $self->{'encoder'} = $encoder;
      }
      else {
        $self->error("Corpus $corpus does not declare a known character encoding.  Switching to non-managed mode.");
        $self->{'encoder'} = undef;
      }
    }
  }
}

## INTERNAL:
##    $lines_read = $self->_update([$timeout]);
## This is the main "workhorse" of the CWB::CQP module.  It checks for output from CQP process
## (stdout and stderr), updates progress status, fills internal buffers, and calls error and
## progress handlers if necessary.  The optional $timeout specifies how many seconds to wait for
## output; the default is 0 seconds, i.e. non-blocking mode, while a negative value blocks.
## NB: $lines_read includes the .EOL. terminator line, so it is safe to keep calling _update()
## until a non-zero value is returned (even if a CQP command fails with an error message).
sub _update {
  my $self = shift;
  my $timeout = shift || 0;
  $timeout = undef
    if $timeout < 0;
  my $stderr_buffer = "";
  my $lines = 0; # how many lines have successfully been read from stdout
  my $encoder = $self->{'encoder'};

  while ($self->{'select_any'}->can_read($timeout)) {
    ## STDERR -- read all available output on stderr first
    if ($self->{'select_err'}->can_read(0)) {
      sysread $self->{'err'}, $stderr_buffer, $self->{'block_size'}, length($stderr_buffer); # append to $stderr_buffer
    }

    ## STDOUT -- if there is no more data on stderr, we should be able to read from stdout
    elsif ($self->{'select_out'}->can_read(0)) {
      sysread $self->{'out'}, $self->{'buffer'}, $self->{'block_size'}, length($self->{'buffer'}); # append to object's input buffer
      if ($self->{'buffer'} =~ /\n/) {
        ## if there's a complete line in the input buffer, split off all lines
        my @new_lines = split /\n/, $self->{'buffer'}, -1; # make sure that last line is handled correctly if buffer ends in \n
        $self->{'buffer'} = pop @new_lines; # last entry is incomplete line ("" if buffer ended in \n) => return to input buffer
        foreach my $line (@new_lines) {
          ## skip blank line printed after each CQP command
          next if $line eq "";
          ## handle progress messages if ProgressBar has been activated
          if ($self->{'progress'} and $line =~ /^-::-PROGRESS-::-/) {
              my ($pass, $n_passes, $message); 
              (undef, $pass, $n_passes, $message) = split /\t/, $line;
              my $percent = ($message =~ /([0-9]+)\%\s*complete/) ? $1+0 : undef; # extract progress percentage, if present
              my $total_percent = (100 * ($pass - 1) + ($percent || 0)) / $n_passes; # estimate total progress ($percent assumed to be 0% if not given)
              $self->{'progress_info'} = [$total_percent, $pass, $n_passes, $message, $percent];
              my $handler = $self->{'progress_handler'};
              if (ref($handler) eq 'CODE') {
                $handler->($total_percent, $pass, $n_passes, $message, $percent); # call user-defined progress handler
              }
          }
          ## regular output lines are collected in object's line buffer
          else {
            $line = $encoder->decode($line) if defined $encoder;
            push @{$self->{'lines'}}, $line;
            $lines++;
          }
        }
      }
      last if $lines > 0;       # if we have read a line and there is no output on stderr, return from function
    }

    ## ERROR -- we should never reach this point
    else {
      die "CWB::CQP: INTERNAL ERROR in _update() -- no data on stdout or stderr of CQP child process";
    }
  }

  if ($stderr_buffer ne "") {
    $self->{'status'} = 'error'; # any output on stderr indicates that something went wrong
    my @lines = split /\n/, $stderr_buffer;
    @lines = map {$encoder->decode($_)} @lines
      if defined $encoder;
    $self->error(@lines); # may call error handler and abort, or print message and continue 
    # note that error() method automatically adds lines to internal error_message buffer
  }
  return $lines;
}

=item I<$cqp>->B<run>(I<$cmd>);

Start a single CQP command I<$cmd> in the background.  This method returns immediately.
Command output can then be read with the B<getline>, B<getlines> and B<getrow> methods.
If asynchronous communication is desired, use B<ready> to check whether output is available.

It is an error to B<run> a new command before the output of the previous command has completely
been processed.

=cut

sub run {
  croak 'USAGE:  $cqp->run($cmd);'
    unless @_ == 2;
  my $self = shift;
  my $cmd = shift;
  my $debug = $self->{'debug'};
  my $encoder = $self->{'encoder'};

  $cmd =~ s/\n+/ /g;            # make sure there are no newline characters (to be on the safe side)
  $cmd =~ s/(;\s*)+$//;         # ";" will be added when $cmd is sent to CQP

  my $active_cmd = $self->{'command'};
  croak "Error: new CQP command issued while '$active_cmd' is still being processed"
    if $active_cmd;

  $self->{'command'} = "$cmd;";
  $self->{'status'} = 'ok';
  $self->{'buffer'} = "";
  $self->{'lines'} = [];

lib/CWB/CQP.pm  view on Meta::CPAN

  my $self = shift;
  my @quoted = ();
  $self->{'status'} = 'ok'; # method raises an error if string cannot be quoted
  while (@_) {
    my $r = shift;
    if ($r =~ /(\\+)$/) {
      $self->error("Cannot quote string '$r' ending in unescaped backslash.")
        if (length($1) % 2) != 0;
    }
    if ($r !~ /"/) {
      push @quoted, "\"$r\""; # quote with "..." if string doesn't containt double quote
    }
    elsif ($r !~ /'/) {
      push @quoted, "'$r'";   # else quote with '...' if string doesn't contain single quote (= apostrophe)
    }
    else {
      # use double quotes, but escape all inner double quotes by doubling (unless already escaped with backslash)
      $r =~ s/(\\*)"/ (length($1) % 2) ? $& : "$&\"" /ge;
      push @quoted, "\"$r\"";
    }
  }
  return (wantarray) ? @quoted : shift @quoted;  
}

sub quotemeta {
  my $self = shift;
  my @quoted = ();
  while (@_) {
    my $s = shift;
    $s =~ s/([(){}\[\]|.?*+\$\\])/\\$1/g;
    $s =~ s/\^/[^]/g; # work around latex escapes like \^o in CQP
    push @quoted, $s;
  }
  return (wantarray) ? @quoted : shift @quoted;
}

=item I<$cqp>->B<debug>(1);

=item I<$cqp>->B<debug>(0);

Activate/deactivate debugging mode, which logs all executed commands and their complete output on STDOUT.  The B<debug> method returns the previous status for convenience.

=cut

sub debug {
  croak 'USAGE:  $prev_status = $cqp->debug( 1 | 0 ) ;'
    unless @_ == 2;
  my $self = shift;
  my $on = shift;
  my $prev = $self->{'debug'};
  $self->{'debug'} = $on;
  return $prev;
}

=item I<$cqp>->B<progress_on>;

=item I<$cqp>->B<progress_off>;

=item I<$message> = I<$cqp>->B<progress>;

=item (I<$total>, I<$pass>, I<$n_passes>, I<$msg>, I<$percent>) = I<$cqp>->B<progress_info>;

CQP progress messages can be activated and deactivated with the B<progress_on> and B<progress_off> methods (corresponding to C<set ProgressBar on|off;> in CQP).

If active, progress information can be obtained with the method B<progress>, which returns the last progress message received from CQP.  The B<progress_info> returns pre-parsed progress information, consisting of estimated total percentage of complet...

It is an error to call B<progress> or B<progress_info> without activating progress messages first.

=cut

## activate / deactivate CQP progress messages
sub progress_on {
  my $self = shift;
  if ($self->{'progress'}) {
    carp 'CWB::CQP:  Progress messages have already been activated (ignored)';
  }
  else {
    $self->exec("set ProgressBar on");
    $self->{'progress'} = 1;
  }
}

sub progress_off {
  my $self = shift;
  if ( $self->{'progress'}) {
    $self->exec("set ProgressBar off");
    $self->{'progress'} = 0;
  }
  else {
    carp 'CWB::CQP:  Progress messages have not been turned on yet (ignored)';
  }
}

## poll current progress status (estimated total percentage, or detailed information)
sub progress {
  my $self = shift;
  croak 'CWB::CQP:  No progress() information available, please call progress_on() first'
    unless $self->{'progress'};
  if (not $self->{'command'}) {
    carp 'CWB::CQP:  No active command, progress() does not make sense';
    return undef;
  }
  else {
    ## if input is already available, return corresponding progress state; otherwise check for new progress messages
    $self->_update
      unless @{$self->{'lines'}} > 0;
    return $self->{'progress_info'}->[0];
  }
}

sub progress_info {
  croak 'USAGE:  ($total_percent, $pass, $n_passes, $message, $percent) = $cqp->progress_info;'
    unless @_ == 1 and wantarray;
  my $self = shift;
  croak 'CWB::CQP:  No progress_info() available, please call progress_on() first'
    unless $self->{'progress'};
  if (not $self->{'command'}) {
    carp 'CWB::CQP:  No active command, progress() does not make sense';
    return undef;
  }
  else {
    ## if input is already available, return corresponding progress state; otherwise check for new progress messages
    $self->_update
      unless @{$self->{'lines'}} > 0;
    return @{$self->{'progress_info'}}
  }
}

=item I<$cqp>->B<set_progress_handler>(I<&my_progress_handler>);

Set a user-defined progress handler, which will be invoked whenever new progress information is received from CQP.  The argument must be a named or anonymous subroutine, which will be called with the information returned by B<progress_info>.  Note th...

Calling B<set_progress_handler> with B<undef> (or without an argument) disables the user-defined progress handler.

=cut

## set user-defined handler for CQP progress messages (does not automatically activate progress messages!)
sub set_progress_handler {
  my $self = shift;
  my $handler = shift;

  croak 'USAGE:  $cqp->set_progress_handler(\&my_progress_handler);'
    unless (not defined $handler) or (ref $handler eq 'CODE');
  if ($handler) {
    $self->{'progress_handler'} = $handler;
  }
  else {
    $self->{'progress_handler'} = undef;
  }
}

=back

=cut

return 1;

__END__

=head1 COPYRIGHT

Copyright (C) 2002-2022 Stephanie Evert [https://purl.org/stephanie.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut



( run in 0.926 second using v1.01-cache-2.11-cpan-97f6503c9c8 )