CWB-Web

 view release on metacpan or  search on metacpan

lib/CWB/Web/Cache.pm  view on Meta::CPAN

    carp "Required argument(s) [@missing] not specified.";
    return 0;
  }
  return 1;
}

## INTERNAL:  @files = list_directory($dir);
## get directory listing from DirHandle module (local file names only!), skipping anything that doesn't look like an identifier
sub list_directory {
  my $dir = shift;
  my $dh = new DirHandle $dir;
  return ()
    unless defined $dh;
  my @files = grep {/^[A-Z0-9_-]+:/} $dh->read;
  undef $dh;
  return @files;
}

## INTERNAL:  $kbytes = directory_size($dir);
## determine size of directory with "du -ks" (returns size in kbytes)
sub directory_size {
  my $dir = shift;
  my @output = ();
  CWB::Shell::Cmd("du -ks $dir", \@output);
  croak "Can't parse output of 'du -ks $dir' (@output)"
    unless @output == 1 and $output[0] =~ /^([0-9]+)\s+\S+/;
  my $size = $1;
  return $size;
}

## INTERNAL:  delete_files(@files);
## unlink list of files and croak() in case of an error
sub delete_files {
  foreach my $file (@_) {
    croak "Error: can't unlink file $file ($!)"
      unless 1 == unlink $file;
  }
}

## INTERNAL:  @filenames = $cache->index_files;
## INTERNAL:  @matching_files = $cache->index_files($substring);
sub index_files {
  my $self = shift;
  my @files = list_directory($self->{'index'});
  if (@_) {
    my $sub = shift;
    @files = grep { index($_, $sub) >= 0 } @files;
  }
  return @files;
}

## INTERNAL:  $cache->make_dirs;
## ensure that cache directory and subdirectories exist, otherwise create them
sub make_dirs {
  my $self = shift;
  my $dir = $self->{'dir'};
  my $index = $self->{'index'};
  my $data = $self->{'data'};
  unless (-d $dir) {
    CWB::Shell::Cmd("mkdir $dir");
    CWB::Shell::Cmd("chmod 777 $dir");
  }
  unless (-d $index) {
    CWB::Shell::Cmd("mkdir $index");
    CWB::Shell::Cmd("chmod 777 $index");
  }
  unless (-d $data) {
    CWB::Shell::Cmd("mkdir $data");
    CWB::Shell::Cmd("chmod 777 $data");
  }
}

## INTERNAL:  $cache->sweep_cache;
## check size of cache data directory and delete old files when it exceeds the limit
sub sweep_cache {
  my $self = shift;
  my $force = shift;
  $force = (defined $force and $force eq "-force") ? 1 : 0;
  my $index = $self->{'index'};
  my $data = $self->{'data'};
  my $max_size = $self->{'size'} * 1024;        # maximal allowed size in kBytes
  my $expire = $self->{'expire'} / 24;          # maximal storage time in days
  my $size = directory_size($data);
  return                                        # return if cache size is below threshold (unless called with "-force")
    unless $force or $size > $max_size;
  # remove 'dangling' data files where index is missing (if there are any)
  my @files = grep { not -f "$index/$_" } list_directory($data);
  if (@files) {
    delete_files(map { "$index/$_"} @files);
  }
  # delete all files that are older than the maximal expiration time
  @files = grep { -M "$index/$_" > $expire } list_directory($index);
  if (@files) {
    delete_files(map { "$index/$_"} @files);
    delete_files(map { "$data/$_"} @files);
  }
  $size = directory_size($data);                # return if these steps have already reduced the cache size
  return unless $size > $max_size;
  # finally, sort remaining files by age (oldest first) and find out how many must be removed to reduce cache size below limit
  @files =
    sort { $b->[1] <=> $a->[1] }
      map { [$_, (-M "$index/$_"), (-s "$data/$_" || 0) / 1024] }
        list_directory($index);
  my $req_size = $size - 0.8 * $max_size;       # reduce to 80% of maximal cache size to reduce number of relatively slow cache sweeps
  my $cum_size = 0;
  while (@files and $cum_size < $req_size) {
    my $f = shift @files;
    my $file = $f->[0];
    unless (-M "$index/$file" < $f->[1]) {      # just to be sure that index file hasn't been touched in the meantime
      delete_files("$index/$file");
      if (-f "$data/$file") {
        delete_files("$data/$file");
      }
      $cum_size += $f->[2];
    }
  }
  $size = directory_size($data);                # check that cache sweep was successful
  croak 'Error: cache sweep failed for directory '.$self->{'dir'}
    if $size > $max_size;
}

## @lines = $cache->get_metadata($unique_name);
## read metadata for unique filename (read from index file), returns () if index file does not exist
sub get_metadata {
  my $self = shift;
  my $file = shift;
  my $index = $self->{'index'};
  my @lines = ();
  if (-f "$index/$file") {
    my $fh = CWB::OpenFile "$index/$file";
    while (<$fh>) {
      chomp;
      push @lines, $_;
    }
    $fh->close;
  }
  return @lines;
}

## $unique_name = $cache->store($named_query [, @metadata]);
## make named query result $named_query (which must be fully qualified with corpus name) persistent,
## returning unique name for retrieval; optional metadata is s(tored in index file when the data file has been created
sub store {
  my $self = shift;
  my $name = shift;
  my $index = $self->{'index'};
  my $data = $self->{'data'};
  my $cqp = $self->{'cqp'};
  croak "Error: query name $name illegal or not fully specified."
    unless $name =~ /^([A-Z0-9_-]+):([A-Za-z0-9_-]+)$/;
  my $corpus = $1;
  my $localname = $2;
  # create cache directory and subdirectories if necessary
  $self->make_dirs;
  # check size of cache directory and remove old entries if necessary
  $self->sweep_cache;
  # extend $name to unique filename (in index directory)
  my %index_file = map {$_ => 1} $self->index_files("$name-");
  my $ext = 1;
  while (exists $index_file{"$name-$ext"}) {
    $ext++;
  }
  my $unique = "$name-$ext";
  my $unique_local = "$localname-$ext";
  # now create empty index file to "lock" the unique name from other processes
  my $fh = CWB::OpenFile "> $index/$unique";
  $fh->close;
  CWB::Shell::Cmd("chmod 666 $index/$unique");
  # set DataDirectory in CQP session and re-activate base corpus
  $cqp->exec("set DataDirectory '$data'");
  $cqp->exec("$corpus");
  # copy named query to unique name and store it in data directory
  $cqp->exec("$unique_local = $name");
  $cqp->exec("save $unique");
  CWB::Shell::Cmd("chmod 666 $data/$unique");
  # re-write index file if caller has passed metadata
  if (@_) {
    $fh = CWB::OpenFile "> $index/$unique";
    while (@_) {
      my $line = shift;
      chomp $line;                              # normalize whitespace in metadata (esp. linebreaks)
      $line =~ s/\s+/ /g;
      print $fh "$line\n";
    }
    $fh->close;
  }
  return $unique;
}

## $size = $cache->retrieve($unique_name);
## load persistent query result identified by $unique_name into CQP process,
## returning number of matches (>= 0) or 'undef' if the query result has expired from the cache
sub retrieve {
  my $self = shift;
  my $unique = shift;
  my $index = $self->{'index'};
  my $data = $self->{'data'};
  my $cqp = $self->{'cqp'};
  my $expire = $self->{'expire'} / 24;          # maximal storage time in days
  croak "Error: illegal unique query name $unique"
    unless $unique =~ /^([A-Z0-9_-]+):([A-Za-z0-9_-]+)$/;
  my $corpus = $1;
  my $unique_local = $2;
  return undef                                  # may have expired from cache during sweep ...
    unless -f "$index/$unique";
  if (-M "$index/$unique" > $expire) {          # ... or because it exceeded the maximal storage time
    $self->sweep_cache("-force");               # delete expired files so they won't hide exact matches
    return undef;
  }
  CWB::Shell::Cmd("touch $index/$unique");      # touch index file to mark last access time
  # set DataDirectory in CQP session and re-activate base corpus
  $cqp->exec("set DataDirectory '$data'");
  $cqp->exec("$corpus");
  # force loading of named query result in CQP session
  my ($size) = $cqp->exec("size $unique");
  return $size;
}

## ($unique_name [, $matching_lines]) = $cache->retrieve_matching($prefix, ["-partial",] @metadata);
## load persistent query result which is a unique extension of $prefix and matches the specified metadata;
## if flag -partial is specified, a query result matching only the first N lines of @metadata will also
## be accepted; N is returned as $matching_lines and must be checked by the caller
sub retrieve_matching {
  my $self = shift;
  my $prefix = shift;
  my $partial = 0;
  if (@_ and lc($_[0]) eq "-partial") {
    $partial = 1;
    shift @_;
  }
  my $index = $self->{'index'};
  my $data = $self->{'data'};
  my $cqp = $self->{'cqp'};
  my $expire = $self->{'expire'} / 24;          # maximal storage time in days
  croak "Error: query name prefix $prefix is illegal or not fully specified."



( run in 0.534 second using v1.01-cache-2.11-cpan-39bf76dae61 )