CWB-Web
view release on metacpan or search on metacpan
lib/CWB/Web/Cache.pm view on Meta::CPAN
carp "Required argument(s) [@missing] not specified.";
return 0;
}
return 1;
}
## INTERNAL: @files = list_directory($dir);
## get directory listing from DirHandle module (local file names only!), skipping anything that doesn't look like an identifier
sub list_directory {
my $dir = shift;
my $dh = new DirHandle $dir;
return ()
unless defined $dh;
my @files = grep {/^[A-Z0-9_-]+:/} $dh->read;
undef $dh;
return @files;
}
## INTERNAL: $kbytes = directory_size($dir);
## determine size of directory with "du -ks" (returns size in kbytes)
sub directory_size {
my $dir = shift;
my @output = ();
CWB::Shell::Cmd("du -ks $dir", \@output);
croak "Can't parse output of 'du -ks $dir' (@output)"
unless @output == 1 and $output[0] =~ /^([0-9]+)\s+\S+/;
my $size = $1;
return $size;
}
## INTERNAL: delete_files(@files);
## unlink list of files and croak() in case of an error
sub delete_files {
foreach my $file (@_) {
croak "Error: can't unlink file $file ($!)"
unless 1 == unlink $file;
}
}
## INTERNAL: @filenames = $cache->index_files;
## INTERNAL: @matching_files = $cache->index_files($substring);
sub index_files {
my $self = shift;
my @files = list_directory($self->{'index'});
if (@_) {
my $sub = shift;
@files = grep { index($_, $sub) >= 0 } @files;
}
return @files;
}
## INTERNAL: $cache->make_dirs;
## ensure that cache directory and subdirectories exist, otherwise create them
sub make_dirs {
my $self = shift;
my $dir = $self->{'dir'};
my $index = $self->{'index'};
my $data = $self->{'data'};
unless (-d $dir) {
CWB::Shell::Cmd("mkdir $dir");
CWB::Shell::Cmd("chmod 777 $dir");
}
unless (-d $index) {
CWB::Shell::Cmd("mkdir $index");
CWB::Shell::Cmd("chmod 777 $index");
}
unless (-d $data) {
CWB::Shell::Cmd("mkdir $data");
CWB::Shell::Cmd("chmod 777 $data");
}
}
## INTERNAL: $cache->sweep_cache;
## check size of cache data directory and delete old files when it exceeds the limit
sub sweep_cache {
my $self = shift;
my $force = shift;
$force = (defined $force and $force eq "-force") ? 1 : 0;
my $index = $self->{'index'};
my $data = $self->{'data'};
my $max_size = $self->{'size'} * 1024; # maximal allowed size in kBytes
my $expire = $self->{'expire'} / 24; # maximal storage time in days
my $size = directory_size($data);
return # return if cache size is below threshold (unless called with "-force")
unless $force or $size > $max_size;
# remove 'dangling' data files where index is missing (if there are any)
my @files = grep { not -f "$index/$_" } list_directory($data);
if (@files) {
delete_files(map { "$index/$_"} @files);
}
# delete all files that are older than the maximal expiration time
@files = grep { -M "$index/$_" > $expire } list_directory($index);
if (@files) {
delete_files(map { "$index/$_"} @files);
delete_files(map { "$data/$_"} @files);
}
$size = directory_size($data); # return if these steps have already reduced the cache size
return unless $size > $max_size;
# finally, sort remaining files by age (oldest first) and find out how many must be removed to reduce cache size below limit
@files =
sort { $b->[1] <=> $a->[1] }
map { [$_, (-M "$index/$_"), (-s "$data/$_" || 0) / 1024] }
list_directory($index);
my $req_size = $size - 0.8 * $max_size; # reduce to 80% of maximal cache size to reduce number of relatively slow cache sweeps
my $cum_size = 0;
while (@files and $cum_size < $req_size) {
my $f = shift @files;
my $file = $f->[0];
unless (-M "$index/$file" < $f->[1]) { # just to be sure that index file hasn't been touched in the meantime
delete_files("$index/$file");
if (-f "$data/$file") {
delete_files("$data/$file");
}
$cum_size += $f->[2];
}
}
$size = directory_size($data); # check that cache sweep was successful
croak 'Error: cache sweep failed for directory '.$self->{'dir'}
if $size > $max_size;
}
## @lines = $cache->get_metadata($unique_name);
## read metadata for unique filename (read from index file), returns () if index file does not exist
sub get_metadata {
my $self = shift;
my $file = shift;
my $index = $self->{'index'};
my @lines = ();
if (-f "$index/$file") {
my $fh = CWB::OpenFile "$index/$file";
while (<$fh>) {
chomp;
push @lines, $_;
}
$fh->close;
}
return @lines;
}
## $unique_name = $cache->store($named_query [, @metadata]);
## make named query result $named_query (which must be fully qualified with corpus name) persistent,
## returning unique name for retrieval; optional metadata is s(tored in index file when the data file has been created
sub store {
my $self = shift;
my $name = shift;
my $index = $self->{'index'};
my $data = $self->{'data'};
my $cqp = $self->{'cqp'};
croak "Error: query name $name illegal or not fully specified."
unless $name =~ /^([A-Z0-9_-]+):([A-Za-z0-9_-]+)$/;
my $corpus = $1;
my $localname = $2;
# create cache directory and subdirectories if necessary
$self->make_dirs;
# check size of cache directory and remove old entries if necessary
$self->sweep_cache;
# extend $name to unique filename (in index directory)
my %index_file = map {$_ => 1} $self->index_files("$name-");
my $ext = 1;
while (exists $index_file{"$name-$ext"}) {
$ext++;
}
my $unique = "$name-$ext";
my $unique_local = "$localname-$ext";
# now create empty index file to "lock" the unique name from other processes
my $fh = CWB::OpenFile "> $index/$unique";
$fh->close;
CWB::Shell::Cmd("chmod 666 $index/$unique");
# set DataDirectory in CQP session and re-activate base corpus
$cqp->exec("set DataDirectory '$data'");
$cqp->exec("$corpus");
# copy named query to unique name and store it in data directory
$cqp->exec("$unique_local = $name");
$cqp->exec("save $unique");
CWB::Shell::Cmd("chmod 666 $data/$unique");
# re-write index file if caller has passed metadata
if (@_) {
$fh = CWB::OpenFile "> $index/$unique";
while (@_) {
my $line = shift;
chomp $line; # normalize whitespace in metadata (esp. linebreaks)
$line =~ s/\s+/ /g;
print $fh "$line\n";
}
$fh->close;
}
return $unique;
}
## $size = $cache->retrieve($unique_name);
## load persistent query result identified by $unique_name into CQP process,
## returning number of matches (>= 0) or 'undef' if the query result has expired from the cache
sub retrieve {
my $self = shift;
my $unique = shift;
my $index = $self->{'index'};
my $data = $self->{'data'};
my $cqp = $self->{'cqp'};
my $expire = $self->{'expire'} / 24; # maximal storage time in days
croak "Error: illegal unique query name $unique"
unless $unique =~ /^([A-Z0-9_-]+):([A-Za-z0-9_-]+)$/;
my $corpus = $1;
my $unique_local = $2;
return undef # may have expired from cache during sweep ...
unless -f "$index/$unique";
if (-M "$index/$unique" > $expire) { # ... or because it exceeded the maximal storage time
$self->sweep_cache("-force"); # delete expired files so they won't hide exact matches
return undef;
}
CWB::Shell::Cmd("touch $index/$unique"); # touch index file to mark last access time
# set DataDirectory in CQP session and re-activate base corpus
$cqp->exec("set DataDirectory '$data'");
$cqp->exec("$corpus");
# force loading of named query result in CQP session
my ($size) = $cqp->exec("size $unique");
return $size;
}
## ($unique_name [, $matching_lines]) = $cache->retrieve_matching($prefix, ["-partial",] @metadata);
## load persistent query result which is a unique extension of $prefix and matches the specified metadata;
## if flag -partial is specified, a query result matching only the first N lines of @metadata will also
## be accepted; N is returned as $matching_lines and must be checked by the caller
sub retrieve_matching {
my $self = shift;
my $prefix = shift;
my $partial = 0;
if (@_ and lc($_[0]) eq "-partial") {
$partial = 1;
shift @_;
}
my $index = $self->{'index'};
my $data = $self->{'data'};
my $cqp = $self->{'cqp'};
my $expire = $self->{'expire'} / 24; # maximal storage time in days
croak "Error: query name prefix $prefix is illegal or not fully specified."
( run in 0.534 second using v1.01-cache-2.11-cpan-39bf76dae61 )