File-Raw
view release on metacpan or search on metacpan
IV idx;
SV *idx_sv;
if (items < 1)
croak("Usage: file::lines_iter(path [, plugin => ..., key => value ...])");
path = SvPV_nolen(ST(0));
/* Plugin path: slurp + dispatch READ, wrap the resulting AoA in an
* iterator that walks records in order. This is eager (whole AoA
* held in memory) - for true streaming use each_line($p, $cb,
* plugin => ...). The iterator interface itself is preserved so
* code that stores the iterator handle still composes. */
if (items > 1) {
HV *opts;
SV *bytes;
SV *out;
AV *records;
LineIterEntry *entry;
opts = file_plugin_build_opts(aTHX_ &ST(0), 1, items, "lines_iter");
if (items < 2)
croak("Usage: file::each_line(path, callback [, plugin => ..., key => value ...])");
path = SvPV_nolen(ST(0));
callback = ST(1);
if (!SvROK(callback) || SvTYPE(SvRV(callback)) != SVt_PVCV) {
croak("Second argument must be a code reference");
}
/* Plugin path: route through streaming dispatch. The plugin's
* stream fn owns the record emission and calls back to `callback`
* per record (typically once for each parsed CSV row, etc.). */
if (items > 2) {
HV *opts = file_plugin_build_opts(aTHX_ &ST(0), 2, items, "each_line");
(void)file_plugin_dispatch_stream(aTHX_ opts, path, callback);
SvREFCNT_dec((SV *)opts);
XSRETURN_EMPTY;
}
block_cv = (CV*)SvRV(callback);
});
The bridge allocates a PerlPluginBridge holding the coderef SVs plus
a FilePlugin block whose function pointers are static C thunks. The
thunks recover the bridge from FilePluginContext::plugin_state and
call the appropriate coderef. The bridge is pinned in
g_perl_plugins so we can free it on unregister.
The 'stream' phase is intentionally not supported from Perl: a Perl
stream plugin would be invoked once per chunk by file.c's read loop,
and the per-call call_sv overhead defeats the point of streaming.
Perl plugins that need record-by-record callbacks should implement
the 'record' phase instead - File::Raw drives the iteration.
============================================ */
typedef struct PerlPluginBridge {
char *name; /* strdup'd; pointer is stored in plugin.name */
SV *read_cv;
SV *write_cv;
SV *record_cv;
FilePlugin plugin;
include/file_plugin.h view on Meta::CPAN
#include "EXTERN.h"
#include "perl.h"
/* Phase identifiers. A plugin only has to implement the phases it cares
* about; unimplemented phases are NULL function pointers and any caller
* that requests them gets a croak. */
typedef enum {
FILE_PLUGIN_PHASE_READ = 1, /* whole-file slurp transform */
FILE_PLUGIN_PHASE_WRITE = 2, /* whole-file spew/append transform */
FILE_PLUGIN_PHASE_RECORD = 3, /* per-record dispatch (predicate/map) */
FILE_PLUGIN_PHASE_STREAM = 4 /* chunked feed for streaming */
} FilePluginPhase;
/* Per-call dispatch context. Lifetime: single dispatch call. */
typedef struct FilePluginContext {
const char *path; /* file path, NUL-terminated, may be NULL */
SV *data; /* read: bytes from disk; write: payload */
SV *callback; /* per-record cb when streaming, else NULL */
HV *options; /* per-call opts; mortal; never NULL */
int phase; /* FILE_PLUGIN_PHASE_* */
int cancel; /* set non-zero to cancel op */
void *plugin_state; /* opaque, copied from FilePlugin::state */
void *call_state; /* NULL on entry; plugin scratch slot for */
/* per-dispatch state. Survives across */
/* chunks within one STREAM dispatch; reset */
/* every fresh dispatch. Plugin owns the */
/* alloc/free lifecycle. */
} FilePluginContext;
lib/File/Raw.pm view on Meta::CPAN
my $line = $iter->next;
}
$iter->close;
Returns a line iterator object. Without a plugin tail, the iterator
streams bytes lazily and is memory-efficient. With a plugin tail
(C<lines_iter($path, plugin =E<gt> 'csv', ...)>) the iterator is eager:
the file is slurped and parsed into an AoA at construction time, and
C<next> walks the array; the C<header =E<gt> 1> and
C<header =E<gt> [names]> options are honoured. For memory-bounded
streaming through a plugin use C<each_line> instead.
B<Note:> For maximum performance, prefer C<each_line()> which uses
MULTICALL optimization and is significantly faster. Use C<lines_iter()>
when you need iterator control (e.g., early exit, multiple iterators).
=head2 mmap_open
my $mmap = File::Raw::mmap_open($path);
my $mmap = File::Raw::mmap_open($path, 1); # writable
lib/File/Raw.pm view on Meta::CPAN
Accepts the standard plugin tail; with C<plugin =E<gt> 'csv'> (or any
plugin returning AoA) the range is applied to the parsed records:
# Rows 100..149 of a CSV
my $page = File::Raw::range_lines($p, 100, 50,
plugin => 'csv', header => 1);
Same eager trade-off as C<lines_iter> with a plugin: the file is
slurped and parsed in full before the slice is taken. For
memory-bounded streaming through a plugin use C<each_line> with a
counter and C<die> to bail.
=head2 atomic_spew
my $ok = File::Raw::atomic_spew($path, $data);
Write data to a temporary file then atomically rename. This ensures
the file is never in a partial state. Returns true on success.
=head2 grep_lines
lib/File/Raw.pm view on Meta::CPAN
=back
C<slurp_raw> and the stat / dir / path families are intentionally
plugin-free.
C<lines_iter> with a plugin tail is B<eager> (it slurps the file once
into an AoA at construction time and the iterator walks that array).
The iterator interface is preserved so callers can still store the
handle, call C<next>/C<eof>/C<close>, etc., but it is not
memory-bounded: for true streaming over huge files use C<each_line>
with the same plugin tail.
=head2 Plugin chains
The C<plugin> value can be an arrayref of plugin names instead of a
single name. The chain describes the file's encoding stack from
outermost wrapper to innermost format; same spelling for both
directions.
# data.csv.gz: gzip wraps csv. Slurp unwraps left-to-right -
lib/File/Raw.pm view on Meta::CPAN
chain. Byte-transform plugins (C<gzip>, C<base64>, C<encoding>) are
chain-friendly anywhere.
=head3 Phase coverage
Chains are supported for B<READ> and B<WRITE> only. The record-derived
helpers (C<grep_lines>, C<count_lines>, C<find_line>, C<map_lines>)
get chain support transparently because they slurp + transform via
READ before iterating records.
C<each_line> (the true streaming path) rejects arrayref C<plugin>
values: composing two streams needs a record-to-chunk adapter that's
its own design problem. Pass a single plugin name there. C<record>
phase is also single-plugin only - chaining record functions would
require records to keep the same shape across links.
=head3 Plugin-author notes
Existing plugins keep working without recompilation: C<FilePlugin> and
C<FilePluginContext> are unchanged. A plugin's C<read>/C<write>
callback is invoked the same way whether it's standalone or part of a
lib/File/Raw.pm view on Meta::CPAN
phase name. A plugin may implement any subset of phases; absent ones
cause a clear error if the user requests them.
File::Raw::register_plugin('csv', {
read => sub { my ($path, $bytes, $opts) = @_; ... }, # bytes -> AoA
write => sub { my ($path, $rows, $opts) = @_; ... }, # rows -> bytes
record => sub { my ($path, $record, $opts) = @_; ... }, # transform/filter
});
The C<stream> phase is intentionally not exposed from Perl - per-chunk
C<call_sv> overhead defeats the purpose of streaming. Plugins that need
record-by-record callbacks should implement C<record>; File::Raw drives
the iteration itself. Streaming plugins must be written in C.
Re-registering a name without C<$override> croaks; pass a true
C<$override> to replace.
=head2 unregister_plugin
File::Raw::unregister_plugin($name);
lib/File/Raw.pm view on Meta::CPAN
my $is_file = File::Raw::is_file($path);
# FAST: 1 syscall
my $st = File::Raw::stat($path);
my ($size, $mtime, $is_file) = @{$st}{qw(size mtime is_file)};
=head1 XS API
File::Raw exposes a plugin C API via C<include/file_plugin.h>. Downstream
XS modules can register C-level plugins that File::Raw's read / write /
streaming dispatch routes calls into - no per-record C<call_sv>
overhead. The shared object is loaded with C<RTLD_GLOBAL> so symbols
resolve at load time without an explicit link step on Linux/macOS.
=head2 Types
=over 4
=item B<FilePluginPhase>
FILE_PLUGIN_PHASE_READ /* whole-file slurp transform */
FILE_PLUGIN_PHASE_WRITE /* whole-file spew/append transform */
FILE_PLUGIN_PHASE_RECORD /* per-record dispatch */
FILE_PLUGIN_PHASE_STREAM /* chunked feed for streaming */
=item B<FilePluginContext>
Per-call dispatch context (lifetime: single dispatch call).
typedef struct FilePluginContext {
const char *path; /* file path */
SV *data; /* read: bytes; write: payload */
SV *callback; /* per-record cb (stream phase) */
HV *options; /* per-call opts; mortal; never NULL*/
t/025-plugin-lines-iter.t view on Meta::CPAN
#!/usr/bin/perl
use strict;
use warnings;
use Test::More;
use File::Raw;
use File::Temp qw(tempdir);
# lines_iter($p, plugin => 'name', ...) wraps the plugin's READ output
# (an AoA) in an iterator. Trade-off vs each_line: eager whole-AoA in
# memory, but preserves the iterator handle interface (storable,
# lazy-on-the-Perl-side, supports early close). For true streaming use
# each_line($p, $cb, plugin => 'name').
# Plugin that returns an arrayref of single-element rows (one per word).
File::Raw::register_plugin('words', {
read => sub {
my ($p, $bytes, $opts) = @_;
return [ map { [$_] } split /\s+/, $bytes ];
},
});
( run in 1.589 second using v1.01-cache-2.11-cpan-140bd7fdf52 )