streaming results from the CPAN

File-Raw


    use File::Raw qw(slurp spew);

    file_spew('/path/to/file', "data");  # Write data
    my $content = file_slurp('/path/to/file');  # Read data

=head2 slurp

    my $content = File::Raw::slurp($path);

Read entire file into a scalar. Returns undef on error.
Pre-allocates the buffer based on file size for optimal performance.

=head2 slurp_raw

    my $content = File::Raw::slurp_raw($path);

Same as slurp, explicit binary mode.

=head2 spew

    my $ok = File::Raw::spew($path, $data);

Write data to file (creates or truncates). Returns true on success.

=head2 append

    my $ok = File::Raw::append($path, $data);

Append data to file. Returns true on success.

=head2 lines

    my $lines = File::Raw::lines($path);

Returns arrayref of all lines (without newlines).

=head2 each_line

    File::Raw::each_line($path, sub {
        print "Line: $_\n";  # line available via $_
    });

Process each line with a callback. Memory efficient - doesn't load
entire file into memory.

=head2 lines_iter

    my $iter = File::Raw::lines_iter($path);
    while (!$iter->eof) {
        my $line = $iter->next;
    }
    $iter->close;

Returns a line iterator object. Without a plugin tail, the iterator
streams bytes lazily and is memory-efficient. With a plugin tail
(C<lines_iter($path, plugin =E<gt> 'csv', ...)>) the iterator is eager:
the file is slurped and parsed into an AoA at construction time, and
C<next> walks the array; the C<header =E<gt> 1> and
C<header =E<gt> [names]> options are honoured. For memory-bounded
streaming through a plugin use C<each_line> instead.

B<Note:> For maximum performance, prefer C<each_line()> which uses
MULTICALL optimization and is significantly faster. Use C<lines_iter()>
when you need iterator control (e.g., early exit, multiple iterators).

=head2 mmap_open

    my $mmap = File::Raw::mmap_open($path);
    my $mmap = File::Raw::mmap_open($path, 1);  # writable

Memory-map a file. Returns a File::Raw::mmap object.

=head3 File::Raw::mmap methods

=over 4

=item data() - Returns the mapped content as a scalar (zero-copy)

=item sync() - Flush changes to disk (for writable maps)

=item close() - Unmap the file

=back

=head2 size

    my $bytes = File::Raw::size($path);

Returns file size in bytes, or -1 on error.

=head2 mtime

    my $epoch = File::Raw::mtime($path);

Returns modification time as epoch seconds, or -1 on error.

=head2 exists

    if (File::Raw::exists($path)) { ... }

Returns true if path exists.

=head2 is_file

    if (File::Raw::is_file($path)) { ... }

Returns true if path is a regular file.

=head2 is_dir

    if (File::Raw::is_dir($path)) { ... }

Returns true if path is a directory.

=head2 is_readable

    if (File::Raw::is_readable($path)) { ... }

Returns true if path is readable.

lib/File/Raw.pm view on Meta::CPAN


    my $mode = File::Raw::mode($path);

Returns the file permission bits (e.g., 0644), or -1 on error.

=head2 is_link

    if (File::Raw::is_link($path)) { ... }

Returns true if path is a symbolic link.

=head2 is_executable

    if (File::Raw::is_executable($path)) { ... }

Returns true if path is executable.

=head2 chmod

    my $ok = File::Raw::chmod($path, $mode);

Change file permissions. Returns true on success.

    File::Raw::chmod($path, 0755);

=head2 head

    my $lines = File::Raw::head($path);      # First 10 lines
    my $lines = File::Raw::head($path, 20);  # First 20 lines

Returns arrayref of first N lines (default 10).

=head2 tail

    my $lines = File::Raw::tail($path);      # Last 10 lines
    my $lines = File::Raw::tail($path, 20);  # Last 20 lines

Returns arrayref of last N lines (default 10).

=head2 range_lines

    my $lines = File::Raw::range_lines($path, $from, $count);

Returns arrayref of C<$count> lines starting at line C<$from>.
1-based: C<range_lines($p, 5, 3)> returns lines 5, 6, 7.
C<range_lines($p, 1, 10)> is equivalent to C<head($p, 10)>.

If C<$from> is past EOF, or C<$count <= 0>, or C<$from < 1>, returns
an empty arrayref. If fewer than C<$count> lines remain after C<$from>,
returns whatever is available (no error).

Accepts the standard plugin tail; with C<plugin =E<gt> 'csv'> (or any
plugin returning AoA) the range is applied to the parsed records:

    # Rows 100..149 of a CSV
    my $page = File::Raw::range_lines($p, 100, 50,
                                      plugin => 'csv', header => 1);

Same eager trade-off as C<lines_iter> with a plugin: the file is
slurped and parsed in full before the slice is taken. For
memory-bounded streaming through a plugin use C<each_line> with a
counter and C<die> to bail.

=head2 atomic_spew

    my $ok = File::Raw::atomic_spew($path, $data);

Write data to a temporary file then atomically rename. This ensures
the file is never in a partial state. Returns true on success.

=head2 grep_lines

    my $lines = File::Raw::grep_lines($path, \&predicate);
    my $lines = File::Raw::grep_lines($path, 'not_blank');

Filter lines matching a predicate. The predicate can be a coderef
or a registered predicate name.

Built-in predicates: blank, not_blank, empty, not_empty, comment, not_comment

    # Using coderef
    my $lines = File::Raw::grep_lines($path, sub { /pattern/ });
    
    # Using built-in predicate
    my $lines = File::Raw::grep_lines($path, 'not_blank');

=head2 count_lines

    my $count = File::Raw::count_lines($path);
    my $count = File::Raw::count_lines($path, \&predicate);
    my $count = File::Raw::count_lines($path, 'not_blank');

Count lines in a file. Optionally filter by predicate.

=head2 find_line

    my $line = File::Raw::find_line($path, \&predicate);
    my $line = File::Raw::find_line($path, 'not_blank');

Find the first line matching a predicate. Returns undef if not found.

=head2 map_lines

    my $results = File::Raw::map_lines($path, \&transform);

Transform each line with a callback, returns arrayref of results.

    my $lengths = File::Raw::map_lines($path, sub { length($_) });

=head2 register_predicate

    File::Raw::register_predicate($name, \&predicate);

Register a custom named predicate for use with grep_lines / count_lines /
find_line. The coderef receives the line in C<$_>.

    File::Raw::register_predicate('has_error', sub { /ERROR/ });
    my $errors = File::Raw::grep_lines($path, 'has_error');

=head2 list_predicates

    my $names = File::Raw::list_predicates();

Returns arrayref of registered predicate names (built-ins plus any
custom ones).

=head1 PLUGINS

Most read / write / iteration functions accept a B<plugin tail>:

    File::Raw::slurp($path, plugin => 'csv', sep => ';', header => 1);
    File::Raw::spew ($path, $rows, plugin => 'csv');
    File::Raw::each_line($path, sub { ... }, plugin => 'csv');

The tail is parsed as C<key =E<gt> value> pairs; the C<plugin> key is
mandatory whenever options are supplied. The named plugin must be
registered via L</register_plugin> (Perl) or
C<file_register_plugin()> (C, see L</XS API>) before the call.

The following functions are plugin-aware:

=over 4

=item * Read: C<slurp>, C<lines>, C<head>, C<tail>, C<range_lines>

=item * Write: C<spew>, C<append>, C<atomic_spew>

=item * Streaming: C<each_line>

=item * Iterator: C<lines_iter>

=item * Record-derived: C<grep_lines>, C<count_lines>, C<find_line>, C<map_lines>

=back

C<slurp_raw> and the stat / dir / path families are intentionally
plugin-free.

C<lines_iter> with a plugin tail is B<eager> (it slurps the file once
into an AoA at construction time and the iterator walks that array).
The iterator interface is preserved so callers can still store the
handle, call C<next>/C<eof>/C<close>, etc., but it is not
memory-bounded: for true streaming over huge files use C<each_line>
with the same plugin tail.

=head2 Plugin chains

The C<plugin> value can be an arrayref of plugin names instead of a
single name. The chain describes the file's encoding stack from
outermost wrapper to innermost format; same spelling for both
directions.

    # data.csv.gz: gzip wraps csv. Slurp unwraps left-to-right -
    # gunzip first, then parse csv - and returns an AoA.
    my $rows = File::Raw::slurp($path,
        plugin => ['gzip', 'csv']);

    # spew applies right-to-left: csv-encode the AoA into bytes,
    # then gzip the bytes, then write the result.
    File::Raw::spew($path, $rows,
        plugin => ['gzip', 'csv']);

The single-plugin scalar form (C<plugin =E<gt> 'csv'>) keeps its
current semantics exactly; chains are purely additive.

=head3 Per-plugin options

When the chain has more than one plugin, give each one its own
sub-hash. Keys outside any sub-hash are shared across the whole chain
(visible to every plugin); per-plugin keys win on conflict.

    File::Raw::slurp($path,
        plugin => ['gzip', 'csv'],
        gzip   => { level => 9 },         # only gzip sees this
        csv    => { sep => ';' },         # only csv sees this
        strict => 1,                      # both gzip and csv see this
    );

The single-plugin scalar form takes a flat options bag (top-level
keys go straight to the lone plugin) - no sub-hash required.

=head3 Type contract

File::Raw doesn't statically enforce the chain's type contract; each
plugin sees its predecessor's return value verbatim. The convention
is:

=over 4

=item *

For READ: every plugin except the last must return bytes. The last
plugin can return any shape (bytes, AoA, AoH, ...).

=item *

For WRITE: every plugin except the first must accept bytes; the
first sees the user's payload (which may itself be structured).

=back

In practice that means structured-output plugins (C<csv>, C<json>,
C<yaml>) belong B<last> in a READ chain and B<first> in a WRITE
chain. Byte-transform plugins (C<gzip>, C<base64>, C<encoding>) are
chain-friendly anywhere.

=head3 Phase coverage

Chains are supported for B<READ> and B<WRITE> only. The record-derived
helpers (C<grep_lines>, C<count_lines>, C<find_line>, C<map_lines>)
get chain support transparently because they slurp + transform via
READ before iterating records.

C<each_line> (the true streaming path) rejects arrayref C<plugin>
values: composing two streams needs a record-to-chunk adapter that's
its own design problem. Pass a single plugin name there. C<record>
phase is also single-plugin only - chaining record functions would
require records to keep the same shape across links.

=head3 Plugin-author notes

Existing plugins keep working without recompilation: C<FilePlugin> and
C<FilePluginContext> are unchanged. A plugin's C<read>/C<write>
callback is invoked the same way whether it's standalone or part of a
chain; the dispatcher builds a per-iteration C<ctx-E<gt>options> HV
that contains the shared keys overlaid with the plugin's own sub-hash
(if any).

=head2 register_plugin

    File::Raw::register_plugin($name, \%phases);
    File::Raw::register_plugin($name, \%phases, $override);

Register a plugin that will be invoked when callers pass
C<plugin =E<gt> $name>. C<%phases> is a hashref of coderefs keyed by
phase name. A plugin may implement any subset of phases; absent ones
cause a clear error if the user requests them.

    File::Raw::register_plugin('csv', {
        read   => sub { my ($path, $bytes,  $opts) = @_; ... },  # bytes -> AoA
        write  => sub { my ($path, $rows,   $opts) = @_; ... },  # rows  -> bytes
        record => sub { my ($path, $record, $opts) = @_; ... },  # transform/filter
    });

The C<stream> phase is intentionally not exposed from Perl - per-chunk
C<call_sv> overhead defeats the purpose of streaming. Plugins that need
record-by-record callbacks should implement C<record>; File::Raw drives
the iteration itself. Streaming plugins must be written in C.

Re-registering a name without C<$override> croaks; pass a true
C<$override> to replace.

=head2 unregister_plugin

    File::Raw::unregister_plugin($name);

Remove a previously-registered plugin.

=head2 list_plugins

    my $names = File::Raw::list_plugins();

Returns arrayref of currently registered plugin names. The built-in
C<'predicate'> plugin is always present.

=head2 The built-in 'predicate' plugin

Boot-time-registered C plugin that owns the eight built-in line
predicates (C<blank>/C<is_blank>, C<not_blank>/C<is_not_blank>,
C<empty>/C<is_empty>, C<not_empty>/C<is_not_empty>,
C<comment>/C<is_comment>, C<not_comment>/C<is_not_comment>) plus any
predicate added via L</register_predicate>. The legacy 2-arg form

    File::Raw::grep_lines($path, 'is_blank');

is sugar for going through this plugin.

=head1 IMPORT STYLE

    use File::Raw qw(:all);              # Import all functions as file_*
    use File::Raw qw(slurp spew lines);  # Import specific functions
    use File::Raw qw(import);            # Same as :all (backwards compat)

When imported, the functions are installed with `file_` prefix and use
custom ops for maximum performance (eliminating function call overhead).

    use File::Raw qw(slurp spew);
    
    my $content = file_slurp($path);
    file_spew($path, $data);

Available imports: slurp, slurp_raw, spew, append, atomic_spew, lines,
exists, size, mtime, atime, ctime, mode, is_file, is_dir, is_link,
is_readable, is_writable, is_executable, unlink, mkdir, rmdir, touch,
copy, move, chmod, readdir, basename, dirname, extname, clear_stat_cache.

=head1 PERFORMANCE NOTES

=head2 Platform Optimizations

=over 4

=item * macOS: Uses copyfile() for native file copying

=item * Linux: Uses sendfile() for zero-copy file transfer

=item * Linux/BSD: Uses posix_fadvise() to hint sequential reads

=back

=head2 When to use File::Raw::stat

If you need multiple attributes from a file (size, mtime, is_file, etc.),
use C<File::Raw::stat()> instead of calling individual functions:

    # SLOW: 5 syscalls
    my $size    = File::Raw::size($path);
    my $mtime   = File::Raw::mtime($path);
    my $is_file = File::Raw::is_file($path);
    
    # FAST: 1 syscall
    my $st = File::Raw::stat($path);
    my ($size, $mtime, $is_file) = @{$st}{qw(size mtime is_file)};

=head1 XS API

File::Raw exposes a plugin C API via C<include/file_plugin.h>. Downstream
XS modules can register C-level plugins that File::Raw's read / write /
streaming dispatch routes calls into - no per-record C<call_sv>
overhead. The shared object is loaded with C<RTLD_GLOBAL> so symbols
resolve at load time without an explicit link step on Linux/macOS.

=head2 Types

=over 4

=item B<FilePluginPhase>

    FILE_PLUGIN_PHASE_READ      /* whole-file slurp transform           */
    FILE_PLUGIN_PHASE_WRITE     /* whole-file spew/append transform     */
    FILE_PLUGIN_PHASE_RECORD    /* per-record dispatch                  */
    FILE_PLUGIN_PHASE_STREAM    /* chunked feed for streaming           */

=item B<FilePluginContext>

Per-call dispatch context (lifetime: single dispatch call).

    typedef struct FilePluginContext {
        const char  *path;          /* file path                        */
        SV          *data;          /* read: bytes; write: payload      */
        SV          *callback;      /* per-record cb (stream phase)     */
        HV          *options;       /* per-call opts; mortal; never NULL*/
        int          phase;
        int          cancel;        /* set non-zero to cancel op        */
        void        *plugin_state;  /* opaque, copied from plugin->state*/
    } FilePluginContext;

=item B<FilePlugin>

Registration block; the caller owns the storage (typically a file-scope
static) and must keep it alive for as long as the plugin is registered.

    typedef struct FilePlugin {
        const char            *name;
        file_plugin_read_fn    read_fn;    /* NULL if not implemented */
        file_plugin_write_fn   write_fn;
        file_plugin_record_fn  record_fn;
        file_plugin_stream_fn  stream_fn;
        void                  *state;
    } FilePlugin;

Phase signatures:

    typedef SV*  (*file_plugin_read_fn)   (pTHX_ FilePluginContext *ctx);
    typedef SV*  (*file_plugin_write_fn)  (pTHX_ FilePluginContext *ctx);
    typedef SV*  (*file_plugin_record_fn) (pTHX_ FilePluginContext *ctx, SV *record);
    typedef int  (*file_plugin_stream_fn) (pTHX_ FilePluginContext *ctx,
                                           const char *chunk, size_t len, int eof);

=back

=head2 Functions

=over 4

=item B<file_register_plugin>

    int file_register_plugin(pTHX_ const FilePlugin *plugin);

Returns 1 on success, 0 if a plugin with the same name is already
registered (use C<file_unregister_plugin> first), -1 on invalid input
(NULL plugin, NULL/empty name). Call during module initialisation only
(not thread-safe).

=item B<file_unregister_plugin>

    int file_unregister_plugin(pTHX_ const char *name);

Remove a plugin by name. Returns 1 if found and removed.

=item B<file_lookup_plugin>

( run in 2.351 seconds using v1.01-cache-2.11-cpan-140bd7fdf52 )