File-Raw

 view release on metacpan or  search on metacpan

file.c  view on Meta::CPAN

    IV idx;
    SV *idx_sv;

    if (items < 1)
        croak("Usage: file::lines_iter(path [, plugin => ..., key => value ...])");

    path = SvPV_nolen(ST(0));

    /* Plugin path: slurp + dispatch READ, wrap the resulting AoA in an
     * iterator that walks records in order. This is eager (whole AoA
     * held in memory) - for true streaming use each_line($p, $cb,
     * plugin => ...). The iterator interface itself is preserved so
     * code that stores the iterator handle still composes. */
    if (items > 1) {
        HV *opts;
        SV *bytes;
        SV *out;
        AV *records;
        LineIterEntry *entry;

        opts = file_plugin_build_opts(aTHX_ &ST(0), 1, items, "lines_iter");

file.c  view on Meta::CPAN

    if (items < 2)
        croak("Usage: file::each_line(path, callback [, plugin => ..., key => value ...])");

    path = SvPV_nolen(ST(0));
    callback = ST(1);

    if (!SvROK(callback) || SvTYPE(SvRV(callback)) != SVt_PVCV) {
        croak("Second argument must be a code reference");
    }

    /* Plugin path: route through streaming dispatch. The plugin's
     * stream fn owns the record emission and calls back to `callback`
     * per record (typically once for each parsed CSV row, etc.). */
    if (items > 2) {
        HV *opts = file_plugin_build_opts(aTHX_ &ST(0), 2, items, "each_line");
        (void)file_plugin_dispatch_stream(aTHX_ opts, path, callback);
        SvREFCNT_dec((SV *)opts);
        XSRETURN_EMPTY;
    }

    block_cv = (CV*)SvRV(callback);

file.c  view on Meta::CPAN

       });

   The bridge allocates a PerlPluginBridge holding the coderef SVs plus
   a FilePlugin block whose function pointers are static C thunks. The
   thunks recover the bridge from FilePluginContext::plugin_state and
   call the appropriate coderef. The bridge is pinned in
   g_perl_plugins so we can free it on unregister.

   The 'stream' phase is intentionally not supported from Perl: a Perl
   stream plugin would be invoked once per chunk by file.c's read loop,
   and the per-call call_sv overhead defeats the point of streaming.
   Perl plugins that need record-by-record callbacks should implement
   the 'record' phase instead - File::Raw drives the iteration.
   ============================================ */

typedef struct PerlPluginBridge {
    char        *name;     /* strdup'd; pointer is stored in plugin.name */
    SV          *read_cv;
    SV          *write_cv;
    SV          *record_cv;
    FilePlugin   plugin;

include/file_plugin.h  view on Meta::CPAN

#include "EXTERN.h"
#include "perl.h"

/* Phase identifiers. A plugin only has to implement the phases it cares
 * about; unimplemented phases are NULL function pointers and any caller
 * that requests them gets a croak. */
typedef enum {
    FILE_PLUGIN_PHASE_READ      = 1,  /* whole-file slurp transform           */
    FILE_PLUGIN_PHASE_WRITE     = 2,  /* whole-file spew/append transform     */
    FILE_PLUGIN_PHASE_RECORD    = 3,  /* per-record dispatch (predicate/map)  */
    FILE_PLUGIN_PHASE_STREAM    = 4   /* chunked feed for streaming           */
} FilePluginPhase;

/* Per-call dispatch context. Lifetime: single dispatch call. */
typedef struct FilePluginContext {
    const char  *path;          /* file path, NUL-terminated, may be NULL    */
    SV          *data;          /* read: bytes from disk; write: payload     */
    SV          *callback;      /* per-record cb when streaming, else NULL   */
    HV          *options;       /* per-call opts; mortal; never NULL         */
    int          phase;         /* FILE_PLUGIN_PHASE_*                       */
    int          cancel;        /* set non-zero to cancel op                 */
    void        *plugin_state;  /* opaque, copied from FilePlugin::state     */
    void        *call_state;    /* NULL on entry; plugin scratch slot for    */
                                /* per-dispatch state. Survives across       */
                                /* chunks within one STREAM dispatch; reset  */
                                /* every fresh dispatch. Plugin owns the     */
                                /* alloc/free lifecycle.                     */
} FilePluginContext;

lib/File/Raw.pm  view on Meta::CPAN

        my $line = $iter->next;
    }
    $iter->close;

Returns a line iterator object. Without a plugin tail, the iterator
streams bytes lazily and is memory-efficient. With a plugin tail
(C<lines_iter($path, plugin =E<gt> 'csv', ...)>) the iterator is eager:
the file is slurped and parsed into an AoA at construction time, and
C<next> walks the array; the C<header =E<gt> 1> and
C<header =E<gt> [names]> options are honoured. For memory-bounded
streaming through a plugin use C<each_line> instead.

B<Note:> For maximum performance, prefer C<each_line()> which uses
MULTICALL optimization and is significantly faster. Use C<lines_iter()>
when you need iterator control (e.g., early exit, multiple iterators).

=head2 mmap_open

    my $mmap = File::Raw::mmap_open($path);
    my $mmap = File::Raw::mmap_open($path, 1);  # writable

lib/File/Raw.pm  view on Meta::CPAN


Accepts the standard plugin tail; with C<plugin =E<gt> 'csv'> (or any
plugin returning AoA) the range is applied to the parsed records:

    # Rows 100..149 of a CSV
    my $page = File::Raw::range_lines($p, 100, 50,
                                      plugin => 'csv', header => 1);

Same eager trade-off as C<lines_iter> with a plugin: the file is
slurped and parsed in full before the slice is taken. For
memory-bounded streaming through a plugin use C<each_line> with a
counter and C<die> to bail.

=head2 atomic_spew

    my $ok = File::Raw::atomic_spew($path, $data);

Write data to a temporary file then atomically rename. This ensures
the file is never in a partial state. Returns true on success.

=head2 grep_lines

lib/File/Raw.pm  view on Meta::CPAN


=back

C<slurp_raw> and the stat / dir / path families are intentionally
plugin-free.

C<lines_iter> with a plugin tail is B<eager> (it slurps the file once
into an AoA at construction time and the iterator walks that array).
The iterator interface is preserved so callers can still store the
handle, call C<next>/C<eof>/C<close>, etc., but it is not
memory-bounded: for true streaming over huge files use C<each_line>
with the same plugin tail.

=head2 Plugin chains

The C<plugin> value can be an arrayref of plugin names instead of a
single name. The chain describes the file's encoding stack from
outermost wrapper to innermost format; same spelling for both
directions.

    # data.csv.gz: gzip wraps csv. Slurp unwraps left-to-right -

lib/File/Raw.pm  view on Meta::CPAN

chain. Byte-transform plugins (C<gzip>, C<base64>, C<encoding>) are
chain-friendly anywhere.

=head3 Phase coverage

Chains are supported for B<READ> and B<WRITE> only. The record-derived
helpers (C<grep_lines>, C<count_lines>, C<find_line>, C<map_lines>)
get chain support transparently because they slurp + transform via
READ before iterating records.

C<each_line> (the true streaming path) rejects arrayref C<plugin>
values: composing two streams needs a record-to-chunk adapter that's
its own design problem. Pass a single plugin name there. C<record>
phase is also single-plugin only - chaining record functions would
require records to keep the same shape across links.

=head3 Plugin-author notes

Existing plugins keep working without recompilation: C<FilePlugin> and
C<FilePluginContext> are unchanged. A plugin's C<read>/C<write>
callback is invoked the same way whether it's standalone or part of a

lib/File/Raw.pm  view on Meta::CPAN

phase name. A plugin may implement any subset of phases; absent ones
cause a clear error if the user requests them.

    File::Raw::register_plugin('csv', {
        read   => sub { my ($path, $bytes,  $opts) = @_; ... },  # bytes -> AoA
        write  => sub { my ($path, $rows,   $opts) = @_; ... },  # rows  -> bytes
        record => sub { my ($path, $record, $opts) = @_; ... },  # transform/filter
    });

The C<stream> phase is intentionally not exposed from Perl - per-chunk
C<call_sv> overhead defeats the purpose of streaming. Plugins that need
record-by-record callbacks should implement C<record>; File::Raw drives
the iteration itself. Streaming plugins must be written in C.

Re-registering a name without C<$override> croaks; pass a true
C<$override> to replace.

=head2 unregister_plugin

    File::Raw::unregister_plugin($name);

lib/File/Raw.pm  view on Meta::CPAN

    my $is_file = File::Raw::is_file($path);
    
    # FAST: 1 syscall
    my $st = File::Raw::stat($path);
    my ($size, $mtime, $is_file) = @{$st}{qw(size mtime is_file)};

=head1 XS API

File::Raw exposes a plugin C API via C<include/file_plugin.h>. Downstream
XS modules can register C-level plugins that File::Raw's read / write /
streaming dispatch routes calls into - no per-record C<call_sv>
overhead. The shared object is loaded with C<RTLD_GLOBAL> so symbols
resolve at load time without an explicit link step on Linux/macOS.

=head2 Types

=over 4

=item B<FilePluginPhase>

    FILE_PLUGIN_PHASE_READ      /* whole-file slurp transform           */
    FILE_PLUGIN_PHASE_WRITE     /* whole-file spew/append transform     */
    FILE_PLUGIN_PHASE_RECORD    /* per-record dispatch                  */
    FILE_PLUGIN_PHASE_STREAM    /* chunked feed for streaming           */

=item B<FilePluginContext>

Per-call dispatch context (lifetime: single dispatch call).

    typedef struct FilePluginContext {
        const char  *path;          /* file path                        */
        SV          *data;          /* read: bytes; write: payload      */
        SV          *callback;      /* per-record cb (stream phase)     */
        HV          *options;       /* per-call opts; mortal; never NULL*/

t/025-plugin-lines-iter.t  view on Meta::CPAN

#!/usr/bin/perl
use strict;
use warnings;
use Test::More;
use File::Raw;
use File::Temp qw(tempdir);

# lines_iter($p, plugin => 'name', ...) wraps the plugin's READ output
# (an AoA) in an iterator. Trade-off vs each_line: eager whole-AoA in
# memory, but preserves the iterator handle interface (storable,
# lazy-on-the-Perl-side, supports early close). For true streaming use
# each_line($p, $cb, plugin => 'name').

# Plugin that returns an arrayref of single-element rows (one per word).
File::Raw::register_plugin('words', {
    read => sub {
        my ($p, $bytes, $opts) = @_;
        return [ map { [$_] } split /\s+/, $bytes ];
    },
});



( run in 0.549 second using v1.01-cache-2.11-cpan-140bd7fdf52 )