Acme-InputRecordSeparatorIsRegexp

 view release on metacpan or  search on metacpan

lib/Acme/InputRecordSeparatorIsRegexp.pm  view on Meta::CPAN

our @ISA = 'Exporter';
our @EXPORT_OK = ('open','autochomp','input_record_separator','binmode');
our %EXPORT_TAGS = ( all =>  [ @EXPORT_OK ] );

BEGIN {
    no strict 'refs';
    *{ 'Acme::IRSRegexp' . "::" } = \*{ __PACKAGE__ . "::" };
}

our $VERSION = '0.07';

sub TIEHANDLE {
    my ($pkg, @opts) = @_;
    my $handle;
    if (@opts % 2) {
	$handle = Symbol::gensym;
    } else {
	my $fh = *{shift @opts};
        # will fail if open for $fh failed, but that's not important
	eval { CORE::open $handle, '<&+', $fh };
    }
    my $rs = shift @opts;
    my %opts = @opts;
    $opts{maxrecsize} ||= ($opts{bufsize} || 16384) / 4;
    $opts{bufsize} ||= $opts{maxrecsize} * 4;
    my $self = bless {
	%opts,
	handle => $handle,
	rs => $rs,
	records => [],
	buffer => ''
    }, $pkg;
    $self->_compile_rs;
    return $self;
}

# We abuse the PerlIO layers syntax to attach
# a regexp specification to a filehandle. This
# function extracts an ':irs(REGEXP)' layer from
# a string.
sub _extract_irs {
    my ($mode) = @_;
    
    my $irs = "";
    my $p0 = index($mode,":irs(");
    my $p1 = $p0 + 5;
    my $nest = 1;
    while ($nest) {
        my $c = eval { substr($mode,$p1++,1) };
        if ($@ || !defined($c)) {
            carp "Argument list not closed for PerlIO layer \"$irs\"";
            return;
        }
        if ($c eq "\\") {
            $c .= substr($mode,$p1++,1);
        }
        if ($c eq "(") { $nest++ }
        if ($c eq ")") { $nest-- }
        if ($nest) { $irs .= $c; }
    }
    substr($mode,$p0,length($irs)+6, "");
    $_[0] = $mode;
    return $irs;
}

sub open (*;$@) {
    no strict 'refs';        # or else bareword file handles will break
    my (undef,$mode,$expr,@list) = @_;
    if (!defined $_[0]) {
        $_[0] = Symbol::gensym;
    }
    my $glob = $_[0];
    if (!ref($glob) && $glob !~ /::/) {
        $glob = join("::",caller(0) || "", $glob);
    }

    if ($mode && index($mode,":irs(") >= 0) {
        my $irs = _extract_irs($mode);
        my $z = @list ? CORE::open *$glob, $mode, $expr, @list
                      : CORE::open *$glob, $mode, $expr;
        tie *$glob, __PACKAGE__, *$glob, $irs;
        return $z;
    }
    if (@list) {
        return CORE::open(*$glob,$mode,$expr,@list);
    } elsif ($expr) {
        return CORE::open(*$glob,$mode,$expr);
    } elsif ($mode) {
        return CORE::open(*$glob,$mode);
    } else {
        return CORE::open(*$glob);
    }
}

sub binmode (*;$) {
    my ($glob,$mode) = @_;
    $mode ||= ":raw";

    if (index($mode,":irs(") >= 0) {
        my $irs = _extract_irs($mode);
        input_record_separator($glob,$irs);
        return 1 unless $mode;
    }
    return CORE::binmode($glob,$mode);
}

sub _compile_rs {
    my $self = shift;
    my $rs = $self->{rs};

    my $q = eval { my @q = split /(?<=${rs})/,""; 1 };
    if ($q) {
	$self->{rsc} = qr/(?<=${rs})/s;
        if ($rs =~ /\?\^\w*m/) {
            $self->{rsc} = qr/(?<=${rs})/ms;
        }
	$self->{can_use_lookbehind} = 1;
    } else {
	$self->{rsc} = qr/(.*?(?:${rs}))/s;
        if ($rs =~ /\?\^\w*m/) {
            $self->{rsc} = qr/(.*?(?:${rs}))/ms;
        }
	$self->{can_use_lookbehind} = 0;
    }
    return;
}

sub READLINE {
    my $self = shift;
    if (wantarray) {
	local $/ = undef;
	$self->{buffer} .= readline($self->{handle});
	push @{$self->{records}}, $self->_split;
	$self->{buffer} = "";
	my @rec = splice @{$self->{records}};
	if (@rec && $self->{autochomp}) {
	    $self->chomp( @rec );
	}
	return @rec;
    }
    # want scalar
    if (!@{$self->{records}}) {
	$self->_populate_buffer;
    }
    my $rec = shift @{$self->{records}};
    if (defined($rec) && $self->{autochomp}) {
	$self->chomp( $rec );
    }
    return $rec;
}

sub _populate_buffer {
    my $self = shift;
    my $handle = $self->{handle};
    return if !$handle || eof($handle);
    
    my @rec;
    {
	my $buffer = '';
	my $n = read $handle, $buffer, $self->{bufsize};
	$self->{buffer} .= $buffer;
	@rec = $self->_split;
	redo if !eof($handle) && @rec == 1;
    }
    push @{$self->{records}}, @rec;
    $self->{buffer} = '';
    if (eof($handle)) {
	return;
    }

    if (@{$self->{records}} > 1) {
	$self->{buffer} = pop @{$self->{records}};
    }
    return;
}

sub EOF {
    my $self = shift;
    foreach my $rec (@{$self->{records}}, $self->{buffer}) {
	return if length($rec) > 0;
    }
    return eof($self->{handle});
}

sub _split {
    my $self = shift;
    if (!defined $self->{can_use_lookbehind}) {
	$self->_compile_rs;
    }
    my $rs = $self->{rsc};
    my @rec = split $rs, $self->{buffer};
    if ($self->{can_use_lookbehind}) {
	return @rec;
    } else {
	return grep length, @rec;
    }
}

sub CLOSE {
    my $self = shift;
    $self->_clear_buffer;
    my $z = close $self->{handle};
    # delete $self->{handle};
    return $z;
}

sub _clear_buffer {
    my $self = shift;
    $self->{buffer} = '';
    $self->{records} = [];
}

sub OPEN {
    my ($self, $mode, @args) = @_;
    if ($self->{handle}) {
	# close $self->{handle};
    }
    my $z = CORE::open $self->{handle}, $mode, @args;
    if ($z) {
	$self->_clear_buffer;
    }
    return $z;
}

sub FILENO {
    my $self = shift;
    return fileno($self->{handle});
}

sub WRITE {
    my ($self, $buf, $len, $offset) = @_;
    $offset ||= 0;
    if (!defined $len) {
	$len = length($buf)-$offset;
    }
    $self->PRINT( substr($buf,$offset,$len) );
}

sub PRINT {
    my ($self, @msg) = @_;
    if ($self->TELL() != tell($self->{handle})) {
	$self->SEEK(0,1);
    } else {
	$self->_clear_buffer;
    }
    print {$self->{handle}} @msg;
}

sub PRINTF {
    my ($self, $template, @args) = @_;
    $self->PRINT(sprintf($template,@args));
}

sub READ {
    my $self = shift;
    my $bufref = \$_[0];
    my (undef, $len, $offset) = @_;
    my $nread = 0;

    while ($len > 0 && @{$self->{records}}) {
	if (length($self->{records}[0])>=$len) {
	    my $rec = shift @{$self->{records}};
	    my $reclen = length($rec);
	    substr( $$bufref, $offset, $reclen, $rec);
	    $len -= $reclen;
	    $offset += $reclen;
	    $nread += $reclen;
	} else {
	    my $rec = substr($self->{records}[0], 0, $len, "");
	    substr( $$bufref, $offset, $len, $rec);
	    $offset += $len;
	    $nread += $len;
	    $len = 0;
	}
    }
    if ($len > 0 && length($self->{buffer}) > 0) {
	my $reclen = length($self->{buffer});
	if ($reclen >= $len) {
	    my $rec = substr( $self->{buffer}, 0, $len, "" );
	    substr( $$bufref, $offset, $len, $rec );
	    $offset += $len;
	    $nread += $len;
	    $len = 0;
	} else {
	    substr( $$bufref, $offset, $reclen, $self->{buffer} );
	    $self->{buffer} = "";
	    $offset += $reclen;
	    $nread += $reclen;
	    $len -= $reclen;
	}
    }
    if ($len > 0) {
	return $nread + read $self->{handle}, $$bufref, $len, $offset;
    } else {
	return $nread;
    }
}

sub GETC {
    my $self = shift;
    if (@{$self->{records}}==0 && 0 == length($self->{buffer})) {
	$self->_populate_buffer;
    }

    if (@{$self->{records}}) {
	my $c = substr( $self->{records}[0], 0, 1, "" );
	if (0 == length($self->{records}[0])) {
	    shift @{$self->{records}};
	}
	return $c;
    } elsif (0 != length($self->{buffer})) {
	my $c = substr( $self->{buffer}, 0, 1, "" );
	return $c;
    } else {
	# eof?
	return undef;
    }
}

sub BINMODE {
    my $self = shift;
    my $handle = $self->{handle};
    if (@_) {
	CORE::binmode $handle, @_;
    } else {
	CORE::binmode $handle;
    }    
}

sub SEEK {
    my ($self, $pos, $whence) = @_;

    if ($whence == 1) {
	$whence = 0;
	$pos += $self->TELL;
    }

    # easy implementation:
    #     on any seek, clear records, buffer

    $self->_clear_buffer;
    seek $self->{handle}, $pos, $whence;

    # more sophisticated implementation
    #     on a seek forward, remove bytes from the front
    #     of buffered data
}

sub TELL {
    my $self = shift;
    # virtual cursor position is actual position on the file handle
    # minus the length of any buffered data
    my $tell = tell $self->{handle};
    $tell -= length($self->{buffer});
    $tell -= length($_) for @{$self->{records}};
    return $tell;
}


no warnings 'redefine';
sub IO::Handle::input_record_separator {
    my $self = shift;
    if (ref($self) eq 'GLOB' || ref(\$self) eq 'GLOB') {
        if (tied(*$self)) {
            if (ref(tied(*$self)) eq __PACKAGE__) {
                return input_record_separator($self,@_);
            }
            my $z = eval { (tied *$self)->input_record_separator(@_) };
            if ($@) {
                carp "input_record_separator is not supported on tied handle";
            }
            return $z;
        }
        if (!@_) { return $/ }
        $self = tie *$self, __PACKAGE__, $self, quotemeta($/);
        return input_record_separator($self,@_);
    } else {
        carp "input to input_record_separator was not a handle";
        return;
    }
}


sub input_record_separator {
    my $self = shift;
    if (ref($self) eq 'GLOB' || ref(\$self) eq 'GLOB') {
        if (!tied *$self) {
            if (!@_) {
                return IO::Handle::input_record_separator(*$self);
            }
            $self = tie *$self, __PACKAGE__, $self, quotemeta($/);
        } else {
            $self = tied *$self;
        }
    }
    if (@_) {
	$self->{rs} = shift;
	delete $self->{can_use_lookbehind};
    }
    $self->_compile_rs;
    return $self->{rs};
}

sub autochomp {
    my $self = shift;
    if (ref($self) eq 'GLOB' || ref(\$self) eq 'GLOB') {
        if (!tied *$self) {
            if (!@_) {
                return 0;
            }
            $self = tie *$self, __PACKAGE__, $self, quotemeta($/);
        } else {
            $self = tied *$self;
        }
    }
    my $val = $self->{autochomp} || 0;
    if (@_) {
        $self->{autochomp} = 0+!!$_[0];
    }
    return 0+$val;
}

sub chomp {
    my $self = shift;
    my $removed = 0;
    my $rs = $self->{rs};
    foreach my $line (@_) {
	$line =~ s/($rs)$//;
	if (defined($1)) {
	    $removed += length($1);
	}
    }
    return $removed;
}

1; # 

__END__

=head1 NAME

Acme::InputRecordSeparatorIsRegexp - awk doesn't have to be better at something.

=head1 VERSION

Version 0.07

=head1 SYNOPSIS

    use Acme::InputRecordSeparatorIsRegexp;

    # open-then-tie
    open my $fh, '<', 'file-with-Win-Mac-and-Unix-line-endings';
    tie *$fh, 'Acme::IRSRegexp', $fh, '\r\n|\n|\r';
    while (<$fh>) {
        # $_ could have "\r\n", "\n", or "\r" line ending now
    }

    # tie-then-open
    tie *{$fh=Symbol::gensym}, 'Acme::IRSRegExp', qr/\r\n|[\r\n]/;
    open $fh, '<', 'file-with-ambiguous-line-endings';
    $line = <$fh>;

    # import open function and use :irs pseudo-layer
    use Acme::InputRecordSeparatorIsRegexp 'open';
    open my $fh, '<:irs(\r\n|\r|\n)', 'ambiguous.txt';
    $line = <$fh>;

    # import binmode and use :irs pseudo-layer
    use Acme::InputRecordSeparatorIsRegexp 'binmode';
    open my $fh, '<', 'ambiguous.txt';
    binmode $fh, ':irs(\r\n|\r|\n)';
    $line = <$fh>;

=head1 DESCRIPTION

In the section about the L<"input record separator"|perlvar/"$/">,
C<perlvar> famously quips

=over 4

Remember: the value of $/ is a string, not a regex. B<awk>
has to be better for something. :-)

=back

This module provides a mechanism to read records from a file
using a regular expression as a record separator.

A common use case for this module is to read a text file 

lib/Acme/InputRecordSeparatorIsRegexp.pm  view on Meta::CPAN

with the file handle. For regular file handles, this is always
the current value of L<< C<$/>|perlvar/"$INPUT_RECORD_SEPARATOR" >>.

    use Acme::InputRecordSeperatorIsRegexp ':all';

    open my $fh_reg, "<", "some_text_file";
    open my $fh_pkg, "<:irs(\d)", "some_text_file";

    $rs = $fh_reg->input_record_separator;    #   "\n"
    $rs = input_record_separator($fh_reg);    #   "\n"
    $rs = $fh_pkg->input_record_separator;    #   '\d'
    $rs = input_record_separator($fh_pkg);    #   '\d'

With two or more arguments, sets the input record separator for
the file handle as the regular expression indicated by the second
argument (any argument after the second is ignored). For regular
file handles, a side effect is that the file handle will be tied 
to this package

    print ref(tied *$fh_reg);        #   ""
    $fh_reg->input_record_separator(qr/\r\n|\r|\n/);
    print ref(tied *$fh_reg);        #   "Acme::InputRecordSeparatorIsRegexp"

If you are just looking for the functionality of setting different
input record separators on different file handles but don't care about
applying regular expressions to determine line endings, this function
provides an (inefficient) way to do that for arbitrary file handles.
Note that the argument to set the input record separator is treated
as a regular expression, so apply C<quotemeta> to it as necessary.

=head1 METHODS

=head2 chomp

    my $chars_removed = (tied *$fh)->chomp($line_from_fh);
    my $chars_removed = (tied *$fh)->chomp(@lines_from_fh);

Like the builtin L<< C<chomp>|"chomp"/perlvar >> function,
but removes the trailing string from lines that correspond to
the file handle's custom input record separator regular
expression instead of C<$/>. Like the builtin C<chomp>,
returns the total number of characters removed from
all its arguments. See also L<"autochomp">.

=head1 INTERNALS

In unusual circumstances, you may be interested in some of the
internals of the tied file handle object. You can set the values
of these internals by passing additional arguments to the
C<tie> statement or passing a hash reference to this package's 
L<"open"> function, for example:

    my $th = Acme::InputRecordSeparatorIsRegexp->open( $regex, '<', $filename,
    			{ bufsize => 65336 } );

=head2 bufsize

The amount of data, in bytes, to read from the input stream at
a time. For performance reasons, this should be at least a few kilobytes.
B<For the module to work correctly, it should also be much larger
than the length of any sequence of characters that could be construed
as a line ending.>

=head1 ALIAS

The package C<Acme::IRSRegexp> is an alias for
C<Acme::InputRecordSeparatorIsRegexp>, allowing you to write

    use Acme::InputRecordSeparatorIsRegexp;
    tie *$fh, 'Acme::IRSRegexp', 

=head1 AUTHOR

Marty O'Brien, C<< <mob at cpan.org> >>

=head1 BUGS, LIMITATIONS, AND OTHER NOTES

Because this package must often pre-fetch input to determine where
a line-ending is, it is generally not appropriate to apply this
package to C<STDIN> or other terminal-like input.

Changing C<$/> will have no affect on a file handle that has
already been tied to this package.

Calling L<< C<chomp>|"chomp"/perlfunc >> on a return value from this
package will operate with C<$/>, B<not> with the regular expression
associated with the tied file handle. Use the construction
C<< tied(*$fh)->chomp(...) >> to perform the chomp operation on
a filehandle that has customized its input record separator with
this package. Or see the L<< C<autochomp>|"autochomp" >> method
to automatically get chomped input.

Please report any bugs or feature requests to 
C<bug-acme-inputrecordseparatorisregexp at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Acme-InputRecordSeparatorIsRegexp>.  
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.


=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Acme::InputRecordSeparatorIsRegexp

You can also look for information at:

=over 4

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Acme-InputRecordSeparatorIsRegexp>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Acme-InputRecordSeparatorIsRegexp>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Acme-InputRecordSeparatorIsRegexp>

lib/Acme/InputRecordSeparatorIsRegexp.pm  view on Meta::CPAN


Any use, modification, and distribution of the Standard or Modified
Versions is governed by this Artistic License. By using, modifying or
distributing the Package, you accept this license. Do not use, modify,
or distribute the Package, if you do not accept this license.

If your Modified Version has been derived from a Modified Version made
by someone other than you, you are nevertheless required to ensure that
your Modified Version complies with the requirements of this license.

This license does not grant you the right to use any trademark, service
mark, tradename, or logo of the Copyright Holder.

This license includes the non-exclusive, worldwide, free-of-charge
patent license to make, have made, use, offer to sell, sell, import and
otherwise transfer the Package with respect to any patent claims
licensable by the Copyright Holder that are necessarily infringed by the
Package. If you institute patent litigation (including a cross-claim or
counterclaim) against any party alleging that the Package constitutes
direct or contributory patent infringement, then this Artistic License
to you shall terminate on the date that such litigation is filed.

Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


=cut

tied object members

    handle           => *
    buffered_records => []
    buffer           => $
    rs               => $
    bufsize          => $
  X maxrecsize       => $

    autochomp        => bool

private methods

    _populate_buffer
    _split


test plan:

    read methods
    ------------
    READLINE
        scalar context
        list context
    READ
        with and without offset
        length < = > buffer allocated
        before and after populate buffer (i.e., after scalar READLINE)
    GETC
        with and without populated buffer (after SCALAR READLINE)
        at eof
    EOF
        with buffer populated

    SEEK & TELL
        move around, keep reading

    write methods
    -------------
    PRINT, PRINTF, WRITE (syswrite)

    misc methods
    ------------
    FILENO
    BINMODE
    OPEN

    data sources
    ------------
    regular file in < mode
    piped input? socket?
    regular file in <+ mode
    regular file in >>+ mode
    DATA filehandle
    in memory filehandle
    mock handle already tied to something else?

test data:
    data with random, different line endings (\n, \r, \r\n)

    random capital letters, split on ..., I dunno, [A-Z][XY]

    join a sequence of integers, split on 120|12|345|
    join a sequence of integers, split on 12|120|345|

TODO

    test, doc, release can_use_lookbehind



( run in 2.294 seconds using v1.01-cache-2.11-cpan-524268b4103 )