Acme-InputRecordSeparatorIsRegexp
view release on metacpan or search on metacpan
lib/Acme/InputRecordSeparatorIsRegexp.pm view on Meta::CPAN
package Acme::InputRecordSeparatorIsRegexp;
use 5.006;
use strict;
use warnings FATAL => 'all';
use Symbol;
use Carp;
use IO::Handle;
require Exporter;
our @ISA = 'Exporter';
our @EXPORT_OK = ('open','autochomp','input_record_separator','binmode');
our %EXPORT_TAGS = ( all => [ @EXPORT_OK ] );
BEGIN {
no strict 'refs';
*{ 'Acme::IRSRegexp' . "::" } = \*{ __PACKAGE__ . "::" };
}
our $VERSION = '0.07';
sub TIEHANDLE {
my ($pkg, @opts) = @_;
my $handle;
if (@opts % 2) {
$handle = Symbol::gensym;
} else {
my $fh = *{shift @opts};
# will fail if open for $fh failed, but that's not important
eval { CORE::open $handle, '<&+', $fh };
}
my $rs = shift @opts;
my %opts = @opts;
$opts{maxrecsize} ||= ($opts{bufsize} || 16384) / 4;
$opts{bufsize} ||= $opts{maxrecsize} * 4;
my $self = bless {
%opts,
handle => $handle,
rs => $rs,
records => [],
buffer => ''
}, $pkg;
$self->_compile_rs;
return $self;
}
# We abuse the PerlIO layers syntax to attach
# a regexp specification to a filehandle. This
# function extracts an ':irs(REGEXP)' layer from
# a string.
sub _extract_irs {
my ($mode) = @_;
my $irs = "";
my $p0 = index($mode,":irs(");
my $p1 = $p0 + 5;
my $nest = 1;
while ($nest) {
my $c = eval { substr($mode,$p1++,1) };
if ($@ || !defined($c)) {
carp "Argument list not closed for PerlIO layer \"$irs\"";
return;
}
if ($c eq "\\") {
$c .= substr($mode,$p1++,1);
}
if ($c eq "(") { $nest++ }
if ($c eq ")") { $nest-- }
if ($nest) { $irs .= $c; }
}
substr($mode,$p0,length($irs)+6, "");
$_[0] = $mode;
return $irs;
}
sub open (*;$@) {
no strict 'refs'; # or else bareword file handles will break
my (undef,$mode,$expr,@list) = @_;
if (!defined $_[0]) {
$_[0] = Symbol::gensym;
}
my $glob = $_[0];
if (!ref($glob) && $glob !~ /::/) {
$glob = join("::",caller(0) || "", $glob);
}
if ($mode && index($mode,":irs(") >= 0) {
my $irs = _extract_irs($mode);
my $z = @list ? CORE::open *$glob, $mode, $expr, @list
: CORE::open *$glob, $mode, $expr;
tie *$glob, __PACKAGE__, *$glob, $irs;
return $z;
}
if (@list) {
return CORE::open(*$glob,$mode,$expr,@list);
} elsif ($expr) {
return CORE::open(*$glob,$mode,$expr);
} elsif ($mode) {
return CORE::open(*$glob,$mode);
} else {
return CORE::open(*$glob);
}
}
sub binmode (*;$) {
my ($glob,$mode) = @_;
$mode ||= ":raw";
if (index($mode,":irs(") >= 0) {
my $irs = _extract_irs($mode);
input_record_separator($glob,$irs);
return 1 unless $mode;
}
return CORE::binmode($glob,$mode);
}
sub _compile_rs {
my $self = shift;
my $rs = $self->{rs};
my $q = eval { my @q = split /(?<=${rs})/,""; 1 };
if ($q) {
$self->{rsc} = qr/(?<=${rs})/s;
if ($rs =~ /\?\^\w*m/) {
$self->{rsc} = qr/(?<=${rs})/ms;
}
$self->{can_use_lookbehind} = 1;
} else {
$self->{rsc} = qr/(.*?(?:${rs}))/s;
if ($rs =~ /\?\^\w*m/) {
$self->{rsc} = qr/(.*?(?:${rs}))/ms;
}
$self->{can_use_lookbehind} = 0;
}
return;
}
sub READLINE {
my $self = shift;
if (wantarray) {
local $/ = undef;
$self->{buffer} .= readline($self->{handle});
push @{$self->{records}}, $self->_split;
$self->{buffer} = "";
my @rec = splice @{$self->{records}};
if (@rec && $self->{autochomp}) {
$self->chomp( @rec );
}
return @rec;
}
# want scalar
if (!@{$self->{records}}) {
$self->_populate_buffer;
}
my $rec = shift @{$self->{records}};
if (defined($rec) && $self->{autochomp}) {
$self->chomp( $rec );
}
return $rec;
}
sub _populate_buffer {
my $self = shift;
my $handle = $self->{handle};
return if !$handle || eof($handle);
my @rec;
{
my $buffer = '';
my $n = read $handle, $buffer, $self->{bufsize};
$self->{buffer} .= $buffer;
@rec = $self->_split;
redo if !eof($handle) && @rec == 1;
}
lib/Acme/InputRecordSeparatorIsRegexp.pm view on Meta::CPAN
substr( $$bufref, $offset, $reclen, $rec);
$len -= $reclen;
$offset += $reclen;
$nread += $reclen;
} else {
my $rec = substr($self->{records}[0], 0, $len, "");
substr( $$bufref, $offset, $len, $rec);
$offset += $len;
$nread += $len;
$len = 0;
}
}
if ($len > 0 && length($self->{buffer}) > 0) {
my $reclen = length($self->{buffer});
if ($reclen >= $len) {
my $rec = substr( $self->{buffer}, 0, $len, "" );
substr( $$bufref, $offset, $len, $rec );
$offset += $len;
$nread += $len;
$len = 0;
} else {
substr( $$bufref, $offset, $reclen, $self->{buffer} );
$self->{buffer} = "";
$offset += $reclen;
$nread += $reclen;
$len -= $reclen;
}
}
if ($len > 0) {
return $nread + read $self->{handle}, $$bufref, $len, $offset;
} else {
return $nread;
}
}
sub GETC {
my $self = shift;
if (@{$self->{records}}==0 && 0 == length($self->{buffer})) {
$self->_populate_buffer;
}
if (@{$self->{records}}) {
my $c = substr( $self->{records}[0], 0, 1, "" );
if (0 == length($self->{records}[0])) {
shift @{$self->{records}};
}
return $c;
} elsif (0 != length($self->{buffer})) {
my $c = substr( $self->{buffer}, 0, 1, "" );
return $c;
} else {
# eof?
return undef;
}
}
sub BINMODE {
my $self = shift;
my $handle = $self->{handle};
if (@_) {
CORE::binmode $handle, @_;
} else {
CORE::binmode $handle;
}
}
sub SEEK {
my ($self, $pos, $whence) = @_;
if ($whence == 1) {
$whence = 0;
$pos += $self->TELL;
}
# easy implementation:
# on any seek, clear records, buffer
$self->_clear_buffer;
seek $self->{handle}, $pos, $whence;
# more sophisticated implementation
# on a seek forward, remove bytes from the front
# of buffered data
}
sub TELL {
my $self = shift;
# virtual cursor position is actual position on the file handle
# minus the length of any buffered data
my $tell = tell $self->{handle};
$tell -= length($self->{buffer});
$tell -= length($_) for @{$self->{records}};
return $tell;
}
no warnings 'redefine';
sub IO::Handle::input_record_separator {
my $self = shift;
if (ref($self) eq 'GLOB' || ref(\$self) eq 'GLOB') {
if (tied(*$self)) {
if (ref(tied(*$self)) eq __PACKAGE__) {
return input_record_separator($self,@_);
}
my $z = eval { (tied *$self)->input_record_separator(@_) };
if ($@) {
carp "input_record_separator is not supported on tied handle";
}
return $z;
}
if (!@_) { return $/ }
$self = tie *$self, __PACKAGE__, $self, quotemeta($/);
return input_record_separator($self,@_);
} else {
carp "input to input_record_separator was not a handle";
return;
}
}
sub input_record_separator {
my $self = shift;
if (ref($self) eq 'GLOB' || ref(\$self) eq 'GLOB') {
lib/Acme/InputRecordSeparatorIsRegexp.pm view on Meta::CPAN
return 0;
}
$self = tie *$self, __PACKAGE__, $self, quotemeta($/);
} else {
$self = tied *$self;
}
}
my $val = $self->{autochomp} || 0;
if (@_) {
$self->{autochomp} = 0+!!$_[0];
}
return 0+$val;
}
sub chomp {
my $self = shift;
my $removed = 0;
my $rs = $self->{rs};
foreach my $line (@_) {
$line =~ s/($rs)$//;
if (defined($1)) {
$removed += length($1);
}
}
return $removed;
}
1; #
__END__
=head1 NAME
Acme::InputRecordSeparatorIsRegexp - awk doesn't have to be better at something.
=head1 VERSION
Version 0.07
=head1 SYNOPSIS
use Acme::InputRecordSeparatorIsRegexp;
# open-then-tie
open my $fh, '<', 'file-with-Win-Mac-and-Unix-line-endings';
tie *$fh, 'Acme::IRSRegexp', $fh, '\r\n|\n|\r';
while (<$fh>) {
# $_ could have "\r\n", "\n", or "\r" line ending now
}
# tie-then-open
tie *{$fh=Symbol::gensym}, 'Acme::IRSRegExp', qr/\r\n|[\r\n]/;
open $fh, '<', 'file-with-ambiguous-line-endings';
$line = <$fh>;
# import open function and use :irs pseudo-layer
use Acme::InputRecordSeparatorIsRegexp 'open';
open my $fh, '<:irs(\r\n|\r|\n)', 'ambiguous.txt';
$line = <$fh>;
# import binmode and use :irs pseudo-layer
use Acme::InputRecordSeparatorIsRegexp 'binmode';
open my $fh, '<', 'ambiguous.txt';
binmode $fh, ':irs(\r\n|\r|\n)';
$line = <$fh>;
=head1 DESCRIPTION
In the section about the L<"input record separator"|perlvar/"$/">,
C<perlvar> famously quips
=over 4
Remember: the value of $/ is a string, not a regex. B<awk>
has to be better for something. :-)
=back
This module provides a mechanism to read records from a file
using a regular expression as a record separator.
A common use case for this module is to read a text file
that you don't know whether it uses Unix (C<\n>),
Windows/DOS (C<\r\n>), or Mac (C<\r>) style line-endings,
or even if it might contain all three. To properly parse
this file, you could tie its file handle to this package with
the appropriate regular expression:
my $fh = Symbol::gensym;
tie *$fh, 'Acme::InputRecordSeparatorIsRegexp', '\r\n|\r|\n';
open $fh, '<', 'file-with-ambiguous-line-endings';
@lines = <$fh>;
# or
while (my $line = <$fh>) { ... }
The lines produced by the C<< <$fh> >> expression, like the
builtin C<readline> function and operator, include the record
separator at the end of the line, so the lines returned may end
in C<\r\n>, C<\r>, or C<\n>.
Another use case is files that contain multiple types of records
where a different sequence of characters is used to denote the
end of different types of records.
=head1 tie STATEMENT
A typical use of this package might look like
my $fh = Symbol::gensym;
tie *$fh, 'Acme::InputRecordSeparatorIsRegexp', $record_sep_regex;
open $fh, '<', $filename;
where C<$record_sep_regexp> is a string or a C<Regexp> object
(specified with the
L<< C<qr/.../>|"Quote and quote-like operators"/perlop >> notation)
containing the regular expression
you want to use for a file's line endings. Also see the convenience
method L<"open"> for an alternate way to obtain a file handle with
the features of this package.
=head1 FUNCTIONS
=head2 open
Another way of using this package to attach a regular expression
to the input record separator of a file handle, available since
v0.04, is to import this package's C<open> function and to
specify an C<:irs(...)> I<pseudo-layer>.
use Acme::InputRecordSeparatorIsRegexp 'open';
$result = open FILEHANDLE, "<:irs(REGEXP)", EXPR
$result = open FILEHANDLE, "<:irs(REGEXP)", EXPR, LIST
$result = open FILEHANDLE, "<:irs(REGEXP)", REFERENCE
$result = open my $fh, "<:irs(\r|\n|\r\n)", "ambiguous-line-endings.txt"
The C<:irs(...)> layer may be combined with other layers.
open my $fh, "<:encoding(UTF-16):irs(\R)", "ambiguous.txt"
See also: L<"binmode">
=head2 autochomp
Returns the current setting, or sets the C<autochomp> attribute
of a file handle associated with this package. When the
C<autochomp> attribute of the file handle is enabled, any lines
read from the file handle through the C<readline> function
or C<< <> >> operator will be returned with the (custom) line
endings automatically removed.
use Acme::InputRecordSeparatorIsRegexp 'open','autochomp';
open my $fh, '<:irs(\R)', 'ambiguous.txt';
autochomp($fh,1); # enable autochomp
my $is_autochomped = autochomp($fh);
autochomp(tied(*$fh), 0); # disable
This function can also be called as a method on the I<tied>
file handle.
(tied *$fh)->autochomp(1); # enable
$fh->autochomp(0); # not OK, must use tied handle
Enabling C<autochomp> with this function on a regular file handle
will tie the file handle into this package using the current
value of C<$/> as the handle's record separator. If you are
just looking for autochomp functionality and don't care about
applying regular expressions to determine line endings, this
function provides an (inefficient) way to do that to
arbitrary file handles.
The default attribute value is false.
=head2 binmode FILEHANDLE, LAYER
Overrides Perl's builtin L<binmode|perlfunc/"binmode"> function.
If the I<pseudo-layer> C<:irs(...)> is specified, then apply the
given regular expression as the dynamic input record separator for
the given filehandle.
Any other layers specified are passed to Perl's builtin C<binmode>
function.
=head2 input_record_separator
Returns the current setting, or changes the setting, of a file handle's
input record separator, I<including file handles that have not
already been tied to this package>. This overcomes a limitation
in L<IO::Handle::input_record_separator|IO::Handle/"METHODS">
where input record separators are not supported on a per-filehandle
basis.
With no arguments, returns the input record separator associated
with the file handle. For regular file handles, this is always
the current value of L<< C<$/>|perlvar/"$INPUT_RECORD_SEPARATOR" >>.
use Acme::InputRecordSeperatorIsRegexp ':all';
open my $fh_reg, "<", "some_text_file";
open my $fh_pkg, "<:irs(\d)", "some_text_file";
$rs = $fh_reg->input_record_separator; # "\n"
$rs = input_record_separator($fh_reg); # "\n"
$rs = $fh_pkg->input_record_separator; # '\d'
$rs = input_record_separator($fh_pkg); # '\d'
With two or more arguments, sets the input record separator for
the file handle as the regular expression indicated by the second
argument (any argument after the second is ignored). For regular
file handles, a side effect is that the file handle will be tied
to this package
print ref(tied *$fh_reg); # ""
$fh_reg->input_record_separator(qr/\r\n|\r|\n/);
print ref(tied *$fh_reg); # "Acme::InputRecordSeparatorIsRegexp"
If you are just looking for the functionality of setting different
input record separators on different file handles but don't care about
applying regular expressions to determine line endings, this function
provides an (inefficient) way to do that for arbitrary file handles.
Note that the argument to set the input record separator is treated
as a regular expression, so apply C<quotemeta> to it as necessary.
=head1 METHODS
=head2 chomp
my $chars_removed = (tied *$fh)->chomp($line_from_fh);
my $chars_removed = (tied *$fh)->chomp(@lines_from_fh);
Like the builtin L<< C<chomp>|"chomp"/perlvar >> function,
but removes the trailing string from lines that correspond to
the file handle's custom input record separator regular
expression instead of C<$/>. Like the builtin C<chomp>,
returns the total number of characters removed from
all its arguments. See also L<"autochomp">.
=head1 INTERNALS
In unusual circumstances, you may be interested in some of the
( run in 0.500 second using v1.01-cache-2.11-cpan-39bf76dae61 )