BOM results from the CPAN

File-LoadLines

view release on metacpan or search on metacpan

![Language Perl](https://img.shields.io/badge/Language-Perl-blue)

File::LoadLines provides an easy way to load the contents of a 
disk file or network resource into your program.

It can deliver the contents without touching (as a blob) but its most
useful purpose is to deliver the contents of text data into an array
of lines. Hence the name, File::LoadLines.

It automatically handles data encodings ASCII, Latin and UTF-8 text.
When the file has a BOM, it handles UTF-8, UTF-16 LE and BE, and
UTF-32 LE and BE.

Recognized line terminators are NL (Unix, Linux), CRLF (DOS, Windows)
and CR (Mac)

## SUPPORT AND DOCUMENTATION

Development of this module takes place on GitHub:
https://github.com/sciurius/perl-File-LoadLines.

lib/File/LoadLines.pm view on Meta::CPAN

=head1 DESCRIPTION

File::LoadLines provides an easy way to load the contents of a text
file into an array of lines. It is intended for small to moderate size files
like config files that are often produced by weird tools (and users).

It will transparently fetch data from the network if the provided file
name is a URL.

File::LoadLines automatically handles ASCII, Latin-1 and UTF-8 text.
When the file has a BOM, it handles UTF-8, UTF-16 LE and BE, and
UTF-32 LE and BE.

Recognized line terminators are NL (Unix, Linux), CRLF (DOS, Windows)
and CR (Mac)

Function loadblob(), exported on depand, fetches the content and
returns it without processing, equivalent to File::Slurp and ilk.

=head1 EXPORT

lib/File/LoadLines.pm view on Meta::CPAN

	# Do not touch.
	$options->{encoding} = 'Blob';
    }
    elsif ( $encoded ) {
	# Nothing to do, already dealt with.
	$options->{encoding} //= 'Perl';
    }

    # Detect Byte Order Mark.
    elsif ( $data =~ /^\xEF\xBB\xBF/ ) {
	warn("$name is UTF-8 (BOM)\n") if $options->{debug};
	$options->{encoding} = 'UTF-8';
	$data = decode( "UTF-8", substr($data, 3) );
    }
    elsif ( $data =~ /^\xFE\xFF/ ) {
	warn("$name is UTF-16BE (BOM)\n") if $options->{debug};
	$options->{encoding} = 'UTF-16BE';
	$data = decode( "UTF-16BE", substr($data, 2) );
    }
    elsif ( $data =~ /^\xFF\xFE\x00\x00/ ) {
	warn("$name is UTF-32LE (BOM)\n") if $options->{debug};
	$options->{encoding} = 'UTF-32LE';
	$data = decode( "UTF-32LE", substr($data, 4) );
    }
    elsif ( $data =~ /^\xFF\xFE/ ) {
	warn("$name is UTF-16LE (BOM)\n") if $options->{debug};
	$options->{encoding} = 'UTF-16LE';
	$data = decode( "UTF-16LE", substr($data, 2) );
    }
    elsif ( $data =~ /^\x00\x00\xFE\xFF/ ) {
	warn("$name is UTF-32BE (BOM)\n") if $options->{debug};
	$options->{encoding} = 'UTF-32BE';
	$data = decode( "UTF-32BE", substr($data, 4) );
    }

    # No BOM, did user specify an encoding?
    elsif ( $options->{encoding} ) {
	warn("$name is ", $options->{encoding}, " (fallback)\n")
	  if $options->{debug};
	$data = decode( $options->{encoding}, $data, 1 );
    }

    # Try UTF8, fallback to ISO-8895.1.
    else {
	my $d = eval { decode( "UTF-8", $data, 1 ) };
	if ( $@ ) {

lib/File/LoadLines.pm view on Meta::CPAN

    croak("Missing filename.\n") unless defined $filename;
    croak("Invalid options.\n")
      if defined($options) && ref($options) ne "HASH";
    $options //= {};
    $options->{blob} = 1;
    loadlines( $filename, $options );
}

=head1 SEE ALSO

There are currently no other modules that handle BOM detection and
line splitting.

I have a faint hope that future versions of Perl and Raku will deal
with this transparently, but I fear the worst.

=head1 HINTS

When you have raw file data (e.g. from a zip), you can use loadlines()
to decode and unpack:

t/10-basic.t view on Meta::CPAN

	$line++;
	$tally++ if /â‚¬urÃ¸/;
    }
    is( $tally, 4, "matches" );
}

# test1.dat: UTF-8 Unicode text
testlines("test1.dat");
# test2.dat: UTF-8 Unicode text, with CRLF line terminators
testlines("test2.dat");
# test3.dat: UTF-8 Unicode (with BOM) text
testlines("test3.dat");
# test4.dat: UTF-8 Unicode (with BOM) text, with CRLF line terminators
testlines("test4.dat");
# test5.dat: Little-endian UTF-16 Unicode text
testlines("test5.dat");
# test6.dat: Little-endian UTF-16 Unicode text, with CRLF, CR line terminators
testlines("test6.dat");
# test7.dat: UTF-8 Unicode text, with CR line terminators
testlines("test7.dat");
# test8.dat: UTF-8 Unicode (with BOM) text, with CR line terminators
testlines("test8.dat");
# test9.dat: Little-endian UTF-16 Unicode text, with CR line terminators
testlines("test9.dat");

t/11-basic.t view on Meta::CPAN

	$line++;
	$tally++ if /â‚¬urÃ¸/;
    }
    is( $tally, 4, "matches" );
}

# test1.dat: UTF-8 Unicode text
testlines("test1.dat");
# test2.dat: UTF-8 Unicode text, with CRLF line terminators
testlines("test2.dat");
# test3.dat: UTF-8 Unicode (with BOM) text
testlines("test3.dat");
# test4.dat: UTF-8 Unicode (with BOM) text, with CRLF line terminators
testlines("test4.dat");
# test5.dat: Little-endian UTF-16 Unicode text
testlines("test5.dat");
# test6.dat: Little-endian UTF-16 Unicode text, with CRLF, CR line terminators
testlines("test6.dat");
# test7.dat: UTF-8 Unicode text, with CR line terminators
testlines("test7.dat");
# test8.dat: UTF-8 Unicode (with BOM) text, with CR line terminators
testlines("test8.dat");
# test9.dat: Little-endian UTF-16 Unicode text, with CR line terminators
testlines("test9.dat");

t/13-nochomp.t view on Meta::CPAN

	is( length($_), $delta+$lengths[$line], "line $line" );
	$line++;
	$tally++ if /â‚¬urÃ¸/;
    }
    is( $tally, 4, "matches" );
}

$delta = 1;
# test1.dat: UTF-8 Unicode text
testlines("test1.dat");
# test3.dat: UTF-8 Unicode (with BOM) text
testlines("test3.dat");
# test5.dat: Little-endian UTF-16 Unicode text
testlines("test5.dat");
# test7.dat: UTF-8 Unicode text, with CR line terminators
testlines("test7.dat");
# test8.dat: UTF-8 Unicode (with BOM) text, with CR line terminators
testlines("test8.dat");
# test9.dat: Little-endian UTF-16 Unicode text, with CR line terminators
testlines("test9.dat");

$delta = 2;
# test2.dat: UTF-8 Unicode text, with CRLF line terminators
testlines("test2.dat");
# test4.dat: UTF-8 Unicode (with BOM) text, with CRLF line terminators
testlines("test4.dat");
# test6.dat: Little-endian UTF-16 Unicode text, with CRLF, CR line terminators
testlines("test6.dat");

t/28-encoding.t view on Meta::CPAN


# Reference data.
my @data = ( "{title: Swing Low Sweet Chariot}", "{subtitle: Sub TÃtlÃ«}" );

mkdir("out") unless -d "out";

# Recode to UTF-8.
my $data = join("\n", @data) . "\n";
$data = encode("UTF-8", $data);

my @BOMs = qw( UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE );
my @noBOMs = qw( ISO-8859-1 UTF-8 );

my %enc2bom = map { $_ => encode($_, "\x{feff}") } @BOMs;

enctest( $_, 1 ) for @noBOMs;
enctest($_) for @BOMs;

done_testing( 4 * 3 * (@noBOMs + @BOMs) );

sub enctest {
    my ( $enc, $nobom ) = @_;
    my $encoded = $data;
    _enctest( $encoded, $enc, $nobom );
    $encoded = $data;
    $encoded =~ s/\n/\x0a/g;
    _enctest( $encoded, $enc, $nobom, "LF" );
    $encoded = $data;
    $encoded =~ s/\n/\x0d/g;

t/28-encoding.t view on Meta::CPAN

    from_to( $encoded, "UTF-8", $enc );
    unless ( $nobom ) {
	BAIL_OUT("Unknown encoding: $enc") unless $enc2bom{$enc};
	$encoded = $enc2bom{$enc} . $encoded;
    }

    my $fn = "out/$enc.cho";
    open( my $fh, ">:raw", $fn ) or die("$fn: $!\n");
    print $fh $encoded;
    close($fh);
    $enc .= " (no BOM)" if $nobom;
    $enc .= " ($crlf)" if $crlf;

    my $opts = { fail => "soft" };
    my @d = loadlines( $fn, $opts );
    note("$fn: " . $opts->{error} ) unless @d;
    ok( scalar( @d ) == 2, "$enc: Two lines" );
    is( $d[0], $data[0], "$enc: Line 1" );
    is( $d[1], $data[1], "$enc: Line 2" );

    unlink($fn);

( run in 0.466 second using v1.01-cache-2.11-cpan-e9daa2b36ef )