BOM results from the CPAN

Badger

    version => 0.01,
    base    => 'Badger::Codec::Encode';

use Encode qw();
use bytes;

# Default encoding
our $ENCODING = 'UTF-8';

# Byte Order Markers for different UTF encodings
our $UTFBOMS = [
    'UTF-8'    => "\x{ef}\x{bb}\x{bf}",
    'UTF-32BE' => "\x{0}\x{0}\x{fe}\x{ff}",
    'UTF-32LE' => "\x{ff}\x{fe}\x{0}\x{0}",
    'UTF-16BE' => "\x{fe}\x{ff}",
    'UTF-16LE' => "\x{ff}\x{fe}",
];

sub encode {
    my ($self, $enc, $data) = @_ == 3 ? @_ : (shift, $ENCODING, shift);
    Encode::encode($enc, $data);

lib/Badger/Codec/Unicode.pm view on Meta::CPAN


sub decode {
    my $self = shift;
    if (@_ >= 2) {
        goto &Encode::decode;       # not a real GOTO - more like a magic
    }                               # subroutine call - see perldoc -f goto
    else {
        my $data  = shift;
        my $count = 0;
        
        # try all the BOMs in order looking for one (order is important
        # 32bit BOMs look like 16bit BOMs)
        while ($count < @$UTFBOMS) {
            my $enc = $UTFBOMS->[$count++];
            my $bom = $UTFBOMS->[$count++];
        
            # does the string start with the bom?
            if ($bom eq substr($data, 0, length($bom))) {
                # decode it and hand it back
#                return Encode::decode($enc, $data);
                return Encode::decode($enc, substr($data, length($bom)), 1);
            }
        }
        return $data;
    }

lib/Badger/Codec/Unicode.pm view on Meta::CPAN

    $utf8 = $codec->encode($data);

=head2 decode($encoding, $data)

Method for decoding Unicode data.  If two arguments are provided then 
the first is the encoding and the second the data to decode.

    $decoded = $codec->decode( utf8 => $encoded );

If one argument is provided then the method will look for a Byte Order
Mark (BOM) to determine the encoding.  If a BOM isn't present, or if the
BOM doesn't match a supported Unicode BOM (any of C<UTF-8>, C<UTF-32BE>
C<UTF-32LE>, C<UTF-16BE> or C<UTF-16LE>) then the data will not be 
decoded as Unicode.

    $decoded = $codec->decode($encoded);    # use BOM to detect encoding

=head2 encoder()

This method returns a subroutine reference which can be called to encode
Unicode data.  Internally it calls the L<encode()> method.

    my $encoder = $codec->encode;
    $encoded = $encoder->($data);

=head2 decoder()

t/filesystem/encoding.t view on Meta::CPAN

use Badger
    Filesystem => 'Bin',
    Codecs     => [ codec => 'utf8' ];

my $dir = Bin->dir('testfiles')->must_exist;


# This is 'moose...' (with slashes in the 'o's them, and the '...' as one char).
my $moose = "m\x{f8}\x{f8}se\x{2026}";

# This is the same data UTF8 encoded complete with BOM
#my $data  = "\x{ef}\x{bb}\x{bf}m\x{c3}\x{b8}\x{c3}\x{b8}se\x{e2}\x{80}\x{a6}";
my $data  = "m\x{c3}\x{b8}\x{c3}\x{b8}se\x{e2}\x{80}\x{a6}";

# write data to file - we do this as raw so we can pass the BOM through
my $file = $dir->file('utf8_data');
$file->raw->print($data);

# read the text back in
my $text = $file->utf8->text;

ok( utf8::is_utf8($text), 'text is utf8' );

is( reasciify($text), reasciify($moose), 'data is unchanged' );

( run in 0.568 second using v1.01-cache-2.11-cpan-e9daa2b36ef )