BOM results from the CPAN

Serge

sub wrap {
    my ($s, $length) = @_;
    die "length should be a positive integer" unless $length > 0;

    # Wrap by '\n' explicitly

    if ($s =~ m{^(.*?(?:\\n|\n))(.+)$}s) {
        my $a = $1; # if $1 and $2 are used directly, this won't work
        my $b = $2;
        return wrap($a, $length), wrap($b, $length);
    }

    # The following regexp was taken from the Translate Toolkit, file textwrap.py

    my @a = split(/(\s+|[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))/, $s);

    my @lines;
    my $line = '';
    while (scalar(@a) > 0) {

        # Take next chunk

        my $chunk = shift @a;

        # Treat whitespace chunks as zero-width to avoid starting the line with whitespace

        my $chunk_length = ($chunk =~ m/^\s*$/) ? 0 : length($chunk);

        if (length($line) + $chunk_length > $length) {
            push @lines, $line;

            # We do not handle the situation when chunk by itself is bigger than $length.
            # We can optionally hard-break such chunks into sub-chunks of exact $length
            # (this might be an option later)

            $line = $chunk;
        } else {
            $line .= $chunk;
        }
    }
    push @lines, $line if $line ne '';

    return @lines;
}

sub read_and_normalize_file {
    my ($fname) = @_;

    # Reading the entire file

    open(SRC, $fname) || die "Can't read [$fname]: $!";
    binmode(SRC);
    my $data = join('', <SRC>);
    close(SRC);

    my $decoder = Encode::Guess->guess($data);
    if (ref($decoder)) {
        my $enc = uc($decoder->name);

        # remove BOM
        # (not sure why this was done, as BOM is apparently needed for at least UTF-16 decoding;
        # so I disabled BOM removal for UTF-16 for now)
        $data =~ s/^\xFF\xFE//s         if  ($enc eq 'UTF-16LE');
        #$data =~ s/^\xFE\xFF//s         if (($enc eq 'UTF-16BE') || ($enc eq 'UTF-16'));
        $data =~ s/^\xFF\xFE\x00\x00//s if  ($enc eq 'UTF-32LE');
        $data =~ s/^\x00\x00\xFE\xFF//s if (($enc eq 'UTF-32BE') || ($enc eq 'UTF-32'));
        $data =~ s/^\xEF\xBB\xBF//s     if (($enc eq 'UTF-8')    || ($enc eq 'UTF8'));

        $data = $decoder->decode($data);
    } else {
        if ($data =~ m/^<\?xml\s+(.+?)\?>/i) {
            my $attrs = $1;
            if ($attrs =~ m/encoding=['"](.+?)['"]/i) {
                my $enc = uc($1);
                #print "\t\tEncoding (from XML header): $enc\n";

                # remove BOM
                $data =~ s/^\xEF\xBB\xBF//s if (($enc eq 'UTF-8') || ($enc eq 'UTF8'));

                $data = decode($enc, $data);
            }
        } else {
            #print "\t\tEncoding (default): ASCII\n"; # $decoder holds the error string
        }
    }

    $data =~ s/\r\n/\n/sg; # normalize line-feeds

    return $data;
}

sub file_mtime {
    my ($fname) = @_;

    my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size,
       $atime, $mtime, $ctime, $blksize, $blocks) = stat($fname);

    return $mtime;
}

1;
( run in 0.727 second using v1.01-cache-2.11-cpan-d7f47b0818f )