HTML-Encapsulate

 view release on metacpan or  search on metacpan

lib/HTML/Encapsulate.pm  view on Meta::CPAN

    close $fh;
    return $content;
}

sub _spit
{
    my $path = shift;
    my $content = shift;
    confess "failed to open file '$path': $!" unless open my $fh, ">", $path;
    print $fh $content;
    close $fh;
}

# This parses the charset from a HTML doc's HEAD section, if present, 
#
# The code here is adapted from Tatsuhiko Miyagawa's here:
# http://svn.bulknews.net/repos/public/HTTP-Response-Charset/trunk/lib/HTTP/Response/Charset.pm
#
# See also http://use.perl.org/~miyagawa/journal/31250
# HTTP::Response::Charset seems not to be on CPAN, however.
{

    my $boms = [
        'UTF-8'    => "\x{ef}\x{bb}\x{bf}",
        'UTF-32BE' => "\x{0}\x{0}\x{fe}\x{ff}",
        'UTF-32LE' => "\x{ff}\x{fe}\x{0}\x{0}",
        'UTF-16BE' => "\x{fe}\x{ff}",
        'UTF-16LE' => "\x{ff}\x{fe}",
    ];


    sub _detect_encoding
    {
        my $filename = shift;
        
        # 1) We assume the content has been identified as HTML, 
        # and the Content-Type header already checked.

        # Read in a max 4k chunk from the content;
        my $chunk;
        {
            open my $fh, "<", $filename 
                or Carp::confess "Failed to read file '$filename': $!";
            read $fh, $chunk, 4096; # read up to 4k
            close $fh;
        }

        # 2) Look for META head tags
        {
            my $head_parser = HTML::HeadParser->new;              
            $head_parser->parse($chunk);            
            $head_parser->eof;
            
            my $content_type = $head_parser->header('Content-Type');            
            return unless $content_type;
            my ($words) = HTTP::Headers::Util::split_header_words($content_type);
            my %param = @$words;
            return $param{charset};
        }

        # 3) If there's a UTF BOM set, look for it
        my $count = 0;
        while (my ($enc, $bom) = $boms->[$count++, $count++])
        {
            return $enc 
                if $bom eq substr($chunk, 0, length $bom);
        }
    
        # 4) If it looks like an XML document, look for XML declaration
        if ($chunk =~ m!^<\?xml\s+version="1.0"\s+encoding="([\w\-]+)"\?>!) {
            return $1;
        }

        # 5) If there's Encode::Detect module installed, try it
        if ( eval "use Encode::Detect::Detector" ) {
            my $charset = Encode::Detect::Detector::detect($chunk);
            return $charset if $charset;
        }
        
        return;
    }
}


# Constructor

sub new
{
    my $class = shift;
    croak "You must supply a matched set of key => value paramters"
        if @_ % 2;

    my %options = @_;

    unless (defined $options{ua}) 
    {
        # the default user agent should follow redirects
        my $ua = LWP::UserAgent->new(
            requests_redirectable => [qw(GET POST HEAD)]
        );
        $options{ua} = $ua;
    }

    my $self = bless \%options, $class;
    
    return $self;
}

sub ua { @_>1 ? shift->{ua} = shift : shift->{ua} }

our $DEFAULT_INSTANCE; # lazily assigned within download

sub download
{
    my $self = shift;

    # An URI or HTTP::Request for the page we want
    my $request = shift;

    # Where to save things. A directory - the main file will be called
    # 'index.html'



( run in 1.386 second using v1.01-cache-2.11-cpan-39bf76dae61 )