HTML-Encapsulate
view release on metacpan or search on metacpan
lib/HTML/Encapsulate.pm view on Meta::CPAN
close $fh;
return $content;
}
sub _spit
{
my $path = shift;
my $content = shift;
confess "failed to open file '$path': $!" unless open my $fh, ">", $path;
print $fh $content;
close $fh;
}
# This parses the charset from a HTML doc's HEAD section, if present,
#
# The code here is adapted from Tatsuhiko Miyagawa's here:
# http://svn.bulknews.net/repos/public/HTTP-Response-Charset/trunk/lib/HTTP/Response/Charset.pm
#
# See also http://use.perl.org/~miyagawa/journal/31250
# HTTP::Response::Charset seems not to be on CPAN, however.
{
my $boms = [
'UTF-8' => "\x{ef}\x{bb}\x{bf}",
'UTF-32BE' => "\x{0}\x{0}\x{fe}\x{ff}",
'UTF-32LE' => "\x{ff}\x{fe}\x{0}\x{0}",
'UTF-16BE' => "\x{fe}\x{ff}",
'UTF-16LE' => "\x{ff}\x{fe}",
];
sub _detect_encoding
{
my $filename = shift;
# 1) We assume the content has been identified as HTML,
# and the Content-Type header already checked.
# Read in a max 4k chunk from the content;
my $chunk;
{
open my $fh, "<", $filename
or Carp::confess "Failed to read file '$filename': $!";
read $fh, $chunk, 4096; # read up to 4k
close $fh;
}
# 2) Look for META head tags
{
my $head_parser = HTML::HeadParser->new;
$head_parser->parse($chunk);
$head_parser->eof;
my $content_type = $head_parser->header('Content-Type');
return unless $content_type;
my ($words) = HTTP::Headers::Util::split_header_words($content_type);
my %param = @$words;
return $param{charset};
}
# 3) If there's a UTF BOM set, look for it
my $count = 0;
while (my ($enc, $bom) = $boms->[$count++, $count++])
{
return $enc
if $bom eq substr($chunk, 0, length $bom);
}
# 4) If it looks like an XML document, look for XML declaration
if ($chunk =~ m!^<\?xml\s+version="1.0"\s+encoding="([\w\-]+)"\?>!) {
return $1;
}
# 5) If there's Encode::Detect module installed, try it
if ( eval "use Encode::Detect::Detector" ) {
my $charset = Encode::Detect::Detector::detect($chunk);
return $charset if $charset;
}
return;
}
}
# Constructor
sub new
{
my $class = shift;
croak "You must supply a matched set of key => value paramters"
if @_ % 2;
my %options = @_;
unless (defined $options{ua})
{
# the default user agent should follow redirects
my $ua = LWP::UserAgent->new(
requests_redirectable => [qw(GET POST HEAD)]
);
$options{ua} = $ua;
}
my $self = bless \%options, $class;
return $self;
}
sub ua { @_>1 ? shift->{ua} = shift : shift->{ua} }
our $DEFAULT_INSTANCE; # lazily assigned within download
sub download
{
my $self = shift;
# An URI or HTTP::Request for the page we want
my $request = shift;
# Where to save things. A directory - the main file will be called
# 'index.html'
( run in 1.386 second using v1.01-cache-2.11-cpan-39bf76dae61 )