Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Document/Type.pm view on Meta::CPAN
$self->{fileType}=File::Type->new();
if (!defined($self->{fileType}))
{
$self->_set_err_state($ERR_FILE_TYPE);
return undef;
}
return $self;
}
sub _init
{
my $self=shift;
$self->{defaultType}='text';
$self->{defaultSubType}='plain';
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#
# Returns similarly to MIME ($type,$sub_type), but is broader
#
sub guess
{
my $self=shift;
my $text=shift;
$self->_set_err_state($ERR_OK); # clean the slate
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
}
my ($type,$sub_type);
# Try File::Type first
my $mime_type=$self->{fileType}->mime_type($text);
if (!defined($mime_type))
{
$type=$self->{defaultType};
$sub_type=$self->{defaultSubType};
}
else
{
($type,$sub_type)=split(/\//,$mime_type,-1);
}
# If the result is a generic one, check for our types of interest
# by other means
# BTW, File::Type should make it clear and checkable what its
# "I dunno" reply is
if ($type eq 'application' && $sub_type eq 'octet-stream')
{
if ($self->_looks_like_HTML($text))
{
($type,$sub_type)=('text','html');
}
elsif ($self->_looks_like_RSS($text))
{
# not a MIME type
($type,$sub_type)=('text','rss')
}
}
return ($type,$sub_type);
}
sub _looks_like_HTML
{
my $self=shift;
my $text=shift;
#
# If we're lucky...
#
if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type=~/(?:html|wml)/igo)
{
return 1;
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($text=~/<(?:(?i)html|body)\W/sgo)
{
return 1;
}
return 0;
}
sub _looks_like_RSS
{
my $self=shift;
my $text=shift;
#
# If we're lucky...
#
if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type=~/(?:rss)/igo)
{
return 1;
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($text=~/<(?:(?i)rss|channel)\W/sgo)
{
return 1;
}
return 0;
}
1;
1;
__END__
=head1 NAME
Alvis::Document::Type - Perl extension for guessing and checking the type
of a document (an extension of MIME types).
=head1 SYNOPSIS
use Alvis::Document::Type;
# Create a new instance
my $t=Alvis::Document::Type->new(defaultType=>'text',
defaultSubType=>'html');
if (!defined($t))
{
die('Ugh!');
}
my ($doc_type,$doc_sub_type)=$t->guess($doc_text);
if (!(defined($doc_type) && defined($doc_sub_type)))
{
die("Guess what? " . $t->errmsg());
}
=head1 DESCRIPTION
Tries to guess the type of a document similarly to MIME types
( run in 1.869 second using v1.01-cache-2.11-cpan-39bf76dae61 )