Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Document/Type.pm  view on Meta::CPAN

    $self->{fileType}=File::Type->new();
    if (!defined($self->{fileType}))
    {
	$self->_set_err_state($ERR_FILE_TYPE);
	return undef;
    }

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{defaultType}='text';
    $self->{defaultSubType}='plain';

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

#
# Returns similarly to MIME ($type,$sub_type), but is broader
#
sub guess
{
    my $self=shift;
    my $text=shift;

    $self->_set_err_state($ERR_OK);  # clean the slate

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    
    my ($type,$sub_type);

    # Try File::Type first
    my $mime_type=$self->{fileType}->mime_type($text);
    if (!defined($mime_type))
    {
	$type=$self->{defaultType};
	$sub_type=$self->{defaultSubType};
    }
    else
    {
	($type,$sub_type)=split(/\//,$mime_type,-1);
    }

    # If the result is a generic one, check for our types of interest
    # by other means
    # BTW, File::Type should make it clear and checkable what its
    # "I dunno" reply is
    if ($type eq 'application' && $sub_type eq 'octet-stream')
    {
	if ($self->_looks_like_HTML($text))
	{
	    ($type,$sub_type)=('text','html');
	}
	elsif ($self->_looks_like_RSS($text))
	{
	    # not a MIME type
	    ($type,$sub_type)=('text','rss')
	}
    }

    return ($type,$sub_type);
}


sub _looks_like_HTML
{
    my $self=shift;
    my $text=shift;

    #
    # If we're lucky...
    #
    if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
    {
	my $type=$1;
	if ($type=~/(?:html|wml)/igo)
	{
	    return 1;
	}
    }
    # Otherwise, use a weaker way of checking... a single 
    # signature start tag will do. 
    #
    if ($text=~/<(?:(?i)html|body)\W/sgo)
    {
	return 1;
    }

    return 0;
}

sub _looks_like_RSS
{
    my $self=shift;
    my $text=shift;

    #
    # If we're lucky...
    #
    if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
    {
	my $type=$1;
	if ($type=~/(?:rss)/igo)
	{
	    return 1;
	}
    }
    # Otherwise, use a weaker way of checking... a single 
    # signature start tag will do. 
    #
    if ($text=~/<(?:(?i)rss|channel)\W/sgo)
    {
	return 1;
    }

    return 0;
}

1;




1;
__END__

=head1 NAME

Alvis::Document::Type - Perl extension for guessing and checking the type
of a document (an extension of MIME types).

=head1 SYNOPSIS

 use Alvis::Document::Type;

 # Create a new instance
 my $t=Alvis::Document::Type->new(defaultType=>'text',
                                  defaultSubType=>'html');
 if (!defined($t))
 {
    die('Ugh!');
 }

 my ($doc_type,$doc_sub_type)=$t->guess($doc_text);
 if (!(defined($doc_type) && defined($doc_sub_type)))
 {
    die("Guess what? " . $t->errmsg()); 
 }

=head1 DESCRIPTION

Tries to guess the type of a document similarly to MIME types



( run in 1.869 second using v1.01-cache-2.11-cpan-39bf76dae61 )