Solstice

 view release on metacpan or  search on metacpan

lib/Solstice/StringLibrary.pm  view on Meta::CPAN

package Solstice::StringLibrary;

# $Id: StringLibrary.pm 2418 2005-07-28 23:28:31Z mcrawfor $

=head1 NAME

Solstice::StringLibrary - A library of generic string manipulation functions

=head1 SYNOPSIS

  use StringLibrary qw(truncstr);

  my $str = truncstr("This is a line of text that needs truncating.");

=head1 DESCRIPTION

Functions in this library make no assumptions about the content 
of the string being modified.

=cut

use 5.006_000;
use strict;
use warnings;
use HTML::Entities;
use HTML::TreeBuilder;
use HTML::FormatText;
use Solstice::StripScripts::Parser;
use Exporter;

our @ISA = qw(Exporter);
our ($VERSION) = ('$Revision: 2418 $' =~ /^\$Revision:\s*([\d.]*)/);

our @EXPORT = qw|htmltounicode truncstr truncemail fixstrlen encode decode unrender scrubhtml convertspaces strtoascii strtourl strtofilename strtojavascript trimstr htmltotext extracttext scrubcdata urlclean fixlinewidth|;
our %EXPORT_TAGS = ( all => [ qw|
    htmltounicode
    truncstr
    truncemail
    fixstrlen
    encode
    decode
    unrender
    scrubhtml
    convertspaces
    strtoascii
    strtourl
    strtofilename
    strtojavascript
    trimstr
    htmltotext
    extracttext
    scrubcdata
    urlclean
    fixlinewidth
| ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );

=head2 Superclass

L<Exporter|Exporter>

=head2 Export

No symbols exported.

=head2 Functions

=over 4

=cut


=item htmltounicode($string)

Returns C<$string> with all E<amp>#234;-like unicode entities packed into perl
unicode.

=cut

sub htmltounicode {
    my ($string) = @_;
    return undef unless defined $string;
    $string =~ s/&#(\d*?);/pack('U*', $1)/ge;
    return $string;
}


=item scrubhtml ($string)

Returns $string with all malicious scripts, broken tags, relative links, dynamic css, etc removed.
=cut

sub scrubhtml {
    my ($string) = @_;
    return undef unless defined $string;

    my $parser = Solstice::StripScripts::Parser->new({
        AllowSrc     => 1,
        AllowHref    => 1,
        AllowNonHTTP => 1,
    });
    $parser->parse($string);
    $parser->eof;
    return $parser->filtered_document;
}

=item truncstr($string, $cutoff, $marker)

Returns $string truncated to $cutoff, and appended with an optional 
cutoff marker (defaults to '...').

lib/Solstice/StringLibrary.pm  view on Meta::CPAN

=item strtofilename($string, $preserve_whitespace)

Returns $string transformed into a safe file name, by converting
spaces to underscores and removing forward slashes. $preserve_whitespace
specifies that whitespace should be escaped rather than translated.

=cut

sub strtofilename {
    my ($string, $preserve_whitespace) = @_;
    return undef unless defined $string;

    my $replace = ($preserve_whitespace) ? "\\ " : '_';    

    for ($string) {
        s/\s/$replace/g;
        s/[\/\?\<\>\\\:\*\|\)\(\']//g;
    }
    return $string;
}

=item strtojavascript($string)

Returns $string transformed into a javascript-safe string, by 
escaping single- and double-quote characters.

=cut

sub strtojavascript {
    my $string = shift;
    return undef unless defined $string;

    for ($string) {
        s/&#39;/'/g;
        #XXX well - removing this seems to clear up a lot of double-escaping we're seeing. hope it doesn't break anything.
#        s/\\/\\\\/g;
        s/"/\\"/g;
        s/'/\\'/g;
        s/[\n\r]//g;
    }
    return $string;
}

=item trimstr($string)

Remove leading and trailing whitespace from $string.

=cut

sub trimstr {
    my $string = shift;
    return undef unless defined $string;

    for ($string) {
        s/^(?:\s|&#09;|&#10;|&#13;|&#32;)+//;
        s/(?:\s|&#09;|&#10;|&#13;|&#32;)+$//;
    }
    return $string;
}

=item scrubcdata($string)

This will return a string with ]]> escaped, so it will be cdata safe.

=cut

sub scrubcdata {
    my $string = shift;
    return undef unless defined $string;

    $string =~ s/]]>/]]&gt;/g;
    return $string;
}


package Solstice::StringLibrary::ExtractText;

use base qw(HTML::Formatter);

## no critic 
#this little section is determined by a superclass, doesn't fit our style guidlines

sub pre_out {
    my $self = shift;
    my $text = shift;
    $self->collect($text);
}

sub out {
    my $self = shift;
    my $text = shift;
    unless ($text =~ /^\s*$/) {
        $self->collect($text.' ');
    }
}

sub img_start {
    my ($self, $node) = @_;
    my $alt = $node->attr('alt');
    $alt = (defined $alt && $alt ne '') ? ": $alt" : '';
    $self->collect('[IMAGE'.$alt.'] ');
}

sub adjust_lm {} 
sub adjust_rm {}

## use critic

#this exists just to remove the line that corrupts some text for us
package Solstice::StringLibrary::FormatText;

use base qw(HTML::FormatText);

sub out
{
    my $self = shift;
    my $text = shift;

    #here's the culprit
#    $text =~ tr/\xA0\xAD/ /d;

    if ($text =~ /^\s*$/) {
    $self->{hspace} = 1;
    return;
    }

    if (defined $self->{vspace}) {



( run in 0.515 second using v1.01-cache-2.11-cpan-39bf76dae61 )