Solstice
view release on metacpan or search on metacpan
lib/Solstice/StringLibrary.pm view on Meta::CPAN
package Solstice::StringLibrary;
# $Id: StringLibrary.pm 2418 2005-07-28 23:28:31Z mcrawfor $
=head1 NAME
Solstice::StringLibrary - A library of generic string manipulation functions
=head1 SYNOPSIS
use StringLibrary qw(truncstr);
my $str = truncstr("This is a line of text that needs truncating.");
=head1 DESCRIPTION
Functions in this library make no assumptions about the content
of the string being modified.
=cut
use 5.006_000;
use strict;
use warnings;
use HTML::Entities;
use HTML::TreeBuilder;
use HTML::FormatText;
use Solstice::StripScripts::Parser;
use Exporter;
our @ISA = qw(Exporter);
our ($VERSION) = ('$Revision: 2418 $' =~ /^\$Revision:\s*([\d.]*)/);
our @EXPORT = qw|htmltounicode truncstr truncemail fixstrlen encode decode unrender scrubhtml convertspaces strtoascii strtourl strtofilename strtojavascript trimstr htmltotext extracttext scrubcdata urlclean fixlinewidth|;
our %EXPORT_TAGS = ( all => [ qw|
htmltounicode
truncstr
truncemail
fixstrlen
encode
decode
unrender
scrubhtml
convertspaces
strtoascii
strtourl
strtofilename
strtojavascript
trimstr
htmltotext
extracttext
scrubcdata
urlclean
fixlinewidth
| ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );
=head2 Superclass
L<Exporter|Exporter>
=head2 Export
No symbols exported.
=head2 Functions
=over 4
=cut
=item htmltounicode($string)
Returns C<$string> with all E<amp>#234;-like unicode entities packed into perl
unicode.
=cut
sub htmltounicode {
my ($string) = @_;
return undef unless defined $string;
$string =~ s/&#(\d*?);/pack('U*', $1)/ge;
return $string;
}
=item scrubhtml ($string)
Returns $string with all malicious scripts, broken tags, relative links, dynamic css, etc removed.
=cut
sub scrubhtml {
my ($string) = @_;
return undef unless defined $string;
my $parser = Solstice::StripScripts::Parser->new({
AllowSrc => 1,
AllowHref => 1,
AllowNonHTTP => 1,
});
$parser->parse($string);
$parser->eof;
return $parser->filtered_document;
}
=item truncstr($string, $cutoff, $marker)
Returns $string truncated to $cutoff, and appended with an optional
cutoff marker (defaults to '...').
lib/Solstice/StringLibrary.pm view on Meta::CPAN
=item strtofilename($string, $preserve_whitespace)
Returns $string transformed into a safe file name, by converting
spaces to underscores and removing forward slashes. $preserve_whitespace
specifies that whitespace should be escaped rather than translated.
=cut
sub strtofilename {
my ($string, $preserve_whitespace) = @_;
return undef unless defined $string;
my $replace = ($preserve_whitespace) ? "\\ " : '_';
for ($string) {
s/\s/$replace/g;
s/[\/\?\<\>\\\:\*\|\)\(\']//g;
}
return $string;
}
=item strtojavascript($string)
Returns $string transformed into a javascript-safe string, by
escaping single- and double-quote characters.
=cut
sub strtojavascript {
my $string = shift;
return undef unless defined $string;
for ($string) {
s/'/'/g;
#XXX well - removing this seems to clear up a lot of double-escaping we're seeing. hope it doesn't break anything.
# s/\\/\\\\/g;
s/"/\\"/g;
s/'/\\'/g;
s/[\n\r]//g;
}
return $string;
}
=item trimstr($string)
Remove leading and trailing whitespace from $string.
=cut
sub trimstr {
my $string = shift;
return undef unless defined $string;
for ($string) {
s/^(?:\s|	| | | )+//;
s/(?:\s|	| | | )+$//;
}
return $string;
}
=item scrubcdata($string)
This will return a string with ]]> escaped, so it will be cdata safe.
=cut
sub scrubcdata {
my $string = shift;
return undef unless defined $string;
$string =~ s/]]>/]]>/g;
return $string;
}
package Solstice::StringLibrary::ExtractText;
use base qw(HTML::Formatter);
## no critic
#this little section is determined by a superclass, doesn't fit our style guidlines
sub pre_out {
my $self = shift;
my $text = shift;
$self->collect($text);
}
sub out {
my $self = shift;
my $text = shift;
unless ($text =~ /^\s*$/) {
$self->collect($text.' ');
}
}
sub img_start {
my ($self, $node) = @_;
my $alt = $node->attr('alt');
$alt = (defined $alt && $alt ne '') ? ": $alt" : '';
$self->collect('[IMAGE'.$alt.'] ');
}
sub adjust_lm {}
sub adjust_rm {}
## use critic
#this exists just to remove the line that corrupts some text for us
package Solstice::StringLibrary::FormatText;
use base qw(HTML::FormatText);
sub out
{
my $self = shift;
my $text = shift;
#here's the culprit
# $text =~ tr/\xA0\xAD/ /d;
if ($text =~ /^\s*$/) {
$self->{hspace} = 1;
return;
}
if (defined $self->{vspace}) {
( run in 0.515 second using v1.01-cache-2.11-cpan-39bf76dae61 )