utf8 results from the CPAN

utf8

Apache-Wyrd

view release on metacpan or search on metacpan

Wyrd/Services/Index.pm view on Meta::CPAN

package Apache::Wyrd::Services::Index;
use 5.006;
use strict;
use warnings;
no warnings qw(uninitialized);
our $VERSION = '0.98';
use Apache::Wyrd::Services::SAK qw(token_parse strip_html utf8_force utf8_to_entities);
use Apache::Wyrd::Services::SearchParser;
use BerkeleyDB;
use BerkeleyDB::Btree;
use Digest::SHA qw(sha1_hex);
use Carp;

=pod

=head1 NAME

Wyrd/Services/Index.pm view on Meta::CPAN

	$self->db->db_get("\x03\%$id", my $data);
	if ($data =~ s/^\x00:(.+)//) {
		$self->db_big->db_get($1, $data);
	}
	return $data;
}

sub index_map {
	my ($self, $attribute_name, $id, $data) = @_;

	use Encode qw(_utf8_off);
	_utf8_off($data);
	#warn "mapping $id - $attribute : " . join (':', @$data);
	my $attribute = $self->attributes->{$attribute_name};
	my (%unique, $item, @items) = (); # for unique-ifying word list
	#remove duplicates if necessary
	if (ref($data) eq 'ARRAY') {
		@items = grep { $unique{$_}++ == 0 } @$data;
	} elsif (ref($data) eq 'HASH') {
		%unique = %$data;
		@items = keys(%unique);
	} else {

Wyrd/Services/Index.pm view on Meta::CPAN

=item (scalar) C<clean_html> (scalar)

Given a string of HTML, this method strips out all tags, comments, etc., and
returns only clean lowercase text for breaking down into tokens.

=cut

sub clean_html {
	my ($self, $data) = @_;
	$data = strip_html($data);
	$data = utf8_force($data);
	$data = lc($data);
	$data =~ s/\p{IsM}/ /gs; # Strip M_arks
	$data =~ s/\p{IsP}/ /gs; # Strip P_unct
	$data =~ s/\p{IsZ}/ /gs; # Strip S(Z_)eparators
	$data =~ s/\p{IsC}+/ /sg; # Flatten all whitespace & C_ontrol characters
	$data =~ s/^[\p{IsC} ]+//s; #Remove leading whitespace
	$data =~ s/[\p{IsC} ]+$//s; #Remove trailing whitespace
	$data =~ s/\+//g;
	$data = utf8_to_entities($data); #Encode all multibyte sequences to entities
	$data =~ s/([\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF])/'&#x' . sprintf('%X', ord($1)) . ';'/gexs; #Encode all single-byte "unusual" characters
	return $data;
}

sub clean_searchstring {
	goto &clean_html;
}

=pod

Wyrd/Services/SAK.pm view on Meta::CPAN

use 5.008;
use strict;
use warnings;
no warnings qw(uninitialized);

package Apache::Wyrd::Services::SAK;
use Carp;
use Exporter;
use Apache::Util;
use HTML::Entities;
use Encode qw(from_to _utf8_off);

=pod

=head1 NAME

Apache::Wyrd::Services::SAK - Swiss Army Knife of common subs

=head1 SYNOPSIS

	use Apache::Wyrd::Services::SAK qw(:hashes spit_file);

Wyrd/Services/SAK.pm view on Meta::CPAN

	slurp_file
	sort_by_ikey
	sort_by_key
	spit_file
	strip_html
	token_hash
	token_parse
	uri_escape
	uniquify_by_key
	uniquify_by_ikey
	utf8_force
	utf8_to_entities
);

our %EXPORT_TAGS = (
	all			=>	\@EXPORT_OK,
	db			=>	[qw(cgi_query do_query set_clause _exists_in_table)],
	file		=>	[qw(file_attribute slurp_file spit_file)],
	hash		=>	[qw(array_4_get data_clean env_4_get lc_hash sort_by_ikey sort_by_key token_hash token_parse uniquify_by_ikey uniquify_by_key)],
	mail		=>	[qw(send_mail)],
	string		=>	[qw(commify strip_html utf8_force utf8_to_entities)],
	tag			=>	[qw(attopts_template)],
	uri			=>	[qw(normalize_href uri_escape)],
);

=pod

=head2 DATABASE (:db)

Functions for working with databases.  Designed for use with a
combination of C<Apache::Wyrd::Interfaces::Setter> and the DBI-compatible

Wyrd/Services/SAK.pm view on Meta::CPAN

	my ($data) = @_;
	$data = decode_entities($data);
	$data =~ s/<>//g; # Strip out all empty tags
	$data =~ s/<--.*?-->/ /g; # Strip out all comments
	$data =~ s/<[^>]*?>/ /g; # Strip out all HTML tags
	return $data;
}

=pod

=item (scalar) C<utf8_force>(scalar)

Attempt to decode the text into UTF-8 by trying different common encodings
until one returns valid UTF-8.

=cut

sub utf8_force {
	my ($text) = @_;
	my $success = 0;
	if (utf8::valid($text)) {
		utf8::upgrade($text);
		return $text;
	}
	for my $encoding (qw(windows-1252 MacRoman Latin-1 Latin-9)) {
		my $trial_data = $text;
		eval {
			from_to($encoding, 'utf8', $trial_data, Encode::FB_HTMLCREF);
		};
		if (not($@) && utf8::valid($trial_data)) {
			$text = $trial_data;
			$success = 1;
			last;
		}
	}
	unless ($success) {
		carp "Unable to encode as UTF8";
	}
	return $text;
}

=pod

=item (scalar) C<utf8_to_entities>(scalar)

Seek through the given text for Unicode byte sequences and replace them with
numbered entities for that unicode character.  Assumes the text is properly-
formatted UTF8.

=cut


sub utf8_to_entities {
	my ($text) = @_;
	use Encode qw(_utf8_off);
	_utf8_off($text);
	while ($text =~ /(([\xC0-\xFF])([\x80-\xFF]{1,5}))/) {

		#store the sequence for later;
		my $unicode_sequence = $1;

		#separate the first byte from the others
		my ($first, $second) = ($2, $3);

		#split remaining bytes and count them
		my @parts = split '', $second;

t/7_mysqlindex.t view on Meta::CPAN

name varchar(255) unique not null,
timestamp long,
digest char(40),
data blob,
wordcount integer,
title varchar(255),
keywords varchar(255),
description text,
regular varchar(255),
map varchar(255)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

drop table if exists _wyrd_index_data;
create table _wyrd_index_data (
item varchar(255) not null,
id integer,
tally integer
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

drop table if exists _wyrd_index_regular;
create table _wyrd_index_regular (
item varchar(255) not null,
id integer,
tally integer
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

drop table if exists _wyrd_index_map;
create table _wyrd_index_map (
item varchar(255) not null,
id integer,
tally integer
) ENGINE=MyISAM DEFAULT CHARSET=utf8
CREATE

for my $query (split ';', $create_routine) {
	last if ($count == 0);
	my $sh = $dbh->prepare($query);
	$sh->execute;
	if ($sh->err) {
		warn $sh->errstr;
		$count = 0;
	}

( run in 1.044 second using v1.01-cache-2.11-cpan-49f99fa48dc )