Apache-Wyrd
view release on metacpan or search on metacpan
Wyrd/Services/Index.pm view on Meta::CPAN
package Apache::Wyrd::Services::Index;
use 5.006;
use strict;
use warnings;
no warnings qw(uninitialized);
our $VERSION = '0.98';
use Apache::Wyrd::Services::SAK qw(token_parse strip_html utf8_force utf8_to_entities);
use Apache::Wyrd::Services::SearchParser;
use BerkeleyDB;
use BerkeleyDB::Btree;
use Digest::SHA qw(sha1_hex);
use Carp;
=pod
=head1 NAME
Wyrd/Services/Index.pm view on Meta::CPAN
$self->db->db_get("\x03\%$id", my $data);
if ($data =~ s/^\x00:(.+)//) {
$self->db_big->db_get($1, $data);
}
return $data;
}
sub index_map {
my ($self, $attribute_name, $id, $data) = @_;
use Encode qw(_utf8_off);
_utf8_off($data);
#warn "mapping $id - $attribute : " . join (':', @$data);
my $attribute = $self->attributes->{$attribute_name};
my (%unique, $item, @items) = (); # for unique-ifying word list
#remove duplicates if necessary
if (ref($data) eq 'ARRAY') {
@items = grep { $unique{$_}++ == 0 } @$data;
} elsif (ref($data) eq 'HASH') {
%unique = %$data;
@items = keys(%unique);
} else {
Wyrd/Services/Index.pm view on Meta::CPAN
=item (scalar) C<clean_html> (scalar)
Given a string of HTML, this method strips out all tags, comments, etc., and
returns only clean lowercase text for breaking down into tokens.
=cut
sub clean_html {
my ($self, $data) = @_;
$data = strip_html($data);
$data = utf8_force($data);
$data = lc($data);
$data =~ s/\p{IsM}/ /gs; # Strip M_arks
$data =~ s/\p{IsP}/ /gs; # Strip P_unct
$data =~ s/\p{IsZ}/ /gs; # Strip S(Z_)eparators
$data =~ s/\p{IsC}+/ /sg; # Flatten all whitespace & C_ontrol characters
$data =~ s/^[\p{IsC} ]+//s; #Remove leading whitespace
$data =~ s/[\p{IsC} ]+$//s; #Remove trailing whitespace
$data =~ s/\+//g;
$data = utf8_to_entities($data); #Encode all multibyte sequences to entities
$data =~ s/([\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF])/'&#x' . sprintf('%X', ord($1)) . ';'/gexs; #Encode all single-byte "unusual" characters
return $data;
}
sub clean_searchstring {
goto &clean_html;
}
=pod
Wyrd/Services/SAK.pm view on Meta::CPAN
use 5.008;
use strict;
use warnings;
no warnings qw(uninitialized);
package Apache::Wyrd::Services::SAK;
use Carp;
use Exporter;
use Apache::Util;
use HTML::Entities;
use Encode qw(from_to _utf8_off);
=pod
=head1 NAME
Apache::Wyrd::Services::SAK - Swiss Army Knife of common subs
=head1 SYNOPSIS
use Apache::Wyrd::Services::SAK qw(:hashes spit_file);
Wyrd/Services/SAK.pm view on Meta::CPAN
slurp_file
sort_by_ikey
sort_by_key
spit_file
strip_html
token_hash
token_parse
uri_escape
uniquify_by_key
uniquify_by_ikey
utf8_force
utf8_to_entities
);
our %EXPORT_TAGS = (
all => \@EXPORT_OK,
db => [qw(cgi_query do_query set_clause _exists_in_table)],
file => [qw(file_attribute slurp_file spit_file)],
hash => [qw(array_4_get data_clean env_4_get lc_hash sort_by_ikey sort_by_key token_hash token_parse uniquify_by_ikey uniquify_by_key)],
mail => [qw(send_mail)],
string => [qw(commify strip_html utf8_force utf8_to_entities)],
tag => [qw(attopts_template)],
uri => [qw(normalize_href uri_escape)],
);
=pod
=head2 DATABASE (:db)
Functions for working with databases. Designed for use with a
combination of C<Apache::Wyrd::Interfaces::Setter> and the DBI-compatible
Wyrd/Services/SAK.pm view on Meta::CPAN
my ($data) = @_;
$data = decode_entities($data);
$data =~ s/<>//g; # Strip out all empty tags
$data =~ s/<--.*?-->/ /g; # Strip out all comments
$data =~ s/<[^>]*?>/ /g; # Strip out all HTML tags
return $data;
}
=pod
=item (scalar) C<utf8_force>(scalar)
Attempt to decode the text into UTF-8 by trying different common encodings
until one returns valid UTF-8.
=cut
sub utf8_force {
my ($text) = @_;
my $success = 0;
if (utf8::valid($text)) {
utf8::upgrade($text);
return $text;
}
for my $encoding (qw(windows-1252 MacRoman Latin-1 Latin-9)) {
my $trial_data = $text;
eval {
from_to($encoding, 'utf8', $trial_data, Encode::FB_HTMLCREF);
};
if (not($@) && utf8::valid($trial_data)) {
$text = $trial_data;
$success = 1;
last;
}
}
unless ($success) {
carp "Unable to encode as UTF8";
}
return $text;
}
=pod
=item (scalar) C<utf8_to_entities>(scalar)
Seek through the given text for Unicode byte sequences and replace them with
numbered entities for that unicode character. Assumes the text is properly-
formatted UTF8.
=cut
sub utf8_to_entities {
my ($text) = @_;
use Encode qw(_utf8_off);
_utf8_off($text);
while ($text =~ /(([\xC0-\xFF])([\x80-\xFF]{1,5}))/) {
#store the sequence for later;
my $unicode_sequence = $1;
#separate the first byte from the others
my ($first, $second) = ($2, $3);
#split remaining bytes and count them
my @parts = split '', $second;
t/7_mysqlindex.t view on Meta::CPAN
name varchar(255) unique not null,
timestamp long,
digest char(40),
data blob,
wordcount integer,
title varchar(255),
keywords varchar(255),
description text,
regular varchar(255),
map varchar(255)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
drop table if exists _wyrd_index_data;
create table _wyrd_index_data (
item varchar(255) not null,
id integer,
tally integer
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
drop table if exists _wyrd_index_regular;
create table _wyrd_index_regular (
item varchar(255) not null,
id integer,
tally integer
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
drop table if exists _wyrd_index_map;
create table _wyrd_index_map (
item varchar(255) not null,
id integer,
tally integer
) ENGINE=MyISAM DEFAULT CHARSET=utf8
CREATE
for my $query (split ';', $create_routine) {
last if ($count == 0);
my $sh = $dbh->prepare($query);
$sh->execute;
if ($sh->err) {
warn $sh->errstr;
$count = 0;
}
( run in 1.044 second using v1.01-cache-2.11-cpan-49f99fa48dc )