Dancer-SearchApp
view release on metacpan or search on metacpan
bin/index-url.pl view on Meta::CPAN
#trace_to => 'Stderr',
);
my $tika_glob = 'jar/tika-server-*.jar';
my $tika_path = (sort { my $ad; $a =~ /server-1.(\d+)/ and $ad=$1;
my $bd; $b =~ /server-1.(\d+)/ and $bd=$1;
$bd <=> $ad
} glob $tika_glob)[0];
die "Tika not found in '$tika_glob'" unless -f $tika_path;
#warn "Using '$tika_path'";
my $tika= Apache::Tika::Server->new(
jarfile => $tika_path,
);
$tika->launch;
my $ok = AnyEvent->condvar;
my $info = await $e->cat->plugins;
# Koennen wir ElasticSearch langdetect als Fallback nehmen?
my $have_langdetect = $info =~ /langdetect/i;
if( ! $have_langdetect ) {
warn "Language detection disabled";
};
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html
use vars qw(%analyzers);
%analyzers = (
'de' => 'german',
'en' => 'english',
'no' => 'norwegian',
'it' => 'italian',
'lt' => 'lithuanian',
'ro' => 'english', # I don't speak "romanian"
'sk' => 'english', # I don't speak "serbo-croatian"
);
if( $force_rebuild ) {
print "Dropping indices\n";
my @list;
await $e->indices->get({index => ['*']})->then(sub{
@list = grep { /^\Q$index_name/ } sort keys %{ $_[0]};
});
await collect( map { my $n=$_; $e->indices->delete( index => $n )->then(sub{warn "$n dropped" }) } @list )->then(sub{
warn "Index cleanup complete";
%indices = ();
});
};
print "Reading ES indices\n";
await $e->indices->get({index => ['*']})->then(sub{
%indices = %{ $_[0]};
});
warn "Index: $_\n" for grep { /^\Q$index_name/ } keys %indices;
# Connect to cluster at search1:9200, sniff all nodes and round-robin between them:
# Lame-ass config cascade
# Read from %ENV, $config, hard defaults, with different names,
# write to yet more different names
# Should merge with other config cascade
sub get_defaults {
my( %options ) = @_;
$options{ defaults } ||= {}; # premade defaults
my @names = @{ $options{ names } };
if( ! exists $options{ env }) {
$options{ env } = \%ENV;
};
my $env = $options{ env };
my $config = $options{ config };
for my $entry (@{ $options{ names }}) {
my ($result_name, $config_name, $env_name, $hard_default) = @$entry;
if( defined $env_name and exists $env->{ $env_name } ) {
#print "Using $env_name from environment\n";
$options{ defaults }->{ $result_name } //= $env->{ $env_name };
};
if( defined $config_name and exists $config->{ $config_name } ) {
#print "Using $config_name from config\n";
$options{ defaults }->{ $result_name } //= $config->{ $config_name };
};
if( ! exists $options{ defaults }->{$result_name} ) {
print "No $config_name from config, using hardcoded default\n";
print "Using $env_name from hard defaults ($hard_default)\n";
$options{ defaults }->{ $result_name } = $hard_default;
};
};
$options{ defaults };
};
sub in_exclude_list {
my( $item, $list ) = @_;
scalar grep { $item =~ /$_/ } @$list
};
# This should go into crawler::imap
# make folders a parameter
# This needs far more work for HTTP: duplicate detection
# HTML is not a directed graph
sub http_recurse {
my( $x, $config ) = @_;
};
sub get_entries_from_folder {
my( $folder )= @_;
# Add rate-limiting counter here, so we don't flood
return $folder;
};
sub get_selector {
my($tree,$sel) = @_;
if( my @nodes = $tree->findnodes($sel)) {
return $nodes[0]->text
}
}
sub abs_url {
my( $new, $base ) = @_;
"" . URI::URL->new( $new, $base )->abs
( run in 1.613 second using v1.01-cache-2.11-cpan-2398b32b56e )