Dancer-SearchApp
view release on metacpan or search on metacpan
bin/index-filesystem.pl view on Meta::CPAN
$config->{fs}->{directories} = [@ARGV];
};
if( ! @ARGV and ! @{ $config->{fs}->{directories} }) {
# If we don't know better, scan the (complete) profile
my $userhome = $ENV{USERPROFILE} || $ENV{HOME};
$config->{fs}->{directories} = [{ folder => $userhome, recurse => 1 }];
}
my @folders = fs_recurse(undef, $config->{fs});
for my $folder (@folders) {
print "Reading $folder\n";
# We need to make this promises-based/asynchronous too so
# that we don't accumulate a lot of data client-side
my @entries = get_entries_from_folder( $folder );
my $bulk = $e->bulk_helper(
max_count => 10,
on_error => sub {
my($name,$data,$code) = @_;
warn "ES Error: $name ($code): " . Dumper $data;
}
);
# Importieren
print sprintf "Importing %d files\n", 0+@entries;
# Process in a batch size of 10, to debug memory consumption
while( my @batch = splice @entries, 0, 100 ) {
await collect(
map {
# One day, this will be a Promise too
my $msg = get_file_info($_);
my $body = $msg->{content};
# Stringify some fields that are prone to be objects:
for(qw(file url)) {
if( $msg->{$_} ) {
$msg->{ $_} = "$msg->{$_}";
};
};
my $lang = detect_language($body, $msg);
$lang->then(sub{
my $found_lang = $_[0]; #'en';
#warn "Have language '$found_lang'";
return find_or_create_index($e, $index_name,$found_lang, 'file')
})->then( sub {
my( $full_name ) = @_;
#warn $msg->{mime_type};
# munge the title so we get magic completion for document titles:
# This should be mostly done in an Elasticsearch filter+analyzer combo
# Except for bands/song titles, which we want to manually munge
my @parts = map {lc $_}
((split /\s+/, $msg->{title}),
(split m![\\/]!, $msg->{url}));
$msg->{title_suggest} = {
input => \@parts,
#output => $msg->{title},
# Maybe some payload to directly link to the document. Later
#payload => {
# url => $msg->{url}
# # , $msg->{mime_type}
# },
};
# https://www.elastic.co/guide/en/elasticsearch/guide/current/one-lang-docs.html
#warn "Storing document into $full_name";
# Switch this to a bulk converter
#$e->index({
$bulk->index({
index => $full_name,
type => 'file', # or 'attachment' ?!
id => $msg->{url}, # we want to overwrite
# index bcc, cc, to, from
#body => $msg # "body" for non-bulk, "source" for bulk ...
source => $msg # "body" for non-bulk, "source" for bulk ...
});
})->then(sub{
# Also add the document to the potential keywords for suggestion
#warn "Done."
return ()
})->catch(sub {undef $msg; warn $_ for @_ });
} @batch
);
};
await $bulk->flush;
sleep 1;
print "$folder done\n";
};
( run in 1.414 second using v1.01-cache-2.11-cpan-71847e10f99 )