App-CekBpom
view release on metacpan or search on metacpan
lib/App/CekBpom.pm view on Meta::CPAN
query_log_file => {
summary => 'Log queries to log file',
schema => 'filename*',
description => <<'_',
If specified, each invocation of this utility will be logged into a line in the
specified file path, in TSV format. Tab character in the query will be converted
into 4 spaces, to avoid clash with the use of Tab as field separator.
For example, this invocation:
% cek-bpom-products "minuman susu fermentasi" yakult --query-log-file /some/path.txt
Sample log line:
time:2020-10-22T01:02:03.000Z queries:minuman susu fermentasi,yakult search_types:merk,nama_produk num_results:51 duration:3.402
_
tags => ['category:logging'],
},
result_dump_dir => {
summary => 'Dump result to directory',
schema => 'dirname*',
description => <<'_',
If specified, will dump full enveloped result to a file in specified directory
path, in JSON format. The JSON formatting makes it easy to grep each row. The
file will be named
`cek-bpom-products-result.<encoded-timestamp>.<search-types-encoded>.<queries-encoded>(.<note-encoded>)?.json`.
The encoded timestamp is ISO 8601 format with colon replaced by underscore. The
encoded query will replace all every group of "unsafe" characters in query with
a single dash. The same goes with encoded note, which comes from the `note`
argument. For example, this invocation:
% cek-bpom-products "minuman susu fermentasi" yakult --note "some note"
will result in a result dump file name like:
`cek-bpom-products-result.2020-10-22T01_02_03.000Z.merk-nama_produk.minuman-susu-fermentasi-yakult.some-note.json`.
_
tags => ['category:logging'],
},
},
examples => [
{
summary => 'By default search against name (nama_produk) and brand (merk)',
argv => ["hichew", "hi-chew", "hi chew"],
test => 0,
'x.doc.show_result' => 0,
},
{
summary => 'Adding --trace will show query details, --format html+datatables is convenient to see/search/sort results in browser',
src => "[[prog]] hichew hi-chew 'hi chew' --trace --format html+datatables",
src_plang => "bash",
test => 0,
'x.doc.show_result' => 0,
},
],
};
sub cek_bpom_products {
require HTTP::CookieJar::LWP;
require LWP::UserAgent::Plugin;
my $time_start = time();
my %args = @_;
defined(my $queries = $args{queries}) or return [400, "Please specify queries"];
my $search_types = $args{search_types} // ['nama_produk', 'merk'];
my $jar = HTTP::CookieJar::LWP->new;
my $ua = LWP::UserAgent::Plugin->new(
cookie_jar => $jar,
);
# first get the front page so we get the session ID
log_trace "Requesting cekbpom front page ...";
my $res = $ua->get($url_prefix);
unless ($res->is_success) {
return [$res->code, "Can't get front page ($url_prefix): ".$res->message];
}
my $ct = $res->content;
unless ($ct =~ m!/home/produk/(\w{26})"!) {
return [543, "Can't extract session ID from front page"];
}
my $session_id = $1;
my %reg_ids;
my @all_rows;
my $time_before_query = time();
QUERY:
for my $query (@$queries) {
SEARCH_TYPE:
for my $search_type (@$search_types) {
my $search_type_num = $known_search_types{$search_type}[0];
unless (defined $search_type_num) {
return [400, "Unknown search_type '$search_type'"];
}
require URI::Escape;
my $query_enc = URI::Escape::uri_escape($query);
my @rows;
my $page_num = 0;
my $num_results = 100;
my ($result_start, $result_end);
while (1) {
log_trace "Querying cekbpom ($search_type=$query, $num_results result(s)) ...";
$res = $ua->get("$url_prefix/home/produk/$session_id/all/row/$num_results/page/$page_num/order/4/DESC/search/$search_type_num/$query_enc");
unless ($res->is_success) {
return [$res->code, "Can't get result page: ".$res->message];
}
my $ct = $res->content;
unless ($ct =~ m!(\d+) - (\d+) Dari (\d+)!) {
return [543, "Can't find signature in result page"];
}
($result_start, $result_end, $num_results) = ($1, $2, $3);
if ($result_end < $num_results && $result_end < 5000) {
redo;
}
if ($ENV{CEK_BPOM_TRACE}) {
log_trace $ct;
}
while ($ct =~ m!
<tr\stitle.+?\surldetil="/(?P<reg_id>[^"]+)">
<td[^>]*>\s* (?P<nomor_registrasi>[^<]+?)\s* (?:<div>Terbit:(?P<tanggal_terbit>[^<]+?))?\s* </div></td>
<td[^>]*>\s* (?P<nama>[^<]+?)\s*<div>Merk:\s* (?P<merk>[^<]+)<br>Kemasan:(?P<kemasan>[^<]+?)\s* </div></td>
( run in 0.691 second using v1.01-cache-2.11-cpan-d7f47b0818f )