App-CekBpom

 view release on metacpan or  search on metacpan

lib/App/CekBpom.pm  view on Meta::CPAN

        query_log_file => {
            summary => 'Log queries to log file',
            schema => 'filename*',
            description => <<'_',

If specified, each invocation of this utility will be logged into a line in the
specified file path, in TSV format. Tab character in the query will be converted
into 4 spaces, to avoid clash with the use of Tab as field separator.

For example, this invocation:

    % cek-bpom-products "minuman susu fermentasi" yakult --query-log-file /some/path.txt

Sample log line:

    time:2020-10-22T01:02:03.000Z    queries:minuman susu fermentasi,yakult    search_types:merk,nama_produk    num_results:51    duration:3.402

_
            tags => ['category:logging'],
        },
        result_dump_dir => {
            summary => 'Dump result to directory',
            schema => 'dirname*',
            description => <<'_',

If specified, will dump full enveloped result to a file in specified directory
path, in JSON format. The JSON formatting makes it easy to grep each row. The
file will be named
`cek-bpom-products-result.<encoded-timestamp>.<search-types-encoded>.<queries-encoded>(.<note-encoded>)?.json`.
The encoded timestamp is ISO 8601 format with colon replaced by underscore. The
encoded query will replace all every group of "unsafe" characters in query with
a single dash. The same goes with encoded note, which comes from the `note`
argument. For example, this invocation:

    % cek-bpom-products "minuman susu fermentasi" yakult --note "some note"

will result in a result dump file name like:
`cek-bpom-products-result.2020-10-22T01_02_03.000Z.merk-nama_produk.minuman-susu-fermentasi-yakult.some-note.json`.

_
            tags => ['category:logging'],
        },
    },
    examples => [
        {
            summary => 'By default search against name (nama_produk) and brand (merk)',
            argv => ["hichew", "hi-chew", "hi chew"],
            test => 0,
            'x.doc.show_result' => 0,
        },
        {
            summary => 'Adding --trace will show query details, --format html+datatables is convenient to see/search/sort results in browser',
            src => "[[prog]] hichew hi-chew 'hi chew' --trace --format html+datatables",
            src_plang => "bash",
            test => 0,
            'x.doc.show_result' => 0,
        },
    ],
};
sub cek_bpom_products {
    require HTTP::CookieJar::LWP;
    require LWP::UserAgent::Plugin;

    my $time_start = time();

    my %args = @_;
    defined(my $queries = $args{queries}) or return [400, "Please specify queries"];
    my $search_types = $args{search_types} // ['nama_produk', 'merk'];

    my $jar = HTTP::CookieJar::LWP->new;
    my $ua = LWP::UserAgent::Plugin->new(
        cookie_jar => $jar,
    );

    # first get the front page so we get the session ID
    log_trace "Requesting cekbpom front page ...";
    my $res = $ua->get($url_prefix);
    unless ($res->is_success) {
        return [$res->code, "Can't get front page ($url_prefix): ".$res->message];
    }
    my $ct = $res->content;
    unless ($ct =~ m!/home/produk/(\w{26})"!) {
        return [543, "Can't extract session ID from front page"];
    }
    my $session_id = $1;

    my %reg_ids;
    my @all_rows;

    my $time_before_query = time();
  QUERY:
    for my $query (@$queries) {
      SEARCH_TYPE:
        for my $search_type (@$search_types) {
            my $search_type_num = $known_search_types{$search_type}[0];
            unless (defined $search_type_num) {
                return [400, "Unknown search_type '$search_type'"];
            }

            require URI::Escape;
            my $query_enc = URI::Escape::uri_escape($query);

            my @rows;
            my $page_num = 0;
            my $num_results = 100;
            my ($result_start, $result_end);
            while (1) {
                log_trace "Querying cekbpom ($search_type=$query, $num_results result(s)) ...";
                $res = $ua->get("$url_prefix/home/produk/$session_id/all/row/$num_results/page/$page_num/order/4/DESC/search/$search_type_num/$query_enc");
                unless ($res->is_success) {
                    return [$res->code, "Can't get result page: ".$res->message];
                }
                my $ct = $res->content;
                unless ($ct =~ m!(\d+) - (\d+) Dari (\d+)!) {
                    return [543, "Can't find signature in result page"];
                }
                ($result_start, $result_end, $num_results) = ($1, $2, $3);

                if ($result_end < $num_results && $result_end < 5000) {
                    redo;
                }

                if ($ENV{CEK_BPOM_TRACE}) {
                    log_trace $ct;
                }

                while ($ct =~ m!
                                   <tr\stitle.+?\surldetil="/(?P<reg_id>[^"]+)">
                                   <td[^>]*>\s* (?P<nomor_registrasi>[^<]+?)\s*   (?:<div>Terbit:(?P<tanggal_terbit>[^<]+?))?\s*    </div></td>
                                   <td[^>]*>\s* (?P<nama>[^<]+?)\s*<div>Merk:\s*  (?P<merk>[^<]+)<br>Kemasan:(?P<kemasan>[^<]+?)\s* </div></td>



( run in 0.691 second using v1.01-cache-2.11-cpan-d7f47b0818f )