App-grep-url
view release on metacpan or search on metacpan
lib/App/grep/url.pm view on Meta::CPAN
package App::grep::url;
our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
our $DATE = '2021-08-01'; # DATE
our $DIST = 'App-grep-url'; # DIST
our $VERSION = '0.004'; # VERSION
use 5.010001;
use strict;
use warnings;
use Log::ger;
use AppBase::Grep;
use Perinci::Sub::Util qw(gen_modified_sub);
our %SPEC;
gen_modified_sub(
output_name => 'grep_url',
base_name => 'AppBase::Grep::grep',
summary => 'Print lines having URL(s) (optionally of certain criteria) in them',
description => <<'_',
This is a grep-like utility that greps for URLs of certain criteria.
_
remove_args => [
'regexps',
'pattern',
'dash_prefix_inverts',
'all',
],
add_args => {
min_urls => {
schema => 'uint*',
default => 1,
tags => ['category:filtering'],
},
max_urls => {
schema => 'int*',
default => -1,
tags => ['category:filtering'],
},
schemes => {
'x.name.is_plural' => 1,
'x.name.singular' => 'scheme',
schema => ['array*', of=>['str*',in=>[qw/http ftp file ssh/]]],
default => ['http', 'file'],
tags => ['category:url-criteria'],
},
scheme_contains => {
schema => 'str*',
tags => ['category:url-criteria'],
},
scheme_not_contains => {
schema => 'str*',
tags => ['category:url-criteria'],
},
scheme_matches => {
schema => 're*',
tags => ['category:url-criteria'],
},
host_contains => {
schema => 'str*',
tags => ['category:url-criteria'],
},
host_not_contains => {
schema => 'str*',
tags => ['category:url-criteria'],
},
host_matches => {
schema => 're*',
tags => ['category:url-criteria'],
},
path_contains => {
schema => 'str*',
tags => ['category:url-criteria'],
},
path_not_contains => {
schema => 'str*',
tags => ['category:url-criteria'],
},
path_matches => {
schema => 're*',
tags => ['category:url-criteria'],
},
query_param_contains => {
schema => ['hash*', of=>'str*'],
tags => ['category:url-criteria'],
},
query_param_not_contains => {
schema => ['hash*', of=>'str*'],
tags => ['category:url-criteria'],
},
query_param_matches => {
schema => ['hash*', of=>'str*'], # XXX of re
tags => ['category:url-criteria'],
},
files => {
'x.name.is_plural' => 1,
'x.name.singular' => 'file',
schema => ['array*', of=>'filename*'],
pos => 0,
slurpy => 1,
},
# XXX recursive (-r)
},
modify_meta => sub {
my $meta = shift;
$meta->{examples} = [
{
summary => 'Show lines that contain at least 2 URLs',
'src' => q([[prog]] --min-urls 2 file.txt),
'src_plang' => 'bash',
'test' => 0,
'x.doc.show_result' => 0,
},
{
summary => 'Show lines that contain URLs from google',
'src' => q([[prog]] --host-contains google file.txt),
'src_plang' => 'bash',
'test' => 0,
'x.doc.show_result' => 0,
},
{
summary => 'Show lines that contain search URLs from google where query contains the keyword "mortal"',
'src' => q([[prog]] --host-contains google --query-param-contains q=mortal file.txt),
'src_plang' => 'bash',
'test' => 0,
'x.doc.show_result' => 0,
},
];
$meta->{links} = [
{url=>'prog:grep-email'},
];
},
output_code => sub {
my %args = @_;
my ($fh, $file);
my @files = @{ delete($args{files}) // [] };
my $show_label = 0;
if (!@files) {
$fh = \*STDIN;
} elsif (@files > 1) {
$show_label = 1;
}
lib/App/grep/url.pm view on Meta::CPAN
open $fh, "<", $file or do {
warn "abgrep: Can't open '$file': $!, skipped\n";
undef $fh;
};
redo READ_LINE;
}
my $line = <$fh>;
if (defined $line) {
return ($line, $show_label ? $file : undef);
} else {
undef $fh;
redo READ_LINE;
}
}
};
require Regexp::Pattern::URI;
require URI;
require URI::QueryParam;
my @re;
for my $scheme (@{ $args{schemes} // [] }) {
if ($scheme eq 'ftp') { push @re, $Regexp::Pattern::URI::RE{ftp}{pat} }
elsif ($scheme eq 'http') { push @re, $Regexp::Pattern::URI::RE{http}{pat} }
elsif ($scheme eq 'ssh') { push @re, $Regexp::Pattern::URI::RE{ssh}{pat} }
elsif ($scheme eq 'file') { push @re, $Regexp::Pattern::URI::RE{file}{pat} }
else { die "grep-url: Unknown URL scheme '$scheme'\n" }
}
die "grep-url: Please add one or more schemes\n" unless @re;
my $re = join('|', @re);
$re = qr/$re/;
$args{_highlight_regexp} = $re;
$args{_filter_code} = sub {
my ($line, $fargs) = @_;
my @urls;
while ($line =~ /($re)/g) {
push @urls, $1;
}
return 0 if $fargs->{min_urls} >= 0 && @urls < $fargs->{min_urls};
return 0 if $fargs->{max_urls} >= 0 && @urls > $fargs->{max_urls};
return 1 unless @urls;
for (@urls) { $_ = URI->new($_) }
my $match = 0;
URL:
for my $url (@urls) {
# scheme criteria
if (defined $fargs->{scheme_contains}) {
if ($fargs->{ignore_case} ?
index(lc($url->scheme), lc($fargs->{scheme_contains})) >= 0 :
index($url->scheme , $fargs->{scheme_contains}) >= 0) {
} else {
next;
}
}
if (defined $fargs->{scheme_not_contains}) {
if ($fargs->{ignore_case} ?
index(lc($url->scheme), lc($fargs->{scheme_not_contains})) < 0 :
index($url->scheme , $fargs->{scheme_not_contains}) < 0) {
} else {
next;
}
}
if (defined $fargs->{scheme_matches}) {
if ($fargs->{ignore_case} ?
$url->scheme =~ qr/$fargs->{scheme_matches}/i :
$url->scheme =~ qr/$fargs->{scheme_matches}/) {
} else {
next;
}
}
# host criteria
if (defined $fargs->{host_contains}) {
if ($fargs->{ignore_case} ?
index(lc($url->host), lc($fargs->{host_contains})) >= 0 :
index($url->host , $fargs->{host_contains}) >= 0) {
} else {
next;
}
}
if (defined $fargs->{host_not_contains}) {
if ($fargs->{ignore_case} ?
index(lc($url->host), lc($fargs->{host_not_contains})) < 0 :
index($url->host , $fargs->{host_not_contains}) < 0) {
} else {
next;
}
}
if (defined $fargs->{host_matches}) {
if ($fargs->{ignore_case} ?
$url->host =~ qr/$fargs->{host_matches}/i :
$url->host =~ qr/$fargs->{host_matches}/) {
} else {
next;
}
}
# path criteria
if (defined $fargs->{path_contains}) {
if ($fargs->{ignore_case} ?
index(lc($url->path), lc($fargs->{path_contains})) >= 0 :
index($url->path , $fargs->{path_contains}) >= 0) {
} else {
next;
}
}
if (defined $fargs->{path_not_contains}) {
if ($fargs->{ignore_case} ?
index(lc($url->path), lc($fargs->{path_not_contains})) < 0 :
index($url->path , $fargs->{path_not_contains}) < 0) {
} else {
next;
}
}
if (defined $fargs->{path_matches}) {
if ($fargs->{ignore_case} ?
$url->path =~ qr/$fargs->{path_matches}/i :
$url->path =~ qr/$fargs->{path_matches}/) {
} else {
next;
}
}
# query param criteria
if (defined $fargs->{query_param_contains}) {
for my $param (keys %{ $fargs->{query_param_contains} }) {
if ($fargs->{ignore_case} ?
index((lc($url->query_param($param)) // ''), lc($fargs->{query_param_contains}{$param})) >= 0 :
index(($url->query_param($param) // '') , $fargs->{query_param_contains}{$param}) >= 0) {
} else {
next URL;
}
}
}
if (defined $fargs->{query_param_not_contains}) {
for my $param (keys %{ $fargs->{query_param_not_contains} }) {
if ($fargs->{ignore_case} ?
index((lc($url->query_param($param)) // ''), lc($fargs->{query_param_not_contains}{$param})) < 0 :
index(($url->query_param($param) // '') , $fargs->{query_param_not_contains}{$param}) < 0) {
} else {
next URL;
}
}
}
if (defined $fargs->{query_param_matches}) {
for my $param (keys %{ $fargs->{query_param_matches} }) {
if ($fargs->{ignore_case} ?
($url->query_param($param) // '') =~ qr/$fargs->{query_param_matches}{$param}/i :
($url->query_param($param) // '') =~ qr/$fargs->{query_param_matches}{$param}/) {
} else {
next URL;
}
}
}
$match++; last;
}
$match;
};
AppBase::Grep::grep(%args);
},
);
1;
# ABSTRACT: Print lines having URL(s) (optionally of certain criteria) in them
__END__
=pod
=encoding UTF-8
=head1 NAME
App::grep::url - Print lines having URL(s) (optionally of certain criteria) in them
=head1 VERSION
This document describes version 0.004 of App::grep::url (from Perl distribution App-grep-url), released on 2021-08-01.
=head1 FUNCTIONS
=head2 grep_url
Usage:
grep_url(%args) -> [$status_code, $reason, $payload, \%result_meta]
Print lines having URL(s) (optionally of certain criteria) in them.
This is a grep-like utility that greps for URLs of certain criteria.
This function is not exported.
Arguments ('*' denotes required arguments):
=over 4
=item * B<color> => I<str> (default: "auto")
=item * B<count> => I<true>
Supress normal output, return a count of matching lines.
=item * B<files> => I<array[filename]>
=item * B<host_contains> => I<str>
=item * B<host_matches> => I<re>
=item * B<host_not_contains> => I<str>
=item * B<ignore_case> => I<bool>
=item * B<invert_match> => I<bool>
Invert the sense of matching.
=item * B<line_number> => I<true>
=item * B<max_urls> => I<int> (default: -1)
=item * B<min_urls> => I<uint> (default: 1)
=item * B<path_contains> => I<str>
=item * B<path_matches> => I<re>
=item * B<path_not_contains> => I<str>
=item * B<query_param_contains> => I<hash>
=item * B<query_param_matches> => I<hash>
=item * B<query_param_not_contains> => I<hash>
=item * B<quiet> => I<true>
=item * B<scheme_contains> => I<str>
=item * B<scheme_matches> => I<re>
=item * B<scheme_not_contains> => I<str>
=item * B<schemes> => I<array[str]> (default: ["http","file"])
=back
Returns an enveloped result (an array).
First element ($status_code) is an integer containing HTTP-like status code
(200 means OK, 4xx caller error, 5xx function error). Second element
($reason) is a string containing error message, or something like "OK" if status is
200. Third element ($payload) is the actual result, but usually not present when enveloped result is an error response ($status_code is not 2xx). Fourth
element (%result_meta) is called result metadata and is optional, a hash
that contains extra information, much like how HTTP response headers provide additional metadata.
Return value: (any)
=head1 HOMEPAGE
Please visit the project's homepage at L<https://metacpan.org/release/App-grep-url>.
=head1 SOURCE
Source repository is at L<https://github.com/perlancar/perl-App-grep-url>.
=head1 BUGS
Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-grep-url>
When submitting a bug or request, please include a test-file or a
patch to an existing test-file that illustrates the bug or desired
feature.
=head1 AUTHOR
perlancar <perlancar@cpan.org>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2021 by perlancar@cpan.org.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
( run in 2.392 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )