JSONL-Subset
view release on metacpan or search on metacpan
lib/JSONL/Subset.pm view on Meta::CPAN
package JSONL::Subset;
use strict;
use warnings;
use Exporter 'import';
use IO::File;
use List::Util qw(shuffle);
our @EXPORT_OK = qw(subset_jsonl);
sub subset_jsonl {
my %args = @_;
my ($infile, $outfile, $percent, $lines, $mode, $seed, $streaming) =
@args{qw/infile outfile percent lines mode seed streaming/};
die "infile, outfile, and percent or lines are required" unless $infile && $outfile && (defined $percent || defined $lines);
die "cannot specify percent and lines, must choose one or the other" if (defined $percent && defined $lines);
die "percent must be between 0 and 100" if (defined $percent && ($percent < 0 || $percent > 100));
die "Invalid mode: $mode" unless $mode =~ /^(random|start|end)$/;
$mode ||= 'random';
if (!defined $streaming || $streaming == 0) {
_subset_jsonl_inplace(
infile => $infile,
outfile => $outfile,
percent => $percent,
lines => $lines,
mode => $mode,
seed => $seed
);
} else {
_subset_jsonl_streaming(
infile => $infile,
outfile => $outfile,
percent => $percent,
lines => $lines,
mode => $mode,
seed => $seed
);
}
}
sub _subset_jsonl_inplace {
my %args = @_;
my ($infile, $outfile, $percent, $lines, $mode, $seed) =
@args{qw/infile outfile percent lines mode seed/};
my $in = IO::File->new($infile, "<:raw") or die "Can't read $infile: $!";
my @lines = grep { /^\s*[\{\[]/ } map { $_ } <$in>;
die "requested more lines ($lines) than infile contains (${scalar(@lines)})" if (defined $lines && $lines > scalar(@lines));
if ($mode eq 'random') {
srand($seed) if defined $seed;
@lines = shuffle(@lines);
}
my $count = 0;
if (defined $percent) {
$count = int(@lines * $percent / 100);
} else {
$count = $lines;
}
my @subset = $mode eq 'end'
? @lines[-$count..-1]
: @lines[0..$count-1];
my $out = IO::File->new($outfile, ">:raw") or die $!;
for my $line (@subset) {
print $out $line;
}
$out->close;
$in->close;
}
sub _subset_jsonl_streaming {
my %args = @_;
my ($infile, $outfile, $percent, $lines, $mode, $seed) =
@args{qw/infile outfile percent lines mode seed/};
my $in = IO::File->new($infile, "<:encoding(UTF-8)") or die "Can't read $infile: $!";
my $total = 0;
while (my $line = <$in>) {
$total++ if $line =~ /^\s*[\{\[]/;;
}
close $in;
die "requested more lines ($lines) than infile contains ($total)" if (defined $lines && $lines > $total);
my $count = 0;
if (defined $percent) {
$count = int($total * $percent / 100);
} else {
$count = $lines;
}
my %picked = ();
if ($mode eq 'start') {
%picked = map { $_ => 1 } 0 .. $count-1;
} elsif ($mode eq 'end') {
%picked = map { $_ => 1 } ($total-$count) .. ($total-1);
} else { # Random
srand($seed) if defined $seed;
for (my $i = 0; $i < $total; $i++) {
if (rand($total - $i) < $count) {
$picked{$i} = 1;
$count--;
last if $count == 0;
}
}
}
open $in, "<:raw", $infile or die $!;
open my $out, ">:raw", $outfile or die $!;
my $real = 0;
while (my $line = <$in>) {
next unless $line =~ /^\s*[\{\[]/;;
print $out $line if $picked{$real};
$real++;
if ($mode eq 'start' && $real >= $count) {
last;
}
}
close $in;
close $out;
}
1;
__END__
=head1 NAME
JSONL::Subset - Extract a percentage of lines from a JSONL file
=head1 SYNOPSIS
use JSONL::Subset qw(subset_jsonl);
subset_jsonl(
infile => "data.jsonl",
outfile => "subset.jsonl",
percent => 10,
mode => "random", # or "start", "end"
seed => 42,
streaming => 1
);
=head1 DESCRIPTION
This module helps you extract a subset of lines from a JSONL file, for sampling or inspection.
=head1 OPTIONS
=head2 infile
Path to the file you want to import from.
=head2 outfile
Path to where you want to save the export.
=head2 percent
Percentage of lines to retain.
=head2 lines
Number of lines to retain.
=head2 mode
- random returns random lines
- start returns lines from the start
- end returns lines from the end
=head2 seed
Only used with random, for reproducability. (optional)
=head2 streaming
If set, infile will be streamed line by line. This makes the process take less RAM, but more wall time.
Recommended for large JSONL files.
=cut
( run in 1.876 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )