JSONL-Subset
view release on metacpan or search on metacpan
CHANGELOG.md view on Meta::CPAN
# Changelog
## v0.05
- Read and write files in raw mode, preserving line endings
- Use more efficient blank line regexp for processing JSONL files
- Improve memory efficiency of picking lines in streaming mode (S-Algorithm), we now only allocate one integer per picked line rather than per line in the dataset
- Add tests for Windows line endings
- Add `CHANGELOG.md`
## v0.04
- Add ability to select `lines` as well as `percent`
## v0.03
- Add MIT license
- Fix some poorly written tests
- Document `streaming` mode
```perl
use JSONL::Subset qw(subset_jsonl);
subset_jsonl(
infile => "data.jsonl",
outfile => "subset.jsonl",
percent => 10,
mode => "random", # or "start", "end"
seed => 42,
streaming => 1
);
```
Or from the command line:
```
jsonl-subset --in data.jsonl --out sample.jsonl --percent 5 --mode random --seed 42 --streaming
```
## Options
### infile
Path to the file you want to import from.
### outfile
### mode
- random returns random lines
- start returns lines from the start
- end returns lines from the end
### seed
Only used with random, for reproducability. (optional)
### streaming
If set, infile will be streamed line by line. This makes the process take less RAM, but more wall time.
Recommended for large JSONL files.
bin/jsonl-subset view on Meta::CPAN
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use JSONL::Subset qw(subset_jsonl);
my ($infile, $outfile, $percent, $lines, $mode, $seed, $streaming);
GetOptions(
"in=s" => \$infile,
"out=s" => \$outfile,
"percent=i"=> \$percent,
"lines=i" => \$lines,
"mode=s" => \$mode,
"seed=i" => \$seed,
"streaming" => \$streaming,
) or die "Usage: $0 --in FILE --out FILE [--percent INT] [--lines INT] [--mode random|start|end] [--seed INT] [--streaming]\n";
subset_jsonl(
infile => $infile,
outfile => $outfile,
percent => $percent,
lines => $lines,
mode => $mode,
seed => $seed,
streaming => $streaming,
);
lib/JSONL/Subset.pm view on Meta::CPAN
use warnings;
use Exporter 'import';
use IO::File;
use List::Util qw(shuffle);
our @EXPORT_OK = qw(subset_jsonl);
sub subset_jsonl {
my %args = @_;
my ($infile, $outfile, $percent, $lines, $mode, $seed, $streaming) =
@args{qw/infile outfile percent lines mode seed streaming/};
die "infile, outfile, and percent or lines are required" unless $infile && $outfile && (defined $percent || defined $lines);
die "cannot specify percent and lines, must choose one or the other" if (defined $percent && defined $lines);
die "percent must be between 0 and 100" if (defined $percent && ($percent < 0 || $percent > 100));
die "Invalid mode: $mode" unless $mode =~ /^(random|start|end)$/;
$mode ||= 'random';
if (!defined $streaming || $streaming == 0) {
_subset_jsonl_inplace(
infile => $infile,
outfile => $outfile,
percent => $percent,
lines => $lines,
mode => $mode,
seed => $seed
);
} else {
_subset_jsonl_streaming(
infile => $infile,
outfile => $outfile,
percent => $percent,
lines => $lines,
mode => $mode,
seed => $seed
);
}
}
lib/JSONL/Subset.pm view on Meta::CPAN
my $out = IO::File->new($outfile, ">:raw") or die $!;
for my $line (@subset) {
print $out $line;
}
$out->close;
$in->close;
}
sub _subset_jsonl_streaming {
my %args = @_;
my ($infile, $outfile, $percent, $lines, $mode, $seed) =
@args{qw/infile outfile percent lines mode seed/};
my $in = IO::File->new($infile, "<:encoding(UTF-8)") or die "Can't read $infile: $!";
my $total = 0;
while (my $line = <$in>) {
$total++ if $line =~ /^\s*[\{\[]/;;
}
lib/JSONL/Subset.pm view on Meta::CPAN
=head1 SYNOPSIS
use JSONL::Subset qw(subset_jsonl);
subset_jsonl(
infile => "data.jsonl",
outfile => "subset.jsonl",
percent => 10,
mode => "random", # or "start", "end"
seed => 42,
streaming => 1
);
=head1 DESCRIPTION
This module helps you extract a subset of lines from a JSONL file, for sampling or inspection.
=head1 OPTIONS
=head2 infile
lib/JSONL/Subset.pm view on Meta::CPAN
=head2 mode
- random returns random lines
- start returns lines from the start
- end returns lines from the end
=head2 seed
Only used with random, for reproducability. (optional)
=head2 streaming
If set, infile will be streamed line by line. This makes the process take less RAM, but more wall time.
Recommended for large JSONL files.
=cut
percent => 30,
mode => "random",
seed => 1337
);
open my $r, "<", $filename_out_r or die $!;
my @rand_out = <$r>;
close $r;
is(scalar(@rand_out), 3, "random: got exactly 3 lines");
is_deeply(\@rand_out, ["{ \"id\": 9 }\n", "{ \"id\": 6 }\n", "{ \"id\": 7 }\n"], "random: got the right lines");
# Start mode (streaming)
my ($fh_out_ss, $filename_out_ss) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_ss,
percent => 30,
mode => "start",
streaming => 1
);
open my $ss, "<", $filename_out_ss or die $!;
my @start_out_s = <$ss>;
close $ss;
is(scalar(@start_out_s), 3, "start (streaming): got exactly 3 lines");
is_deeply(\@start_out_s, ["{ \"id\": 1 }\n", "{ \"id\": 2 }\n", "{ \"id\": 3 }\n"], "start (streaming): got the right lines");
# End mode (streaming)
my ($fh_out_es, $filename_out_es) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_es,
percent => 30,
mode => "end",
streaming => 1
);
open my $es, "<", $filename_out_es or die $!;
my @end_out_s = <$es>;
close $es;
is(scalar(@end_out_s), 3, "end (streaming): got exactly 3 lines");
is_deeply(\@end_out_s, ["{ \"id\": 8 }\n", "{ \"id\": 9 }\n", "{ \"id\": 10 }\n"], "end (streaming): got the right lines");
# Random mode (streaming)
my ($fh_out_rs, $filename_out_rs) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_rs,
percent => 30,
mode => "random",
seed => 1337,
streaming => 1
);
open my $rs, "<", $filename_out_rs or die $!;
my @rand_out_s = <$rs>;
close $rs;
is(scalar(@rand_out_s), 3, "random (streaming): got exactly 3 lines");
is_deeply(\@rand_out_s, ["{ \"id\": 4 }\n", "{ \"id\": 7 }\n", "{ \"id\": 10 }\n"], "random (streaming): got the right lines");
# Start mode (lines)
my ($fh_out_sl, $filename_out_sl) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_sl,
lines => 3,
mode => "start"
);
open my $sl, "<", $filename_out_sl or die $!;
lines => 3,
mode => "random",
seed => 1337
);
open my $rl, "<", $filename_out_rl or die $!;
my @rand_out_l = <$rl>;
close $rl;
is(scalar(@rand_out_l), 3, "random (lines): got exactly 3 lines");
is_deeply(\@rand_out_l, ["{ \"id\": 9 }\n", "{ \"id\": 6 }\n", "{ \"id\": 7 }\n"], "random (lines): got the right lines");
# Start mode (streaming & lines)
my ($fh_out_ss_l, $filename_out_ss_l) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_ss_l,
lines => 3,
mode => "start",
streaming => 1
);
open my $ss_l, "<", $filename_out_ss_l or die $!;
my @start_out_sl = <$ss_l>;
close $ss_l;
is(scalar(@start_out_sl), 3, "start (streaming & lines): got exactly 3 lines");
is_deeply(\@start_out_sl, ["{ \"id\": 1 }\n", "{ \"id\": 2 }\n", "{ \"id\": 3 }\n"], "start (streaming & lines): got the right lines");
# End mode (streaming & lines)
my ($fh_out_es_l, $filename_out_es_l) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_es_l,
lines => 3,
mode => "end",
streaming => 1
);
open my $es_l, "<", $filename_out_es_l or die $!;
my @end_out_sl = <$es_l>;
close $es_l;
is(scalar(@end_out_sl), 3, "end (streaming & lines): got exactly 3 lines");
is_deeply(\@end_out_sl, ["{ \"id\": 8 }\n", "{ \"id\": 9 }\n", "{ \"id\": 10 }\n"], "end (streaming & lines): got the right lines");
# Random mode (streaming & lines)
my ($fh_out_rs_l, $filename_out_rs_l) = tempfile();
subset_jsonl(
infile => $FIXTURE,
outfile => $filename_out_rs_l,
lines => 3,
mode => "random",
seed => 1337,
streaming => 1
);
open my $rs_l, "<", $filename_out_rs_l or die $!;
my @rand_out_sl = <$rs_l>;
close $rs_l;
is(scalar(@rand_out_sl), 3, "random (streaming & lines): got exactly 3 lines");
is_deeply(\@rand_out_sl, ["{ \"id\": 4 }\n", "{ \"id\": 7 }\n", "{ \"id\": 10 }\n"], "random (streaming & lines): got the right lines");
# Random mode (Windows line endings)
my ($fh_out_r_win, $filename_out_r_win) = tempfile();
subset_jsonl(
infile => $WINDOWS_FIXTURE,
outfile => $filename_out_r_win,
percent => 30,
mode => "random",
seed => 1337
);
open my $r_win, "<", $filename_out_r_win or die $!;
my @rand_out_win = <$r_win>;
close $r_win;
is(scalar(@rand_out_win), 3, "random: got exactly 3 lines");
is_deeply(\@rand_out_win, ["{ \"id\": 9 }\r\n", "{ \"id\": 6 }\r\n", "{ \"id\": 7 }\r\n"], "random: got the right lines");
# Random mode (streaming, Windows line endings)
my ($fh_out_rs_win, $filename_out_rs_win) = tempfile();
subset_jsonl(
infile => $WINDOWS_FIXTURE,
outfile => $filename_out_rs_win,
percent => 30,
mode => "random",
seed => 1337,
streaming => 1
);
open my $rs_win, "<", $filename_out_rs_win or die $!;
my @rand_out_s_win = <$rs_win>;
close $rs_win;
is(scalar(@rand_out_s_win), 3, "random (streaming): got exactly 3 lines");
is_deeply(\@rand_out_s_win, ["{ \"id\": 4 }\r\n", "{ \"id\": 7 }\r\n", "{ \"id\": 10 }\r\n"], "random (streaming): got the right lines");
( run in 0.268 second using v1.01-cache-2.11-cpan-4face438c0f )