JSONL-Subset

 view release on metacpan or  search on metacpan

CHANGELOG.md  view on Meta::CPAN

# Changelog

## v0.05

- Read and write files in raw mode, preserving line endings
- Use more efficient blank line regexp for processing JSONL files
- Improve memory efficiency of picking lines in streaming mode (S-Algorithm), we now only allocate one integer per picked line rather than per line in the dataset
- Add tests for Windows line endings
- Add `CHANGELOG.md`

## v0.04

- Add ability to select `lines` as well as `percent`

## v0.03

- Add MIT license
- Fix some poorly written tests
- Document `streaming` mode

README.md  view on Meta::CPAN


```perl
use JSONL::Subset qw(subset_jsonl);

subset_jsonl(
    infile    => "data.jsonl",
    outfile   => "subset.jsonl",
    percent   => 10,
    mode      => "random", # or "start", "end"
    seed      => 42,
    streaming => 1
);
```

Or from the command line:

```
jsonl-subset --in data.jsonl --out sample.jsonl --percent 5 --mode random --seed 42 --streaming
```

## Options

### infile

Path to the file you want to import from.

### outfile

README.md  view on Meta::CPAN

### mode

- random returns random lines
- start returns lines from the start
- end returns lines from the end

### seed

Only used with random, for reproducability. (optional)

### streaming

If set, infile will be streamed line by line. This makes the process take less RAM, but more wall time.

Recommended for large JSONL files.

bin/jsonl-subset  view on Meta::CPAN

#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;
use JSONL::Subset qw(subset_jsonl);

my ($infile, $outfile, $percent, $lines, $mode, $seed, $streaming);

GetOptions(
    "in=s"     => \$infile,
    "out=s"    => \$outfile,
    "percent=i"=> \$percent,
    "lines=i"  => \$lines,
    "mode=s"   => \$mode,
    "seed=i"   => \$seed,
    "streaming" => \$streaming,
) or die "Usage: $0 --in FILE --out FILE [--percent INT] [--lines INT] [--mode random|start|end] [--seed INT] [--streaming]\n";

subset_jsonl(
    infile  => $infile,
    outfile => $outfile,
    percent => $percent,
    lines   => $lines,
    mode    => $mode,
    seed    => $seed,
    streaming => $streaming,
);

lib/JSONL/Subset.pm  view on Meta::CPAN

use warnings;

use Exporter 'import';
use IO::File;
use List::Util qw(shuffle);

our @EXPORT_OK = qw(subset_jsonl);

sub subset_jsonl {
    my %args = @_;
    my ($infile, $outfile, $percent, $lines, $mode, $seed, $streaming) =
        @args{qw/infile outfile percent lines mode seed streaming/};

    die "infile, outfile, and percent or lines are required" unless $infile && $outfile && (defined $percent || defined $lines);
    die "cannot specify percent and lines, must choose one or the other" if (defined $percent && defined $lines);
    die "percent must be between 0 and 100" if (defined $percent && ($percent < 0 || $percent > 100));
    die "Invalid mode: $mode" unless $mode =~ /^(random|start|end)$/;

    $mode ||= 'random';

    if (!defined $streaming || $streaming == 0) {
        _subset_jsonl_inplace(
            infile  => $infile,
            outfile => $outfile,
            percent => $percent,
            lines   => $lines,
            mode    => $mode,
            seed    => $seed
        );
    } else {
        _subset_jsonl_streaming(
            infile  => $infile,
            outfile => $outfile,
            percent => $percent,
            lines   => $lines,
            mode    => $mode,
            seed    => $seed
        );
    }
}

lib/JSONL/Subset.pm  view on Meta::CPAN

    my $out = IO::File->new($outfile, ">:raw") or die $!;

    for my $line (@subset) {
        print $out $line;
    }
    $out->close;

    $in->close;
}

sub _subset_jsonl_streaming {
    my %args = @_;
    my ($infile, $outfile, $percent, $lines, $mode, $seed) =
        @args{qw/infile outfile percent lines mode seed/};

    my $in = IO::File->new($infile, "<:encoding(UTF-8)") or die "Can't read $infile: $!";
    my $total = 0;

    while (my $line = <$in>) {
        $total++ if $line =~ /^\s*[\{\[]/;;
    }

lib/JSONL/Subset.pm  view on Meta::CPAN

=head1 SYNOPSIS

  use JSONL::Subset qw(subset_jsonl);

  subset_jsonl(
      infile    => "data.jsonl",
      outfile   => "subset.jsonl",
      percent   => 10,
      mode      => "random",  # or "start", "end"
      seed      => 42,
      streaming => 1
  );

=head1 DESCRIPTION

This module helps you extract a subset of lines from a JSONL file, for sampling or inspection.

=head1 OPTIONS

=head2 infile

lib/JSONL/Subset.pm  view on Meta::CPAN

=head2 mode

- random returns random lines
- start returns lines from the start
- end returns lines from the end

=head2 seed

Only used with random, for reproducability. (optional)

=head2 streaming

If set, infile will be streamed line by line. This makes the process take less RAM, but more wall time.

Recommended for large JSONL files.

=cut

t/basic.t  view on Meta::CPAN

	percent => 30,
	mode => "random",
	seed => 1337
);
open my $r, "<", $filename_out_r or die $!;
my @rand_out = <$r>;
close $r;
is(scalar(@rand_out), 3, "random: got exactly 3 lines");
is_deeply(\@rand_out, ["{ \"id\": 9 }\n", "{ \"id\": 6 }\n", "{ \"id\": 7 }\n"], "random: got the right lines");

# Start mode (streaming)
my ($fh_out_ss, $filename_out_ss) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_ss,
	percent => 30,
	mode => "start",
	streaming => 1
);
open my $ss, "<", $filename_out_ss or die $!;
my @start_out_s = <$ss>;
close $ss;
is(scalar(@start_out_s), 3, "start (streaming): got exactly 3 lines");
is_deeply(\@start_out_s, ["{ \"id\": 1 }\n", "{ \"id\": 2 }\n", "{ \"id\": 3 }\n"], "start (streaming): got the right lines");

# End mode (streaming)
my ($fh_out_es, $filename_out_es) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_es,
	percent => 30,
	mode => "end",
	streaming => 1
);
open my $es, "<", $filename_out_es or die $!;
my @end_out_s = <$es>;
close $es;
is(scalar(@end_out_s), 3, "end (streaming): got exactly 3 lines");
is_deeply(\@end_out_s, ["{ \"id\": 8 }\n", "{ \"id\": 9 }\n", "{ \"id\": 10 }\n"], "end (streaming): got the right lines");

# Random mode (streaming)
my ($fh_out_rs, $filename_out_rs) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_rs,
	percent => 30,
	mode => "random",
	seed => 1337,
	streaming => 1
);
open my $rs, "<", $filename_out_rs or die $!;
my @rand_out_s = <$rs>;
close $rs;
is(scalar(@rand_out_s), 3, "random (streaming): got exactly 3 lines");
is_deeply(\@rand_out_s, ["{ \"id\": 4 }\n", "{ \"id\": 7 }\n", "{ \"id\": 10 }\n"], "random (streaming): got the right lines");

# Start mode (lines)
my ($fh_out_sl, $filename_out_sl) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_sl,
	lines => 3,
	mode => "start"
);
open my $sl, "<", $filename_out_sl or die $!;

t/basic.t  view on Meta::CPAN

	lines => 3,
	mode => "random",
	seed => 1337
);
open my $rl, "<", $filename_out_rl or die $!;
my @rand_out_l = <$rl>;
close $rl;
is(scalar(@rand_out_l), 3, "random (lines): got exactly 3 lines");
is_deeply(\@rand_out_l, ["{ \"id\": 9 }\n", "{ \"id\": 6 }\n", "{ \"id\": 7 }\n"], "random (lines): got the right lines");

# Start mode (streaming & lines)
my ($fh_out_ss_l, $filename_out_ss_l) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_ss_l,
	lines => 3,
	mode => "start",
	streaming => 1
);
open my $ss_l, "<", $filename_out_ss_l or die $!;
my @start_out_sl = <$ss_l>;
close $ss_l;
is(scalar(@start_out_sl), 3, "start (streaming & lines): got exactly 3 lines");
is_deeply(\@start_out_sl, ["{ \"id\": 1 }\n", "{ \"id\": 2 }\n", "{ \"id\": 3 }\n"], "start (streaming & lines): got the right lines");

# End mode (streaming & lines)
my ($fh_out_es_l, $filename_out_es_l) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_es_l,
	lines => 3,
	mode => "end",
	streaming => 1
);
open my $es_l, "<", $filename_out_es_l or die $!;
my @end_out_sl = <$es_l>;
close $es_l;
is(scalar(@end_out_sl), 3, "end (streaming & lines): got exactly 3 lines");
is_deeply(\@end_out_sl, ["{ \"id\": 8 }\n", "{ \"id\": 9 }\n", "{ \"id\": 10 }\n"], "end (streaming & lines): got the right lines");

# Random mode (streaming & lines)
my ($fh_out_rs_l, $filename_out_rs_l) = tempfile();
subset_jsonl(
	infile => $FIXTURE,
	outfile => $filename_out_rs_l,
	lines => 3,
	mode => "random",
	seed => 1337,
	streaming => 1
);
open my $rs_l, "<", $filename_out_rs_l or die $!;
my @rand_out_sl = <$rs_l>;
close $rs_l;
is(scalar(@rand_out_sl), 3, "random (streaming & lines): got exactly 3 lines");
is_deeply(\@rand_out_sl, ["{ \"id\": 4 }\n", "{ \"id\": 7 }\n", "{ \"id\": 10 }\n"], "random (streaming & lines): got the right lines");

# Random mode (Windows line endings)
my ($fh_out_r_win, $filename_out_r_win) = tempfile();
subset_jsonl(
	infile => $WINDOWS_FIXTURE,
	outfile => $filename_out_r_win,
	percent => 30,
	mode => "random",
	seed => 1337
);
open my $r_win, "<", $filename_out_r_win or die $!;
my @rand_out_win = <$r_win>;
close $r_win;
is(scalar(@rand_out_win), 3, "random: got exactly 3 lines");
is_deeply(\@rand_out_win, ["{ \"id\": 9 }\r\n", "{ \"id\": 6 }\r\n", "{ \"id\": 7 }\r\n"], "random: got the right lines");

# Random mode (streaming, Windows line endings)
my ($fh_out_rs_win, $filename_out_rs_win) = tempfile();
subset_jsonl(
	infile => $WINDOWS_FIXTURE,
	outfile => $filename_out_rs_win,
	percent => 30,
	mode => "random",
	seed => 1337,
	streaming => 1
);
open my $rs_win, "<", $filename_out_rs_win or die $!;
my @rand_out_s_win = <$rs_win>;
close $rs_win;
is(scalar(@rand_out_s_win), 3, "random (streaming): got exactly 3 lines");
is_deeply(\@rand_out_s_win, ["{ \"id\": 4 }\r\n", "{ \"id\": 7 }\r\n", "{ \"id\": 10 }\r\n"], "random (streaming): got the right lines");



( run in 0.268 second using v1.01-cache-2.11-cpan-4face438c0f )