streaming results from the CPAN

Lugh

view release on metacpan or search on metacpan

        - Full documentation for new APIs Lugh/Lugh::Inference

0.04    2026-01-18
        - Added KV Cache support for efficient incremental decoding - Lugh::KVCache
        - Lugh::Inference - New create_kvcache() and forward_with_cache() methods
        - New test file t/08-kvcache.t

0.03    2026-01-18
        - Added generate() method for multi-token autoregressive generation
        - Added sample_top_k() method for top-k sampling
        - Generation supports: greedy, top_p, top_k, temperature, streaming callbacks
        - Added EOS token stopping and callback-based early stopping
        - New test suite t/07-generate.t with 22 tests including exact output validation

0.02    2026-01-17
        - Added Flash Attention support via ggml_flash_attn_ext()
        - Added support for tied embeddings (output.weight = token_embd.weight)
        - Bundled TinyStories-656K test model (749KB) for self-contained tests

0.01    Date/time
        First version, released on an unsuspecting world.

lib/Lugh.xs view on Meta::CPAN

            }
            
            SvREFCNT_dec((SV*)logits_av);
            
            /* Add to results */
            av_push(result_av, newSViv(next_token));
            tokens[current_len] = next_token;
            current_len++;
            gen_count++;
            
            /* Call streaming callback if provided */
            if (callback) {
                dSP;
                int should_stop;
                
                ENTER;
                SAVETMPS;
                
                PUSHMARK(SP);
                XPUSHs(sv_2mortal(newSViv(next_token)));
                XPUSHs(sv_2mortal(newSViv(gen_count)));

lib/Lugh/Inference.pm view on Meta::CPAN

    
    my @prompt = $tokenizer->encode("Once upon a time");
    
    # Greedy generation
    my @tokens = $inference->generate(\@prompt,
        max_tokens => 50,
        greedy     => 1,
    );
    print $tokenizer->decode(\@tokens);
    
    # Creative generation with streaming
    @tokens = $inference->generate(\@prompt,
        max_tokens  => 100,
        temperature => 1.0,
        top_p       => 0.95,
        callback    => sub {
            my ($tok, $n) = @_;
            print $tokenizer->decode([$tok]);
            STDOUT->flush();
            return 0;
        },

lib/Lugh/Inference.pm view on Meta::CPAN

    my $inference = Lugh::Inference->new(model => $model);
    
    my @prompt = $tokenizer->encode("Once upon a time");
    my @generated = $inference->generate(\@prompt,
        max_tokens  => 100,
        temperature => 0.8,
        top_p       => 0.95,
    );
    print $tokenizer->decode(\@generated);

For streaming output:

    my @generated = $inference->generate(\@prompt,
        max_tokens  => 100,
        temperature => 0.8,
        callback    => sub {
            my ($token, $count) = @_;
            print $tokenizer->decode([$token]);
            STDOUT->flush();
            return 0;  # Continue
        },

lib/Lugh/Tokenizer.pm view on Meta::CPAN

    my $prompt = "<s>[INST] <<SYS>>
    You are a helpful assistant.
    <</SYS>>
    
    What is the capital of France? [/INST]";
    
    my @tokens = $tokenizer->encode($prompt, add_bos => 0);

=head2 Streaming Decode

    # Decode one token at a time (for streaming output)
    for my $token (@generated_tokens) {
        my $text = $tokenizer->decode([$token]);
        print $text;
        STDOUT->flush();
    }

=head1 LIMITATIONS

=over 4

t/0007-generate.t view on Meta::CPAN

# Test EOS token stops generation
# Generate with a single token and high max to see if EOS is hit
my @long_gen = $inference->generate(
    \@prompt_tokens,
    max_tokens => 50,
    greedy     => 1,
);
my $stopped_at_eos = (scalar(@long_gen) < 50) || ($long_gen[-1] == $eos_id);
ok($stopped_at_eos || 1, 'Generation may stop at EOS or max_tokens');

# Test streaming callback
my @callback_tokens;
my $callback_count = 0;
my @stream_gen = $inference->generate(
    \@prompt_tokens,
    max_tokens => 5,
    greedy     => 1,
    callback   => sub {
        my ($token, $count) = @_;
        push @callback_tokens, $token;
        $callback_count = $count;

( run in 1.003 second using v1.01-cache-2.11-cpan-df04353d9ac )