AI-Embedding

 view release on metacpan or  search on metacpan

lib/AI/Embedding.pm  view on Meta::CPAN

package AI::Embedding;

use strict;
use warnings;

use HTTP::Tiny;
use JSON::PP;
use Data::CosineSimilarity;

our $VERSION = '1.11';
$VERSION = eval $VERSION;

my $http = HTTP::Tiny->new;

# Create Embedding object
sub new {
    my $class = shift;
    my %attr  = @_;

    $attr{'error'}      = '';

    $attr{'api'}        = 'OpenAI' unless $attr{'api'};
    $attr{'error'}      = 'Invalid API' unless $attr{'api'} eq 'OpenAI';
    $attr{'error'}      = 'API Key missing' unless $attr{'key'};

    $attr{'model'}      = 'text-embedding-ada-002' unless $attr{'model'};

    return bless \%attr, $class;
}

# Define endpoints for APIs
my %url    = (
    'OpenAI' => 'https://api.openai.com/v1/embeddings',
);

# Define HTTP Headers for APIs
my %header = (
    'OpenAI' => &_get_header_openai,
);

# Returns true if last operation was success
sub success {
    my $self = shift;
    return !$self->{'error'};
}

# Returns error if last operation failed
sub error {
    my $self = shift;
    return $self->{'error'};
}

# Header for calling OpenAI
sub _get_header_openai {
    my $self = shift;
    $self->{'key'} = '' unless defined $self->{'key'};
    return {
         'Authorization' => 'Bearer ' . $self->{'key'},
         'Content-type'  => 'application/json'
     };
 }

 # Fetch Embedding response
 sub _get_embedding {
     my ($self, $text) = @_;

     my $response = $http->post($url{$self->{'api'}}, {
         'headers' => {
             'Authorization' => 'Bearer ' . $self->{'key'},
             'Content-type'  => 'application/json'
         },
         content => encode_json {
             input  => $text,
             model  => $self->{'model'},
         }
     });
     if ($response->{'content'} =~ 'invalid_api_key') {
         die 'Incorrect API Key - check your API Key is correct';
     }
     return $response;
 }

 # TODO:
 # Make 'headers' use $header{$self->{'api'}}
 # Currently hard coded to OpenAI

 # Added purely for testing - IGNORE!
 sub _test {
     my $self = shift;
#    return $self->{'api'};
     return $header{$self->{'api'}};
 }

 # Return Embedding as a CSV string
 sub embedding {
     my ($self, $text, $verbose) = @_;

     my $response = $self->_get_embedding($text);
     if ($response->{'success'}) {
         my $embedding = decode_json($response->{'content'});
         return join (',', @{$embedding->{'data'}[0]->{'embedding'}});
     }
     $self->{'error'} = 'HTTP Error - ' . $response->{'reason'};
     return $response if defined $verbose;
     return undef;
 }

 # Return Embedding as an array
 sub raw_embedding {
     my ($self, $text, $verbose) = @_;

     my $response = $self->_get_embedding($text);
     if ($response->{'success'}) {
         my $embedding = decode_json($response->{'content'});
         return @{$embedding->{'data'}[0]->{'embedding'}};
     }
     $self->{'error'} = 'HTTP Error - ' . $response->{'reason'};
     return $response if defined $verbose;
     return undef;
 }

 # Return Test Embedding
 sub test_embedding {
     my ($self, $text, $dimension) = @_;
     $self->{'error'} = '';

     $dimension = 1536 unless defined $dimension;

     if ($text) {
         srand scalar split /\s+/, $text;
     }

     my @vector;
     for (1...$dimension) {
         push @vector, rand(2) - 1;
     }
     return join ',', @vector;
 }

# Convert a CSV Embedding into a hashref
sub _make_vector {
    my ($self, $embed_string) = @_;

    if (!defined $embed_string) {
        $self->{'error'} = 'Nothing to compare!';
        return;
    }

    my %vector;
    my @embed = split /,/, $embed_string;
    for (my $i = 0; $i < @embed; $i++) {
       $vector{'feature' . $i} = $embed[$i];
   }
   return \%vector;
}

# Return a comparator to compare to a set vector
sub comparator {
    my($self, $embed) = @_;
    $self->{'error'} = '';

    my $vector1 = $self->_make_vector($embed);
    return sub {
        my($embed2) = @_;
        my $vector2 = $self->_make_vector($embed2);
        return $self->_compare_vector($vector1, $vector2);
    };
}

# Compare 2 Embeddings
sub compare {
    my ($self, $embed1, $embed2) = @_;

    my $vector1 = $self->_make_vector($embed1);
    my $vector2;
    if (defined $embed2) {
        $vector2 = $self->_make_vector($embed2);
    } else {
        $vector2 = $self->{'comparator'};
    }

    if (!defined $vector2) {
        $self->{'error'} = 'Nothing to compare!';
        return;
    }

    if (scalar keys %$vector1 != scalar keys %$vector2) {
        $self->{'error'} = 'Embeds are unequal length';
        return;
    }

    return $self->_compare_vector($vector1, $vector2);
}

# Compare 2 Vectors
sub _compare_vector {
    my ($self, $vector1, $vector2) = @_;
    my $cs = Data::CosineSimilarity->new;
    $cs->add( label1 => $vector1 );
    $cs->add( label2 => $vector2 );
    return $cs->similarity('label1', 'label2')->cosine;
}

1;

__END__

=encoding utf8

=head1 NAME

AI::Embedding - Perl module for working with text embeddings using various APIs

=head1 VERSION

Version 1.11

=head1 SYNOPSIS

    use AI::Embedding;

    my $embedding = AI::Embedding->new(
        api => 'OpenAI',
        key => 'your-api-key'
    );

    my $csv_embedding  = $embedding->embedding('Some sample text');
    my $test_embedding = $embedding->test_embedding('Some sample text');
    my @raw_embedding  = $embedding->raw_embedding('Some sample text');

    my $cmp = $embedding->comparator($csv_embedding2);

    my $similarity = $cmp->($csv_embedding1);
    my $similarity_with_other_embedding = $embedding->compare($csv_embedding1, $csv_embedding2);

=head1 DESCRIPTION

The L<AI::Embedding> module provides an interface for working with text embeddings using various APIs. It currently supports the L<OpenAI|https://www.openai.com> L<Embeddings API|https://platform.openai.com/docs/guides/embeddings/what-are-embeddings>...

Embeddings allow the meaning of passages of text to be compared for similarity.  This is more natural and useful to humans than using traditional keyword based comparisons.

An Embedding is a multi-dimensional vector representing the meaning of a piece of text.  The Embedding vector is created by an AI Model.  The default model (OpenAI's C<text-embedding-ada-002>) produces a 1536 dimensional vector.  The resulting vector...

=head2 Comparator

Embeddings are used to compare similarity of meaning between two passages of text.  A typical work case is to store a number of pieces of text (e.g. articles or blogs) in a database and compare each one to some user supplied search text.  L<AI::Embed...

Alternatively, the C<comparator> method can be called with one Embedding.  The C<comparator> returns a reference to a method that takes a single Embedding to be compared to the Embedding from which the Comparator was created.

When comparing multiple Embeddings to the same Embedding (such as search text) it is faster to use a C<comparator>.

=head1 CONSTRUCTOR

=head2 new

    my $embedding = AI::Embedding->new(



( run in 2.677 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )