AI-Embedding
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
Revision history for AI-Embedding
1.11 20th December 2023
Corrected minimum Perl version
1.10 18th December 2023
Die if an incorrect API is provided instead os a random error
1.01 10th June 2023
Corrected minor errors in POD including version number
1.0 9th June 2023
First release version
Added acknowledgments to Ken Cotterill and Hugo van der Sanden
0.1_4 8th June 2023
NOTE - This is still a development release - DO NOT USE IN PRODUCTION
Corrected badly declared dependency and opted for the core JSON::PP module
0.1_3 8th June 2023
NOTE - This is still a development release - DO NOT USE IN PRODUCTION
Changed the implementation of the 'comparator' method so it can be used multiple times
Corrected typos in POD
Updated POD to reflect changes
Added new tests to test updated 'camparator' method
0.1_2 2nd June 2023
NOTE - This is still a development release - DO NOT USE IN PRODUCTION
Added 'test_embedding' method to allow code testing without calling the chargable API
Corrected authentication issues with OpenAI
Corrected issues causing CPAN Tests to fail
0.1_1 30th May 2023
First development version, released on an unsuspecting world.
Changes
lib/AI/Embedding.pm
Makefile.PL
MANIFEST This list of files
README
t/00-load.t
t/01-openai.t
t/02-test.t
t/manifest.t
t/pod-coverage.t
t/pod.t
t/version.t
META.yml Module YAML meta-data (added by MakeMaker)
META.json Module JSON meta-data (added by MakeMaker)
{
"abstract" : "Perl module for working with text embeddings using various APIs",
"author" : [
"Ian Boddison <bod@cpan.org>"
],
"dynamic_config" : 1,
"generated_by" : "ExtUtils::MakeMaker version 7.58, CPAN::Meta::Converter version 2.150010",
"license" : [
"perl_5"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
"version" : 2
},
"name" : "AI-Embedding",
"no_index" : {
"directory" : [
"t",
"inc"
]
},
"prereqs" : {
"build" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"configure" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"runtime" : {
"requires" : {
"Data::CosineSimilarity" : "0.02",
"HTTP::Tiny" : "0.014",
"JSON::PP" : "2.00",
"perl" : "5.010"
}
},
"test" : {
"requires" : {
"Test::More" : "0"
}
}
},
"release_status" : "stable",
"version" : "1.11",
"x_serialization_backend" : "JSON::PP version 4.06"
}
---
abstract: 'Perl module for working with text embeddings using various APIs'
author:
- 'Ian Boddison <bod@cpan.org>'
build_requires:
ExtUtils::MakeMaker: '0'
Test::More: '0'
configure_requires:
ExtUtils::MakeMaker: '0'
dynamic_config: 1
generated_by: 'ExtUtils::MakeMaker version 7.58, CPAN::Meta::Converter version 2.150010'
license: perl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: '1.4'
name: AI-Embedding
no_index:
directory:
- t
- inc
requires:
Data::CosineSimilarity: '0.02'
HTTP::Tiny: '0.014'
JSON::PP: '2.00'
perl: '5.010'
version: '1.11'
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
Makefile.PL view on Meta::CPAN
use 5.006;
use strict;
use warnings;
use ExtUtils::MakeMaker;
my %WriteMakefileArgs = (
NAME => 'AI::Embedding',
AUTHOR => q{Ian Boddison <bod@cpan.org>},
VERSION_FROM => 'lib/AI/Embedding.pm',
ABSTRACT_FROM => 'lib/AI/Embedding.pm',
LICENSE => 'perl_5',
MIN_PERL_VERSION => '5.010',
CONFIGURE_REQUIRES => {
'ExtUtils::MakeMaker' => '0',
},
TEST_REQUIRES => {
'Test::More' => '0',
},
PREREQ_PM => {
'JSON::PP' => '2.00',
'HTTP::Tiny' => '0.014',
'Data::CosineSimilarity' => '0.02',
},
dist => { COMPRESS => q{perl -MIO::Compress::Gzip=gzip,:constants -e"my $$in = $$ARGV[0]; gzip($$in => qq($$in.gz), q(Level) => Z_BEST_COMPRESSION, q(BinModeIn) => 1) or die q(gzip failed); unlink $$in;"}, SUFFIX => 'gz', },
clean => { FILES => 'AI-Embedding-*' },
);
# Compatibility with old versions of ExtUtils::MakeMaker
unless (eval { ExtUtils::MakeMaker->VERSION('6.64'); 1 }) {
my $test_requires = delete $WriteMakefileArgs{TEST_REQUIRES} || {};
@{$WriteMakefileArgs{PREREQ_PM}}{keys %$test_requires} = values %$test_requires;
}
unless (eval { ExtUtils::MakeMaker->VERSION('6.55_03'); 1 }) {
my $build_requires = delete $WriteMakefileArgs{BUILD_REQUIRES} || {};
@{$WriteMakefileArgs{PREREQ_PM}}{keys %$build_requires} = values %$build_requires;
}
delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
unless eval { ExtUtils::MakeMaker->VERSION('6.52'); 1 };
delete $WriteMakefileArgs{MIN_PERL_VERSION}
unless eval { ExtUtils::MakeMaker->VERSION('6.48'); 1 };
delete $WriteMakefileArgs{LICENSE}
unless eval { ExtUtils::MakeMaker->VERSION('6.31'); 1 };
WriteMakefile(%WriteMakefileArgs);
AI-Embedding
The README is used to introduce the module and provide instructions on
how to install the module, any machine dependencies it may have (for
example C compilers and installed libraries) and any other information
that should be provided before the module is installed.
A README file is required for CPAN modules since CPAN extracts the README
file from a module distribution so that people browsing the archive
can use it to get an idea of the module's uses. It is usually a good idea
to provide version information here so that people can decide whether
fixes for the module are worth downloading.
INSTALLATION
To install this module, run the following commands:
perl Makefile.PL
make
make test
make install
SUPPORT AND DOCUMENTATION
After installing, you can find documentation for this module with the
perldoc command.
perldoc AI::Embedding
You can also look for information at:
RT, CPAN's request tracker (report bugs here)
https://rt.cpan.org/NoAuth/Bugs.html?Dist=AI-Embedding
CPAN Ratings
https://cpanratings.perl.org/d/AI-Embedding
Search CPAN
https://metacpan.org/release/AI-Embedding
LICENSE AND COPYRIGHT
This software is Copyright (c) 2023 by Ian Boddison.
This program is released under the following license:
Perl
lib/AI/Embedding.pm view on Meta::CPAN
package AI::Embedding;
use strict;
use warnings;
use HTTP::Tiny;
use JSON::PP;
use Data::CosineSimilarity;
our $VERSION = '1.11';
$VERSION = eval $VERSION;
my $http = HTTP::Tiny->new;
# Create Embedding object
sub new {
my $class = shift;
my %attr = @_;
$attr{'error'} = '';
$attr{'api'} = 'OpenAI' unless $attr{'api'};
$attr{'error'} = 'Invalid API' unless $attr{'api'} eq 'OpenAI';
$attr{'error'} = 'API Key missing' unless $attr{'key'};
$attr{'model'} = 'text-embedding-ada-002' unless $attr{'model'};
return bless \%attr, $class;
}
# Define endpoints for APIs
my %url = (
'OpenAI' => 'https://api.openai.com/v1/embeddings',
);
# Define HTTP Headers for APIs
my %header = (
'OpenAI' => &_get_header_openai,
);
# Returns true if last operation was success
sub success {
my $self = shift;
return !$self->{'error'};
}
# Returns error if last operation failed
sub error {
my $self = shift;
return $self->{'error'};
}
# Header for calling OpenAI
sub _get_header_openai {
my $self = shift;
$self->{'key'} = '' unless defined $self->{'key'};
return {
'Authorization' => 'Bearer ' . $self->{'key'},
'Content-type' => 'application/json'
};
}
# Fetch Embedding response
sub _get_embedding {
my ($self, $text) = @_;
my $response = $http->post($url{$self->{'api'}}, {
'headers' => {
'Authorization' => 'Bearer ' . $self->{'key'},
'Content-type' => 'application/json'
},
content => encode_json {
input => $text,
model => $self->{'model'},
}
});
if ($response->{'content'} =~ 'invalid_api_key') {
die 'Incorrect API Key - check your API Key is correct';
}
return $response;
}
# TODO:
# Make 'headers' use $header{$self->{'api'}}
# Currently hard coded to OpenAI
# Added purely for testing - IGNORE!
sub _test {
my $self = shift;
# return $self->{'api'};
return $header{$self->{'api'}};
}
# Return Embedding as a CSV string
sub embedding {
my ($self, $text, $verbose) = @_;
my $response = $self->_get_embedding($text);
if ($response->{'success'}) {
my $embedding = decode_json($response->{'content'});
return join (',', @{$embedding->{'data'}[0]->{'embedding'}});
}
$self->{'error'} = 'HTTP Error - ' . $response->{'reason'};
return $response if defined $verbose;
return undef;
}
# Return Embedding as an array
sub raw_embedding {
my ($self, $text, $verbose) = @_;
my $response = $self->_get_embedding($text);
if ($response->{'success'}) {
my $embedding = decode_json($response->{'content'});
return @{$embedding->{'data'}[0]->{'embedding'}};
}
$self->{'error'} = 'HTTP Error - ' . $response->{'reason'};
return $response if defined $verbose;
return undef;
}
# Return Test Embedding
sub test_embedding {
my ($self, $text, $dimension) = @_;
$self->{'error'} = '';
$dimension = 1536 unless defined $dimension;
if ($text) {
srand scalar split /\s+/, $text;
}
my @vector;
for (1...$dimension) {
push @vector, rand(2) - 1;
}
return join ',', @vector;
}
# Convert a CSV Embedding into a hashref
sub _make_vector {
my ($self, $embed_string) = @_;
if (!defined $embed_string) {
$self->{'error'} = 'Nothing to compare!';
return;
}
my %vector;
my @embed = split /,/, $embed_string;
for (my $i = 0; $i < @embed; $i++) {
$vector{'feature' . $i} = $embed[$i];
}
return \%vector;
}
# Return a comparator to compare to a set vector
sub comparator {
my($self, $embed) = @_;
$self->{'error'} = '';
my $vector1 = $self->_make_vector($embed);
return sub {
my($embed2) = @_;
my $vector2 = $self->_make_vector($embed2);
return $self->_compare_vector($vector1, $vector2);
};
}
# Compare 2 Embeddings
sub compare {
my ($self, $embed1, $embed2) = @_;
my $vector1 = $self->_make_vector($embed1);
my $vector2;
if (defined $embed2) {
$vector2 = $self->_make_vector($embed2);
} else {
$vector2 = $self->{'comparator'};
}
if (!defined $vector2) {
$self->{'error'} = 'Nothing to compare!';
return;
}
if (scalar keys %$vector1 != scalar keys %$vector2) {
$self->{'error'} = 'Embeds are unequal length';
return;
}
return $self->_compare_vector($vector1, $vector2);
}
# Compare 2 Vectors
sub _compare_vector {
my ($self, $vector1, $vector2) = @_;
my $cs = Data::CosineSimilarity->new;
$cs->add( label1 => $vector1 );
$cs->add( label2 => $vector2 );
return $cs->similarity('label1', 'label2')->cosine;
}
1;
__END__
=encoding utf8
=head1 NAME
AI::Embedding - Perl module for working with text embeddings using various APIs
=head1 VERSION
Version 1.11
=head1 SYNOPSIS
use AI::Embedding;
my $embedding = AI::Embedding->new(
api => 'OpenAI',
key => 'your-api-key'
);
my $csv_embedding = $embedding->embedding('Some sample text');
my $test_embedding = $embedding->test_embedding('Some sample text');
my @raw_embedding = $embedding->raw_embedding('Some sample text');
my $cmp = $embedding->comparator($csv_embedding2);
my $similarity = $cmp->($csv_embedding1);
my $similarity_with_other_embedding = $embedding->compare($csv_embedding1, $csv_embedding2);
=head1 DESCRIPTION
The L<AI::Embedding> module provides an interface for working with text embeddings using various APIs. It currently supports the L<OpenAI|https://www.openai.com> L<Embeddings API|https://platform.openai.com/docs/guides/embeddings/what-are-embeddings>...
Embeddings allow the meaning of passages of text to be compared for similarity. This is more natural and useful to humans than using traditional keyword based comparisons.
An Embedding is a multi-dimensional vector representing the meaning of a piece of text. The Embedding vector is created by an AI Model. The default model (OpenAI's C<text-embedding-ada-002>) produces a 1536 dimensional vector. The resulting vector...
=head2 Comparator
Embeddings are used to compare similarity of meaning between two passages of text. A typical work case is to store a number of pieces of text (e.g. articles or blogs) in a database and compare each one to some user supplied search text. L<AI::Embed...
Alternatively, the C<comparator> method can be called with one Embedding. The C<comparator> returns a reference to a method that takes a single Embedding to be compared to the Embedding from which the Comparator was created.
When comparing multiple Embeddings to the same Embedding (such as search text) it is faster to use a C<comparator>.
=head1 CONSTRUCTOR
=head2 new
my $embedding = AI::Embedding->new(
api => 'OpenAI',
key => 'your-api-key',
model => 'text-embedding-ada-002',
);
Creates a new AI::Embedding object. It requires the 'key' parameter. The 'key' parameter is the API key provided by the service provider and is required.
Parameters:
=over
=item *
C<key> - B<required> The API Key
=item *
C<api> - The API to use. Currently only 'OpenAI' is supported and this is the default.
=item *
C<model> - The language model to use. Defaults to C<text-embedding-ada-002> - see L<OpenAI docs|https://platform.openai.com/docs/guides/embeddings/what-are-embeddings>
=back
=head1 METHODS
=head2 success
Returns true if the last method call was successful
=head2 error
Returns the last error message or an empty string if B<success> returned true
=head2 embedding
my $csv_embedding = $embedding->embedding('Some text passage', [$verbose]);
Generates an embedding for the given text and returns it as a comma-separated string. The C<embedding> method takes a single parameter, the text to generate the embedding for.
Returns a (rather long) string that can be stored in a C<TEXT> database field.
If the method call fails it sets the L</"error"> message and returns C<undef>. If the optional C<verbose> parameter is true, the complete L<HTTP::Tiny> response object is also returned to aid with debugging issues when using this module.
=head2 raw_embedding
my @raw_embedding = $embedding->raw_embedding('Some text passage', [$verbose]);
Generates an embedding for the given text and returns it as an array. The C<raw_embedding> method takes a single parameter, the text to generate the embedding for.
It is not normally necessary to use this method as the Embedding will almost always be used as a single homogeneous unit.
If the method call fails it sets the L</"error"> message and returns C<undef>. If the optional C<verbose> parameter is true, the complete L<HTTP::Tiny> response object is also returned to aid with debugging issues when using this module.
=head2 test_embedding
my $test_embedding = $embedding->test_embedding('Some text passage', $dimensions);
Used for testing code without making a chargeable call to the API.
Provides a CSV string of the same size and format as L<embedding> but with meaningless random data.
Returns a random embedding. Both parameters are optional. If a text string is provided, the returned embedding will always be the same random embedding otherwise it will be random and different every time. The C<dimension> parameter controls the n...
=head2 comparator
$embedding->comparator($csv_embedding2);
Sets a vector as a C<comparator> for future comparisons and returns a reference to a method for using the C<comparator>.
The B<comparator> method takes a single parameter, the comma-separated Embedding string to use as the comparator.
The following two are functionally equivalent. However, where multiple Embeddings are to be compared to a single Embedding, using a L<Comparator> is significantly faster.
my $similarity = $embedding->compare($csv_embedding1, $csv_embedding2);
my $cmp = $embedding->comparator($csv_embedding2);
my $similarity = $cmp->($csv_embedding1);
See L</"Comparator">
The returned method reference returns the cosine similarity between the Embedding used to call the C<comparator> method and the Embedding supplied to the method reference. See L<compare> for an explanation of the cosine similarity.
=head2 compare
my $similarity_with_other_embedding = $embedding->compare($csv_embedding1, $csv_embedding2);
Compares two embeddings and returns the cosine similarity between them. The B<compare> method takes two parameters: $csv_embedding1 and $csv_embedding2 (both comma-separated embedding strings).
Returns the cosine similarity as a floating-point number between -1 and 1, where 1 represents identical embeddings, 0 represents no similarity, and -1 represents opposite embeddings.
The absolute number is not usually relevant for text comparision. It is usually sufficient to rank the comparison results in order of high to low to reflect the best match to the worse match.
=head1 SEE ALSO
L<https://openai.com> - OpenAI official website
=head1 AUTHOR
Ian Boddison <ian at boddison.com>
=head1 BUGS
Please report any bugs or feature requests to C<bug-ai-embedding at rt.cpan.org>, or through
the web interface at L<https://rt.cpan.org/NoAuth/ReportBug.html?Queue=bug-ai-embedding>. I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc AI::Embedding
You can also look for information at:
=over 4
=item * RT: CPAN's request tracker (report bugs here)
L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=AI-Embedding>
=item * Search CPAN
L<https://metacpan.org/release/AI::Embedding>
=back
=head1 ACKNOWLEDGEMENTS
Thanks to the help and support provided by members of Perl Monks L<https://perlmonks.org/>.
Especially L<Ken Cotterill (KCOTT)|https://metacpan.org/author/KCOTT> for assistance with unit tests and L<Hugo van der Sanden (HVDS)|https://metacpan.org/author/HVDS> for suggesting the current C<comparator> implementaion.
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2023 by Ian Boddison.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
t/00-load.t view on Meta::CPAN
#!perl
use 5.006;
use strict;
use warnings;
use Test::More tests => 1;
#plan tests => 2;
BEGIN {
use_ok( 'AI::Embedding' ) || print "Bail out!\n";
}
diag( "Testing AI::Embedding $AI::Embedding::VERSION, Perl $], $^X" );
t/01-openai.t view on Meta::CPAN
#!perl
use 5.006;
use strict;
use warnings;
use Test::More;
use AI::Embedding;
my $embed_fail1 = AI::Embedding->new();
ok( $embed_fail1->isa( 'AI::Embedding' ), 'Instantiation' );
ok( !$embed_fail1->success, 'Key Error during object creation' );
my $embed_fail2 = AI::Embedding->new(
'key' => '0123456789',
'api' => 'Not Allowed',
);
ok( $embed_fail2->isa( 'AI::Embedding' ), 'Instantiation' );
ok( !$embed_fail2->success, 'API Error during object creation' );
my $embed_pass = AI::Embedding->new(
'key' => '0123456789',
'api' => 'OpenAI',
);
ok( $embed_pass->isa( 'AI::Embedding' ), 'Instantiation' );
ok( $embed_pass->success, 'Successful object creation' );
my $comp_fail = $embed_pass->compare('-0.6,-0.5,-0.4,-0.3,-0.2,0.0,0.2,0.3,0.4,0.5', '-0.6,-0.5,-0.4,-0.3,-0.2');
ok( !$embed_pass->success, 'Compare mismatch' );
ok( $embed_pass->error eq 'Embeds are unequal length', 'Correct error message');
my $comp_pass1 = $embed_pass->compare('-0.6,-0.5,-0.4,-0.3,-0.2,0.0,0.2,0.3,0.4,0.5', '-0.6,-0.5,-0.4,-0.3,-0.2,0.0,0.2,0.3,0.4,0.5');
is( $comp_pass1, 1, "Compare got $comp_pass1");
my $cmp = $embed_pass->comparator('-0.6,-0.5,-0.4,-0.3,-0.2,0.0,0.2,0.3,0.4,0.5');
ok( $embed_pass->success, "Comparator created" );
ok( defined $cmp, "Comparator exists" );
my $comp_pass2 = $cmp->('-0.6,-0.5,-0.4,-0.3,-0.2,0.0,0.2,0.3,0.4,0.5');
is( $comp_pass2, 1, "Compare to comparator got $comp_pass2");
done_testing(12);
t/02-test.t view on Meta::CPAN
#!perl
use 5.006;
use strict;
use warnings;
use Test::More;
use AI::Embedding;
my $embed_pass = AI::Embedding->new(
'key' => '0123456789',
'api' => 'OpenAI',
);
ok( $embed_pass->isa( 'AI::Embedding' ), 'Instantiation' );
ok( $embed_pass->success, 'Successful object creation' );
my $test_string1 = 'The cat sat on the mat';
my $test_string2 = 'Hickory dickory dock';
my $embed1 = $embed_pass->test_embedding($test_string1);
is( scalar split (/,/, $embed1), 1536, "Correct first embed length");
my $embed2 = $embed_pass->test_embedding($test_string2);
is( scalar split (/,/, $embed2), 1536, "Correct second embed length");
my $embed3 = $embed_pass->test_embedding($test_string2);
ok( $embed2 eq $embed3, "Same text - same test embedding" );
ok( $embed2 ne $embed1, "Different text - different test embedding" );
done_testing(6);
t/manifest.t view on Meta::CPAN
#!perl
use 5.006;
use strict;
use warnings;
use Test::More;
unless ( $ENV{RELEASE_TESTING} ) {
plan( skip_all => "Author tests not required for installation" );
}
my $min_tcm = 0.9;
eval "use Test::CheckManifest $min_tcm";
plan skip_all => "Test::CheckManifest $min_tcm required" if $@;
ok_manifest();
t/pod-coverage.t view on Meta::CPAN
#!perl
use 5.006;
use strict;
use warnings;
use Test::More;
unless ( $ENV{RELEASE_TESTING} ) {
plan( skip_all => "Author tests not required for installation" );
}
# Ensure a recent version of Test::Pod::Coverage
my $min_tpc = 1.08;
eval "use Test::Pod::Coverage $min_tpc";
plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage"
if $@;
# Test::Pod::Coverage doesn't require a minimum Pod::Coverage version,
# but older versions don't recognize some common documentation styles
my $min_pc = 0.18;
eval "use Pod::Coverage $min_pc";
plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage"
if $@;
all_pod_coverage_ok();
#!perl
use 5.006;
use strict;
use warnings;
use Test::More;
unless ( $ENV{RELEASE_TESTING} ) {
plan( skip_all => "Author tests not required for installation" );
}
# Ensure a recent version of Test::Pod
my $min_tp = 1.22;
eval "use Test::Pod $min_tp";
plan skip_all => "Test::Pod $min_tp required for testing POD" if $@;
all_pod_files_ok();
t/version.t view on Meta::CPAN
#!/usr/bin/perl
use warnings;
use strict;
use Test::More;
use AI::Embedding;
unless ( $ENV{RELEASE_TESTING} ) {
plan( skip_all => "Author tests not required for installation" );
}
my $code_version = $AI::Embedding::VERSION;
ok($code_version, 'version set');
ok(open(my $source, '<', $INC{'AI/Embedding.pm'}), 'open the source');
my $in_version;
while (<$source>) {
if (/^=head1 VERSION/) {
$in_version = 1;
} elsif (/^=head1/) {
undef $in_version;
}
if ($in_version && /^Version ([0-9.]+)/) {
is($code_version, $1, 'pod version');
}
}
done_testing();
view all matches for this distributionview release on metacpan - search on metacpan
( run in 1.769 second using v1.00-cache-2.02-grep-82fe00e-cpan-72ae3ad1e6da )