Simd-Avx512

 view release on metacpan or  search on metacpan

lib/Simd/Avx512.pm  view on Meta::CPAN

 }

sub require32($)                                                                # Check that we have a string of 32 bits
 {my ($xmm) = @_;                                                               # DWord
  requireN(32, $xmm);
 }

sub require64($)                                                                # Check that we have a string of 64 bits
 {my ($xmm) = @_;                                                               # Bytes
  requireN(64, $xmm);
 }

sub require128($)                                                               # Check that we have a string of 128 bits
 {my ($xmm) = @_;                                                               # Word
  requireN(128, $xmm);
 }

sub require256($)                                                               # Check that we have a string of 256 bits
 {my ($xmm) = @_;                                                               # Dword
  requireN(256, $xmm);
 }

sub require512($)                                                               # Check that we have a string of 512 bits
 {my ($xmm) = @_;                                                               # Qword
  requireN(128, $xmm);
 }

sub require128or256or512($;$)                                                   # Check that we have a string of 128|256|512 bits in the first operand and optionally the same in the second operand
 {my ($xmm1, $xmm2) = @_;                                                       # Bytes, optional bytes
  my $l = length $xmm1;
  confess "128|256|512 bits required for first operand ($l)"    unless $l == 128 or $l == 256 or $l == 512;
  if (defined $xmm2)
   {my $m = length $xmm2;
    confess "128|256|512 bits required for second operand ($m)" unless $m == 128 or $m == 256 or $m == 512;
    confess "Operands must have same length($l,$m)" unless $l == $m;
   }
  $l
 }

sub require64or128or256or512($)                                                 # Check that we have a string of 64|128|256|512 bits
 {my ($xmm) = @_;                                                               # Bytes
  my $l = length $xmm;
  confess "64|128|256|512 bits required for operand"  unless $l == 64 or $l == 128 or $l == 256 or $l == 512;
  confess "Only zeros and ones allowed in operand"    unless $xmm =~ m(\A[01]+\Z);
 }

sub requireSameLength($$)                                                       # Check that the two operands have the same length
 {my ($xmm1, $xmm2) = @_;                                                       # Bytes, bytes
  my ($l, $L) = (length($xmm1), length($xmm2));
  confess "Operands have different lengths($l, $L)" unless $l == $L;
  $l
 }

sub requireNumber128or256or512($)                                               # Check that we have a number with a value of 128|256|512
 {my ($n) = @_;                                                                 # Number
  confess "128|256|512 required for operand" unless $n == 128 or $n == 256 or $n == 512;
 }

sub flipBitsUnderMask($$)                                                       # Flip the bits in a string where the corresponding  mask bit is 1 else leave the bit as is
 {my ($string, $mask) = @_;                                                     # Bit string, mask
  my $l = requireSameLength $string, $mask;
  my $f = '';
  for my $i(0..$l-1)                                                            # Each character in the string and mask
   {my $s = substr($string, $i, 1);
    $f .= substr($mask, $i, 1) eq '0' ? $s : $s eq '0' ? '1' : '0'
   }
  $f
 }

sub compareTwosComplement($$)                                                   # Compare two numbers in two's complement formats and return -1 if the first number is less than the second, 0 if they are equal, else +1
 {my ($a, $b) = @_;                                                             # First, second
  my $n = requireSameLength $a, $b;

  return -1 if substr($a, 0, 1) eq '1' and substr($b, 0, 1) eq '0';             # Leading sign bit
  return +1 if substr($a, 0, 1) eq '0' and substr($b, 0, 1) eq '1';

  for(1..$n)                                                                    # Non sign bits
   {return -1 if substr($a, $_, 1) eq '0' and substr($b, $_, 1) eq '1';
    return +1 if substr($a, $_, 1) eq '1' and substr($b, $_, 1) eq '0';
   }
  0                                                                             # Equal
 }

#D1 Instructions                                                                # Emulation of Avx512 instructions

sub PSLLDQ($$)                                                                  # Packed Shift Left Logical DoubleQword
 {my ($xmm1, $imm8) = @_;                                                       # Bytes, length of shift in bytes
  require128 $xmm1;                                                             # Check that we have a string of 128 bits
  substr($xmm1, $imm8 * 8).zBytes($imm8)
 }

sub VPSLLDQ($$)                                                                 # Packed Shift Left Logical DoubleQword
 {my ($xmm1, $imm8) = @_;                                                       # Bytes, length of shift in bytes
  require128or256or512 $xmm1;                                                   # Check that we have a string of 128 bits
  confess "0 - 15 for shift amount required" unless $imm8 >= 0 and $imm8 < 16;

  return PSLLDQ($xmm1, $imm8) if length($xmm1)                   == 128;

  return PSLLDQ(substr($xmm1,   0, 128), $imm8).
         PSLLDQ(substr($xmm1, 128, 128), $imm8) if length($xmm1) == 256;

  return PSLLDQ(substr($xmm1,   0, 128), $imm8).
         PSLLDQ(substr($xmm1, 128, 128), $imm8).
         PSLLDQ(substr($xmm1, 256, 128), $imm8).
         PSLLDQ(substr($xmm1, 384, 128), $imm8)
 }

sub PSRLDQ($$)                                                                  # Packed Shift Right Logical DoubleQword
 {my ($xmm1, $imm8) = @_;                                                       # Bytes, length of shift
  require128 $xmm1;                                                             # Check that we have a string of 128 bits
  zBytes($imm8).substr($xmm1, 0, 128 - $imm8 * 8)
 }

sub VPSRLDQ($$)                                                                 # Packed Shift Right Logical DoubleQword
 {my ($xmm1, $imm8) = @_;                                                       # Bytes, length of shift
  require128or256or512 $xmm1;                                                   # Check that we have a string of 128 bits
  confess "0 - 15 for shift amount required" unless $imm8 >= 0 and $imm8 < 16;

  return PSRLDQ($xmm1, $imm8) if length($xmm1)                   == 128;

  return PSRLDQ(substr($xmm1,   0, 128), $imm8).
         PSRLDQ(substr($xmm1, 128, 128), $imm8) if length($xmm1) == 256;

  return PSRLDQ(substr($xmm1,   0, 128), $imm8).
         PSRLDQ(substr($xmm1, 128, 128), $imm8).
         PSRLDQ(substr($xmm1, 256, 128), $imm8).
         PSRLDQ(substr($xmm1, 384, 128), $imm8)
 }

#D1 PCMP                                                                        # Packed CoMPare
#D2 PCMPEQ                                                                      # Packed CoMPare EQual

lib/Simd/Avx512.pm  view on Meta::CPAN

 }

#D2 VPCMPU                                                                      # Packed CoMPare Unsigned

sub vpcmpu($$$$$)                                                               #P Packed CoMPare Unsigned
 {my ($size, $k2, $xmm1, $xmm2, $op) = @_;                                      # Size of element in bits, input mask, bytes, bytes, test code

  require8or16or32or64 $size if $develop;                                       # We supply this parameter so we ought to get it right
  require64 $k2;                                                                # Mask
  require128or256or512 $xmm1, $xmm2;                                            # Check that we have a string of 128 bits in the first operand
  confess "Invalid op code $op" unless $op =~ m(\A(0|1|2|4|5|6)\Z);             # Test code

  my $T  =                                                                      # String tests
   [sub {return 1 if $_[0] eq $_[1]; 0},                                        # eq 0
    sub {return 1 if $_[0] lt $_[1]; 0},                                        # lt 1
    sub {return 1 if $_[0] le $_[1]; 0},                                        # le 2
    undef,
    sub {return 1 if $_[0] ne $_[1]; 0},                                        # ne 4
    sub {return 1 if $_[0] ge $_[1]; 0},                                        # ge 5
    sub {return 1 if $_[0] gt $_[1]; 0},                                        # gt 6
   ];

  my $N  = length($xmm1) / $size;                                               # Number of elements
  my $k1 = maskRegister;
     $k2 = substr($k2, -$N);                                                    # Relevant portion of mask
  for(0..$N-1)
   {next unless substr($k2, $_, 1) eq '1';                                      # Mask
    my $o = $_ * $size;
    substr($k1, $_, 1) = &{$$T[$op]}(substr($xmm1, $o, $size),                  # Compare according to code
                                     substr($xmm2, $o, $size)) ? '1' : '0';
   }

  substr(zBytes(8).substr($k1, 0, $N), -64)
 }

sub VPCMPUB($$$$)                                                               # Packed CoMPare Unsigned Byte
 {my ($k2, $xmm1, $xmm2, $op) = @_;                                             # Input mask, bytes, bytes, test code
  vpcmpu 8, $k2, $xmm1, $xmm2, $op
 }

sub VPCMPUW($$$$)                                                               # Packed CoMPare Unsigned Word
 {my ($k2, $xmm1, $xmm2, $op) = @_;                                             # Input mask, words, words, test code
  vpcmpu 16, $k2, $xmm1, $xmm2, $op
 }

sub VPCMPUD($$$$)                                                               # Packed CoMPare Unsigned Dword
 {my ($k2, $xmm1, $xmm2, $op) = @_;                                             # Input mask, dwords, dwords, test code
  vpcmpu 32, $k2, $xmm1, $xmm2, $op
 }

sub VPCMPUQ($$$$)                                                               # Packed CoMPare Unsigned Qword
 {my ($k2, $xmm1, $xmm2, $op) = @_;                                             # Input mask, qwords, qwords, test code
  vpcmpu 64, $k2, $xmm1, $xmm2, $op
 }

#D1 VPTEST                                                                      # Packed TEST
#D2 VPTESTM                                                                     # Packed TEST MASK

sub andAndTest($$)                                                              #P And two bit strings of the same length and return 0 if the result is 0 else 1
 {my ($a, $b) = @_;                                                             # Element, element
  my $N = requireSameLength $a, $b;                                             # Check that the two elements have the same length
  for(0..$N-1)                                                                  # Look for match
   {return 1 if substr($a, $_, 1) eq '1' and substr($b, $_, 1) eq '1';
   }
  0
 }

sub vptest($$$)                                                                 #P Packed TEST
 {my ($size, $xmm1, $xmm2) = @_;                                                # Size of element in bits, element, element

  require8or16or32or64 $size if $develop;                                       # We supply this parameter so we ought to get it right
  require128or256or512 $xmm1, $xmm2;                                            # Check that we have a string of 128 bits in the first operand

  my $N  = length($xmm1) / $size;                                               # Number of elements
  my $k1 = maskRegister;
  for(0..$N-1)
   {my $o = $_ * $size;
    substr($k1, $_, 1) = andAndTest(substr($xmm1, $o, $size),                   # Test two elements
                                    substr($xmm2, $o, $size)) ? '1' : '0';
   }

  substr(zBytes(8).substr($k1, 0, $N), -64)
 }

sub VPTESTMB($$)                                                                # Packed TEST Mask Byte
 {my ($xmm1, $xmm2) = @_;                                                       # Bytes, bytes
  vptest 8, $xmm1, $xmm2
 }

sub VPTESTMW($$)                                                                # Packed TEST Mask Word
 {my ($xmm1, $xmm2) = @_;                                                       # Words, words
  vptest 16, $xmm1, $xmm2
 }

sub VPTESTMD($$)                                                                # Packed TEST Mask Dword
 {my ($xmm1, $xmm2) = @_;                                                       # Dwords, dwords
  vptest 32, $xmm1, $xmm2
 }

sub VPTESTMQ($$)                                                                # Packed TEST Mask Quad
 {my ($xmm1, $xmm2) = @_;                                                       # Quads, quads
  vptest 64, $xmm1, $xmm2
 }

#D1 VPBROADCAST                                                                 # VPBROADCASTB - Packed BROADCAST Byte

sub VPBROADCASTB($$)                                                            # Packed TEST Mask Byte
 {my ($size, $b) = @_;                                                          # Size of target in bits, byte
  requireNumber128or256or512 $size;
  require8 $b;
  repeat($b, $size / 8)
 }

sub VPBROADCASTW($$)                                                            # Packed TEST Mask Word
 {my ($size, $w) = @_;                                                          # Size of target in bits, word
  requireNumber128or256or512 $size;
  require16 $w;
  repeat($w, $size / 16)
 }

sub VPBROADCASTD($$)                                                            # Packed TEST Mask Dword

lib/Simd/Avx512.pm  view on Meta::CPAN


sub VPINSRW($$$)                                                                # Packed INSeRt Word
 {my ($target, $word, $pos) = @_;                                               # Target element, word, position to insert byte expressed as number of words from lowest order word numbered 0
  require128or256or512 $target;
  require16 $word;
  confess "Invalid position $pos" if $pos < 0 or $pos > length($target) / 2;
  substr($target, -($pos+1)*16, 16) = $word;
  $target
 }

sub VPINSRD($$$)                                                                # Packed INSeRt Dword
 {my ($target, $dword, $pos) = @_;                                              # Target element, dword, position to insert byte expressed as number of dwords from lowest order dword numbered 0
  require128or256or512 $target;
  require32 $dword;
  confess "Invalid position $pos" if $pos < 0 or $pos > length($target) / 4;
  substr($target, -($pos+1)*32, 32) = $dword;
  $target
 }

sub VPINSRQ($$$)                                                                # Packed INSeRt Quad
 {my ($target, $qword, $pos) = @_;                                              # Target element, qword, position to insert byte expressed as number of dwords from lowest order qword numbered 0
  require128or256or512 $target;
  require64 $qword;
  confess "Invalid position $pos" if $pos < 0 or $pos > length($target) / 8;
  substr($target, -($pos+1)*64, 64) = $qword;
  $target
 }

#D1 VPLZCNT                                                                     # Packed Leading Zero CouNT

sub VPLZCNTD($)                                                                 # Packed Leading Zero CouNT Dword
 {my ($target) = @_;                                                            # Target element
  require128or256or512 $target;
  my $r = '';
  my $n = length($target) / 32;
  for(0..$n-1)
   {my $b = substr($target, $_*32, 32) =~ s(1.*\Z) ()sr;
    $r .= sprintf("%032b", length $b);
   }
  $r
 }

sub VPLZCNTQ($)                                                                 # Packed Leading Zero CouNT Qword
 {my ($target) = @_;                                                            # Target element
  require128or256or512 $target;
  my $r = '';
  my $n = length($target) / 64;
  for(0..$n-1)
   {my $b = substr($target, $_*64, 64) =~ s(1.*\Z) ()sr;
    $r .= sprintf("%064b", length $b);
   }
  $r
 }

#D1 Compress and Expand                                                         # Compress or expand
#D2 VPCOMPRESS                                                                  # Packed COMPRESS

sub vpcompress($$$$$)                                                           #P Packed COMPRESS
 {my ($size, $xmm1, $k2, $z, $xmm2) = @_;                                       # Size of each element in bits, Compression target, compression mask, clear upper elements, source to compress
  require64 $k2;
  my $n = require128or256or512 $xmm1, $xmm2;
  my $N = $n / $size;                                                           # Number of elements
  $xmm1 = '0' x length $xmm1 if $z;                                             # Clear target if requested
  my $p = 0;                                                                    # Position in target
  for(1..$N)                                                                    # Compress selected elements
   {if (substr($k2, -$_, 1) eq '1')
     {substr($xmm1, --$p * $size, $size) = substr($xmm2, -$_ * $size, $size)
     }
   }
  $xmm1
 }

sub VPCOMPRESSD($$$$)                                                           # Packed COMPRESS Dword
 {my ($xmm1, $k2, $z, $xmm2) = @_;                                              # Compression target, compression mask, clear upper elements, source to compress
  vpcompress 32, $xmm1, $k2, $z, $xmm2
 }

sub VPCOMPRESSQ($$$$)                                                           # Packed COMPRESS Qword
 {my ($xmm1, $k2, $z, $xmm2) = @_;                                              # Compression target, compression mask, clear upper elements, source to compress
  vpcompress 64, $xmm1, $k2, $z, $xmm2
 }

#D2 VPEXPAND                                                                    # Packed EXPAND

sub vpexpand($$$$$)                                                             #P Packed EXPAND
 {my ($size, $xmm1, $k2, $z, $xmm2) = @_;                                       # Size of each element in bits, Compression target, expansion mask, clear upper elements, source to expand
  require64 $k2;
  my $n = require128or256or512 $xmm1, $xmm2;
  my $N = $n / $size;                                                           # Number of elements
  $xmm1 = '0' x length $xmm1 if $z;                                             # Clear target if requested
  my $p = 0;                                                                    # Position in target
  for(1..$N)                                                                    # Compress selected elements
   {if (substr($k2, -$_, 1) eq '1')
     {substr($xmm2, -$_ * $size, $size) = substr($xmm1, --$p * $size, $size)
     }
   }
  $xmm1
 }

sub VPEXPANDD($$$$)                                                             # Packed EXPAND Dword
 {my ($xmm1, $k2, $z, $xmm2) = @_;                                              # Compression target, expansion mask, clear upper elements, source to expand
  vpexpand 32, $xmm1, $k2, $z, $xmm2
 }

sub VPEXPANDQ($$$$)                                                             # Packed EXPAND Qword
 {my ($xmm1, $k2, $z, $xmm2) = @_;                                              # Compression target, expansion mask, clear upper elements, source to expand
  vpexpand 64, $xmm1, $k2, $z, $xmm2
 }

#D0
#-------------------------------------------------------------------------------
# Export
#-------------------------------------------------------------------------------

use Exporter qw(import);

use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

@ISA          = qw(Exporter);
@EXPORT_OK    = qw(
);
%EXPORT_TAGS  = (all=>[@EXPORT, @EXPORT_OK]);

# podDocumentation

=pod

=encoding utf-8

=head1 Name

Simd::Avx512 - Emulate SIMD Avx512 instructions

=head1 Synopsis

Help needed please!

The instructions being emulated are illustrated at: L<https://www.officedaytime.com/simd512e/>
The instructions being emulated are described at: L<https://hjlebbink.github.io/x86doc/>

=head2 Example

Find the number of leading zeros in each of 8 quad words.

if (1) {
  my ($i, $od, $oq) = (
#Q0                                                               1                                                               2                                                               3                                                       ...
#D0                               1                               2                               3                               4                               5                               6                               7                       ...



( run in 1.569 second using v1.01-cache-2.11-cpan-39bf76dae61 )