Alien-FreeImage
view release on metacpan or search on metacpan
src/Source/OpenEXR/IlmImf/ImfDwaCompressorSimd.h view on Meta::CPAN
dst[27] = (float)srcHalf[24];
dst[28] = (float)srcHalf[31];
dst[29] = (float)srcHalf[40];
dst[30] = (float)srcHalf[44];
dst[31] = (float)srcHalf[53];
dst[32] = (float)srcHalf[10];
dst[33] = (float)srcHalf[19];
dst[34] = (float)srcHalf[23];
dst[35] = (float)srcHalf[32];
dst[36] = (float)srcHalf[39];
dst[37] = (float)srcHalf[45];
dst[38] = (float)srcHalf[52];
dst[39] = (float)srcHalf[54];
dst[40] = (float)srcHalf[20];
dst[41] = (float)srcHalf[22];
dst[42] = (float)srcHalf[33];
dst[43] = (float)srcHalf[38];
dst[44] = (float)srcHalf[46];
dst[45] = (float)srcHalf[51];
dst[46] = (float)srcHalf[55];
dst[47] = (float)srcHalf[60];
dst[48] = (float)srcHalf[21];
dst[49] = (float)srcHalf[34];
dst[50] = (float)srcHalf[37];
dst[51] = (float)srcHalf[47];
dst[52] = (float)srcHalf[50];
dst[53] = (float)srcHalf[56];
dst[54] = (float)srcHalf[59];
dst[55] = (float)srcHalf[61];
dst[56] = (float)srcHalf[35];
dst[57] = (float)srcHalf[36];
dst[58] = (float)srcHalf[48];
dst[59] = (float)srcHalf[49];
dst[60] = (float)srcHalf[57];
dst[61] = (float)srcHalf[58];
dst[62] = (float)srcHalf[62];
dst[63] = (float)srcHalf[63];
}
//
// If we can form the correct ordering in xmm registers,
// we can use F16C to convert from HALF -> FLOAT. However,
// making the correct order isn't trivial.
//
// We want to re-order a source 8x8 matrix from:
//
// 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
// 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
// 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
// 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 (A)
// 32 33 34 35 36 37 38 39 --> 10 19 23 32 39 45 52 54
// 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
// 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
// 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
//
// Which looks like a mess, right?
//
// Now, check out the NE/SW diagonals of (A). Along those lines,
// we have runs of contiguous values! If we rewrite (A) a bit, we get:
//
// 0
// 1 2
// 5 4 3
// 6 7 8 9
// 14 13 12 11 10
// 15 16 17 18 19 20
// 27 26 25 24 23 22 21 (B)
// 28 29 30 31 32 33 34 35
// 42 41 40 39 38 37 36
// 43 44 45 46 47 48
// 53 52 51 50 49
// 54 55 56 57
// 60 59 58
// 61 62
// 63
//
// In this ordering, the columns are the rows (A). If we can 'transpose'
// (B), we'll achieve our goal. But we want this to fit nicely into
// xmm registers and still be able to load large runs efficiently.
// Also, notice that the odd rows are in ascending order, while
// the even rows are in descending order.
//
// If we 'fold' the bottom half up into the top, we can preserve ordered
// runs accross rows, and still keep all the correct values in columns.
// After transposing, we'll need to rotate things back into place.
// This gives us:
//
// 0 | 42 41 40 39 38 37 36
// 1 2 | 43 44 45 46 47 48
// 5 4 3 | 53 52 51 50 49
// 6 7 8 9 | 54 55 56 57 (C)
// 14 13 12 11 10 | 60 59 58
// 15 16 17 18 19 20 | 61 62
// 27 26 25 24 23 22 21 | 61
// 28 29 30 31 32 33 34 35
//
// But hang on. We still have the backwards descending rows to deal with.
// Lets reverse the even rows so that all values are in ascending order
//
// 36 37 38 39 40 41 42 | 0
// 1 2 | 43 44 45 46 47 48
// 49 50 51 52 53 | 3 4 5
// 6 7 8 9 | 54 55 56 57 (D)
// 58 59 60 | 10 11 12 13 14
// 15 16 17 18 19 20 | 61 62
// 61 | 21 22 23 24 25 26 27
// 28 29 30 31 32 33 34 35
//
// If we can form (D), we will then:
// 1) Reverse the even rows
// 2) Transpose
// 3) Rotate the rows
//
// and we'll have (A).
//
src/Source/OpenEXR/IlmImf/ImfDwaCompressorSimd.h view on Meta::CPAN
#endif /* IMF_HAVE_SSE2 */
//
// Full 8x8 Inverse DCT:
//
// Simple inverse DCT on an 8x8 block, with scalar ops only.
// Operates on data in-place.
//
// This is based on the iDCT formuation (y = frequency domain,
// x = spatial domain)
//
// [x0] [ ][y0] [ ][y1]
// [x1] = [ M1 ][y2] + [ M2 ][y3]
// [x2] [ ][y4] [ ][y5]
// [x3] [ ][y6] [ ][y7]
//
// [x7] [ ][y0] [ ][y1]
// [x6] = [ M1 ][y2] - [ M2 ][y3]
// [x5] [ ][y4] [ ][y5]
// [x4] [ ][y6] [ ][y7]
//
// where M1: M2:
//
// [a c a f] [b d e g]
// [a f -a -c] [d -g -b -e]
// [a -f -a c] [e -b g d]
// [a -c a -f] [g -e d -b]
//
// and the constants are as defined below..
//
// If you know how many of the lower rows are zero, that can
// be passed in to help speed things up. If you don't know,
// just set zeroedRows=0.
//
//
// Default implementation
//
template <int zeroedRows>
void
dctInverse8x8_scalar (float *data)
{
const float a = .5f * cosf (3.14159f / 4.0f);
const float b = .5f * cosf (3.14159f / 16.0f);
const float c = .5f * cosf (3.14159f / 8.0f);
const float d = .5f * cosf (3.f*3.14159f / 16.0f);
const float e = .5f * cosf (5.f*3.14159f / 16.0f);
const float f = .5f * cosf (3.f*3.14159f / 8.0f);
const float g = .5f * cosf (7.f*3.14159f / 16.0f);
float alpha[4], beta[4], theta[4], gamma[4];
float *rowPtr = NULL;
//
// First pass - row wise.
//
// This looks less-compact than the description above in
// an attempt to fold together common sub-expressions.
//
for (int row = 0; row < 8 - zeroedRows; ++row)
{
rowPtr = data + row * 8;
alpha[0] = c * rowPtr[2];
alpha[1] = f * rowPtr[2];
alpha[2] = c * rowPtr[6];
alpha[3] = f * rowPtr[6];
beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
theta[0] = a * (rowPtr[0] + rowPtr[4]);
theta[3] = a * (rowPtr[0] - rowPtr[4]);
theta[1] = alpha[0] + alpha[3];
theta[2] = alpha[1] - alpha[2];
gamma[0] = theta[0] + theta[1];
gamma[1] = theta[3] + theta[2];
gamma[2] = theta[3] - theta[2];
gamma[3] = theta[0] - theta[1];
rowPtr[0] = gamma[0] + beta[0];
rowPtr[1] = gamma[1] + beta[1];
rowPtr[2] = gamma[2] + beta[2];
rowPtr[3] = gamma[3] + beta[3];
rowPtr[4] = gamma[3] - beta[3];
rowPtr[5] = gamma[2] - beta[2];
rowPtr[6] = gamma[1] - beta[1];
rowPtr[7] = gamma[0] - beta[0];
}
//
// Second pass - column wise.
//
for (int column = 0; column < 8; ++column)
{
alpha[0] = c * data[16+column];
alpha[1] = f * data[16+column];
alpha[2] = c * data[48+column];
alpha[3] = f * data[48+column];
beta[0] = b * data[8+column] + d * data[24+column] +
e * data[40+column] + g * data[56+column];
beta[1] = d * data[8+column] - g * data[24+column] -
b * data[40+column] - e * data[56+column];
beta[2] = e * data[8+column] - b * data[24+column] +
g * data[40+column] + d * data[56+column];
( run in 0.865 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )