Alien-FreeImage

 view release on metacpan or  search on metacpan

src/Source/OpenEXR/IlmImf/ImfDwaCompressorSimd.h  view on Meta::CPAN

    dst[27] = (float)srcHalf[24];
    dst[28] = (float)srcHalf[31];
    dst[29] = (float)srcHalf[40];

    dst[30] = (float)srcHalf[44];
    dst[31] = (float)srcHalf[53];
    dst[32] = (float)srcHalf[10];
    dst[33] = (float)srcHalf[19];
    dst[34] = (float)srcHalf[23];
    dst[35] = (float)srcHalf[32];
    dst[36] = (float)srcHalf[39];
    dst[37] = (float)srcHalf[45];
    dst[38] = (float)srcHalf[52];
    dst[39] = (float)srcHalf[54];

    dst[40] = (float)srcHalf[20];
    dst[41] = (float)srcHalf[22];
    dst[42] = (float)srcHalf[33];
    dst[43] = (float)srcHalf[38];
    dst[44] = (float)srcHalf[46];
    dst[45] = (float)srcHalf[51];
    dst[46] = (float)srcHalf[55];
    dst[47] = (float)srcHalf[60];
    dst[48] = (float)srcHalf[21];
    dst[49] = (float)srcHalf[34];

    dst[50] = (float)srcHalf[37];
    dst[51] = (float)srcHalf[47];
    dst[52] = (float)srcHalf[50];
    dst[53] = (float)srcHalf[56];
    dst[54] = (float)srcHalf[59];
    dst[55] = (float)srcHalf[61];
    dst[56] = (float)srcHalf[35];
    dst[57] = (float)srcHalf[36];
    dst[58] = (float)srcHalf[48];
    dst[59] = (float)srcHalf[49];

    dst[60] = (float)srcHalf[57];
    dst[61] = (float)srcHalf[58];
    dst[62] = (float)srcHalf[62];
    dst[63] = (float)srcHalf[63];
}


//
// If we can form the correct ordering in xmm registers,
// we can use F16C to convert from HALF -> FLOAT. However,
// making the correct order isn't trivial. 
// 
// We want to re-order a source 8x8 matrix from:
//
//  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
//  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
// 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
// 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53   (A)
// 32 33 34 35 36 37 38 39  --> 10 19 23 32 39 45 52 54
// 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
// 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
// 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
//
// Which looks like a mess, right? 
//
// Now, check out the NE/SW diagonals of (A). Along those lines, 
// we have runs of contiguous values! If we rewrite (A) a bit, we get:
//
//  0
//  1  2
//  5  4  3
//  6  7  8  9
// 14 13 12 11 10
// 15 16 17 18 19 20
// 27 26 25 24 23 22 21            (B)
// 28 29 30 31 32 33 34 35
//    42 41 40 39 38 37 36
//       43 44 45 46 47 48
//          53 52 51 50 49
//             54 55 56 57
//                60 59 58
//                   61 62
//                      63
//
// In this ordering, the columns are the rows (A). If we can 'transpose' 
// (B), we'll achieve our goal. But we want this to fit nicely into 
// xmm registers and still be able to load large runs efficiently.  
// Also, notice that the odd rows are in ascending order, while 
// the even rows are in descending order. 
//
// If we 'fold' the bottom half up into the top, we can preserve ordered
// runs accross rows, and still keep all the correct values in columns. 
// After transposing, we'll need to rotate things back into place. 
// This gives us:
//
//  0 | 42   41   40   39   38   37   36
//  1    2 | 43   44   45   46   47   48
//  5    4    3 | 53   52   51   50   49
//  6    7    8    9 | 54   55   56   57      (C)
// 14   13   12   11   10 | 60   59   58
// 15   16   17   18   19   20 | 61   62
// 27   26   25   24   23   22   21 | 61
// 28   29   30   31   32   33   34   35
//
// But hang on. We still have the backwards descending rows to deal with.
// Lets reverse the even rows so that all values are in ascending order
//
//  36   37  38   39   40   41   42 | 0
//  1    2 | 43   44   45   46   47   48
//  49   50  51   52   53 |  3    4    5  
//  6    7    8    9 | 54   55   56   57      (D)
// 58   59   60 | 10   11   12   13   14  
// 15   16   17   18   19   20 | 61   62
// 61 | 21   22   23   24   25   26   27 
// 28   29   30   31   32   33   34   35
//
// If we can form (D),  we will then:
//   1) Reverse the even rows
//   2) Transpose
//   3) Rotate the rows 
//
// and we'll have (A).
//

src/Source/OpenEXR/IlmImf/ImfDwaCompressorSimd.h  view on Meta::CPAN


#endif /* IMF_HAVE_SSE2 */


//
// Full 8x8 Inverse DCT:
//
// Simple inverse DCT on an 8x8 block, with scalar ops only.
//  Operates on data in-place.
//
// This is based on the iDCT formuation (y = frequency domain,
//                                       x = spatial domain)
//
//    [x0]    [        ][y0]    [        ][y1] 
//    [x1] =  [  M1    ][y2]  + [  M2    ][y3] 
//    [x2]    [        ][y4]    [        ][y5] 
//    [x3]    [        ][y6]    [        ][y7]
//
//    [x7]    [        ][y0]    [        ][y1] 
//    [x6] =  [  M1    ][y2]  - [  M2    ][y3] 
//    [x5]    [        ][y4]    [        ][y5] 
//    [x4]    [        ][y6]    [        ][y7]
//
// where M1:             M2:
//
//   [a  c  a   f]     [b  d  e  g]
//   [a  f -a  -c]     [d -g -b -e]
//   [a -f -a   c]     [e -b  g  d]
//   [a -c  a  -f]     [g -e  d -b]
//
// and the constants are as defined below..
//
// If you know how many of the lower rows are zero, that can
// be passed in to help speed things up. If you don't know, 
// just set zeroedRows=0.
//

//
// Default implementation
//

template <int zeroedRows>
void
dctInverse8x8_scalar (float *data)
{
    const float a = .5f * cosf (3.14159f / 4.0f);
    const float b = .5f * cosf (3.14159f / 16.0f);
    const float c = .5f * cosf (3.14159f / 8.0f);
    const float d = .5f * cosf (3.f*3.14159f / 16.0f);
    const float e = .5f * cosf (5.f*3.14159f / 16.0f);
    const float f = .5f * cosf (3.f*3.14159f / 8.0f);
    const float g = .5f * cosf (7.f*3.14159f / 16.0f);

    float alpha[4], beta[4], theta[4], gamma[4];

    float *rowPtr = NULL;

    //
    // First pass - row wise.
    //
    // This looks less-compact than the description above in
    // an attempt to fold together common sub-expressions.
    //

    for (int row = 0; row < 8 - zeroedRows; ++row)
    {
        rowPtr = data + row * 8;

        alpha[0] = c * rowPtr[2]; 
        alpha[1] = f * rowPtr[2]; 
        alpha[2] = c * rowPtr[6]; 
        alpha[3] = f * rowPtr[6]; 

        beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
        beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
        beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
        beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];

        theta[0] = a * (rowPtr[0] + rowPtr[4]);
        theta[3] = a * (rowPtr[0] - rowPtr[4]);

        theta[1] = alpha[0] + alpha[3]; 
        theta[2] = alpha[1] - alpha[2]; 


        gamma[0] = theta[0] + theta[1];
        gamma[1] = theta[3] + theta[2];
        gamma[2] = theta[3] - theta[2];
        gamma[3] = theta[0] - theta[1];


        rowPtr[0] = gamma[0] + beta[0];
        rowPtr[1] = gamma[1] + beta[1];
        rowPtr[2] = gamma[2] + beta[2];
        rowPtr[3] = gamma[3] + beta[3];

        rowPtr[4] = gamma[3] - beta[3];
        rowPtr[5] = gamma[2] - beta[2];
        rowPtr[6] = gamma[1] - beta[1];
        rowPtr[7] = gamma[0] - beta[0];
    }

    //
    // Second pass - column wise.
    //

    for (int column = 0; column < 8; ++column)
    {
        alpha[0] = c * data[16+column]; 
        alpha[1] = f * data[16+column]; 
        alpha[2] = c * data[48+column]; 
        alpha[3] = f * data[48+column]; 

        beta[0] = b * data[8+column]  + d * data[24+column] +
                  e * data[40+column] + g * data[56+column];

        beta[1] = d * data[8+column]  - g * data[24+column] -
                  b * data[40+column] - e * data[56+column];

        beta[2] = e * data[8+column]  - b * data[24+column] + 
                  g * data[40+column] + d * data[56+column];



( run in 0.865 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )