Alien-FreeImage

 view release on metacpan or  search on metacpan

src/Source/OpenEXR/IlmImf/ImfDwaCompressor.cpp  view on Meta::CPAN

                    csc709Inverse64 (_dctData[0]._buffer, 
                                     _dctData[1]._buffer, 
                                     _dctData[2]._buffer);

                }
                else
                {
                    csc709Inverse (_dctData[0]._buffer[0], 
                                   _dctData[1]._buffer[0], 
                                   _dctData[2]._buffer[0]);
                }
            }

            //
            // Float -> Half conversion. 
            //
            // If the block has a constant value, just convert the first pixel.
            //

            for (unsigned int comp = 0; comp < numComp; ++comp)
            {
                if (!blockIsConstant)
                {
                    (*convertFloatToHalf64)
                        (&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
                }
                else
                {
                    #if IMF_HAVE_SSE2

                        __m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];

                        dst[0] = _mm_set1_epi16
                            (((half)_dctData[comp]._buffer[0]).bits());

                        dst[1] = dst[0];
                        dst[2] = dst[0];
                        dst[3] = dst[0];
                        dst[4] = dst[0];
                        dst[5] = dst[0];
                        dst[6] = dst[0];
                        dst[7] = dst[0];

                    #else  /* IMF_HAVE_SSE2 */

                        unsigned short *dst = &rowBlock[comp][blockx*64];

                        dst[0] = ((half)_dctData[comp]._buffer[0]).bits();

                        for (int i = 1; i < 64; ++i)
                        {
                            dst[i] = dst[0];
                        }

                    #endif /* IMF_HAVE_SSE2 */
                } // blockIsConstant
            } // comp
        } // blockx

        //
        // At this point, we have half-float nonlinear value blocked
        // in rowBlock[][]. We need to unblock the data, transfer
        // back to linear, and write the results in the _rowPtrs[].
        //
        // There is a fast-path for aligned rows, which helps
        // things a little. Since this fast path is only valid
        // for full 8-element wide blocks, the partial x blocks
        // are broken into a separate loop below.
        //
        // At the moment, the fast path requires:
        //   * sse support
        //   * aligned row pointers
        //   * full 8-element wide blocks
        //

        for (int comp = 0; comp < numComp; ++comp)
        {
            //
            // Test if we can use the fast path
            //

        #ifdef IMF_HAVE_SSE2

            bool fastPath = true;

            for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
            {
                if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
                    fastPath = false;
            }

            if (fastPath)
            {
                //
                // Handle all the full X blocks, in a fast path with sse2 and
                // aligned row pointers
                //

                for (int y=8*blocky; y<8*blocky+maxY; ++y)
                {
                    __m128i *dst = (__m128i *)_rowPtrs[comp][y];
                    __m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];


                    for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
                    {
                        //
                        // These may need some twiddling.
                        // Run with multiples of 8
                        //

                        _mm_prefetch ((char *)(src + 16), _MM_HINT_NTA); 

                        unsigned short i0  = _mm_extract_epi16 (*src, 0);
                        unsigned short i1  = _mm_extract_epi16 (*src, 1);
                        unsigned short i2  = _mm_extract_epi16 (*src, 2);
                        unsigned short i3  = _mm_extract_epi16 (*src, 3);

                        unsigned short i4  = _mm_extract_epi16 (*src, 4);
                        unsigned short i5  = _mm_extract_epi16 (*src, 5);
                        unsigned short i6  = _mm_extract_epi16 (*src, 6);



( run in 0.332 second using v1.01-cache-2.11-cpan-9bca49b1385 )