Alien-FreeImage
view release on metacpan or search on metacpan
src/Source/OpenEXR/IlmImf/ImfDwaCompressor.cpp view on Meta::CPAN
csc709Inverse64 (_dctData[0]._buffer,
_dctData[1]._buffer,
_dctData[2]._buffer);
}
else
{
csc709Inverse (_dctData[0]._buffer[0],
_dctData[1]._buffer[0],
_dctData[2]._buffer[0]);
}
}
//
// Float -> Half conversion.
//
// If the block has a constant value, just convert the first pixel.
//
for (unsigned int comp = 0; comp < numComp; ++comp)
{
if (!blockIsConstant)
{
(*convertFloatToHalf64)
(&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
}
else
{
#if IMF_HAVE_SSE2
__m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];
dst[0] = _mm_set1_epi16
(((half)_dctData[comp]._buffer[0]).bits());
dst[1] = dst[0];
dst[2] = dst[0];
dst[3] = dst[0];
dst[4] = dst[0];
dst[5] = dst[0];
dst[6] = dst[0];
dst[7] = dst[0];
#else /* IMF_HAVE_SSE2 */
unsigned short *dst = &rowBlock[comp][blockx*64];
dst[0] = ((half)_dctData[comp]._buffer[0]).bits();
for (int i = 1; i < 64; ++i)
{
dst[i] = dst[0];
}
#endif /* IMF_HAVE_SSE2 */
} // blockIsConstant
} // comp
} // blockx
//
// At this point, we have half-float nonlinear value blocked
// in rowBlock[][]. We need to unblock the data, transfer
// back to linear, and write the results in the _rowPtrs[].
//
// There is a fast-path for aligned rows, which helps
// things a little. Since this fast path is only valid
// for full 8-element wide blocks, the partial x blocks
// are broken into a separate loop below.
//
// At the moment, the fast path requires:
// * sse support
// * aligned row pointers
// * full 8-element wide blocks
//
for (int comp = 0; comp < numComp; ++comp)
{
//
// Test if we can use the fast path
//
#ifdef IMF_HAVE_SSE2
bool fastPath = true;
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
fastPath = false;
}
if (fastPath)
{
//
// Handle all the full X blocks, in a fast path with sse2 and
// aligned row pointers
//
for (int y=8*blocky; y<8*blocky+maxY; ++y)
{
__m128i *dst = (__m128i *)_rowPtrs[comp][y];
__m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];
for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
//
// These may need some twiddling.
// Run with multiples of 8
//
_mm_prefetch ((char *)(src + 16), _MM_HINT_NTA);
unsigned short i0 = _mm_extract_epi16 (*src, 0);
unsigned short i1 = _mm_extract_epi16 (*src, 1);
unsigned short i2 = _mm_extract_epi16 (*src, 2);
unsigned short i3 = _mm_extract_epi16 (*src, 3);
unsigned short i4 = _mm_extract_epi16 (*src, 4);
unsigned short i5 = _mm_extract_epi16 (*src, 5);
unsigned short i6 = _mm_extract_epi16 (*src, 6);
( run in 0.332 second using v1.01-cache-2.11-cpan-9bca49b1385 )