flag results from the CPAN

Alien-XGBoost

    // Load partially-full, aligned tile using the pixel iterator
    __device__ __forceinline__ void LoadTile(
        OffsetT         block_offset,
        int             valid_samples,
        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
        Int2Type<false> is_full_tile,
        Int2Type<true>  is_aligned)
    {
        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];

        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));

        int valid_pixels = valid_samples / NUM_CHANNELS;

        // Load using a wrapped pixel iterator
        BlockLoadPixelT(temp_storage.pixel_load).Load(
            d_wrapped_pixels,
            reinterpret_cast<AliasedPixels&>(samples),
            valid_pixels);
    }

    // Load partially-full, mis-aligned tile using sample iterator
    __device__ __forceinline__ void LoadTile(
        OffsetT         block_offset,
        int             valid_samples,
        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
        Int2Type<false> is_full_tile,
        Int2Type<false> is_aligned)
    {
        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];

        BlockLoadSampleT(temp_storage.sample_load).Load(
            d_wrapped_samples + block_offset,
            reinterpret_cast<AliasedSamples&>(samples),
            valid_samples);
    }


    //---------------------------------------------------------------------
    // Tile processing
    //---------------------------------------------------------------------

    // Consume a tile of data samples
    template <
        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
        bool IS_FULL_TILE>      // Whether the tile is full
    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
    {
        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
        bool        is_valid[PIXELS_PER_THREAD];

        // Load tile
        LoadTile(
            block_offset,
            valid_samples,
            samples,
            Int2Type<IS_FULL_TILE>(),
            Int2Type<IS_ALIGNED>());

        // Set valid flags
        #pragma unroll
        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);

        // Accumulate samples
#if CUB_PTX_ARCH >= 120
        if (prefer_smem)
            AccumulateSmemPixels(samples, is_valid);
        else
            AccumulateGmemPixels(samples, is_valid);
#else
        AccumulateGmemPixels(samples, is_valid);
#endif

    }


    // Consume row tiles.  Specialized for work-stealing from queue
    template <bool IS_ALIGNED>
    __device__ __forceinline__ void ConsumeTiles(
        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                   ///< The number of rows in the region of interest
        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
        int                 tiles_per_row,              ///< Number of image tiles per row
        GridQueue<int>      tile_queue,
        Int2Type<true>      is_work_stealing)
    {

        int         num_tiles                   = num_rows * tiles_per_row;
        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;

        while (tile_idx < num_tiles)
        {
            int     row             = tile_idx / tiles_per_row;
            int     col             = tile_idx - (row * tiles_per_row);
            OffsetT row_offset      = row * row_stride_samples;
            OffsetT col_offset      = (col * TILE_SAMPLES);
            OffsetT tile_offset     = row_offset + col_offset;

            if (col == tiles_per_row - 1)
            {
                // Consume a partially-full tile at the end of the row
                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
            } 
            else
            {
                // Consume full tile
                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
            }

            CTA_SYNC();

            // Get next tile
            if (threadIdx.x == 0)
                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;

            CTA_SYNC();
( run in 0.898 second using v1.01-cache-2.11-cpan-140bd7fdf52 )