locked results from the CPAN

Alien-XGBoost
                ValueT  vector_value        = wd_vector_x[column_idx];
                row_total                   += value * vector_value;
            }

            // Warp reduce
            row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total);

            // Output
            if (lane_idx == 0)
            {
                spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total;
            }
        }

        // Return the tile's running carry-out
        KeyValuePairT tile_carry(tile_num_rows, 0.0);
        return tile_carry;
    }
*/







    /**
     * Consume a merge tile, specialized for indirect load of nonzeros
     * /
    __device__ __forceinline__ KeyValuePairT ConsumeTile2(
        int             tile_idx,
        CoordinateT     tile_start_coord,
        CoordinateT     tile_end_coord,
        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
    {
        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;

        ValueT*     s_tile_nonzeros         = &temp_storage.merge_items[0].nonzero;

        ValueT      nonzeros[ITEMS_PER_THREAD];

        // Gather the nonzeros for the merge tile into shared memory
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int     nonzero_idx         = threadIdx.x + (ITEM * BLOCK_THREADS);
            nonzero_idx                 = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);

            OffsetT column_idx          = wd_column_indices[tile_start_coord.y + nonzero_idx];
            ValueT  value               = wd_values[tile_start_coord.y + nonzero_idx];

            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
#if (CUB_PTX_ARCH >= 350)
            vector_value                = wd_vector_x[column_idx];
#endif

            nonzeros[ITEM]              = value * vector_value;
        }

        // Exchange striped->blocked
        BlockExchangeT(temp_storage.exchange).StripedToBlocked(nonzeros);

        CTA_SYNC();

        // Compute an inclusive prefix sum
        BlockPrefixSumT(temp_storage.prefix_sum).InclusiveSum(nonzeros, nonzeros);

        CTA_SYNC();

        if (threadIdx.x == 0)
            s_tile_nonzeros[0] = 0.0;

        // Scatter back to smem
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM + 1;
            s_tile_nonzeros[item_idx] = nonzeros[ITEM];
        }

        CTA_SYNC();

        // Gather the row end-offsets for the merge tile into shared memory
        #pragma unroll 1
        for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
        {
            OffsetT start = CUB_MAX(wd_row_end_offsets[tile_start_coord.x + item - 1], tile_start_coord.y);
            OffsetT end = wd_row_end_offsets[tile_start_coord.x + item];

            start -= tile_start_coord.y;
            end -= tile_start_coord.y;

            ValueT row_partial = s_tile_nonzeros[end] - s_tile_nonzeros[start];

            spmv_params.d_vector_y[tile_start_coord.x + item] = row_partial;
        }

        // Get the tile's carry-out
        KeyValuePairT tile_carry;
        if (threadIdx.x == 0)
        {
            tile_carry.key = tile_num_rows;

            OffsetT start = CUB_MAX(wd_row_end_offsets[tile_end_coord.x - 1], tile_start_coord.y);
            start -= tile_start_coord.y;
            OffsetT end = tile_num_nonzeros;

            tile_carry.value = s_tile_nonzeros[end] - s_tile_nonzeros[start];
        }

        // Return the tile's running carry-out
        return tile_carry;
    }
*/


    /**
     * Consume input tile
     */
    __device__ __forceinline__ void ConsumeTile(
        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
( run in 0.792 second using v1.01-cache-2.11-cpan-a9496e3eb41 )