PUT results from the CPAN

Alien-XGBoost
    cr.b = FUNC()(cx.b, cy.b);

    return cr.storage;
  }
};
#endif

template<class FUNC>
struct MULTI<FUNC, float> {
  static_assert(sizeof(PackType) == 2 * sizeof(float),
      "PackType must be twice the size of float.");
  union converter {
    PackType storage;
    struct {
      float a, b;
    };
  };

  __device__ PackType operator()(const PackType x, const PackType y) const {
    converter cx, cy, cr;
    cx.storage = x;
    cy.storage = y;

    cr.a = FUNC()(cx.a, cy.a);
    cr.b = FUNC()(cx.b, cy.b);

    return cr.storage;
  }
};

template<class FUNC>
struct MULTI<FUNC, double> {
  static_assert(sizeof(PackType) == sizeof(double),
      "PackType must be the same size as double.");
  __device__ PackType operator()(const PackType x, const PackType y) const {
    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
    return __double_as_longlong(rv);
  }
};

template<class FUNC>
struct MULTI<FUNC, unsigned long long> {
  static_assert(sizeof(PackType) == sizeof(unsigned long long),
      "PackType must be the same size as unsigned long long.");
  __device__ PackType operator()(const PackType x, const PackType y) const {
    unsigned long long rv = FUNC()(x, y);
    return rv;
  }
};

template<class FUNC>
struct MULTI<FUNC, long long> {
  static_assert(sizeof(PackType) == sizeof(long long),
      "PackType must be the same size as long long.");
  __device__ PackType operator()(const PackType x, const PackType y) const {
    long long rv = FUNC()((long long)x, (long long)y);
    return rv;
  }
};

template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__device__ inline void ReduceCopy(
    const volatile T * __restrict__ const src0,
    const volatile T * __restrict__ const src1,
    volatile T * __restrict__ const dest0,
    volatile T * __restrict__ const dest1, const int idx) {
  T val = vFetch(src0+idx);
  if (TWO_INPUTS) {
    val = FUNC()(val, vFetch(src1+idx));
  }
  vStore(dest0+idx, val);
  if (TWO_OUTPUTS) {
    vStore(dest1+idx, val);
  }
}

template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
__device__ inline void ReduceCopy64b(
    const volatile T * __restrict__ const src0,
    const volatile T * __restrict__ const src1,
    volatile T * __restrict__ const dest0,
    volatile T * __restrict__ const dest1, const int offset) {
  PackType t0[UNROLL];
  PackType t1[UNROLL];
  #pragma unroll
  for (int u = 0; u < UNROLL; ++u) {
    int idx = offset + u*THREADS;
    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
    if (TWO_INPUTS) {
      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
    }
  }
  #pragma unroll
  for (int u = 0; u < UNROLL; ++u) {
    int idx = offset + u*THREADS;
    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
    if (TWO_OUTPUTS) {
      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
    }
  }
}

#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))

template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
  size_t ptrval = reinterpret_cast<size_t>(ptr);
  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}

// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
    bool HAS_SRC1>
__device__ inline void ReduceOrCopy(const int tid,
    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
    int N) {
  if (N<=0) {
    return;
  }

  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;

  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
  // If not, we'll just use the slow preamble path for the whole operation
  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));

  if (!alignable) {
    Npreamble = N;
  }

  // stage 1: preamble: handle any elements up to the point of everything coming
  // into alignment
  for (int idx = tid; idx < Npreamble; idx += THREADS) {
    // ought to be no way this is ever more than one iteration, except when
    // alignable is false
    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
  }

  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
  // assuming the pointers we have are all 64-bit alignable.
  if (alignable) {
    const int PackFactor = sizeof(PackType) / sizeof(T);
    int Nrem = N - Npreamble;
    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }

    // stage 2a: main loop
    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
        * (UNROLL * THREADS); // round down

    #pragma unroll 1 // don't unroll this loop
( run in 0.729 second using v1.01-cache-2.11-cpan-39bf76dae61 )