From 403a7cb6c34d163e4f120387b5dc5487d30bb1d5 Mon Sep 17 00:00:00 2001
From: Jeremy Barnes <jeremy@barneso.com>
Date: Sun, 10 Jan 2016 22:39:13 -0500
Subject: Alternative way of forcing instantiation of device kernels without
 causing warnings or requiring device to device kernel invocations.

This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 558d0c83d..374edb605 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -116,7 +116,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(false && "Should only be called on floats");
+    eigen_assert(false && "Should only be called on floats");
   }
 
   static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
@@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
   }
 };
-- 
cgit v1.2.3