3 files changed, 11 insertions, 4 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 90ee50678..126a9e4ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1261,7 +1261,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       Base(op, device) {}
 
   // We need to redefine this method to make nvcc happy
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     this->m_leftImpl.evalSubExprsIfNeeded(NULL);
     this->m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -1274,7 +1274,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  void evalTo(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
     if (this->m_lhs_inner_dim_contiguous) {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
@@ -1313,10 +1313,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> EIGEN_DEVICE_FUNC
   void evalTyped(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
 
     // rows in left side
     const Index m = this->m_i_size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 0f67f0f57..5abdc489b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -275,10 +275,14 @@ struct GpuDevice {
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
   EIGEN_UNUSED_VARIABLE(status)
   assert(status == cudaSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
 }
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 59404dba5..89f055134 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -220,6 +220,8 @@ struct InnerReducer<Self, Op, GpuDevice> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = 32;
+    EIGEN_UNUSED_VARIABLE(block_size)
+    EIGEN_UNUSED_VARIABLE(num_blocks)
 
     LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);