merge

author: Gael Guennebaud <g.gael@free.fr> 2016-02-23 11:09:05 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2016-02-23 11:09:05 +0100
commit: 91e1375ba97284d1a11068d27c039800ec7900f1 (patch)
tree: ce96e6c443a28e1ea6d189b56cc08460c475b77c
parent: 055000a42466670d7fd0162f026cde9ab90f9b25 (diff)
parent: 1d9256f7db5db6c9f7fa915b4af868625f53502f (diff)
13 files changed, 273 insertions, 346 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 7af0bdc60..1a1b4ec3d 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -52,14 +52,19 @@ __device__ half operator /= (half& a, const half& b) {
   a = a / b;
   return a;
 }
-__device__ half __shfl_xor(half a, int) {
-  assert(false && "tbd");
-  return a;
+
+namespace std {
+__device__ half abs(const half& a) {
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+}
 }
 
 namespace Eigen {
 namespace internal {
 
+template<> struct is_arithmetic<half> { enum { value = true }; };
 template<> struct is_arithmetic<half2> { enum { value = true }; };
 
 template<> struct packet_traits<half> : default_packet_traits
@@ -214,17 +219,20 @@ template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
 }
 
 template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
-  assert(false && "tbd");
-  return half2();
+  half2 result;
+  result.x = a.x & 0x7FFF7FFF; 
+  return result;
 }
 
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<half2,2>& kernel) {
-  assert(false && "tbd");
-  //  half tmp = kernel.packet[0].y;
-  //  kernel.packet[0].y = kernel.packet[1].x;
-  //  kernel.packet[1].x = tmp;
+  half a1 = __low2half(kernel.packet[0]);
+  half a2 = __high2half(kernel.packet[0]);
+  half b1 = __low2half(kernel.packet[1]);
+  half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
 }
 
 } // end namespace internal
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index f3ec70a9e..b208a401a 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -15,7 +15,7 @@ using Eigen::TensorMap;
 
 // TODO(bsteiner): also templatize on the input type since we have users
 // for int8 as well as floats.
-template <typename Device> class BenchmarkSuite {
+template <typename Device, typename T> class BenchmarkSuite {
  public:
   BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
       : m_(m), k_(k), n_(n), device_(device) {
@@ -37,7 +37,7 @@ template <typename Device> class BenchmarkSuite {
     eigen_assert(m_ == k_ && k_ == n_);
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
-      device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
     }
     // Record the number of values copied per second
     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
@@ -45,13 +45,15 @@ template <typename Device> class BenchmarkSuite {
 
   void typeCasting(int num_iters) {
     eigen_assert(m_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
-    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = k_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
     TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
-      B.device(device_) = A.cast<int>();
+      B.device(device_) = A.template cast<int>();
     }
     // Record the number of values copied per second
     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
@@ -59,8 +61,10 @@ template <typename Device> class BenchmarkSuite {
 
   void random(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -72,10 +76,12 @@ template <typename Device> class BenchmarkSuite {
 
   void slicing(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
     const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
     const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
@@ -100,10 +106,13 @@ template <typename Device> class BenchmarkSuite {
   }
 
   void rowChip(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
-    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
-    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
-    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -114,10 +123,13 @@ template <typename Device> class BenchmarkSuite {
   }
 
   void colChip(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
-    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
-    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
-    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -129,12 +141,18 @@ template <typename Device> class BenchmarkSuite {
 
   void shuffling(int num_iters) {
     eigen_assert(m_ == n_);
-    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
-
-    const Eigen::array<int, 2> shuffle = {{1, 0}};
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = k_;
+    size_b[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+    Eigen::array<int, 2> shuffle;
+    shuffle[0] = 1;
+    shuffle[1] = 0;
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -146,10 +164,14 @@ template <typename Device> class BenchmarkSuite {
 
  void padding(int num_iters) {
     eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_-3;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = k_;
+    size_b[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
 
     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
@@ -165,12 +187,18 @@ template <typename Device> class BenchmarkSuite {
 
  void striding(int num_iters) {
     eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
-
-    const Eigen::array<TensorIndex, 2> strides = {{1, 2}};
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = m_;
+    size_b[1] = k_/2;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+    Eigen::array<TensorIndex, 2> strides;
+    strides[0] = 1;
+    strides[1] = 2;
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -181,13 +209,19 @@ template <typename Device> class BenchmarkSuite {
   }
 
   void broadcasting(int num_iters) {
-    const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = 1;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_c;
+    size_c[0] = m_;
+    size_c[1] = n_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
 
 #ifndef EIGEN_HAS_INDEX_LIST
-    const Eigen::array<int, 2> broadcast = {{1, n_}};
+    Eigen::array<int, 2> broadcast;
+    broadcast[0] = 1;
+    broadcast[1] = n_;
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
@@ -205,10 +239,12 @@ template <typename Device> class BenchmarkSuite {
 
   void coeffWiseOp(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -221,10 +257,12 @@ template <typename Device> class BenchmarkSuite {
 
   void algebraicFunc(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -237,10 +275,12 @@ template <typename Device> class BenchmarkSuite {
 
   void transcendentalFunc(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -253,13 +293,16 @@ template <typename Device> class BenchmarkSuite {
 
  // Row reduction
   void rowReduction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
-    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
     const Eigen::array<TensorIndex, 1> output_size = {{n_}};
-    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
 
 #ifndef EIGEN_HAS_INDEX_LIST
-    const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
+    Eigen::array<TensorIndex, 1> sum_along_dim;
+    sum_along_dim[0] = 0;
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
@@ -277,15 +320,18 @@ template <typename Device> class BenchmarkSuite {
 
   // Column reduction
   void colReduction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
-    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
         b_, input_size);
     const Eigen::array<TensorIndex, 1> output_size = {{k_}};
-    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
 
 #ifndef EIGEN_HAS_INDEX_LIST
-    const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
+    Eigen::array<TensorIndex, 1> sum_along_dim;
+    sum_along_dim = 1;
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
@@ -303,16 +349,23 @@ template <typename Device> class BenchmarkSuite {
 
   // do a contraction which is equivalent to a matrix multiplication
   void contraction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
-    const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
-    const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};
-
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
-
-    typedef typename Tensor<float, 2>::DimensionPair DimPair;
-    const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    Eigen::array<TensorIndex, 2> sizeA;
+    sizeA[0] = m_;
+    sizeA[1] = k_;
+    Eigen::array<TensorIndex, 2> sizeB;
+    sizeB[0] = k_;
+    sizeB[1] = n_;
+    Eigen::array<TensorIndex, 2> sizeC;
+    sizeC[0] = m_;
+    sizeC[1] = n_;
+
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -324,14 +377,21 @@ template <typename Device> class BenchmarkSuite {
   }
 
   void convolution(int num_iters, int kernel_x, int kernel_y) {
-    const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
-    const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
-    const Eigen::array<TensorIndex, 2> result_sizes =
-        {{m_ - kernel_x + 1, n_ - kernel_y + 1}};
-    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
-    Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};
+    Eigen::array<TensorIndex, 2> input_sizes;
+    input_sizes[0] = m_;
+    input_sizes[1] = n_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
+    Eigen::array<TensorIndex, 2> kernel_sizes;
+    kernel_sizes[0] = kernel_x;
+    kernel_sizes[1] = kernel_y;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+    Eigen::array<TensorIndex, 2> result_sizes;
+    result_sizes[0] = m_ - kernel_x + 1;
+    result_sizes[1] = n_ - kernel_y + 1;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
+    Eigen::array<TensorIndex, 2> dims;
+    dims[0] = 0;
+    dims[1] = 1;
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -345,15 +405,15 @@ template <typename Device> class BenchmarkSuite {
 
  private:
   void initialize() {
-    a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
-    b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
-    c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
+    a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+    b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+    c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
 
     // Initialize the content of the memory pools to prevent asan from
     // complaining.
-    device_.memset(a_, 12, m_ * k_ * sizeof(float));
-    device_.memset(b_, 23, k_ * n_ * sizeof(float));
-    device_.memset(c_, 31, m_ * n_ * sizeof(float));
+    device_.memset(a_, 12, m_ * k_ * sizeof(T));
+    device_.memset(b_, 23, k_ * n_ * sizeof(T));
+    device_.memset(c_, 31, m_ * n_ * sizeof(T));
 
     //BenchmarkUseRealTime();
   }
@@ -372,9 +432,9 @@ template <typename Device> class BenchmarkSuite {
   TensorIndex m_;
   TensorIndex k_;
   TensorIndex n_;
-  float* a_;
-  float* b_;
-  float* c_;
+  T* a_;
+  T* b_;
+  T* c_;
   Device device_;
 };
 #endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
index 6754e1a32..8947f4b7f 100644
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -9,13 +9,13 @@ Eigen::ThreadPool pool(threads);                \
 Eigen::ThreadPoolDevice device(&pool, threads);
 
 // Simple functions
-#define BM_FuncCPU(FUNC, THREADS)                                \
-  static void BM_##FUNC##_##THREADS##T(int iters, int N) {       \
-    StopBenchmarkTiming();                                       \
-    CREATE_THREAD_POOL(THREADS);                                 \
-    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);    \
-    suite.FUNC(iters);                                           \
-  }                                                              \
+#define BM_FuncCPU(FUNC, THREADS)                                    \
+  static void BM_##FUNC##_##THREADS##T(int iters, int N) {           \
+    StopBenchmarkTiming();                                           \
+    CREATE_THREAD_POOL(THREADS);                                     \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                               \
+  }                                                                  \
   BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
 
 BM_FuncCPU(memcpy, 4);
@@ -80,19 +80,19 @@ BM_FuncCPU(colReduction, 12);
 
 
 // Contractions
-#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                     \
-  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
-    StopBenchmarkTiming();                                                     \
-    if (THREADS == 1) {                                                        \
-      Eigen::DefaultDevice device;                                             \
-      BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3);          \
-      suite.FUNC(iters);                                                       \
-    } else {                                                                   \
-      CREATE_THREAD_POOL(THREADS);                                             \
-      BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3);       \
-      suite.FUNC(iters);                                                       \
-    }                                                                          \
-  }                                                                            \
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                      \
+  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
+    StopBenchmarkTiming();                                                      \
+    if (THREADS == 1) {                                                         \
+      Eigen::DefaultDevice device;                                              \
+      BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3);    \
+      suite.FUNC(iters);                                                        \
+    } else {                                                                    \
+      CREATE_THREAD_POOL(THREADS);                                              \
+      BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+      suite.FUNC(iters);                                                        \
+    }                                                                           \
+  }                                                                             \
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
 
 
@@ -138,7 +138,7 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
   static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
     StopBenchmarkTiming();                                                     \
     CREATE_THREAD_POOL(THREADS);                                               \
-    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);                  \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N);	       \
     suite.FUNC(iters, DIM1, DIM2);                                             \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu
index 611e8197b..a6f594382 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -12,7 +12,7 @@
     StopBenchmarkTiming();                                                     \
     Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
   }                                                                            \
@@ -41,7 +41,7 @@ BM_FuncGPU(colReduction);
     StopBenchmarkTiming();                                                     \
     Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3);         \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
   }                                                                            \
@@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
     StopBenchmarkTiming();                                                     \
     Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters, DIM1, DIM2);                                             \
   }                                                                            \
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index b4f860c41..3b5be4426 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -58,6 +58,7 @@ typedef unsigned __int64 uint64_t;
 #endif
 
 #ifdef EIGEN_USE_GPU
+#include <iostream>
 #include <cuda_runtime.h>
 #if defined(__CUDACC__)
 #include <curand_kernel.h>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index e254c0b7b..4e87813a9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -165,6 +165,18 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
     typename XprType::Nested m_xpr;
 };
 
+template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) {
+    impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+};
+
+template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) {
+    return impl.evalSubExprsIfNeeded(data);
+  }
+};
 
 
 
@@ -195,10 +207,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
   {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
+    return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index aec5f4c8e..a5aa05da4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -230,10 +230,10 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
           pos_j_base_powered[1] = pos_j_base;
           if (line_len > 2) {
             const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-            for (int i = 2; i < line_len + 1; ++i) {
-              pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
-                                      pos_j_base_powered[i - 1] /
-                                      pos_j_base_powered[i - 2] * pos_j_base_sq;
+            for (int j = 2; j < line_len + 1; ++j) {
+              pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
+                                      pos_j_base_powered[j - 1] /
+                                      pos_j_base_powered[j - 2] * pos_j_base_sq;
             }
           }
         }
@@ -468,7 +468,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
   template <int Dir>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
-      ComplexScalar* data, int n, int n_power_of_2) {
+      ComplexScalar* data, Index n, Index n_power_of_2) {
     // Original code:
     // RealScalar wtemp = std::sin(M_PI/n);
     // RealScalar wpi =  -std::sin(2 * M_PI/n);
@@ -482,9 +482,9 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     const ComplexScalar wp_one_2 = wp_one * wp_one;
     const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
     const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
-    const int n2 = n / 2;
+    const Index n2 = n / 2;
     ComplexScalar w(1.0, 0.0);
-    for (int i = 0; i < n2; i += 4) {
+    for (Index i = 0; i < n2; i += 4) {
        ComplexScalar temp0(data[i + n2] * w);
        ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
        ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
@@ -507,7 +507,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
  template <int Dir>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
-      ComplexScalar* data, int n, int n_power_of_2) {
+      ComplexScalar* data, Index n, Index n_power_of_2) {
     eigen_assert(isPowerOfTwo(n));
     if (n > 8) {
       compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 2ab332add..bc6021c9e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -167,7 +167,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = NumDims == 5,
+    CoordAccess = false,
     RawAccess = false
   };
 
@@ -437,59 +437,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   Index rowInflateStride() const { return m_row_inflate_strides; }
   Index colInflateStride() const { return m_col_inflate_strides; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    // Location of the first element of the patch.
-    // ColMajor
-    // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
-    // RowMajor
-    // 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d
-    const Index patch2DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 1];
-
-    array<Index, NumDims-1> inputCoords;
-    Index input_col_idx = patch2DIndex / m_fastInputColsEff;
-    Index inputCol = input_col_idx  + coords[1] * m_in_row_strides - m_rowPaddingTop;
-    Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft;
-    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputCoords[0] = coords[0];  // depth
-      inputCoords[1] = origInputCol;
-      inputCoords[2] = origInputRow;
-      inputCoords[3] = coords[4];  // batch
-    } else {
-      inputCoords[3] = coords[4];  // depth
-      inputCoords[2] = origInputCol;
-      inputCoords[1] = origInputRow;
-      inputCoords[0] = coords[0];  // batch
-    }
-    // If the computed coordinates are outside the original image perimeter, return 0.
-    if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff ||
-        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) ||
-        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
-      return m_impl.coeff(inputCoords);
-    } else {
-      Index inputIndex;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        inputIndex =
-          inputCoords[3] * m_patchInputStride +
-          inputCoords[2] * m_colInputStride +
-          inputCoords[1] * m_rowInputStride +
-          inputCoords[0];
-      } else {
-        inputIndex =
-          inputCoords[1] * m_patchInputStride +
-          inputCoords[2] * m_colInputStride +
-          inputCoords[3] * m_rowInputStride +
-          inputCoords[4];
-      }
-      return m_impl.coeff(inputIndex);
-    }
-  }
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 11284315c..e867e450e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -318,7 +318,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    CoordAccess = false,
     RawAccess = false
   };
 
@@ -457,15 +457,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords)
-  {
-    array<Index, NumDims> inputCoords;
-    for (int i = 0; i < NumDims; ++i) {
-      inputCoords = coords[i] + this->m_offsets[i];
-    }
-    return m_impl.coeff(inputCoords);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
     Scalar* result = m_impl.data();
     if (result) {
@@ -547,7 +538,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    CoordAccess = false,
     RawAccess = false
   };
 
@@ -608,15 +599,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       }
     }
   }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords)
-  {
-    array<Index, NumDims> inputCoords;
-    for (int i = 0; i < NumDims; ++i) {
-      inputCoords = coords[i] + this->m_offsets[i];
-    }
-    return this->m_impl.coeffRef(inputCoords);
-  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 39a305a93..c3f25f0df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -151,27 +151,27 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return Scalar(0);
+          return internal::scalar_cast_op<int, Scalar>()(0);
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
-        return Scalar(0);
+        return internal::scalar_cast_op<int, Scalar>()(0);
       }
       inputIndex += (index - m_padding[0].first);
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
         if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return Scalar(0);
+          return internal::scalar_cast_op<int, Scalar>()(0);
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i+1];
       }
       if (index < m_padding[NumDims-1].first ||
           index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
-        return Scalar(0);
+        return internal::scalar_cast_op<int, Scalar>()(0);
       }
       inputIndex += (index - m_padding[NumDims-1].first);
     }
@@ -194,14 +194,14 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       {
         const Index idx = coords[0];
         if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
-          return Scalar(0);
+          return internal::scalar_cast_op<int, Scalar>()(0);
         }
         inputIndex = idx - m_padding[0].first;
       }
       for (int i = 1; i < NumDims; ++i) {
         const Index idx = coords[i];
         if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return Scalar(0);
+          return internal::scalar_cast_op<int, Scalar>()(0);
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
       }
@@ -209,14 +209,14 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       {
         const Index idx = coords[NumDims-1];
         if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
-          return Scalar(0);
+          return internal::scalar_cast_op<int, Scalar>()(0);
         }
         inputIndex = idx - m_padding[NumDims-1].first;
       }
       for (int i = NumDims - 2; i >= 0; --i) {
         const Index idx = coords[i];
         if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return Scalar(0);
+          return internal::scalar_cast_op<int, Scalar>()(0);
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
       }
@@ -245,11 +245,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
       if (last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(Scalar(0));
+        return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
       }
       else if (first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(Scalar(0));
+        return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
       }
       else if (first >= lastPaddedLeft && last < firstPaddedRight) {
         // all the coefficient are between the 2 padding zones.
@@ -271,11 +271,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     if (last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(Scalar(0));
+      return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
     }
     else if (first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(Scalar(0));
+      return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
     }
     else if (first >= lastPaddedLeft && last < firstPaddedRight) {
       // all the coefficient are between the 2 padding zones.
@@ -304,11 +304,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
       if (last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(Scalar(0));
+        return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
       }
       else if (first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(Scalar(0));
+        return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
       }
       else if (first >= lastPaddedLeft && last < firstPaddedRight) {
         // all the coefficient are between the 2 padding zones.
@@ -330,11 +330,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     if (last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(Scalar(0));
+      return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
     }
     else if (first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(Scalar(0));
+      return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
     }
     else if (first >= lastPaddedLeft && last < firstPaddedRight) {
       // all the coefficient are between the 2 padding zones.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 2cbb820b1..57b716fd6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
+    CoordAccess = false,
     RawAccess = false
  };
 
@@ -248,56 +248,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0;
-    // Location of the first element of the patch.
-    const Index patchIndex = coords[patch_coord_idx];
-
-    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
-      array<Index, NumDims-1> inputCoords;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        for (int i = NumDims - 2; i > 0; --i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i];
-          inputCoords[i] = coords[i] + patchIdx;
-        }
-      } else {
-        for (int i = 0; i < NumDims - 2; ++i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i+1];
-          inputCoords[i] = coords[i+1] + patchIdx;
-        }
-      }
-      Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
-      inputCoords[0] = (patchIndex + coords[coords_idx]);
-      return m_impl.coeff(inputCoords);
-    }
-    else {
-      Index inputIndex = 0;
-      if (Layout == ColMajor) {
-        for (int i = NumDims - 2; i > 0; --i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i];
-          inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
-        }
-      } else {
-        for (int i = 0; i < NumDims - 2; ++i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i+1];
-          inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
-        }
-      }
-      Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
-      inputIndex += (patchIndex + coords[coords_idx]);
-      return m_impl.coeff(inputIndex);
-    }
-  }
-
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 52b78b261..04f4f8ffc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -180,7 +180,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = NumDims == 6,
+    CoordAccess = false,
     RawAccess = false
   };
 
@@ -518,79 +518,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   Index rowInflateStride() const { return m_row_inflate_strides; }
   Index colInflateStride() const { return m_col_inflate_strides; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    // ColMajor
-    //   0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches
-    // RowMajor
-    //   0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth
-    const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
-    const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
-    const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
-    const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
-
-    array<Index, NumDims-1> inputCoords;
-
-    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
-    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
-    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
-        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
-    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
-    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
-        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
-    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
-    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
-    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
-        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputCoords[0] = coords[0];  // depth
-      inputCoords[1] = origInputPlane;
-      inputCoords[2] = origInputRow;
-      inputCoords[3] = origInputCol;
-      inputCoords[4] = coords[5];  // batch
-    } else {
-      inputCoords[4] = coords[5];  // depth
-      inputCoords[3] = origInputPlane;
-      inputCoords[2] = origInputRow;
-      inputCoords[1] = origInputCol;
-      inputCoords[0] = coords[0];  // batch
-    }
-    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
-      return m_impl.coeff(inputCoords);
-    } else {
-      Index inputIndex;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        inputIndex =
-          inputCoords[4] * m_otherInputStride +
-          inputCoords[3] * m_colInputStride +
-          inputCoords[2] * m_rowInputStride +
-          inputCoords[1] * m_planeInputStride +
-          inputCoords[0];
-      } else {
-        inputIndex =
-          inputCoords[0] * m_otherInputStride +
-          inputCoords[1] * m_colInputStride +
-          inputCoords[2] * m_rowInputStride +
-          inputCoords[3] * m_planeInputStride +
-          inputCoords[4];
-      }
-      return m_impl.coeff(inputIndex);
-    }
-  }
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 7449d6f8c..9b9fd843c 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -55,6 +55,44 @@ void test_cuda_conversion() {
   gpu_device.deallocate(d_conv);
 }
 
+
+void test_cuda_unary() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
 void test_cuda_elementwise() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -201,7 +239,10 @@ void test_cxx11_tensor_of_float16_cuda()
   Eigen::GpuDevice device(&stream);
   if (device.majorDeviceVersion() > 5 ||
       (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
+    std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl;
+
     CALL_SUBTEST_1(test_cuda_conversion());
+    CALL_SUBTEST_1(test_cuda_unary());
     CALL_SUBTEST_1(test_cuda_elementwise());
 //    CALL_SUBTEST_2(test_cuda_contractions());
     CALL_SUBTEST_3(test_cuda_reductions());
author	Gael Guennebaud <g.gael@free.fr>	2016-02-23 11:09:05 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2016-02-23 11:09:05 +0100
commit	91e1375ba97284d1a11068d27c039800ec7900f1 (patch)
tree	ce96e6c443a28e1ea6d189b56cc08460c475b77c
parent	055000a42466670d7fd0162f026cde9ab90f9b25 (diff)
parent	1d9256f7db5db6c9f7fa915b4af868625f53502f (diff)