From 7402fea0a8e63e3ea248257047c584afee8f8bde Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 16 May 2014 15:08:05 -0700
Subject: Vectorized the evaluation of tensor expression (using SSE, AVX, NEON,
 ...) Added the ability to parallelize the evaluation of a tensor expression
 over multiple cpu cores. Added the ability to offload the evaluation of a
 tensor expression to a GPU.

---
 unsupported/test/cxx11_tensor_thread_pool.cpp | 37 +++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 unsupported/test/cxx11_tensor_thread_pool.cpp

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
new file mode 100644
index 000000000..c9de71da3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -0,0 +1,37 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cxx11_tensor_thread_pool()
+{
+  Eigen::Tensor<float, 3> in1(Eigen::array<ptrdiff_t, 3>(2,3,7));
+  Eigen::Tensor<float, 3> in2(Eigen::array<ptrdiff_t, 3>(2,3,7));
+  Eigen::Tensor<float, 3> out(Eigen::array<ptrdiff_t, 3>(2,3,7));
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPoolDevice thread_pool_device(3);
+  out.device(thread_pool_device) = in1 + in2 * 3.14;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<ptrdiff_t, 3>(i,j,k)), in1(Eigen::array<ptrdiff_t, 3>(i,j,k)) + in2(Eigen::array<ptrdiff_t, 3>(i,j,k)) * 3.14f);
+      }
+    }
+  }
+}
-- 
cgit v1.2.3


From 8998f4099e20ebc80db0aba2582301cd48d31c5a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 5 Jun 2014 10:49:34 -0700
Subject: Created additional tests for the tensor code.

---
 unsupported/test/CMakeLists.txt               |   2 +
 unsupported/test/cxx11_tensor_comparisons.cpp |  84 +++++++++++++
 unsupported/test/cxx11_tensor_contraction.cpp | 163 ++++++++++++++++++++++++++
 unsupported/test/cxx11_tensor_device.cpp      |  17 ++-
 unsupported/test/cxx11_tensor_expr.cpp        | 149 ++++++++++++++++++++---
 unsupported/test/cxx11_tensor_fixed_size.cpp  |  14 +--
 unsupported/test/cxx11_tensor_thread_pool.cpp |   7 +-
 7 files changed, 406 insertions(+), 30 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_comparisons.cpp
 create mode 100644 unsupported/test/cxx11_tensor_contraction.cpp

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index abc3375e5..d6072c9f3 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -102,6 +102,8 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_simple "-std=c++0x")
   ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
   ei_add_test(cxx11_tensor_assign "-std=c++0x")
+  ei_add_test(cxx11_tensor_comparison "-std=c++0x")
+  ei_add_test(cxx11_tensor_contraction "-std=c++0x")
   ei_add_test(cxx11_tensor_expr "-std=c++0x")
   ei_add_test(cxx11_tensor_map "-std=c++0x")
   ei_add_test(cxx11_tensor_device  "-std=c++0x")
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
new file mode 100644
index 000000000..186f56ac3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_orderings()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<bool, 3> lt(2,3,7);
+  Tensor<bool, 3> le(2,3,7);
+  Tensor<bool, 3> gt(2,3,7);
+  Tensor<bool, 3> ge(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  lt = mat1 < mat2;
+  le = mat1 <= mat2;
+  gt = mat1 > mat2;
+  ge = mat1 >= mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k));
+        VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k));
+        VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k));
+        VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_equality()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        if (random() < 0.5) {
+          mat2(i,j,k) = mat1(i,j,k);
+        }
+      }
+    }
+  }
+
+  Tensor<bool, 3> eq(2,3,7);
+  Tensor<bool, 3> ne(2,3,7);
+  eq = (mat1 == mat2);
+  ne = (mat1 != mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k));
+        VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_comparisons()
+{
+  CALL_SUBTEST(test_orderings());
+  CALL_SUBTEST(test_equality());
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
new file mode 100644
index 000000000..1c89dfdd1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -0,0 +1,163 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+
+static void test_evals()
+{
+  Tensor<float, 2> mat1(2, 3);
+  Tensor<float, 2> mat2(2, 3);
+  Tensor<float, 2> mat3(3, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3.setRandom();
+
+  Tensor<float, 2> mat4(3,3);
+  mat4.setZero();
+  Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}});
+  TensorEvaluator<decltype(mat1.contract(mat2, dims3))> eval(mat1.contract(mat2, dims3));
+  eval.evalTo(mat4.data());
+  EIGEN_STATIC_ASSERT(TensorEvaluator<decltype(mat1.contract(mat2, dims3))>::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 3);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2> mat5(2,2);
+  mat5.setZero();
+  Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}});
+  TensorEvaluator<decltype(mat1.contract(mat2, dims4))> eval2(mat1.contract(mat2, dims4));
+  eval2.evalTo(mat5.data());
+  EIGEN_STATIC_ASSERT(TensorEvaluator<decltype(mat1.contract(mat2, dims4))>::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval2.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2));
+  VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2> mat6(2,2);
+  mat6.setZero();
+  Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}});
+  TensorEvaluator<decltype(mat1.contract(mat3, dims6))> eval3(mat1.contract(mat3, dims6));
+  eval3.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(TensorEvaluator<decltype(mat1.contract(mat3, dims6))>::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval3.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval3.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1));
+  VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
+}
+
+
+static void test_scalar()
+{
+  Tensor<float, 1> vec1({6});
+  Tensor<float, 1> vec2({6});
+
+  vec1.setRandom();
+  vec2.setRandom();
+
+  Tensor<float, 1> scalar(1);
+  scalar.setZero();
+  Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}});
+  TensorEvaluator<decltype(vec1.contract(vec2, dims))> eval(vec1.contract(vec2, dims));
+  eval.evalTo(scalar.data());
+  EIGEN_STATIC_ASSERT(TensorEvaluator<decltype(vec1.contract(vec2, dims))>::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 6; ++i) {
+    expected += vec1(i) * vec2(i);
+  }
+  VERIFY_IS_APPROX(scalar(0), expected);
+}
+
+
+static void test_multidims()
+{
+  Tensor<float, 3> mat1(2, 2, 2);
+  Tensor<float, 4> mat2(2, 2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 3> mat3(2, 2, 2);
+  mat3.setZero();
+  Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}});
+  TensorEvaluator<decltype(mat1.contract(mat2, dims))> eval(mat1.contract(mat2, dims));
+  eval.evalTo(mat3.data());
+  EIGEN_STATIC_ASSERT(TensorEvaluator<decltype(mat1.contract(mat2, dims))>::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[2], 2);
+
+  VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) +
+                                mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) +
+                                mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) +
+                                mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) +
+                                mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) +
+                                mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) +
+                                mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) +
+                                mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
+                                mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
+}
+
+
+static void test_expr()
+{
+  Tensor<float, 2> mat1(2, 3);
+  Tensor<float, 2> mat2(3, 2);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+
+void test_cxx11_tensor_contraction()
+{
+  CALL_SUBTEST(test_evals());
+  CALL_SUBTEST(test_scalar());
+  CALL_SUBTEST(test_multidims());
+  CALL_SUBTEST(test_expr());
+}
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp
index 9eb1d0420..365b109c7 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cpp
@@ -15,7 +15,7 @@
 
 
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 using Eigen::RowMajor;
@@ -39,8 +39,12 @@ struct CPUContext {
 
 // Context for evaluation on GPU
 struct GPUContext {
-  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out) { }
-
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+    cudaStreamCreate(&stream_);
+  }
+  ~GPUContext() {
+    cudaStreamDestroy(stream_);
+  }
   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
   Eigen::TensorDevice<Eigen::TensorMap<Eigen::Tensor<float, 3> >, Eigen::GpuDevice> out() { return TensorDevice<Eigen::TensorMap<Eigen::Tensor<float, 3> >, Eigen::GpuDevice>(gpu_device_, out_); }
@@ -49,6 +53,7 @@ struct GPUContext {
   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+  cudaStream_t stream_;
   Eigen::GpuDevice gpu_device_;
 };
 
@@ -57,7 +62,7 @@ struct GPUContext {
 template <typename Context>
 static void test_contextual_eval(Context* context)
 {
-  context->out() = context->in1() + context->in2() * 3.14f;
+  context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
 }
 
 static void test_cpu() {
@@ -73,7 +78,7 @@ static void test_cpu() {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f);
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
       }
     }
   }
@@ -111,7 +116,7 @@ static void test_gpu() {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f);
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
       }
     }
   }
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index e0124da8c..e85fcbfa9 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -28,10 +28,10 @@ static void test_1d()
 
   float data3[6];
   TensorMap<Tensor<float, 1>> vec3(data3, 6);
-  vec3 = vec1.cwiseSqrt();
+  vec3 = vec1.sqrt();
   float data4[6];
   TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6);
-  vec4 = vec2.cwiseSqrt();
+  vec4 = vec2.square();
 
   VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
   VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
@@ -40,12 +40,12 @@ static void test_1d()
   VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
   VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
 
-  VERIFY_IS_APPROX(vec4(0), sqrtf(0.0));
-  VERIFY_IS_APPROX(vec4(1), sqrtf(1.0));
-  VERIFY_IS_APPROX(vec4(2), sqrtf(2.0));
-  VERIFY_IS_APPROX(vec4(3), sqrtf(3.0));
-  VERIFY_IS_APPROX(vec4(4), sqrtf(4.0));
-  VERIFY_IS_APPROX(vec4(5), sqrtf(5.0));
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
 
   vec3 = vec1 + vec2;
   VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
@@ -79,8 +79,8 @@ static void test_2d()
 
   Tensor<float, 2> mat3(2,3);
   Tensor<float, 2, RowMajor> mat4(2,3);
-  mat3 = mat1.cwiseAbs();
-  mat4 = mat2.cwiseAbs();
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
 
   VERIFY_IS_APPROX(mat3(0,0), 0.0f);
   VERIFY_IS_APPROX(mat3(0,1), 1.0f);
@@ -102,7 +102,7 @@ static void test_3d()
   Tensor<float, 3> mat1(2,3,7);
   Tensor<float, 3, RowMajor> mat2(2,3,7);
 
-  float val = 0.0;
+  float val = 1.0;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -118,28 +118,147 @@ static void test_3d()
   Tensor<float, 3, RowMajor> mat4(2,3,7);
   mat4 = mat2 * 3.14f;
   Tensor<float, 3> mat5(2,3,7);
-  mat5 = mat1.cwiseSqrt().cwiseSqrt();
+  mat5 = mat1.inverse().log();
   Tensor<float, 3, RowMajor> mat6(2,3,7);
-  mat6 = mat2.cwiseSqrt() * 3.14f;
+  mat6 = mat2.pow(0.5f) * 3.14f;
+  Tensor<float, 3> mat7(2,3,7);
+  mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
+  Tensor<float, 3, RowMajor> mat8(2,3,7);
+  mat8 = (-mat2).exp() * 3.14f;
 
-  val = 0.0;
+  val = 1.0;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat3(i,j,k), val + val);
         VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
-        VERIFY_IS_APPROX(mat5(i,j,k), sqrtf(sqrtf(val)));
+        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
         VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
+        VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
+        VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
         val += 1.0;
       }
     }
   }
 }
 
+static void test_constants()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+  mat2 = mat1.constant(3.14f);
+  mat3 = mat1.cwiseMax(7.3f).exp();
+
+  val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
+        VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
+        val += 1.0;
+      }
+    }
+  }
+}
+
+
+static void test_functors()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+  mat2 = mat1.inverse().unaryExpr(&asinf);
+  mat3 = mat1.unaryExpr(&tanhf);
+
+  val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
+        VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
+        val += 1.0;
+      }
+    }
+  }
+}
+
+static void test_type_casting()
+{
+  Tensor<bool, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<double, 3> mat3(2,3,7);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  mat3 = mat1.template cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0);
+      }
+    }
+  }
+
+  mat3 = mat2.template cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k)));
+      }
+    }
+  }
+}
+
+static void test_select()
+{
+  Tensor<float, 3> selector(2,3,7);
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> result(2,3,7);
+
+  selector.setRandom();
+  mat1.setRandom();
+  mat2.setRandom();
+  result = (selector > selector.constant(0.5f)).select(mat1, mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k));
+      }
+    }
+  }
+}
+
 
 void test_cxx11_tensor_expr()
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
   CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_constants());
+  CALL_SUBTEST(test_functors());
+  CALL_SUBTEST(test_type_casting());
+  CALL_SUBTEST(test_select());
 }
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 214f6951d..d270486f2 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -33,10 +33,10 @@ static void test_1d()
 
   float data3[6];
   TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
-  vec3 = vec1.cwiseSqrt();
+  vec3 = vec1.sqrt();
   float data4[6];
   TensorMap<TensorFixedSize<float, Sizes<6>, RowMajor> > vec4(data4, 6);
-  vec4 = vec2.cwiseSqrt();
+  vec4 = vec2.sqrt();
 
   VERIFY_IS_EQUAL((vec3.size()), 6);
   //  VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6);
@@ -92,8 +92,8 @@ static void test_2d()
 
   TensorFixedSize<float, Sizes<2, 3>> mat3;
   TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
-  mat3 = mat1.cwiseAbs();
-  mat4 = mat2.cwiseAbs();
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
 
   VERIFY_IS_EQUAL((mat3.size()), 2*3);
     //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
@@ -136,9 +136,9 @@ static void test_3d()
   }
 
   TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
-  mat3 = mat1.cwiseSqrt();
+  mat3 = mat1.sqrt();
   TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4;
-  mat4 = mat2.cwiseSqrt();
+  mat4 = mat2.sqrt();
 
   VERIFY_IS_EQUAL((mat3.size()), 2*3*7);
   //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
@@ -173,7 +173,7 @@ static void test_array()
   }
 
   TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
-  mat3 = mat1.cwisePow(3.5f);
+  mat3 = mat1.pow(3.5f);
 
   val = 0.0;
   for (int i = 0; i < 2; ++i) {
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index c9de71da3..b371e8a71 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -12,6 +12,7 @@
 
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
+#include "thread/threadpool.h"
 
 using Eigen::Tensor;
 
@@ -24,8 +25,10 @@ void test_cxx11_tensor_thread_pool()
   in1.setRandom();
   in2.setRandom();
 
-  Eigen::ThreadPoolDevice thread_pool_device(3);
-  out.device(thread_pool_device) = in1 + in2 * 3.14;
+  ThreadPool thread_pool(2);
+  thread_pool.StartWorkers();
+  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, 3);
+  out.device(thread_pool_device) = in1 + in2 * 3.14f;
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
-- 
cgit v1.2.3


From fe102248ac8f78e33064caeb5cdea6fc41af637c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 9 Jun 2014 09:19:21 -0700
Subject: Fixed the threadpool test

---
 unsupported/test/cxx11_tensor_thread_pool.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index b371e8a71..2e67b2064 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -12,7 +12,6 @@
 
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
-#include "thread/threadpool.h"
 
 using Eigen::Tensor;
 
@@ -25,9 +24,7 @@ void test_cxx11_tensor_thread_pool()
   in1.setRandom();
   in2.setRandom();
 
-  ThreadPool thread_pool(2);
-  thread_pool.StartWorkers();
-  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, 3);
+  Eigen::ThreadPoolDevice thread_pool_device(3);
   out.device(thread_pool_device) = in1 + in2 * 3.14f;
 
   for (int i = 0; i < 2; ++i) {
-- 
cgit v1.2.3


From a991f94c0e5c51555875564ce58681a82d07cd69 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 10 Oct 2014 15:20:37 -0700
Subject: Fixed the thread pool test

---
 test/main.h                                         | 4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +-
 unsupported/test/CMakeLists.txt                     | 2 +-
 unsupported/test/cxx11_tensor_thread_pool.cpp       | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/test/main.h b/test/main.h
index b504970f3..9cb41c828 100644
--- a/test/main.h
+++ b/test/main.h
@@ -47,8 +47,8 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#define min(A,B) please_protect_your_min_with_parentheses
-#define max(A,B) please_protect_your_max_with_parentheses
+//#define min(A,B) please_protect_your_min_with_parentheses
+//#define max(A,B) please_protect_your_max_with_parentheses
 
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index faf965df8..84768ca09 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -131,7 +131,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index numblocks = size / blocksize;
 
       Index i = 0;
-      vector<std::future<void> > results;
+      std::vector<std::future<void> > results;
       results.reserve(numblocks);
       for (int i = 0; i < numblocks; ++i) {
          results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 75423f516..1c4d0838a 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -122,5 +122,5 @@ if(EIGEN_TEST_CXX11)
 #  ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
   ei_add_test(cxx11_tensor_striding "-std=c++0x")
 #  ei_add_test(cxx11_tensor_device  "-std=c++0x")
-#  ei_add_test(cxx11_tensor_thread_pool "-std=c++0x")
+  ei_add_test(cxx11_tensor_thread_pool "-std=c++0x")
 endif()
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2e67b2064..e02d8e4be 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -17,9 +17,9 @@ using Eigen::Tensor;
 
 void test_cxx11_tensor_thread_pool()
 {
-  Eigen::Tensor<float, 3> in1(Eigen::array<ptrdiff_t, 3>(2,3,7));
-  Eigen::Tensor<float, 3> in2(Eigen::array<ptrdiff_t, 3>(2,3,7));
-  Eigen::Tensor<float, 3> out(Eigen::array<ptrdiff_t, 3>(2,3,7));
+  Eigen::Tensor<float, 3> in1(2,3,7);
+  Eigen::Tensor<float, 3> in2(2,3,7);
+  Eigen::Tensor<float, 3> out(2,3,7);
 
   in1.setRandom();
   in2.setRandom();
@@ -30,7 +30,7 @@ void test_cxx11_tensor_thread_pool()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<ptrdiff_t, 3>(i,j,k)), in1(Eigen::array<ptrdiff_t, 3>(i,j,k)) + in2(Eigen::array<ptrdiff_t, 3>(i,j,k)) * 3.14f);
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
       }
     }
   }
-- 
cgit v1.2.3


From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 13 Oct 2014 17:02:09 -0700
Subject: Misc improvements and cleanups

---
 Eigen/src/Core/GenericPacketMath.h                 |  15 +-
 unsupported/Eigen/CXX11/Tensor                     |   4 +
 .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h   |   5 +
 .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h   | 101 ++++++++-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |  12 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  35 ++++
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h      |  73 ++++---
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |   2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  20 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |  36 +++-
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h  |  26 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h     |  22 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h |   9 +-
 .../Eigen/CXX11/src/Tensor/TensorStriding.h        |  61 ++++--
 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h  |  32 +--
 unsupported/test/CMakeLists.txt                    |   1 +
 unsupported/test/cxx11_tensor_assign.cpp           |  35 +++-
 unsupported/test/cxx11_tensor_convolution.cpp      |  70 +++++++
 unsupported/test/cxx11_tensor_device.cpp           |  27 +++
 unsupported/test/cxx11_tensor_morphing.cpp         |   5 +-
 unsupported/test/cxx11_tensor_of_complex.cpp       |  64 ++++++
 unsupported/test/cxx11_tensor_thread_pool.cpp      | 232 ++++++++++++++++++++-
 29 files changed, 780 insertions(+), 141 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_of_complex.cpp

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index e6fea5bba..3ef3475c7 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -359,7 +359,7 @@ pmadd(const Packet&  a,
 /** \internal \returns a packet version of \a *from.
   * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
 template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
   if(LoadMode == Aligned)
     return pload<Packet>(from);
@@ -370,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 /** \internal copy the packet \a from to \a *to.
   * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
   if(LoadMode == Aligned)
     pstore(to, from);
@@ -378,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
     pstoreu(to, from);
 }
 
+/** \internal \returns a packet version of \a *from.
+  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+  * hardware if available to speedup the loading of data that won't be modified
+  * by the current computation.
+  */
+template<typename Packet, int LoadMode>
+inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+  return ploadt<Packet, LoadMode>(from);
+}
+
 /** \internal default implementation of palign() allowing partial specialization */
 template<int Offset,typename PacketType>
 struct palign_impl
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 0dac95e45..2137f4276 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -30,6 +30,10 @@
 #include <cstring>
 #include <stdint.h>
 
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 #include <curand_kernel.h>
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index 227522ecb..e30eb6ad8 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -66,6 +66,11 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_
 template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
 template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+
+
 #undef STD_GET_ARR_HACK
 
 template <typename T> struct array_size;
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 4c6b95773..e45d0a3b1 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -48,7 +48,8 @@ template <typename T, size_t n> class array {
     values[2] = v3;
   }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) {
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
     EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v1;
     values[1] = v2;
@@ -56,7 +57,8 @@ template <typename T, size_t n> class array {
     values[3] = v4;
   }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) {
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
     EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v1;
     values[1] = v2;
@@ -64,6 +66,43 @@ template <typename T, size_t n> class array {
     values[3] = v4;
     values[4] = v5;
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
   array(std::initializer_list<T> l) {
@@ -93,9 +132,11 @@ template<typename T, typename Tail=empty_list> struct type_list {
 
 struct null_type { };
 
-template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type, typename T4 = null_type, typename T5 = null_type>
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
 struct make_type_list {
-  typedef typename make_type_list<T2, T3, T4, T5>::type tailresult;
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
 
   typedef type_list<T1, tailresult> type;
 };
@@ -150,6 +191,23 @@ template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
   typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
 };
 
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
 
 template <std::size_t index, class NList> struct get;
 
@@ -174,6 +232,7 @@ template <> struct arg_prod<empty_list> {
   static const int value = 1;
 };
 
+
 template<int n, typename t>
 array<t, n> repeat(t v) {
   array<t, n> array;
@@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l
   return get<I, type_list<Head, Tail> >::value;
 }
 
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+  return arg_prod<NList>::value;
+};
+
 template<std::size_t n, typename t>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
   t prod = 1;
@@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
   return 0;
 }
 
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
 template<std::size_t I, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
   return a[I];
@@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
   return a[I];
 }
 
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
 template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
   static const size_t value = N;
 };
-
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
 
 struct sum_op {
   template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 3bfe80c9e..e973c00d3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -131,8 +131,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
-    static const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
-    static const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
   }
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 27c10f64f..6018ecc66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -30,6 +30,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Scalar CoeffReturnType;
     typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
 
+    // Dimensions
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); }
+
     // Nullary operators
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
@@ -187,7 +193,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     // Contractions.
-    typedef std::pair<Index, Index> DimensionPair;
+    typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 3b2a9c8b9..0e55d4de1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -48,7 +48,7 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB
 
 
 template<typename Broadcast, typename XprType>
-class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, WriteAccessors>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
@@ -91,7 +91,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D
     if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < packetSize; ++i) {
         values[i] = coeff(originalIndex+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 4a5fd9c79..34bdd5309 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -872,11 +872,19 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    assert(m_buf);
-    assert(index < m_dimensions.TotalSize());
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
     return m_buf[index];
   }
 
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
  private:
   // No assignment (copies are needed by the kernels)
   TensorEvaluator& operator = (const TensorEvaluator&);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 75519c9f5..649bdb308 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -38,6 +38,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const DeviceType& m_device;
     ExpressionType& m_expression;
@@ -58,6 +70,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const ThreadPoolDevice& m_device;
     ExpressionType& m_expression;
@@ -79,6 +103,17 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const GpuDevice& m_device;
     ExpressionType m_expression;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index fad342eab..5a6ff70e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -37,23 +37,41 @@ struct DefaultDevice {
 // Multiple cpu cores
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
+
+typedef std::future<void> Future;
+
 struct ThreadPoolDevice {
-  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
-  size_t numThreads() const { return num_threads_; }
+  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
   }
+
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
+
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
+
   EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
 
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
+    return std::async(std::launch::async, f, args...);
+  }
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
+    std::async(std::launch::async, f, args...);
+  }
+
  private:
   // todo: NUMA, ...
   size_t num_threads_;
@@ -63,41 +81,34 @@ struct ThreadPoolDevice {
 
 // GPU offloading
 #ifdef EIGEN_USE_GPU
-static int m_numMultiProcessors = 0;
-static int m_maxThreadsPerBlock = 0;
-static int m_maxThreadsPerMultiProcessor = 0;
+static cudaDeviceProp m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
+    m_devicePropInitialized = true;
+  }
+}
 
 static inline int getNumCudaMultiProcessors() {
-  if (m_numMultiProcessors == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-  }
-  return m_numMultiProcessors;
+  initializeDeviceProp();
+  return m_deviceProperties.multiProcessorCount;
 }
 static inline int maxCudaThreadsPerBlock() {
-  if (m_maxThreadsPerBlock == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-  }
-  return m_maxThreadsPerBlock;
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerBlock;
 }
 static inline int maxCudaThreadsPerMultiProcessor() {
-  if (m_maxThreadsPerBlock == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-  }
-  return m_maxThreadsPerMultiProcessor;
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerMultiProcessor;
+}
+static inline int sharedMemPerBlock() {
+  initializeDeviceProp();
+  return m_deviceProperties.sharedMemPerBlock;
 }
 
+
 struct GpuDevice {
   // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
   GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
@@ -141,8 +152,8 @@ struct GpuDevice {
 #endif
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // Fixme:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
     return 32;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 732c6b344..2dd8e274b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,7 +29,7 @@ namespace Eigen {
   * \sa Tensor
   */
 
-// Can't use std::pairs on cuda devices
+// Can't use std::pair on cuda devices
 template <typename Index> struct IndexPair {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 587cbd5ca..ce9d73578 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
-  EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
     internal::pstoret<Scalar, Packet, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 0f969036c..e324ba8d2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -65,13 +65,13 @@ struct TensorEvaluator
     return m_data[index];
   }
 
-  template<int LoadMode> EIGEN_STRONG_INLINE
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
     return internal::ploadt<Packet, LoadMode>(m_data + index);
   }
 
-  template <int StoreMode> EIGEN_STRONG_INLINE
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const Packet& x)
   {
     return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
@@ -113,13 +113,17 @@ struct TensorEvaluator<const Derived, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data);
+#ifdef __CUDA_ARCH__
+    return __ldg(m_data+index);
+#else
     return m_data[index];
+#endif
   }
 
-  template<int LoadMode> EIGEN_STRONG_INLINE
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    return internal::ploadt<Packet, LoadMode>(m_data + index);
+    return internal::ploadt_ro<Packet, LoadMode>(m_data + index);
   }
 
   const Scalar* data() const { return m_data; }
@@ -166,7 +170,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(index);
   }
@@ -219,7 +223,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
@@ -278,7 +282,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
   }
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
@@ -340,7 +344,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
   }
   template<int LoadMode>
-  PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
     static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 10f5a5ee7..01fa04c64 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 
-#ifdef EIGEN_USE_THREADS
-#include <future>
-#endif
-
 namespace Eigen {
 
 /** \class TensorExecutor
@@ -62,7 +58,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      const int VectorizedSize = (size / PacketSize) * PacketSize;
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
 
       for (Index i = 0; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
@@ -131,10 +127,10 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index numblocks = size / blocksize;
 
       Index i = 0;
-      std::vector<std::future<void> > results;
+      std::vector<Future> results;
       results.reserve(numblocks);
       for (int i = 0; i < numblocks; ++i) {
-         results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
+        results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
       }
 
       for (int i = 0; i < numblocks; ++i) {
@@ -154,11 +150,31 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <typename Evaluator>
-__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) {
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, unsigned int size) {
+
   const int first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const int step_size = blockDim.x * gridDim.x;
-  for (int i = first_index; i < size; i += step_size) {
-    eval.evalScalar(i);
+
+  if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
+    // Use the scalar path
+    for (int i = first_index; i < size; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+  else {
+    // Use the vector path
+    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const int vectorized_step_size = step_size * PacketSize;
+    const int vectorized_size = (size / PacketSize) * PacketSize;
+    int i = first_index * PacketSize;
+    for ( ; i < vectorized_size; i += vectorized_step_size) {
+      eval.evalPacket(i);
+    }
+    for ( ; i < size; i += step_size) {
+      eval.evalScalar(i);
+    }
   }
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 4d7f9e1fd..a753c5a48 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -17,7 +17,7 @@ namespace Eigen {
   *
   * \brief The fixed sized version of the tensor class.
   *
-  * The fixes sized equivalent of 
+  * The fixed sized equivalent of
   * Eigen::Tensor<float, 3> t(3, 5, 7);
   * is
   * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
@@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_ALIGN),
-      PacketAccess = true,
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     };
 
   typedef Dimensions_ Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index cf97031be..2714117ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -31,30 +31,34 @@ namespace internal {
 template <typename T>
 struct TensorIntDivisor {
  public:
-  TensorIntDivisor() {
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
     multiplier = 0;
     shift1 = 0;
     shift2 = 0;
   }
 
   // Must have 1 <= divider <= 2^31-1
-  TensorIntDivisor(const T divider) {
-    static const int N = 32;
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = 32;
     eigen_assert(divider > 0);
     eigen_assert(divider <= (1<<(N-1)) - 1);
 
     // fast ln2
+#ifndef __CUDA_ARCH__
     const int leading_zeros = __builtin_clz(divider);
-    const int l = N - (leading_zeros+1);
-
-    multiplier = (static_cast<uint64_t>(1) << (N+l)) / divider - (static_cast<uint64_t>(1) << N) + 1;
-    shift1 = (std::min)(1, l);
-    shift2 = (std::max)(0, l-1);
+#else
+    const int leading_zeros = __clz(divider);
+#endif
+    const int log_div = N - (leading_zeros+1);
+
+    multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
   // Must have 0 <= numerator <= 2^32-1
-  T divide(const T numerator) const {
-    static const int N = 32;
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    const int N = 32;
     eigen_assert(numerator >= 0);
     eigen_assert(numerator <= (1ull<<N) - 1);
 
@@ -71,7 +75,7 @@ struct TensorIntDivisor {
 
 
 template <typename T>
-static T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
   return divisor.divide(numerator);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 04849dd9f..2c0d2cd0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -42,26 +42,25 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
 
     static const int Options = Options_;
 
-    static const std::size_t NumIndices = PlainObjectType::NumIndices;
+    static const Index NumIndices = PlainObjectType::NumIndices;
     typedef typename PlainObjectType::Dimensions Dimensions;
 
-
     enum {
-      IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned),
-      PacketAccess = true,
+      IsAligned = ((int(Options_)&Aligned)==Aligned),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     };
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array<DenseIndex, NumIndices>({{firstDimension, otherDimensions...}})) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array<DenseIndex, NumIndices>(firstDimension)) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
@@ -176,12 +175,13 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
-      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      const std::size_t NumDims = sizeof...(otherIndices) + 1;
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
         return m_data[index];
       } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
         return m_data[index];
       }
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 7da89458f..8da6e0f26 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -144,7 +144,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -206,7 +206,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index f7e7fc107..7e0063626 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = true,
+    IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
@@ -194,7 +194,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = true,
+    IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0c4f8a3d6..aaec39756 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -30,11 +30,11 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
 class TensorStorage
 {
  private:
@@ -62,7 +62,7 @@ class TensorStorage
 
 
 // pure-dynamic, but without specification of all dimensions explicitly
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
   : public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
@@ -79,7 +79,7 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
 };
 
 // pure dynamic
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
     T *m_data;
@@ -140,6 +140,7 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
 };
 
 
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 7acdbfc72..ecfdb762c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -48,7 +48,7 @@ struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridin
 
 
 template<typename Strides, typename XprType>
-class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType>, WriteAccessors>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
   typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -109,28 +109,23 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      if (i > 0) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-      } else {
-        m_inputStrides[0] = 1;
-        m_outputStrides[0] = 1;
-      }
-    }
-    for (int i = 0; i < NumDims; ++i) {
-        m_inputStrides[i] *= op.strides()[i];
+    m_outputStrides[0] = 1;
+    m_inputStrides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      m_inputStrides[i-1] *= op.strides()[i-1];
     }
+    m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
   }
 
-  //  typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -150,16 +145,44 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     return m_impl.coeff(inputIndex);
   }
 
-  /*  template<int LoadMode>
+  template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_impl.template packet<LoadMode>(index);
-    }*/
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx0 = indices[0] / m_outputStrides[i];
+      const Index idx1 = indices[1] / m_outputStrides[i];
+      inputIndices[0] += idx0 * m_inputStrides[i];
+      inputIndices[1] += idx1 * m_inputStrides[i];
+      indices[0] -= idx0 * m_outputStrides[i];
+      indices[1] -= idx1 * m_outputStrides[i];
+    }
+    inputIndices[0] += indices[0] * m_inputStrides[0];
+    inputIndices[1] += indices[1] * m_inputStrides[0];
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
 
   Scalar* data() const { return NULL; }
 
  protected:
-    //  Strides m_strides;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 40f805741..5940a8cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -70,14 +70,18 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
 };
 
 
-template<typename PlainObjectType>
-struct traits<TensorMap<PlainObjectType> >
+template<typename PlainObjectType, int Options_>
+struct traits<TensorMap<PlainObjectType, Options_> >
   : public traits<PlainObjectType>
 {
   typedef traits<PlainObjectType> BaseTraits;
   typedef typename BaseTraits::Scalar Scalar;
   typedef typename BaseTraits::StorageKind StorageKind;
   typedef typename BaseTraits::Index Index;
+  enum {
+    Options = Options_,
+    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
 };
 
 
@@ -105,16 +109,16 @@ struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
   typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
 };
 
-template<typename PlainObjectType>
-struct eval<TensorMap<PlainObjectType>, Eigen::Dense>
+template<typename PlainObjectType, int Options>
+struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
-template<typename PlainObjectType>
-struct eval<const TensorMap<PlainObjectType>, Eigen::Dense>
+template<typename PlainObjectType, int Options>
+struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
 template <typename Scalar_, std::size_t NumIndices_, int Options_>
@@ -141,16 +145,16 @@ struct nested<const TensorFixedSize<Scalar_, Dimensions, Options>, 1, typename e
   typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
 };
 
-template <typename PlainObjectType>
-struct nested<TensorMap<PlainObjectType>, 1, typename eval<TensorMap<PlainObjectType> >::type>
+template <typename PlainObjectType, int Options>
+struct nested<TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
-template <typename PlainObjectType>
-struct nested<const TensorMap<PlainObjectType>, 1, typename eval<TensorMap<PlainObjectType> >::type>
+template <typename PlainObjectType, int Options>
+struct nested<const TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
 }  // end namespace internal
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index d6c435947..a7ef2b402 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -110,6 +110,7 @@ if(EIGEN_TEST_CXX11)
 #  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
   ei_add_test(cxx11_tensor_const "-std=c++0x")
   ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
+  ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
   ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
   ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
   ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index f2b126413..0ac3f9bf9 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -253,6 +253,39 @@ static void test_auto_resize()
 }
 
 
+static void test_compound_assign()
+{
+  Tensor<int, 1> start_tensor(10);
+  Tensor<int, 1> offset_tensor(10);
+  start_tensor.setRandom();
+  offset_tensor.setRandom();
+
+  Tensor<int, 1> tensor = start_tensor;
+  tensor += offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor -= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor *= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor /= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
+  }
+}
+
+
 void test_cxx11_tensor_assign()
 {
   CALL_SUBTEST(test_1d());
@@ -260,5 +293,5 @@ void test_cxx11_tensor_assign()
   CALL_SUBTEST(test_3d());
   CALL_SUBTEST(test_same_type());
   CALL_SUBTEST(test_auto_resize());
-
+  CALL_SUBTEST(test_compound_assign());
 }
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index bafe73edd..4672db463 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -64,8 +64,78 @@ static void test_expr()
 }
 
 
+static void test_modes() {
+  Tensor<float, 1> input(3);
+  Tensor<float, 1> kernel(3);
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<float, 1> valid(1);
+  valid = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<float, 1> same(3);
+  same = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+  Tensor<float, 1> full(5);
+  full = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+}
+
+
+static void test_strides() {
+  Tensor<float, 1> input(13);
+  Tensor<float, 1> kernel(3);
+  input.setRandom();
+  kernel.setRandom();
+
+  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}};
+  const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}};
+
+  Tensor<float, 1> result;
+  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+
+
+
 void test_cxx11_tensor_convolution()
 {
   CALL_SUBTEST(test_evals());
   CALL_SUBTEST(test_expr());
+  CALL_SUBTEST(test_modes());
+  CALL_SUBTEST(test_strides());
 }
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp
index f331cb481..26465ee11 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cpp
@@ -123,6 +123,14 @@ static void test_forced_contextual_eval(Context* context)
   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
 }
 
+template <typename Context>
+static void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
 template <typename Context>
 static void test_contraction(Context* context)
 {
@@ -197,6 +205,15 @@ static void test_cpu() {
     }
   }
 
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
   test_contraction(&context);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
@@ -299,6 +316,16 @@ static void test_gpu() {
     }
   }
 
+  test_compound_assignment(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
   test_contraction(&context);
   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
   for (int i = 0; i < 40; ++i) {
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index 2a6a97856..fd1b1fa32 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -12,6 +12,7 @@
 #include <Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
+using Eigen::IndexPair;
 
 static void test_simple_reshape()
 {
@@ -52,7 +53,7 @@ static void test_reshape_in_expr() {
   TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
   Tensor<float, 2>::Dimensions newDims1{{2,3*5*7*11}};
   Tensor<float, 2>::Dimensions newDims2{{3*5*7*11,13}};
-  array<Tensor<float, 1>::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}};
+  Eigen::array<IndexPair<DenseIndex>, 1> contract_along{{IndexPair<DenseIndex>(1, 0)}};
   Tensor<float, 2> tensor3(2,13);
   tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along);
 
@@ -125,7 +126,7 @@ static void test_slice_in_expr() {
   TensorMap<Tensor<float, 2>> tensor1(m1.data(), 7, 7);
   TensorMap<Tensor<float, 2>> tensor2(m2.data(), 3, 3);
   Tensor<float, 2> tensor3(3,1);
-  array<Tensor<float, 1>::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}};
+  array<IndexPair<DenseIndex>, 1> contract_along{{IndexPair<DenseIndex>(1, 0)}};
 
   Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
   Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3);
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
new file mode 100644
index 000000000..b5044b962
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+
+static void test_additions()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<float>, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = std::complex<float>(i, -i);
+    data2(i) = std::complex<float>(i, 7 * i);
+  }
+
+  Tensor<std::complex<float>, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_EQUAL(sum(i),  std::complex<float>(2*i, 6*i));
+  }
+}
+
+
+static void test_contractions()
+{
+  Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
+  Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10);
+  Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf;
+  MapXcf m_left(t_left.data(), 1500, 248);
+  MapXcf m_right(t_right.data(), 248, 1400);
+  Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400);
+
+  // This contraction should be equivalent to a regular matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+void test_cxx11_tensor_of_complex()
+{
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_contractions());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e02d8e4be..f0de61f8b 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -9,22 +9,23 @@
 
 #define EIGEN_USE_THREADS
 
-
+#include <iostream>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+
 using Eigen::Tensor;
 
-void test_cxx11_tensor_thread_pool()
+static void test_multithread_elementwise()
 {
-  Eigen::Tensor<float, 3> in1(2,3,7);
-  Eigen::Tensor<float, 3> in2(2,3,7);
-  Eigen::Tensor<float, 3> out(2,3,7);
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
 
   in1.setRandom();
   in2.setRandom();
 
-  Eigen::ThreadPoolDevice thread_pool_device(3);
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
   out.device(thread_pool_device) = in1 + in2 * 3.14f;
 
   for (int i = 0; i < 2; ++i) {
@@ -35,3 +36,222 @@ void test_cxx11_tensor_thread_pool()
     }
   }
 }
+
+
+static void test_multithread_compound_assignment()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1;
+  out.device(thread_pool_device) += in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+
+static void test_multithread_contraction()
+{
+  Tensor<float, 4> t_left(30, 50, 37, 31);
+  Tensor<float, 5> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5> t_result(30, 50, 70, 2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+
+  typedef Map<MatrixXf> MapXf;
+  MapXf m_left(t_left.data(), 1500, 1147);
+  MapXf m_right(t_right.data(), 1147, 1400);
+  MatrixXf m_result(1500, 1400);
+
+  Eigen::ThreadPoolDevice thread_pool_device(4);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+ for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+
+static void test_contraction_corner_cases()
+{
+  Tensor<float, 2> t_left(32, 500);
+  Tensor<float, 2> t_right(32, 28*28);
+  Tensor<float, 2> t_result(500, 28*28);
+
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result = t_result.constant(NAN);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
+
+  typedef Map<MatrixXf> MapXf;
+  MapXf m_left(t_left.data(), 32, 500);
+  MapXf m_right(t_right.data(), 32, 28*28);
+  MatrixXf m_result(500, 28*28);
+
+  Eigen::ThreadPoolDevice thread_pool_device(12);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left.transpose() * m_right;
+
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_result.resize (1, 28*28);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 500);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (500, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 500);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (1, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+
+static void test_multithread_contraction_agrees_with_singlethread() {
+  int contract_size = internal::random<int>(1, 5000);
+
+  Tensor<float, 3> left(internal::random<int>(1, 80),
+                        contract_size,
+                        internal::random<int>(1, 100));
+
+  Tensor<float, 4> right(internal::random<int>(1, 25),
+                         internal::random<int>(1, 37),
+                         contract_size,
+                         internal::random<int>(1, 51));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(2, 11));
+
+  Tensor<float, 5> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5> tp_result(st_result.dimensions());
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test will fail
+    // due to numerical precision issues when values are small)
+    if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+
+static void test_memcpy() {
+
+  for (int i = 0; i < 5; ++i) {
+    const int num_threads = internal::random<int>(3, 11);
+    Eigen::ThreadPoolDevice thread_pool_device(num_threads);
+
+    const int size = internal::random<int>(13, 7632);
+    Tensor<float, 1> t1(size);
+    t1.setRandom();
+    std::vector<float> result(size);
+    thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
+    for (int i = 0; i < size; i++) {
+      VERIFY_IS_EQUAL(t1(i), result[i]);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_thread_pool()
+{
+  CALL_SUBTEST(test_multithread_elementwise());
+  CALL_SUBTEST(test_multithread_compound_assignment());
+
+  CALL_SUBTEST(test_multithread_contraction());
+
+  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread());
+
+  // Exercise various cases that have been problematic in the past.
+  CALL_SUBTEST(test_contraction_corner_cases());
+
+  CALL_SUBTEST(test_memcpy());
+}
-- 
cgit v1.2.3


From b5124e7cfda27ed99dcfcec8cb1b674efa1ef4a3 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 14 Jan 2015 15:46:04 -0800
Subject: Created many additional tests

---
 unsupported/test/CMakeLists.txt                 |  13 +-
 unsupported/test/cxx11_tensor_assign.cpp        |  73 ++++
 unsupported/test/cxx11_tensor_broadcasting.cpp  |  86 ++++-
 unsupported/test/cxx11_tensor_chipping.cpp      | 183 ++++++---
 unsupported/test/cxx11_tensor_concatenation.cpp |  34 +-
 unsupported/test/cxx11_tensor_contract_cuda.cpp | 121 ++++++
 unsupported/test/cxx11_tensor_contraction.cpp   | 221 +++++++----
 unsupported/test/cxx11_tensor_cuda.cpp          | 474 ++++++++++++++++++++++++
 unsupported/test/cxx11_tensor_device.cpp        | 118 +++---
 unsupported/test/cxx11_tensor_dimension.cpp     |   9 +-
 unsupported/test/cxx11_tensor_expr.cpp          |  40 ++
 unsupported/test/cxx11_tensor_forced_eval.cpp   |  27 ++
 unsupported/test/cxx11_tensor_image_patch.cpp   | 206 +++++++++-
 unsupported/test/cxx11_tensor_map.cpp           |   7 +-
 unsupported/test/cxx11_tensor_morphing.cpp      | 143 +++++--
 unsupported/test/cxx11_tensor_of_strings.cpp    |  54 +--
 unsupported/test/cxx11_tensor_padding.cpp       |  23 +-
 unsupported/test/cxx11_tensor_patch.cpp         |  17 +
 unsupported/test/cxx11_tensor_reduction.cpp     | 287 ++++++++++++--
 unsupported/test/cxx11_tensor_shuffling.cpp     |  28 +-
 unsupported/test/cxx11_tensor_simple.cpp        |   3 +
 unsupported/test/cxx11_tensor_striding.cpp      |  38 +-
 unsupported/test/cxx11_tensor_thread_pool.cpp   |  70 ++--
 23 files changed, 1908 insertions(+), 367 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_contract_cuda.cpp
 create mode 100644 unsupported/test/cxx11_tensor_cuda.cpp

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 89c651804..9f44e47f9 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -99,7 +99,7 @@ if(EIGEN_TEST_CXX11)
   # older compiler that don't support cxx11.
   ei_add_test(cxx11_meta "-std=c++0x")
   ei_add_test(cxx11_tensor_simple "-std=c++0x")
-  ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
+#  ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
   ei_add_test(cxx11_tensor_assign "-std=c++0x")
   ei_add_test(cxx11_tensor_dimension "-std=c++0x")
   ei_add_test(cxx11_tensor_index_list "-std=c++0x")
@@ -126,8 +126,17 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_reduction "-std=c++0x")
   ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
   ei_add_test(cxx11_tensor_striding "-std=c++0x")
-#  ei_add_test(cxx11_tensor_device  "-std=c++0x")
   ei_add_test(cxx11_tensor_thread_pool "-std=c++0x")
   ei_add_test(cxx11_tensor_ref "-std=c++0x")
+  ei_add_test(cxx11_tensor_random "-std=c++0x")
+  ei_add_test(cxx11_tensor_casts "-std=c++0x")
+  ei_add_test(cxx11_tensor_reverse "-std=c++0x")
+  ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
   ei_add_test(cxx11_tensor_io "-std=c++0x")
+
+  # These tests needs nvcc
+#  ei_add_test(cxx11_tensor_device "-std=c++0x")
+#  ei_add_test(cxx11_tensor_cuda "-std=c++0x")
+#  ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
+
 endif()
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index 0ac3f9bf9..d16aaf847 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -285,6 +285,78 @@ static void test_compound_assign()
   }
 }
 
+static void test_std_initializers_tensor() {
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  Tensor<int, 1> a(3);
+  a.setValues({0, 1, 2});
+  VERIFY_IS_EQUAL(a(0), 0);
+  VERIFY_IS_EQUAL(a(1), 1);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // It fills the top-left slice.
+  a.setValues({10, 20});
+  VERIFY_IS_EQUAL(a(0), 10);
+  VERIFY_IS_EQUAL(a(1), 20);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // Chaining.
+  Tensor<int, 1> a2(3);
+  a2 = a.setValues({100, 200, 300});
+  VERIFY_IS_EQUAL(a(0), 100);
+  VERIFY_IS_EQUAL(a(1), 200);
+  VERIFY_IS_EQUAL(a(2), 300);
+  VERIFY_IS_EQUAL(a2(0), 100);
+  VERIFY_IS_EQUAL(a2(1), 200);
+  VERIFY_IS_EQUAL(a2(2), 300);
+
+  Tensor<int, 2> b(2, 3);
+  b.setValues({{0, 1, 2}, {3, 4, 5}});
+  VERIFY_IS_EQUAL(b(0, 0), 0);
+  VERIFY_IS_EQUAL(b(0, 1), 1);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 3);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  // It fills the top-left slice.
+  b.setValues({{10, 20}, {30}});
+  VERIFY_IS_EQUAL(b(0, 0), 10);
+  VERIFY_IS_EQUAL(b(0, 1), 20);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 30);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  Eigen::Tensor<int, 3> c(3, 2, 4);
+  c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}},
+               {{10, 11, 12, 13}, {14, 15, 16, 17}},
+               {{20, 21, 22, 23}, {24, 25, 26, 27}}});
+  VERIFY_IS_EQUAL(c(0, 0, 0), 0);
+  VERIFY_IS_EQUAL(c(0, 0, 1), 1);
+  VERIFY_IS_EQUAL(c(0, 0, 2), 2);
+  VERIFY_IS_EQUAL(c(0, 0, 3), 3);
+  VERIFY_IS_EQUAL(c(0, 1, 0), 4);
+  VERIFY_IS_EQUAL(c(0, 1, 1), 5);
+  VERIFY_IS_EQUAL(c(0, 1, 2), 6);
+  VERIFY_IS_EQUAL(c(0, 1, 3), 7);
+  VERIFY_IS_EQUAL(c(1, 0, 0), 10);
+  VERIFY_IS_EQUAL(c(1, 0, 1), 11);
+  VERIFY_IS_EQUAL(c(1, 0, 2), 12);
+  VERIFY_IS_EQUAL(c(1, 0, 3), 13);
+  VERIFY_IS_EQUAL(c(1, 1, 0), 14);
+  VERIFY_IS_EQUAL(c(1, 1, 1), 15);
+  VERIFY_IS_EQUAL(c(1, 1, 2), 16);
+  VERIFY_IS_EQUAL(c(1, 1, 3), 17);
+  VERIFY_IS_EQUAL(c(2, 0, 0), 20);
+  VERIFY_IS_EQUAL(c(2, 0, 1), 21);
+  VERIFY_IS_EQUAL(c(2, 0, 2), 22);
+  VERIFY_IS_EQUAL(c(2, 0, 3), 23);
+  VERIFY_IS_EQUAL(c(2, 1, 0), 24);
+  VERIFY_IS_EQUAL(c(2, 1, 1), 25);
+  VERIFY_IS_EQUAL(c(2, 1, 2), 26);
+  VERIFY_IS_EQUAL(c(2, 1, 3), 27);
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+}
 
 void test_cxx11_tensor_assign()
 {
@@ -294,4 +366,5 @@ void test_cxx11_tensor_assign()
   CALL_SUBTEST(test_same_type());
   CALL_SUBTEST(test_auto_resize());
   CALL_SUBTEST(test_compound_assign());
+  CALL_SUBTEST(test_std_initializers_tensor());
 }
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 9663912a4..f0792bdcf 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -13,9 +13,10 @@
 
 using Eigen::Tensor;
 
+template <int DataLayout>
 static void test_simple_broadcasting()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
   array<ptrdiff_t, 4> broadcasts;
   broadcasts[0] = 1;
@@ -23,7 +24,7 @@ static void test_simple_broadcasting()
   broadcasts[2] = 1;
   broadcasts[3] = 1;
 
-  Tensor<float, 4> no_broadcast;
+  Tensor<float, 4, DataLayout> no_broadcast;
   no_broadcast = tensor.broadcast(broadcasts);
 
   VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2);
@@ -45,7 +46,7 @@ static void test_simple_broadcasting()
   broadcasts[1] = 3;
   broadcasts[2] = 1;
   broadcasts[3] = 4;
-  Tensor<float, 4> broadcast;
+  Tensor<float, 4, DataLayout> broadcast;
   broadcast = tensor.broadcast(broadcasts);
 
   VERIFY_IS_EQUAL(broadcast.dimension(0), 4);
@@ -65,16 +66,17 @@ static void test_simple_broadcasting()
 }
 
 
+template <int DataLayout>
 static void test_vectorized_broadcasting()
 {
-  Tensor<float, 3> tensor(8,3,5);
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
   tensor.setRandom();
   array<ptrdiff_t, 3> broadcasts;
   broadcasts[0] = 2;
   broadcasts[1] = 3;
   broadcasts[2] = 4;
 
-  Tensor<float, 3> broadcast;
+  Tensor<float, 3, DataLayout> broadcast;
   broadcast = tensor.broadcast(broadcasts);
 
   VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
@@ -107,8 +109,78 @@ static void test_vectorized_broadcasting()
 }
 
 
+template <int DataLayout>
+static void test_static_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_fixed_size_broadcasting()
+{
+  // Need to add a [] operator to the Size class for this to work
+#if 0
+  Tensor<float, 1, DataLayout> t1(10);
+  t1.setRandom();
+  TensorFixedSize<float, Sizes<1>, DataLayout> t2;
+  t2 = t2.constant(20.0f);
+
+  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+
+  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
+  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
+  }
+#endif
+}
+
+
 void test_cxx11_tensor_broadcasting()
 {
-   CALL_SUBTEST(test_simple_broadcasting());
-   CALL_SUBTEST(test_vectorized_broadcasting());
+  CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_static_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_static_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index 0027b2888..0de7bbac6 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -13,18 +13,20 @@
 
 using Eigen::Tensor;
 
-
+template<int DataLayout>
 static void test_simple_chip()
 {
-  Tensor<float, 5> tensor(2,3,5,7,11);
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  Tensor<float, 4> chip1;
-  chip1 = tensor.chip<0>(1);
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.template chip<0>(1);
+
   VERIFY_IS_EQUAL(chip1.dimension(0), 3);
   VERIFY_IS_EQUAL(chip1.dimension(1), 5);
   VERIFY_IS_EQUAL(chip1.dimension(2), 7);
   VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -35,7 +37,7 @@ static void test_simple_chip()
     }
   }
 
-  Tensor<float, 4> chip2 = tensor.chip<1>(1);
+  Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1);
   VERIFY_IS_EQUAL(chip2.dimension(0), 2);
   VERIFY_IS_EQUAL(chip2.dimension(1), 5);
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
@@ -50,7 +52,7 @@ static void test_simple_chip()
     }
   }
 
-  Tensor<float, 4> chip3 = tensor.chip<2>(2);
+  Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2);
   VERIFY_IS_EQUAL(chip3.dimension(0), 2);
   VERIFY_IS_EQUAL(chip3.dimension(1), 3);
   VERIFY_IS_EQUAL(chip3.dimension(2), 7);
@@ -65,7 +67,7 @@ static void test_simple_chip()
     }
   }
 
-  Tensor<float, 4> chip4(tensor.chip<3>(5));
+  Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5));
   VERIFY_IS_EQUAL(chip4.dimension(0), 2);
   VERIFY_IS_EQUAL(chip4.dimension(1), 3);
   VERIFY_IS_EQUAL(chip4.dimension(2), 5);
@@ -80,7 +82,7 @@ static void test_simple_chip()
     }
   }
 
-  Tensor<float, 4> chip5(tensor.chip<4>(7));
+  Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7));
   VERIFY_IS_EQUAL(chip5.dimension(0), 2);
   VERIFY_IS_EQUAL(chip5.dimension(1), 3);
   VERIFY_IS_EQUAL(chip5.dimension(2), 5);
@@ -96,14 +98,97 @@ static void test_simple_chip()
   }
 }
 
+template<int DataLayout>
+static void test_dynamic_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.chip(1, 0);
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
 
+template<int DataLayout>
 static void test_chip_in_expr() {
-  Tensor<float, 5> input1(2,3,5,7,11);
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
   input1.setRandom();
-  Tensor<float, 4> input2(3,5,7,11);
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
   input2.setRandom();
 
-  Tensor<float, 4> result = input1.chip<0>(0) + input2;
+  Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2;
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -115,9 +200,9 @@ static void test_chip_in_expr() {
     }
   }
 
-  Tensor<float, 3> input3(3,7,11);
+  Tensor<float, 3, DataLayout> input3(3,7,11);
   input3.setRandom();
-  Tensor<float, 3> result2 = input1.chip<0>(0).chip<1>(2) + input3;
+  Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3;
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
       for (int k = 0; k < 11; ++k) {
@@ -128,16 +213,16 @@ static void test_chip_in_expr() {
   }
 }
 
-
+template<int DataLayout>
 static void test_chip_as_lvalue()
 {
-  Tensor<float, 5> input1(2,3,5,7,11);
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
   input1.setRandom();
 
-  Tensor<float, 4> input2(3,5,7,11);
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
   input2.setRandom();
-  Tensor<float, 5> tensor = input1;
-  tensor.chip<0>(1) = input2;
+  Tensor<float, 5, DataLayout> tensor = input1;
+  tensor.template chip<0>(1) = input2;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
@@ -154,10 +239,10 @@ static void test_chip_as_lvalue()
     }
   }
 
-  Tensor<float, 4> input3(2,5,7,11);
+  Tensor<float, 4, DataLayout> input3(2,5,7,11);
   input3.setRandom();
   tensor = input1;
-  tensor.chip<1>(1) = input3;
+  tensor.template chip<1>(1) = input3;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
@@ -174,10 +259,10 @@ static void test_chip_as_lvalue()
     }
   }
 
-  Tensor<float, 4> input4(2,3,7,11);
+  Tensor<float, 4, DataLayout> input4(2,3,7,11);
   input4.setRandom();
   tensor = input1;
-  tensor.chip<2>(3) = input4;
+  tensor.template chip<2>(3) = input4;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
@@ -194,10 +279,10 @@ static void test_chip_as_lvalue()
     }
   }
 
-  Tensor<float, 4> input5(2,3,5,11);
+  Tensor<float, 4, DataLayout> input5(2,3,5,11);
   input5.setRandom();
   tensor = input1;
-  tensor.chip<3>(4) = input5;
+  tensor.template chip<3>(4) = input5;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
@@ -214,10 +299,10 @@ static void test_chip_as_lvalue()
     }
   }
 
-  Tensor<float, 4> input6(2,3,5,7);
+  Tensor<float, 4, DataLayout> input6(2,3,5,7);
   input6.setRandom();
   tensor = input1;
-  tensor.chip<4>(5) = input6;
+  tensor.template chip<4>(5) = input6;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
@@ -235,47 +320,57 @@ static void test_chip_as_lvalue()
   }
 }
 
-
+template<int DataLayout>
 static void test_chip_raw_data()
 {
-  Tensor<float, 5> tensor(2,3,5,7,11);
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  typedef TensorEvaluator<decltype(tensor.chip<4>(3)), DefaultDevice> Evaluator4;
-  auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4;
+  auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice());
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
         for (int l = 0; l < 7; ++l) {
-          int chip_index = i + 2 * (j + 3 * (k + 5 * l));
+          int chip_index;
+          if (DataLayout == ColMajor) {
+            chip_index = i + 2 * (j + 3 * (k + 5 * l));
+          } else {
+            chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i)));
+          }
           VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
         }
       }
     }
   }
 
-  typedef TensorEvaluator<decltype(tensor.chip<0>(0)), DefaultDevice> Evaluator0;
-  auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.template chip<0>(0)), DefaultDevice> Evaluator0;
+  auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
-  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
-  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
-  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
 }
 
-
 void test_cxx11_tensor_chipping()
 {
-  CALL_SUBTEST(test_simple_chip());
-  CALL_SUBTEST(test_chip_in_expr());
-  CALL_SUBTEST(test_chip_as_lvalue());
-  CALL_SUBTEST(test_chip_raw_data());
+  CALL_SUBTEST(test_simple_chip<ColMajor>());
+  CALL_SUBTEST(test_simple_chip<RowMajor>());
+  CALL_SUBTEST(test_dynamic_chip<ColMajor>());
+  CALL_SUBTEST(test_dynamic_chip<RowMajor>());
+  CALL_SUBTEST(test_chip_in_expr<ColMajor>());
+  CALL_SUBTEST(test_chip_in_expr<RowMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_chip_raw_data<ColMajor>());
+  CALL_SUBTEST(test_chip_raw_data<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index 8fd4f5f80..9fdf33c16 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -13,15 +13,16 @@
 
 using Eigen::Tensor;
 
+template<int DataLayout>
 static void test_dimension_failures()
 {
-  Tensor<int, 3> left(2, 3, 1);
-  Tensor<int, 3> right(3, 3, 1);
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(3, 3, 1);
   left.setRandom();
   right.setRandom();
 
   // Okay; other dimensions are equal.
-  Tensor<int, 3> concatenation = left.concatenate(right, 0);
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
 
   // Dimension mismatches.
   VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
@@ -32,33 +33,35 @@ static void test_dimension_failures()
   VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
 }
 
+template<int DataLayout>
 static void test_static_dimension_failure()
 {
-  Tensor<int, 2> left(2, 3);
-  Tensor<int, 3> right(2, 3, 1);
+  Tensor<int, 2, DataLayout> left(2, 3);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
 
 #ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
   // Technically compatible, but we static assert that the inputs have same
   // NumDims.
-  Tensor<int, 3> concatenation = left.concatenate(right, 0);
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
 #endif
 
   // This can be worked around in this case.
-  Tensor<int, 3> concatenation = left
+  Tensor<int, 3, DataLayout> concatenation = left
       .reshape(Tensor<int, 3>::Dimensions{{2, 3, 1}})
       .concatenate(right, 0);
-  Tensor<int, 2> alternative = left
+  Tensor<int, 2, DataLayout> alternative = left
       .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0);
 }
 
+template<int DataLayout>
 static void test_simple_concatenation()
 {
-  Tensor<int, 3> left(2, 3, 1);
-  Tensor<int, 3> right(2, 3, 1);
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
   left.setRandom();
   right.setRandom();
 
-  Tensor<int, 3> concatenation = left.concatenate(right, 0);
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
   VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
   VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
   VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
@@ -103,8 +106,11 @@ static void test_simple_concatenation()
 
 void test_cxx11_tensor_concatenation()
 {
-   CALL_SUBTEST(test_dimension_failures());
-   CALL_SUBTEST(test_static_dimension_failure());
-   CALL_SUBTEST(test_simple_concatenation());
+   CALL_SUBTEST(test_dimension_failures<ColMajor>());
+   CALL_SUBTEST(test_dimension_failures<RowMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<ColMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<RowMajor>());
+   CALL_SUBTEST(test_simple_concatenation<ColMajor>());
+   CALL_SUBTEST(test_simple_concatenation<RowMajor>());
    // CALL_SUBTEST(test_vectorized_concatenation());
 }
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp
new file mode 100644
index 000000000..9599607c6
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp
@@ -0,0 +1,121 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+static void test_cuda_contraction(int m_size, int k_size, int n_size)
+{
+  cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
+  Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
+  Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
+  Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) {
+      cout << "mismatch detected at index " << i << ": " << t_result.data()[i]
+           << " vs " <<  t_result_gpu.data()[i] << endl;
+      assert(false);
+    }
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_cuda()
+{
+  cout<<"Calling contraction tests"<<std::endl;
+  CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
+  for (int k = 32; k < 256; k++) {
+    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
+    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
+  }
+  for (int k = 32; k < 256; k++) {
+    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
+    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
+  }
+  for (int k = 32; k < 256; k++) {
+    CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
+    CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
+  }
+
+  int m_sizes[] = {31,   39,   63,   64,  65,
+                   127, 129,  255,  257, 511,
+                   512, 513, 1023, 1024, 1025 };
+  int n_sizes[] = {31,   39,   63,   64,  65,
+                   127, 129,  255,  257, 511,
+                   512, 513, 1023, 1024, 1025 };
+
+  int k_sizes[] = { 31,  39,  63, 64,    65,
+                    95,  96, 127, 129,  255,
+                   257, 511, 512, 513, 1023,
+                  1024, 1025};
+
+  for (int i = 0; i <15; i++)
+    for (int j = 0; j < 15; j++)
+      for (int k = 0; k < 17; k++) {
+        CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+      }
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 17bd335f7..6124818fd 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -16,18 +16,18 @@ using Eigen::Tensor;
 
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
-
+template<int DataLayout>
 static void test_evals()
 {
-  Tensor<float, 2> mat1(2, 3);
-  Tensor<float, 2> mat2(2, 3);
-  Tensor<float, 2> mat3(3, 2);
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(2, 3);
+  Tensor<float, 2, DataLayout> mat3(3, 2);
 
   mat1.setRandom();
   mat2.setRandom();
   mat3.setRandom();
 
-  Tensor<float, 2> mat4(3,3);
+  Tensor<float, 2, DataLayout> mat4(3,3);
   mat4.setZero();
   Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}});
   typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
@@ -47,7 +47,7 @@ static void test_evals()
   VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
   VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
 
-  Tensor<float, 2> mat5(2,2);
+  Tensor<float, 2, DataLayout> mat5(2,2);
   mat5.setZero();
   Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}});
   typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
@@ -62,7 +62,7 @@ static void test_evals()
   VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
   VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
 
-  Tensor<float, 2> mat6(2,2);
+  Tensor<float, 2, DataLayout> mat6(2,2);
   mat6.setZero();
   Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}});
   typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
@@ -78,16 +78,16 @@ static void test_evals()
   VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
 }
 
-
+template<int DataLayout>
 static void test_scalar()
 {
-  Tensor<float, 1> vec1({6});
-  Tensor<float, 1> vec2({6});
+  Tensor<float, 1, DataLayout> vec1({6});
+  Tensor<float, 1, DataLayout> vec2({6});
 
   vec1.setRandom();
   vec2.setRandom();
 
-  Tensor<float, 1> scalar(1);
+  Tensor<float, 1, DataLayout> scalar(1);
   scalar.setZero();
   Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}});
   typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator;
@@ -102,16 +102,16 @@ static void test_scalar()
   VERIFY_IS_APPROX(scalar(0), expected);
 }
 
-
+template<int DataLayout>
 static void test_multidims()
 {
-  Tensor<float, 3> mat1(2, 2, 2);
-  Tensor<float, 4> mat2(2, 2, 2, 2);
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2);
 
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 3> mat3(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat3(2, 2, 2);
   mat3.setZero();
   Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}});
   typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
@@ -140,15 +140,15 @@ static void test_multidims()
                                 mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
 }
 
-
+template<int DataLayout>
 static void test_holes() {
-  Tensor<float, 4> t1(2, 5, 7, 3);
-  Tensor<float, 5> t2(2, 7, 11, 13, 3);
+  Tensor<float, 4, DataLayout> t1(2, 5, 7, 3);
+  Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3);
   t1.setRandom();
   t2.setRandom();
 
   Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(3, 4)}});
-  Tensor<float, 5> result = t1.contract(t2, dims);
+  Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
   VERIFY_IS_EQUAL(result.dimension(0), 5);
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   VERIFY_IS_EQUAL(result.dimension(2), 7);
@@ -174,16 +174,16 @@ static void test_holes() {
   }
 }
 
-
+template<int DataLayout>
 static void test_full_redux()
 {
-  Tensor<float, 2> t1(2, 2);
-  Tensor<float, 3> t2(2, 2, 2);
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 3, DataLayout> t2(2, 2, 2);
   t1.setRandom();
   t2.setRandom();
 
   Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
-  Tensor<float, 1> result = t1.contract(t2, dims);
+  Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(1, 0, 0)
                             + t1(0, 1) * t2(0, 1, 0) +  t1(1, 1) * t2(1, 1, 0));
@@ -200,13 +200,13 @@ static void test_full_redux()
                             + t1(0, 1) * t2(1, 0, 1) +  t1(1, 1) * t2(1, 1, 1));
 }
 
-
+template<int DataLayout>
 static void test_contraction_of_contraction()
 {
-  Tensor<float, 2> t1(2, 2);
-  Tensor<float, 2> t2(2, 2);
-  Tensor<float, 2> t3(2, 2);
-  Tensor<float, 2> t4(2, 2);
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 2, DataLayout> t2(2, 2);
+  Tensor<float, 2, DataLayout> t3(2, 2);
+  Tensor<float, 2, DataLayout> t4(2, 2);
   t1.setRandom();
   t2.setRandom();
   t3.setRandom();
@@ -216,30 +216,32 @@ static void test_contraction_of_contraction()
   auto contract1 = t1.contract(t2, dims);
   auto diff = t3 - contract1;
   auto contract2 = t1.contract(t4, dims);
-  Tensor<float, 2> result = contract2.contract(diff, dims);
+  Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims);
+
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 2);
 
-  Eigen::Map<MatrixXf> m1(t1.data(), 2, 2);
-  Eigen::Map<MatrixXf> m2(t2.data(), 2, 2);
-  Eigen::Map<MatrixXf> m3(t3.data(), 2, 2);
-  Eigen::Map<MatrixXf> m4(t4.data(), 2, 2);
-  Eigen::MatrixXf expected = (m1 * m4) * (m3 - m1 * m2);
+  Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>>
+      m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2),
+      m4(t4.data(), 2, 2);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>
+      expected = (m1 * m4) * (m3 - m1 * m2);
+
   VERIFY_IS_APPROX(result(0, 0), expected(0, 0));
   VERIFY_IS_APPROX(result(0, 1), expected(0, 1));
   VERIFY_IS_APPROX(result(1, 0), expected(1, 0));
   VERIFY_IS_APPROX(result(1, 1), expected(1, 1));
 }
 
-
+template<int DataLayout>
 static void test_expr()
 {
-  Tensor<float, 2> mat1(2, 3);
-  Tensor<float, 2> mat2(3, 2);
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(3, 2);
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 2> mat3(2,2);
+  Tensor<float, 2, DataLayout> mat3(2,2);
 
   Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
   mat3 = mat1.contract(mat2, dims);
@@ -250,16 +252,16 @@ static void test_expr()
   VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }
 
-
+template<int DataLayout>
 static void test_out_of_order_contraction()
 {
-  Tensor<float, 3> mat1(2, 2, 2);
-  Tensor<float, 3> mat2(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat2(2, 2, 2);
 
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 2> mat3(2, 2);
+  Tensor<float, 2, DataLayout> mat3(2, 2);
 
   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(0, 2)}});
   mat3 = mat1.contract(mat2, dims);
@@ -295,18 +297,18 @@ static void test_out_of_order_contraction()
 
 }
 
-
+template<int DataLayout>
 static void test_consistency()
 {
   // this does something like testing (A*B)^T = (B^T * A^T)
 
-  Tensor<float, 3> mat1(4, 3, 5);
-  Tensor<float, 5> mat2(3, 2, 1, 5, 4);
+  Tensor<float, 3, DataLayout> mat1(4, 3, 5);
+  Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4);
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 4> mat3(5, 2, 1, 5);
-  Tensor<float, 4> mat4(2, 1, 5, 5);
+  Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5);
+  Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
 
   // contract on dimensions of size 4 and 3
   Eigen::array<DimPair, 2> dims1({{DimPair(0, 4), DimPair(1, 0)}});
@@ -316,27 +318,40 @@ static void test_consistency()
   mat4 = mat2.contract(mat1, dims2);
 
   // check that these are equal except for ordering of dimensions
-  for (size_t i = 0; i < 5; i++) {
-    for (size_t j = 0; j < 10; j++) {
-      VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
+  if (DataLayout == ColMajor) {
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
+      }
+    }
+  } else {
+    // Row major
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]);
+      }
     }
   }
 }
 
-
+template<int DataLayout>
 static void test_large_contraction()
 {
-  Tensor<float, 4> t_left(30, 50, 8, 31);
-  Tensor<float, 5> t_right(8, 31, 7, 20, 10);
-  Tensor<float, 5> t_result(30, 50, 7, 20, 10);
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
 
   t_left.setRandom();
   t_right.setRandom();
 
-  typedef Map<MatrixXf> MapXf;
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
   MapXf m_left(t_left.data(), 1500, 248);
   MapXf m_right(t_right.data(), 248, 1400);
-  MatrixXf m_result(1500, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
 
   // this contraction should be equivalent to a single matrix multiplication
   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
@@ -351,20 +366,20 @@ static void test_large_contraction()
   }
 }
 
-
+template<int DataLayout>
 static void test_matrix_vector()
 {
-  Tensor<float, 2> t_left(30, 50);
-  Tensor<float, 1> t_right(50);
-  Tensor<float, 1> t_result(30);
+  Tensor<float, 2, DataLayout> t_left(30, 50);
+  Tensor<float, 1, DataLayout> t_right(50);
+  Tensor<float, 1, DataLayout> t_result(30);
 
   t_left.setRandom();
   t_right.setRandom();
 
-  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic>> MapXf;
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
   MapXf m_left(t_left.data(), 30, 50);
   MapXf m_right(t_right.data(), 50, 1);
-  Eigen::Matrix<float, Dynamic, Dynamic> m_result(30, 1);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1);
 
   // this contraction should be equivalent to a single matrix multiplication
   Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
@@ -379,18 +394,19 @@ static void test_matrix_vector()
 }
 
 
+template<int DataLayout>
 static void test_tensor_vector()
 {
-  Tensor<float, 3> t_left(7, 13, 17);
-  Tensor<float, 2> t_right(1, 7);
-  typedef typename Tensor<float, 1>::DimensionPair DimensionPair;
+  Tensor<float, 3, DataLayout> t_left(7, 13, 17);
+  Tensor<float, 2, DataLayout> t_right(1, 7);
+  typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
   Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
-  Tensor<float, 3> t_result = t_left.contract(t_right, dim_pair01);
+  Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
 
-  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic>> MapXf;
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
   MapXf m_left(t_left.data(), 7, 13*17);
   MapXf m_right(t_right.data(), 1, 7);
-  Eigen::Matrix<float, Dynamic, Dynamic> m_result = m_left.transpose() * m_right.transpose();
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
 
   for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
     VERIFY_IS_APPROX(t_result(i), m_result(i, 0));
@@ -398,18 +414,63 @@ static void test_tensor_vector()
 }
 
 
+template<int DataLayout>
+static void test_small_blocking_factors()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1);
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  // Force the cache sizes, which results in smaller blocking factors.
+  Eigen::setCpuCacheSizes(896, 1920, 2944);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Tensor<float, 5, DataLayout> t_result;
+  t_result = t_left.contract(t_right, dims);
+
+  // compute result using a simple eigen matrix product
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93);
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
 void test_cxx11_tensor_contraction()
 {
-  CALL_SUBTEST(test_evals());
-  CALL_SUBTEST(test_scalar());
-  CALL_SUBTEST(test_multidims());
-  CALL_SUBTEST(test_holes());
-  CALL_SUBTEST(test_full_redux());
-  CALL_SUBTEST(test_contraction_of_contraction());
-  CALL_SUBTEST(test_expr());
-  CALL_SUBTEST(test_out_of_order_contraction());
-  CALL_SUBTEST(test_consistency());
-  CALL_SUBTEST(test_large_contraction());
-  CALL_SUBTEST(test_matrix_vector());
-  CALL_SUBTEST(test_tensor_vector());
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_scalar<ColMajor>());
+  CALL_SUBTEST(test_scalar<RowMajor>());
+  CALL_SUBTEST(test_multidims<ColMajor>());
+  CALL_SUBTEST(test_multidims<RowMajor>());
+  CALL_SUBTEST(test_holes<ColMajor>());
+  CALL_SUBTEST(test_holes<RowMajor>());
+  CALL_SUBTEST(test_full_redux<ColMajor>());
+  CALL_SUBTEST(test_full_redux<RowMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST(test_consistency<ColMajor>());
+  CALL_SUBTEST(test_consistency<RowMajor>());
+  CALL_SUBTEST(test_large_contraction<ColMajor>());
+  CALL_SUBTEST(test_large_contraction<RowMajor>());
+  CALL_SUBTEST(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp
new file mode 100644
index 000000000..059d23de1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cuda.cpp
@@ -0,0 +1,474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// TODO(mdevin): Free the cuda memory.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_elementwise_small() {
+  Tensor<float, 1> in1(Eigen::array<int, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<int, 1>(2));
+  Tensor<float, 1> out(Eigen::array<int, 1>(2));
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, Eigen::array<int, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+      d_in2, Eigen::array<int, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+      d_out, Eigen::array<int, 1>(2));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(
+        out(Eigen::array<int, 1>(i)),
+        in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
+  }
+}
+
+void test_cuda_elementwise()
+{
+  Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+  in1.setRandom();
+  in2.setRandom();
+  in3.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t in3_bytes = in3.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_in3;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_in3), in3_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
+      }
+    }
+  }
+}
+
+
+void test_cuda_reduction()
+{
+  Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113));
+  Tensor<float, 2> out(Eigen::array<int, 2>(72,97));
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113));
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97));
+
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = 0;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected =
+              std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l)));
+        }
+      }
+      VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected);
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_cuda_contraction()
+{
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31));
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31));
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1));
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1));
+
+  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+  MapXf m_left(t_left.data(), 300, 93);
+  MapXf m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+
+  m_result = m_left * m_right;
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << endl;
+      assert(false);
+    }
+  }
+}
+
+static void test_cuda_convolution_1d()
+{
+  Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
+  Tensor<float, 1> kernel(Eigen::array<int, 1>(4));
+  Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4));
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137));
+
+  Eigen::array<int, 1> dims(1);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 34; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(Eigen::array<int, 4>(i,j,k,l));
+          const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) +
+                                 input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3));
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+}
+
+
+static void test_cuda_convolution_2d()
+{
+  Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
+  Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4));
+  Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4));
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137));
+
+  Eigen::array<int, 2> dims(1,2);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(Eigen::array<int, 4>(i,j,k,l));
+          const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) +
+                                 input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) +
+                                 input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) +
+                                 input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3));
+            VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+}
+
+
+static void test_cuda_convolution_3d()
+{
+  Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17));
+  Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2));
+  Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2));
+  Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17));
+
+  Eigen::array<int, 3> dims(1,2,3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 136; ++l) {
+          for (int m = 0; m < 17; ++m) {
+            const float result = out(Eigen::array<int, 5>(i,j,k,l,m));
+            const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1));
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+static float* CudaCopyFloat(float* data, int size) {
+  const int nbytes = size * sizeof(float);
+  float* result = NULL;
+  if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) {
+    return NULL;
+  } else {
+    if (data != NULL) {
+      cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice);
+    }
+    return result;
+  }
+}
+
+static void test_cuda_constant_broadcast()
+{
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Tensor<float, 1> t1(10);
+  for (int i = 0; i < 10; ++i) {
+    t1(i) = 10.0f * i;
+  }
+  float* t1_cuda = CudaCopyFloat(t1.data(), t1.size());
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10);
+
+  Tensor<float, 1> t2(1);
+  t2 = t2.constant(20.0f);
+  float* t2_cuda = CudaCopyFloat(t2.data(), t2.size());
+  Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1);
+
+  float* t3_cuda = CudaCopyFloat(NULL, 10);
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10);
+
+  t3_gpu.device(gpu_device) =
+      t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10));
+
+  Eigen::Tensor<float, 1> t3(10);
+  cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float),
+             cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST(test_cuda_elementwise_small());
+  CALL_SUBTEST(test_cuda_elementwise());
+  CALL_SUBTEST(test_cuda_reduction());
+  CALL_SUBTEST(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST(test_cuda_convolution_1d());
+  CALL_SUBTEST(test_cuda_convolution_2d());
+  CALL_SUBTEST(test_cuda_convolution_3d());
+  CALL_SUBTEST(test_cuda_constant_broadcast());
+}
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp
index 26465ee11..f2d7e4ce6 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cpp
@@ -22,23 +22,23 @@ using Eigen::RowMajor;
 
 // Context for evaluation on cpu
 struct CPUContext {
-  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(Eigen::array<int, 2>(2,2)), kernel_3d_(Eigen::array<int, 3>(2,2,2)) {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
     kernel_1d_(0) = 3.14f;
     kernel_1d_(1) = 2.7f;
 
-    kernel_2d_(Eigen::array<int, 2>(0,0)) = 3.14f;
-    kernel_2d_(Eigen::array<int, 2>(1,0)) = 2.7f;
-    kernel_2d_(Eigen::array<int, 2>(0,1)) = 0.2f;
-    kernel_2d_(Eigen::array<int, 2>(1,1)) = 7.0f;
-
-    kernel_3d_(Eigen::array<int, 3>(0,0,0)) = 3.14f;
-    kernel_3d_(Eigen::array<int, 3>(0,1,0)) = 2.7f;
-    kernel_3d_(Eigen::array<int, 3>(0,0,1)) = 0.2f;
-    kernel_3d_(Eigen::array<int, 3>(0,1,1)) = 7.0f;
-    kernel_3d_(Eigen::array<int, 3>(1,0,0)) = -1.0f;
-    kernel_3d_(Eigen::array<int, 3>(1,1,0)) = -0.3f;
-    kernel_3d_(Eigen::array<int, 3>(1,0,1)) = -0.7f;
-    kernel_3d_(Eigen::array<int, 3>(1,1,1)) = -0.5f;
+    kernel_2d_(0,0) = 3.14f;
+    kernel_2d_(1,0) = 2.7f;
+    kernel_2d_(0,1) = 0.2f;
+    kernel_2d_(1,1) = 7.0f;
+
+    kernel_3d_(0,0,0) = 3.14f;
+    kernel_3d_(0,1,0) = 2.7f;
+    kernel_3d_(0,0,1) = 0.2f;
+    kernel_3d_(0,1,1) = 7.0f;
+    kernel_3d_(1,0,0) = -1.0f;
+    kernel_3d_(1,1,0) = -0.3f;
+    kernel_3d_(1,0,1) = -0.7f;
+    kernel_3d_(1,1,1) = -0.5f;
   }
 
   const Eigen::DefaultDevice& device() const { return cpu_device_; }
@@ -93,8 +93,8 @@ struct GPUContext {
   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
   Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, Eigen::array<int, 2>(2, 2)); }
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, Eigen::array<int, 3>(2, 2, 2)); }
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
 
  private:
   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
@@ -150,8 +150,8 @@ static void test_contraction(Context* context)
 template <typename Context>
 static void test_1d_convolution(Context* context)
 {
-  Eigen::DSizes<int, 3> indices(Eigen::array<int, 3>(0,0,0));
-  Eigen::DSizes<int, 3> sizes(Eigen::array<int, 3>(40,49,70));
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,70);
 
   Eigen::array<int, 1> dims(1);
   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
@@ -160,8 +160,8 @@ static void test_1d_convolution(Context* context)
 template <typename Context>
 static void test_2d_convolution(Context* context)
 {
-  Eigen::DSizes<int, 3> indices(Eigen::array<int, 3>(0,0,0));
-  Eigen::DSizes<int, 3> sizes(Eigen::array<int, 3>(40,49,69));
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,69);
 
   Eigen::array<int, 2> dims(1,2);
   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
@@ -170,8 +170,8 @@ static void test_2d_convolution(Context* context)
 template <typename Context>
 static void test_3d_convolution(Context* context)
 {
-  Eigen::DSizes<int, 3> indices(Eigen::array<int, 3>(0,0,0));
-  Eigen::DSizes<int, 3> sizes(Eigen::array<int, 3>(39,49,69));
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(39,49,69);
 
   Eigen::array<int, 3> dims(0,1,2);
   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
@@ -179,9 +179,9 @@ static void test_3d_convolution(Context* context)
 
 
 static void test_cpu() {
-  Eigen::Tensor<float, 3> in1(Eigen::array<int, 3>(40,50,70));
-  Eigen::Tensor<float, 3> in2(Eigen::array<int, 3>(40,50,70));
-  Eigen::Tensor<float, 3> out(Eigen::array<int, 3>(40,50,70));
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
 
   in1 = in1.random() + in1.constant(10.0f);
   in2 = in2.random() + in2.constant(10.0f);
@@ -191,7 +191,7 @@ static void test_cpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
       }
     }
   }
@@ -200,7 +200,7 @@ static void test_cpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), (in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k))) * 3.14f + 2.718f);
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
       }
     }
   }
@@ -209,7 +209,7 @@ static void test_cpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
       }
     }
   }
@@ -217,11 +217,11 @@ static void test_cpu() {
   test_contraction(&context);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
-      const float result = out(Eigen::array<int, 3>(i,j,0));
+      const float result = out(i,j,0);
       float expected = 0;
       for (int k = 0; k < 50; ++k) {
         for (int l = 0; l < 70; ++l) {
-          expected += in1(Eigen::array<int, 3>(i, k, l)) * in2(Eigen::array<int, 3>(j, k, l));
+          expected += in1(i, k, l) * in2(j, k, l);
         }
       }
       VERIFY_IS_APPROX(expected, result);
@@ -232,7 +232,7 @@ static void test_cpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), (in1(Eigen::array<int, 3>(i,j,k)) * 3.14f + in1(Eigen::array<int, 3>(i,j+1,k)) * 2.7f));
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
       }
     }
   }
@@ -241,9 +241,9 @@ static void test_cpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
-        const float result = out(Eigen::array<int, 3>(i,j,k));
-        const float expected = (in1(Eigen::array<int, 3>(i,j,k)) * 3.14f + in1(Eigen::array<int, 3>(i,j+1,k)) * 2.7f) +
-                               (in1(Eigen::array<int, 3>(i,j,k+1)) * 0.2f + in1(Eigen::array<int, 3>(i,j+1,k+1)) * 7.0f);
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
         if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
           continue;
         }
@@ -256,11 +256,11 @@ static void test_cpu() {
   for (int i = 0; i < 39; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
-        const float result = out(Eigen::array<int, 3>(i,j,k));
-        const float expected = (in1(Eigen::array<int, 3>(i,j,k)) * 3.14f + in1(Eigen::array<int, 3>(i,j+1,k)) * 2.7f +
-                                in1(Eigen::array<int, 3>(i,j,k+1)) * 0.2f + in1(Eigen::array<int, 3>(i,j+1,k+1)) * 7.0f) +
-                               (in1(Eigen::array<int, 3>(i+1,j,k)) * -1.0f + in1(Eigen::array<int, 3>(i+1,j+1,k)) * -0.3f +
-                                in1(Eigen::array<int, 3>(i+1,j,k+1)) * -0.7f + in1(Eigen::array<int, 3>(i+1,j+1,k+1)) * -0.5f);
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
         if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
           continue;
         }
@@ -271,9 +271,9 @@ static void test_cpu() {
 }
 
 static void test_gpu() {
-  Eigen::Tensor<float, 3> in1(Eigen::array<int, 3>(40,50,70));
-  Eigen::Tensor<float, 3> in2(Eigen::array<int, 3>(40,50,70));
-  Eigen::Tensor<float, 3> out(Eigen::array<int, 3>(40,50,70));
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
   in1 = in1.random() + in1.constant(10.0f);
   in2 = in2.random() + in2.constant(10.0f);
 
@@ -291,9 +291,9 @@ static void test_gpu() {
   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
 
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(40,50,70));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(40,50,70));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(40,50,70));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
 
   GPUContext context(gpu_in1, gpu_in2, gpu_out);
   test_contextual_eval(&context);
@@ -301,7 +301,7 @@ static void test_gpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
       }
     }
   }
@@ -311,7 +311,7 @@ static void test_gpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), (in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k))) * 3.14f + 2.718f);
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
       }
     }
   }
@@ -321,7 +321,7 @@ static void test_gpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
       }
     }
   }
@@ -330,11 +330,11 @@ static void test_gpu() {
   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
-      const float result = out(Eigen::array<int, 3>(i,j,0));
+      const float result = out(i,j,0);
       float expected = 0;
       for (int k = 0; k < 50; ++k) {
         for (int l = 0; l < 70; ++l) {
-          expected += in1(Eigen::array<int, 3>(i, k, l)) * in2(Eigen::array<int, 3>(j, k, l));
+          expected += in1(i, k, l) * in2(j, k, l);
         }
       }
       VERIFY_IS_APPROX(expected, result);
@@ -347,7 +347,7 @@ static void test_gpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), (in1(Eigen::array<int, 3>(i,j,k)) * 3.14f + in1(Eigen::array<int, 3>(i,j+1,k)) * 2.7f));
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
       }
     }
   }
@@ -358,9 +358,9 @@ static void test_gpu() {
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
-        const float result = out(Eigen::array<int, 3>(i,j,k));
-        const float expected = (in1(Eigen::array<int, 3>(i,j,k)) * 3.14f + in1(Eigen::array<int, 3>(i,j+1,k)) * 2.7f +
-                                in1(Eigen::array<int, 3>(i,j,k+1)) * 0.2f + in1(Eigen::array<int, 3>(i,j+1,k+1)) * 7.0f);
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
         VERIFY_IS_APPROX(expected, result);
       }
     }
@@ -372,11 +372,11 @@ static void test_gpu() {
   for (int i = 0; i < 39; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
-       const float result = out(Eigen::array<int, 3>(i,j,k));
-        const float expected = (in1(Eigen::array<int, 3>(i,j,k)) * 3.14f + in1(Eigen::array<int, 3>(i,j+1,k)) * 2.7f +
-                                in1(Eigen::array<int, 3>(i,j,k+1)) * 0.2f + in1(Eigen::array<int, 3>(i,j+1,k+1)) * 7.0f +
-                                in1(Eigen::array<int, 3>(i+1,j,k)) * -1.0f + in1(Eigen::array<int, 3>(i+1,j+1,k)) * -0.3f +
-                                in1(Eigen::array<int, 3>(i+1,j,k+1)) * -0.7f + in1(Eigen::array<int, 3>(i+1,j+1,k+1)) * -0.5f);
+       const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
         VERIFY_IS_APPROX(expected, result);
       }
     }
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index c806b623f..0cc4e86f7 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -16,12 +16,15 @@ using Eigen::Tensor;
 
 static void test_dynamic_size()
 {
-  Eigen::DSizes<int, 3> dimensions(Eigen::array<int, 3>{{2,3,7}});
+  Eigen::DSizes<int, 3> dimensions(2,3,7);
 
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
   VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions[0], 2);
+  VERIFY_IS_EQUAL((int)dimensions[1], 3);
+  VERIFY_IS_EQUAL((int)dimensions[2], 7);
 }
 
 static void test_fixed_size()
@@ -37,9 +40,9 @@ static void test_fixed_size()
 
 static void test_match()
 {
-  Eigen::DSizes<int, 3> dyn(Eigen::array<int, 3>{{2,3,7}});
+  Eigen::DSizes<int, 3> dyn(2,3,7);
   Eigen::Sizes<2,3,7> stat;
-  VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true);
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
 }
 
 
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index e85fcbfa9..792fdeade 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -125,6 +125,12 @@ static void test_3d()
   mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
   Tensor<float, 3, RowMajor> mat8(2,3,7);
   mat8 = (-mat2).exp() * 3.14f;
+  Tensor<float, 3, RowMajor> mat9(2,3,7);
+  mat9 = mat2 + 3.14f;
+  Tensor<float, 3, RowMajor> mat10(2,3,7);
+  mat10 = mat2 - 3.14f;
+  Tensor<float, 3, RowMajor> mat11(2,3,7);
+  mat11 = mat2 / 3.14f;
 
   val = 1.0;
   for (int i = 0; i < 2; ++i) {
@@ -136,6 +142,9 @@ static void test_3d()
         VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
         VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
         VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
+        VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
+        VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
+        VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
         val += 1.0;
       }
     }
@@ -172,6 +181,36 @@ static void test_constants()
   }
 }
 
+static void test_boolean()
+{
+  Tensor<int, 1> vec(6);
+  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+
+  // Test ||.
+  Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
+  VERIFY_IS_EQUAL(bool1[0], true);
+  VERIFY_IS_EQUAL(bool1[1], false);
+  VERIFY_IS_EQUAL(bool1[2], false);
+  VERIFY_IS_EQUAL(bool1[3], false);
+  VERIFY_IS_EQUAL(bool1[4], false);
+  VERIFY_IS_EQUAL(bool1[5], true);
+
+  // Test &&, including cast of operand vec.
+  Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
+  VERIFY_IS_EQUAL(bool2[0], false);
+  VERIFY_IS_EQUAL(bool2[1], true);
+  VERIFY_IS_EQUAL(bool2[2], true);
+  VERIFY_IS_EQUAL(bool2[3], true);
+  VERIFY_IS_EQUAL(bool2[4], false);
+  VERIFY_IS_EQUAL(bool2[5], false);
+
+  // Compilation tests:
+  // Test Tensor<bool> against results of cast or comparison; verifies that
+  // CoeffReturnType is set to match Op return type of bool for Unary and Binary
+  // Ops.
+  Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2;
+  bool3 = vec < vec.constant(4) && bool2;
+}
 
 static void test_functors()
 {
@@ -258,6 +297,7 @@ void test_cxx11_tensor_expr()
   CALL_SUBTEST(test_2d());
   CALL_SUBTEST(test_3d());
   CALL_SUBTEST(test_constants());
+  CALL_SUBTEST(test_boolean());
   CALL_SUBTEST(test_functors());
   CALL_SUBTEST(test_type_casting());
   CALL_SUBTEST(test_select());
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index 529584a7b..ad9de867d 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -45,7 +45,34 @@ static void test_simple()
 }
 
 
+static void test_const()
+{
+  MatrixXf input(3,3);
+  input.setRandom();
+  MatrixXf output = input;
+  output.rowwise() -= input.colwise().maxCoeff();
+
+  Eigen::array<int, 1> depth_dim;
+  depth_dim[0] = 0;
+  Tensor<float, 2>::Dimensions dims2d;
+  dims2d[0] = 1;
+  dims2d[1] = 3;
+  Eigen::array<int, 2> bcast;
+  bcast[0] = 3;
+  bcast[1] = 1;
+  const TensorMap<Tensor<const float, 2>> input_tensor(input.data(), 3, 3);
+  Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(output(i, j), output_tensor(i, j));
+    }
+  }
+}
+
+
 void test_cxx11_tensor_forced_eval()
 {
   CALL_SUBTEST(test_simple());
+  CALL_SUBTEST(test_const());
 }
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 55d35eac0..26854f5a4 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -28,6 +28,9 @@ static void test_simple_patch()
   VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7);
 
   for (int i = 0; i < tensor.size(); ++i) {
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
     VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
   }
 
@@ -51,6 +54,9 @@ static void test_simple_patch()
               if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
                 expected = tensor(d, r-1+i, c-2+j, b);
               }
+              if (entire_image_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
               VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected);
             }
           }
@@ -68,6 +74,11 @@ static void test_simple_patch()
   VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
   VERIFY_IS_EQUAL(twod_patch.dimension(4), 7);
 
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 5; ++j) {
       int patchId = i+3*j;
@@ -76,8 +87,13 @@ static void test_simple_patch()
           for (int d = 0; d < 2; ++d) {
             for (int b = 0; b < 7; ++b) {
               float expected = 0.0f;
-              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) {
-                expected = tensor(d, r-1+i, c-1+j, b);
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (twod_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
               VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected);
             }
@@ -88,6 +104,156 @@ static void test_simple_patch()
   }
 }
 
+// Verifies VALID padding (no padding) with incrementing values.
+static void test_patch_padding_valid()
+{
+  int input_depth = 3;
+  int input_rows = 3;
+  int input_cols = 3;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with the same value.
+static void test_patch_padding_valid_same_value()
+{
+  int input_depth = 1;
+  int input_rows = 5;
+  int input_cols = 5;
+  int input_batches = 2;
+  int ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  tensor = tensor.constant(11.0f);
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+static void test_patch_padding_same()
+{
+  int input_depth = 3;
+  int input_rows = 4;
+  int input_cols = 2;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be
+  // 0.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
 
 static void test_patch_no_extra_dim()
 {
@@ -103,6 +269,9 @@ static void test_patch_no_extra_dim()
   VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
 
   for (int i = 0; i < tensor.size(); ++i) {
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
     VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
   }
 
@@ -124,6 +293,9 @@ static void test_patch_no_extra_dim()
             if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
               expected = tensor(d, r-1+i, c-2+j);
             }
+            if (entire_image_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
             VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected);
           }
         }
@@ -139,6 +311,11 @@ static void test_patch_no_extra_dim()
   VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
   VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
 
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 5; ++j) {
       int patchId = i+3*j;
@@ -146,8 +323,13 @@ static void test_patch_no_extra_dim()
         for (int c = 0; c < 2; ++c) {
           for (int d = 0; d < 2; ++d) {
             float expected = 0.0f;
-            if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) {
-              expected = tensor(d, r-1+i, c-1+j);
+            int row_offset = r*stride + i - row_padding;
+            int col_offset = c*stride + j - col_padding;
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+              expected = tensor(d, row_offset, col_offset);
+            }
+            if (twod_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
             }
             VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected);
           }
@@ -181,6 +363,9 @@ static void test_imagenet_patches()
               if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
                 expected = l_in(d, r-5+i, c-5+j, b);
               }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
             }
           }
@@ -208,6 +393,9 @@ static void test_imagenet_patches()
               if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
                 expected = l_in(d, r-4+i, c-4+j, b);
               }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
             }
           }
@@ -235,6 +423,9 @@ static void test_imagenet_patches()
               if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
                 expected = l_in(d, r-3+i, c-3+j, b);
               }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
             }
           }
@@ -262,6 +453,9 @@ static void test_imagenet_patches()
               if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
                 expected = l_in(d, r-1+i, c-1+j, b);
               }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
             }
           }
@@ -271,10 +465,12 @@ static void test_imagenet_patches()
   }
 }
 
-
 void test_cxx11_tensor_image_patch()
 {
   CALL_SUBTEST(test_simple_patch());
   CALL_SUBTEST(test_patch_no_extra_dim());
+  CALL_SUBTEST(test_patch_padding_valid());
+  CALL_SUBTEST(test_patch_padding_valid_same_value());
+  CALL_SUBTEST(test_patch_padding_same());
   CALL_SUBTEST(test_imagenet_patches());
 }
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 478c20306..9cf2eb150 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -29,6 +29,7 @@ static void test_1d()
   vec1(4) = 23; vec2(4) = 4;
   vec1(5) = 42; vec2(5) = 5;
 
+  VERIFY_IS_EQUAL(vec1.rank(), 1);
   VERIFY_IS_EQUAL(vec1.size(), 6);
   VERIFY_IS_EQUAL(vec1.dimension(0), 6);
 
@@ -69,10 +70,12 @@ static void test_2d()
   TensorMap<Tensor<const int, 2>> mat3(mat1.data(), 2, 3);
   TensorMap<Tensor<const int, 2, RowMajor>> mat4(mat2.data(), 2, 3);
 
+  VERIFY_IS_EQUAL(mat3.rank(), 2);
   VERIFY_IS_EQUAL(mat3.size(), 6);
   VERIFY_IS_EQUAL(mat3.dimension(0), 2);
   VERIFY_IS_EQUAL(mat3.dimension(1), 3);
 
+  VERIFY_IS_EQUAL(mat4.rank(), 2);
   VERIFY_IS_EQUAL(mat4.size(), 6);
   VERIFY_IS_EQUAL(mat4.dimension(0), 2);
   VERIFY_IS_EQUAL(mat4.dimension(1), 3);
@@ -109,13 +112,15 @@ static void test_3d()
   }
 
   TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), 2, 3, 7);
+  TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}});
 
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
   VERIFY_IS_EQUAL(mat3.dimension(0), 2);
   VERIFY_IS_EQUAL(mat3.dimension(1), 3);
   VERIFY_IS_EQUAL(mat3.dimension(2), 7);
 
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
   VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
   VERIFY_IS_EQUAL(mat4.dimension(0), 2);
   VERIFY_IS_EQUAL(mat4.dimension(1), 3);
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index 78b0dade0..b4b0a55b6 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -89,19 +89,19 @@ static void test_reshape_as_lvalue()
   }
 }
 
-
+template<int DataLayout>
 static void test_simple_slice()
 {
-  Tensor<float, 5> tensor(2,3,5,7,11);
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  Tensor<float, 5> slice1(1,1,1,1,1);
+  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
   Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
   slice1 = tensor.slice(indices, sizes);
   VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
 
-  Tensor<float, 5> slice2(1,1,2,2,3);
+  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
   Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
   slice2 = tensor.slice(indices2, sizes2);
@@ -114,7 +114,7 @@ static void test_simple_slice()
   }
 }
 
-
+// TODO(andydavis) Add RowMajor support when TensorContract supports RowMajor.
 static void test_slice_in_expr() {
   MatrixXf m1(7,7);
   MatrixXf m2(3,3);
@@ -141,21 +141,28 @@ static void test_slice_in_expr() {
       VERIFY_IS_APPROX(res(i,j), m3(i,j));
     }
   }
-}
 
+  // Take an arbitrary slice of an arbitrarily sized tensor.
+  TensorMap<Tensor<const float, 2>> tensor4(m1.data(), 7, 7);
+  Tensor<float, 1> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  for (int i = 0; i < 35; ++i) {
+    VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
+  }
+}
 
+template<int DataLayout>
 static void test_slice_as_lvalue()
 {
-  Tensor<float, 3> tensor1(2,2,7);
+  Tensor<float, 3, DataLayout> tensor1(2,2,7);
   tensor1.setRandom();
-  Tensor<float, 3> tensor2(2,2,7);
+  Tensor<float, 3, DataLayout> tensor2(2,2,7);
   tensor2.setRandom();
-  Tensor<float, 3> tensor3(4,3,5);
+  Tensor<float, 3, DataLayout> tensor3(4,3,5);
   tensor3.setRandom();
-  Tensor<float, 3> tensor4(4,3,2);
+  Tensor<float, 3, DataLayout> tensor4(4,3,2);
   tensor4.setRandom();
 
-  Tensor<float, 3> result(4,5,7);
+  Tensor<float, 3, DataLayout> result(4,5,7);
   Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
   Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
   result.slice(first_slice, sizes12) = tensor1;
@@ -190,10 +197,10 @@ static void test_slice_as_lvalue()
   }
 }
 
-
+template<int DataLayout>
 static void test_slice_raw_data()
 {
-  Tensor<float, 4> tensor(3,5,7,11);
+  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
   tensor.setRandom();
 
   Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
@@ -203,40 +210,78 @@ static void test_slice_raw_data()
   VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul);
   VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4));
 
-  extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
-  auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-  VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
-  VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
-  VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
+  if (DataLayout == ColMajor) {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
+  } else {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5));
+  }
 
   extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
   auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
   VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul);
   VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
 
-  offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
-  extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
-  auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-  VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul);
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 2; ++j) {
-      VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22ul);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 2; ++k) {
+        VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l));
+      }
     }
   }
 
-  offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
-  extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
-  auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-  VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul);
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 5; ++j) {
+        for (int k = 0; k < 7; ++k) {
+          for (int l = 0; l < 2; ++l) {
+            int slice_index = i + 3 * (j + 5 * (k + 7 * l));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4));
+          }
+        }
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770ul);
+    for (int l = 0; l < 11; ++l) {
       for (int k = 0; k < 7; ++k) {
-        for (int l = 0; l < 2; ++l) {
-          int slice_index = i + 3 * (j + 5 * (k + 7 * l));
-          VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4));
+        for (int j = 0; j < 5; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            int slice_index = l + 11 * (k + 7 * (j + 5 * i));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l));
+          }
         }
       }
     }
+
   }
 
   offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0);
@@ -247,14 +292,38 @@ static void test_slice_raw_data()
 }
 
 
+static void test_composition()
+{
+  Eigen::Tensor<float, 2> matrix(7, 11);
+  matrix.setRandom();
+
+  const DSizes<ptrdiff_t, 3> newDims{{1, 1, 11}};
+  Eigen::Tensor<float, 3> tensor =
+      matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
+
+  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11ul);
+  VERIFY_IS_EQUAL(tensor.dimension(0), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(1), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(2), 11);
+  for (int i = 0; i < 11; ++i) {
+    VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i));
+  }
+}
+
+
 void test_cxx11_tensor_morphing()
 {
   CALL_SUBTEST(test_simple_reshape());
   CALL_SUBTEST(test_reshape_in_expr());
   CALL_SUBTEST(test_reshape_as_lvalue());
 
-  CALL_SUBTEST(test_simple_slice());
+  CALL_SUBTEST(test_simple_slice<ColMajor>());
+  CALL_SUBTEST(test_simple_slice<RowMajor>());
   CALL_SUBTEST(test_slice_in_expr());
-  CALL_SUBTEST(test_slice_as_lvalue());
-  CALL_SUBTEST(test_slice_raw_data());
+  CALL_SUBTEST(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_slice_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_slice_raw_data<ColMajor>());
+  CALL_SUBTEST(test_slice_raw_data<RowMajor>());
+
+  CALL_SUBTEST(test_composition());
 }
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index 0ffa341c4..8d05d154e 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -8,19 +8,18 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
-#include <string>
+
 #include <Eigen/CXX11/Tensor>
 
-using std::string;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
 static void test_assign()
 {
-  string data1[6];
-  TensorMap<Tensor<string, 2>> mat1(data1, 2, 3);
-  string data2[6];
-  const TensorMap<Tensor<const string, 2>> mat2(data2, 2, 3);
+  std::string data1[6];
+  TensorMap<Tensor<std::string, 2>> mat1(data1, 2, 3);
+  std::string data2[6];
+  const TensorMap<Tensor<const std::string, 2>> mat2(data2, 2, 3);
 
   for (int i = 0; i < 6; ++i) {
     std::ostringstream s1;
@@ -31,16 +30,16 @@ static void test_assign()
     data2[i] = s2.str();
   }
 
-  Tensor<string, 2> rslt1;
+  Tensor<std::string, 2> rslt1;
   rslt1 = mat1;
-  Tensor<string, 2> rslt2;
+  Tensor<std::string, 2> rslt2;
   rslt2 = mat2;
 
-  Tensor<string, 2> rslt3 = mat1;
-  Tensor<string, 2> rslt4 = mat2;
+  Tensor<std::string, 2> rslt3 = mat1;
+  Tensor<std::string, 2> rslt4 = mat2;
 
-  Tensor<string, 2> rslt5(mat1);
-  Tensor<string, 2> rslt6(mat2);
+  Tensor<std::string, 2> rslt5(mat1);
+  Tensor<std::string, 2> rslt6(mat2);
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
@@ -57,8 +56,8 @@ static void test_assign()
 
 static void test_concat()
 {
-  Tensor<string, 2> t1(2, 3);
-  Tensor<string, 2> t2(2, 3);
+  Tensor<std::string, 2> t1(2, 3);
+  Tensor<std::string, 2> t2(2, 3);
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
@@ -71,7 +70,7 @@ static void test_concat()
     }
   }
 
-  Tensor<string, 2> result = t1.concatenate(t2, 1);
+  Tensor<std::string, 2> result = t1.concatenate(t2, 1);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 6);
 
@@ -86,7 +85,7 @@ static void test_concat()
 
 static void test_slices()
 {
-  Tensor<string, 2> data(2, 6);
+  Tensor<std::string, 2> data(2, 6);
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       std::ostringstream s1;
@@ -99,8 +98,8 @@ static void test_slices()
   const Eigen::DSizes<ptrdiff_t, 2> first_half{{0, 0}};
   const Eigen::DSizes<ptrdiff_t, 2> second_half{{0, 3}};
 
-  Tensor<string, 2> t1 = data.slice(first_half, half_size);
-  Tensor<string, 2> t2 = data.slice(second_half, half_size);
+  Tensor<std::string, 2> t1 = data.slice(first_half, half_size);
+  Tensor<std::string, 2> t2 = data.slice(second_half, half_size);
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
@@ -113,8 +112,8 @@ static void test_slices()
 
 static void test_additions()
 {
-  Tensor<string, 1> data1(3);
-  Tensor<string, 1> data2(3);
+  Tensor<std::string, 1> data1(3);
+  Tensor<std::string, 1> data2(3);
   for (int i = 0; i < 3; ++i) {
     data1(i) = "abc";
     std::ostringstream s1;
@@ -122,16 +121,26 @@ static void test_additions()
     data2(i) = s1.str();
   }
 
-  Tensor<string, 1> sum = data1 + data2;
+  Tensor<std::string, 1> sum = data1 + data2;
   for (int i = 0; i < 3; ++i) {
     std::ostringstream concat;
     concat << "abc" << i;
-    string expected = concat.str();
+    std::string expected = concat.str();
     VERIFY_IS_EQUAL(sum(i), expected);
   }
 }
 
 
+static void test_initialization()
+{
+  Tensor<std::string, 2> a(2, 3);
+  a.setConstant(std::string("foo"));
+  for (int i = 0; i < 2*3; ++i) {
+    VERIFY_IS_EQUAL(a(i), std::string("foo"));
+  }
+}
+
+
 void test_cxx11_tensor_of_strings()
 {
   // Beware: none of this is likely to ever work on a GPU.
@@ -139,4 +148,5 @@ void test_cxx11_tensor_of_strings()
   CALL_SUBTEST(test_concat());
   CALL_SUBTEST(test_slices());
   CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_initialization());
 }
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
index 6f74216dd..ffa19896e 100644
--- a/unsupported/test/cxx11_tensor_padding.cpp
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -13,9 +13,10 @@
 
 using Eigen::Tensor;
 
+template<int DataLayout>
 static void test_simple_padding()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
 
   array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
@@ -24,7 +25,7 @@ static void test_simple_padding()
   paddings[2] = std::make_pair(3, 4);
   paddings[3] = std::make_pair(0, 0);
 
-  Tensor<float, 4> padded;
+  Tensor<float, 4, DataLayout> padded;
   padded = tensor.pad(paddings);
 
   VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
@@ -47,9 +48,10 @@ static void test_simple_padding()
   }
 }
 
+template<int DataLayout>
 static void test_padded_expr()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
 
   array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
@@ -62,17 +64,19 @@ static void test_padded_expr()
   reshape_dims[0] = 12;
   reshape_dims[1] = 84;
 
-  Tensor<float, 2> result;
+  Tensor<float, 2, DataLayout> result;
   result = tensor.pad(paddings).reshape(reshape_dims);
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 6; ++j) {
       for (int k = 0; k < 12; ++k) {
         for (int l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
           if (j >= 2 && j < 5 && k >= 3 && k < 8) {
-            VERIFY_IS_EQUAL(result(i+2*j,k+12*l), tensor(i,j-2,k-3,l));
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
           } else {
-            VERIFY_IS_EQUAL(result(i+2*j,k+12*l), 0.0f);
+            VERIFY_IS_EQUAL(result_value, 0.0f);
           }
         }
       }
@@ -80,9 +84,10 @@ static void test_padded_expr()
   }
 }
 
-
 void test_cxx11_tensor_padding()
 {
-  CALL_SUBTEST(test_simple_padding());
-  CALL_SUBTEST(test_padded_expr());
+  CALL_SUBTEST(test_simple_padding<ColMajor>());
+  CALL_SUBTEST(test_simple_padding<RowMajor>());
+  CALL_SUBTEST(test_padded_expr<ColMajor>());
+  CALL_SUBTEST(test_padded_expr<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
index e2ba5bfd8..0ee7b46d4 100644
--- a/unsupported/test/cxx11_tensor_patch.cpp
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -36,6 +36,23 @@ static void test_simple_patch()
     VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
   }
 
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+  Tensor<float, 5> single_patch;
+  single_patch = tensor.extract_patches(patch_dims);
+
+  VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+  VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+
   patch_dims[0] = 1;
   patch_dims[1] = 2;
   patch_dims[2] = 2;
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index da9885166..99e19eba4 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -13,15 +13,15 @@
 
 using Eigen::Tensor;
 
-static void test_simple_reductions()
-{
-  Tensor<float, 4> tensor(2,3,5,7);
+template <int DataLayout>
+static void test_simple_reductions() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
   tensor.setRandom();
   array<ptrdiff_t, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
 
-  Tensor<float, 2> result = tensor.sum(reduction_axis);
+  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 5);
   for (int i = 0; i < 2; ++i) {
@@ -36,6 +36,53 @@ static void test_simple_reductions()
     }
   }
 
+  {
+    Tensor<float, 1, DataLayout> sum1 = tensor.sum();
+    VERIFY_IS_EQUAL(sum1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> sum2 = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(sum2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(sum1(0), sum2(0));
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 2;
+  result = tensor.prod(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float prod = 1.0f;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          prod *= tensor(k, i, l, j);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), prod);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> prod1 = tensor.prod();
+    VERIFY_IS_EQUAL(prod1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> prod2 = tensor.prod(reduction_axis);
+    VERIFY_IS_EQUAL(prod2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(prod1(0), prod2(0));
+  }
+
   reduction_axis[0] = 0;
   reduction_axis[1] = 2;
   result = tensor.maximum(reduction_axis);
@@ -53,6 +100,21 @@ static void test_simple_reductions()
     }
   }
 
+  {
+    Tensor<float, 1, DataLayout> max1 = tensor.maximum();
+    VERIFY_IS_EQUAL(max1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> max2 = tensor.maximum(reduction_axis);
+    VERIFY_IS_EQUAL(max2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(max1(0), max2(0));
+  }
+
   reduction_axis[0] = 0;
   reduction_axis[1] = 1;
   result = tensor.minimum(reduction_axis);
@@ -63,24 +125,72 @@ static void test_simple_reductions()
       float min_val = (std::numeric_limits<float>::max)();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
-          min_val = (std::min)(min_val, tensor(k,  l, i, j));
+          min_val = (std::min)(min_val, tensor(k, l, i, j));
         }
       }
       VERIFY_IS_APPROX(result(i, j), min_val);
     }
   }
-}
 
+  {
+    Tensor<float, 1, DataLayout> min1 = tensor.minimum();
+    VERIFY_IS_EQUAL(min1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> min2 = tensor.minimum(reduction_axis);
+    VERIFY_IS_EQUAL(min2.dimension(0), 1);
 
-static void test_full_reductions()
-{
-  Tensor<float, 2> tensor(2,3);
+    VERIFY_IS_APPROX(min1(0), min2(0));
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+  result = tensor.mean(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float sum = 0.0f;
+      int count = 0;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          sum += tensor(k, l, i, j);
+          ++count;
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum / count);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> mean1 = tensor.mean();
+    VERIFY_IS_EQUAL(mean1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> mean2 = tensor.mean(reduction_axis);
+    VERIFY_IS_EQUAL(mean2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(mean1(0), mean2(0));
+  }
+}
+
+template <int DataLayout>
+static void test_full_reductions() {
+  Tensor<float, 2, DataLayout> tensor(2, 3);
   tensor.setRandom();
   array<ptrdiff_t, 2> reduction_axis;
   reduction_axis[0] = 0;
   reduction_axis[1] = 1;
 
-  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
   VERIFY_IS_EQUAL(result.dimension(0), 1);
 
   float sum = 0.0f;
@@ -103,30 +213,26 @@ static void test_full_reductions()
   VERIFY_IS_APPROX(result(0), sqrtf(sum));
 }
 
-
 struct UserReducer {
-  UserReducer(float offset) : offset_(offset), sum_(0.0f) {}
-  void reduce(const float val) {
-    sum_ += val * val;
-  }
-  float finalize() const {
-    return 1.0f / (sum_ + offset_);
-  }
+  static const bool PacketAccess = false;
+  UserReducer(float offset) : offset_(offset) {}
+  void reduce(const float val, float* accum) { *accum += val * val; }
+  float initialize() const { return 0; }
+  float finalize(const float accum) const { return 1.0f / (accum + offset_); }
 
  private:
-  float offset_;
-  float sum_;
+  const float offset_;
 };
 
-static void test_user_defined_reductions()
-{
-  Tensor<float, 2> tensor(5,7);
+template <int DataLayout>
+static void test_user_defined_reductions() {
+  Tensor<float, 2, DataLayout> tensor(5, 7);
   tensor.setRandom();
   array<ptrdiff_t, 1> reduction_axis;
   reduction_axis[0] = 1;
 
   UserReducer reducer(10.0f);
-  Tensor<float, 1> result = tensor.reduce(reduction_axis, reducer);
+  Tensor<float, 1, DataLayout> result = tensor.reduce(reduction_axis, reducer);
   VERIFY_IS_EQUAL(result.dimension(0), 5);
   for (int i = 0; i < 5; ++i) {
     float expected = 10.0f;
@@ -138,22 +244,24 @@ static void test_user_defined_reductions()
   }
 }
 
-
-static void test_tensor_maps()
-{
-  int inputs[2*3*5*7];
-  TensorMap<Tensor<int, 4> > tensor_map(inputs, 2,3,5,7);
-  TensorMap<Tensor<const int, 4> > tensor_map_const(inputs, 2,3,5,7);
-  const TensorMap<Tensor<const int, 4> > tensor_map_const_const(inputs, 2,3,5,7);
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[2 * 3 * 5 * 7];
+  TensorMap<Tensor<int, 4, DataLayout> > tensor_map(inputs, 2, 3, 5, 7);
+  TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const(inputs, 2, 3, 5,
+                                                                7);
+  const TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const_const(
+      inputs, 2, 3, 5, 7);
 
   tensor_map.setRandom();
   array<ptrdiff_t, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
 
-  Tensor<int, 2> result = tensor_map.sum(reduction_axis);
-  Tensor<int, 2> result2 = tensor_map_const.sum(reduction_axis);
-  Tensor<int, 2> result3 = tensor_map_const_const.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result = tensor_map.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result2 = tensor_map_const.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result3 =
+      tensor_map_const_const.sum(reduction_axis);
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 5; ++j) {
@@ -170,11 +278,110 @@ static void test_tensor_maps()
   }
 }
 
+template <int DataLayout>
+static void test_static_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 97);
+  in.setRandom();
+
+#if __cplusplus <= 199711L
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+#else
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<3> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_last_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(97, 113);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if __cplusplus <= 199711L
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+#else
+  // This triggers the use of packets for ColMajor.
+  Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 97; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 72; ++l) {
+          expected = (std::max)(expected, in(l, k, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_first_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if __cplusplus <= 199711L
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 2;
+  reduction_axis[1] = 3;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 97; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, j, k, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
 
-void test_cxx11_tensor_reduction()
-{
-   CALL_SUBTEST(test_simple_reductions());
-   CALL_SUBTEST(test_full_reductions());
-   CALL_SUBTEST(test_user_defined_reductions());
-   CALL_SUBTEST(test_tensor_maps());
+void test_cxx11_tensor_reduction() {
+  CALL_SUBTEST(test_simple_reductions<ColMajor>());
+  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(test_full_reductions<ColMajor>());
+  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+  CALL_SUBTEST(test_static_dims<ColMajor>());
+  CALL_SUBTEST(test_static_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<ColMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index 39c623499..ec623e1f9 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -14,9 +14,10 @@
 using Eigen::Tensor;
 using Eigen::array;
 
+template <int DataLayout>
 static void test_simple_shuffling()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
   array<ptrdiff_t, 4> shuffles;
   shuffles[0] = 0;
@@ -24,7 +25,7 @@ static void test_simple_shuffling()
   shuffles[2] = 2;
   shuffles[3] = 3;
 
-  Tensor<float, 4> no_shuffle;
+  Tensor<float, 4, DataLayout> no_shuffle;
   no_shuffle = tensor.shuffle(shuffles);
 
   VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
@@ -46,7 +47,7 @@ static void test_simple_shuffling()
   shuffles[1] = 3;
   shuffles[2] = 1;
   shuffles[3] = 0;
-  Tensor<float, 4> shuffle;
+  Tensor<float, 4, DataLayout> shuffle;
   shuffle = tensor.shuffle(shuffles);
 
   VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
@@ -66,9 +67,10 @@ static void test_simple_shuffling()
 }
 
 
+template <int DataLayout>
 static void test_expr_shuffling()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
 
   array<ptrdiff_t, 4> shuffles;
@@ -76,10 +78,10 @@ static void test_expr_shuffling()
   shuffles[1] = 3;
   shuffles[2] = 1;
   shuffles[3] = 0;
-  Tensor<float, 4> expected;
+  Tensor<float, 4, DataLayout> expected;
   expected = tensor.shuffle(shuffles);
 
-  Tensor<float, 4> result(5,7,3,2);
+  Tensor<float, 4, DataLayout> result(5,7,3,2);
 
   array<int, 4> src_slice_dim{{2,3,1,7}};
   array<int, 4> src_slice_start{{0,0,0,0}};
@@ -128,16 +130,17 @@ static void test_expr_shuffling()
 }
 
 
+template <int DataLayout>
 static void test_shuffling_as_value()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
   array<ptrdiff_t, 4> shuffles;
   shuffles[2] = 0;
   shuffles[3] = 1;
   shuffles[1] = 2;
   shuffles[0] = 3;
-  Tensor<float, 4> shuffle(5,7,3,2);
+  Tensor<float, 4, DataLayout> shuffle(5,7,3,2);
   shuffle.shuffle(shuffles) = tensor;
 
   VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
@@ -158,7 +161,10 @@ static void test_shuffling_as_value()
 
 void test_cxx11_tensor_shuffling()
 {
-   CALL_SUBTEST(test_simple_shuffling());
-   CALL_SUBTEST(test_expr_shuffling());
-   CALL_SUBTEST(test_shuffling_as_value());
+   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
+   CALL_SUBTEST(test_simple_shuffling<RowMajor>());
+   CALL_SUBTEST(test_expr_shuffling<ColMajor>());
+   CALL_SUBTEST(test_expr_shuffling<RowMajor>());
+   CALL_SUBTEST(test_shuffling_as_value<ColMajor>());
+   CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index a70591c82..23855fca0 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -32,6 +32,7 @@ static void test_1d()
   vec1(5) = 42; vec2(5) = 5; vec3(5) = 0;
   vec4.setZero();
 
+  VERIFY_IS_EQUAL((vec1.rank()), 1);
   VERIFY_IS_EQUAL((vec1.size()), 6);
   VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
 
@@ -99,10 +100,12 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
+  VERIFY_IS_EQUAL((mat1.rank()), 2);
   VERIFY_IS_EQUAL((mat1.size()), 6);
   VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2);
   VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3);
 
+  VERIFY_IS_EQUAL((mat2.rank()), 2);
   VERIFY_IS_EQUAL((mat2.size()), 6);
   VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2);
   VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3);
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
index 502569d1d..1feb39dca 100644
--- a/unsupported/test/cxx11_tensor_striding.cpp
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -13,9 +13,10 @@
 
 using Eigen::Tensor;
 
+template<int DataLayout>
 static void test_simple_striding()
 {
-  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
   array<ptrdiff_t, 4> strides;
   strides[0] = 1;
@@ -23,7 +24,7 @@ static void test_simple_striding()
   strides[2] = 1;
   strides[3] = 1;
 
-  Tensor<float, 4> no_stride;
+  Tensor<float, 4, DataLayout> no_stride;
   no_stride = tensor.stride(strides);
 
   VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
@@ -45,7 +46,7 @@ static void test_simple_striding()
   strides[1] = 4;
   strides[2] = 2;
   strides[3] = 3;
-  Tensor<float, 4> stride;
+  Tensor<float, 4, DataLayout> stride;
   stride = tensor.stride(strides);
 
   VERIFY_IS_EQUAL(stride.dimension(0), 1);
@@ -65,7 +66,36 @@ static void test_simple_striding()
 }
 
 
+template<int DataLayout>
+static void test_striding_as_lvalue()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+  result.stride(strides) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+}
+
+
 void test_cxx11_tensor_striding()
 {
-   CALL_SUBTEST(test_simple_striding());
+  CALL_SUBTEST(test_simple_striding<ColMajor>());
+  CALL_SUBTEST(test_simple_striding<RowMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index f0de61f8b..e25912279 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -9,11 +9,11 @@
 
 #define EIGEN_USE_THREADS
 
-#include <iostream>
+
 #include "main.h"
+#include <iostream>
 #include <Eigen/CXX11/Tensor>
 
-
 using Eigen::Tensor;
 
 static void test_multithread_elementwise()
@@ -60,12 +60,12 @@ static void test_multithread_compound_assignment()
   }
 }
 
-
+template<int DataLayout>
 static void test_multithread_contraction()
 {
-  Tensor<float, 4> t_left(30, 50, 37, 31);
-  Tensor<float, 5> t_right(37, 31, 70, 2, 10);
-  Tensor<float, 5> t_result(30, 50, 70, 2, 10);
+  Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
+  Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
 
   t_left.setRandom();
   t_right.setRandom();
@@ -74,11 +74,10 @@ static void test_multithread_contraction()
   typedef Tensor<float, 1>::DimensionPair DimPair;
   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
 
-
-  typedef Map<MatrixXf> MapXf;
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
   MapXf m_left(t_left.data(), 1500, 1147);
   MapXf m_right(t_right.data(), 1147, 1400);
-  MatrixXf m_result(1500, 1400);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
 
   Eigen::ThreadPoolDevice thread_pool_device(4);
 
@@ -95,12 +94,12 @@ static void test_multithread_contraction()
   }
 }
 
-
+template<int DataLayout>
 static void test_contraction_corner_cases()
 {
-  Tensor<float, 2> t_left(32, 500);
-  Tensor<float, 2> t_right(32, 28*28);
-  Tensor<float, 2> t_result(500, 28*28);
+  Tensor<float, 2, DataLayout> t_left(32, 500);
+  Tensor<float, 2, DataLayout> t_right(32, 28*28);
+  Tensor<float, 2, DataLayout> t_result(500, 28*28);
 
   t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
   t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
@@ -110,10 +109,10 @@ static void test_contraction_corner_cases()
   typedef Tensor<float, 1>::DimensionPair DimPair;
   Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
 
-  typedef Map<MatrixXf> MapXf;
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
   MapXf m_left(t_left.data(), 32, 500);
   MapXf m_right(t_right.data(), 32, 28*28);
-  MatrixXf m_result(500, 28*28);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
 
   Eigen::ThreadPoolDevice thread_pool_device(12);
 
@@ -181,18 +180,18 @@ static void test_contraction_corner_cases()
   }
 }
 
-
+template<int DataLayout>
 static void test_multithread_contraction_agrees_with_singlethread() {
   int contract_size = internal::random<int>(1, 5000);
 
-  Tensor<float, 3> left(internal::random<int>(1, 80),
-                        contract_size,
-                        internal::random<int>(1, 100));
+  Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
+                                    contract_size,
+                                    internal::random<int>(1, 100));
 
-  Tensor<float, 4> right(internal::random<int>(1, 25),
-                         internal::random<int>(1, 37),
-                         contract_size,
-                         internal::random<int>(1, 51));
+  Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
+                                     internal::random<int>(1, 37),
+                                     contract_size,
+                                     internal::random<int>(1, 51));
 
   left.setRandom();
   right.setRandom();
@@ -206,13 +205,13 @@ static void test_multithread_contraction_agrees_with_singlethread() {
 
   Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(2, 11));
 
-  Tensor<float, 5> st_result;
+  Tensor<float, 5, DataLayout> st_result;
   st_result = left.contract(right, dims);
 
-  Tensor<float, 5> tp_result(st_result.dimensions());
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
   tp_result.device(thread_pool_device) = left.contract(right, dims);
 
-  VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
     // if both of the values are very small, then do nothing (because the test will fail
     // due to numerical precision issues when values are small)
@@ -241,17 +240,30 @@ static void test_memcpy() {
 }
 
 
+static void test_multithread_random()
+{
+  Eigen::ThreadPoolDevice device(2);
+  Tensor<float, 1> t(1 << 20);
+  t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
+}
+
+
 void test_cxx11_tensor_thread_pool()
 {
   CALL_SUBTEST(test_multithread_elementwise());
   CALL_SUBTEST(test_multithread_compound_assignment());
 
-  CALL_SUBTEST(test_multithread_contraction());
+  CALL_SUBTEST(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST(test_multithread_contraction<RowMajor>());
 
-  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread());
+  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST(test_contraction_corner_cases());
+  CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
 
   CALL_SUBTEST(test_memcpy());
+
+  CALL_SUBTEST(test_multithread_random());
 }
-- 
cgit v1.2.3


From c03c73c9b7032f984bcd6c52d9ca3a430ce19c69 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Feb 2015 14:26:12 +0100
Subject: Fix clang compilation

---
 unsupported/test/cxx11_tensor_thread_pool.cpp | 1 +
 1 file changed, 1 insertion(+)

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e25912279..f49523683 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -15,6 +15,7 @@
 #include <Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
+using std::isnan;
 
 static void test_multithread_elementwise()
 {
-- 
cgit v1.2.3


From 4716c2c6666eb7018dac2e2ed050ead45c8933e1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 Feb 2015 12:06:19 -0800
Subject: Fixed compilation error

---
 unsupported/test/cxx11_tensor_thread_pool.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')

diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index f49523683..6fe65c7f9 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -15,7 +15,7 @@
 #include <Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
-using std::isnan;
+
 
 static void test_multithread_elementwise()
 {
@@ -122,7 +122,7 @@ static void test_contraction_corner_cases()
   m_result = m_left.transpose() * m_right;
 
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
-    assert(!isnan(t_result.data()[i]));
+    assert(!std::isnan(t_result.data()[i]));
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
       std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
@@ -137,7 +137,7 @@ static void test_contraction_corner_cases()
   new(&m_left) MapXf(t_left.data(), 32, 1);
   m_result = m_left.transpose() * m_right;
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
-    assert(!isnan(t_result.data()[i]));
+    assert(!std::isnan(t_result.data()[i]));
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
@@ -155,7 +155,7 @@ static void test_contraction_corner_cases()
   new(&m_right) MapXf(t_right.data(), 32, 4);
   m_result = m_left.transpose() * m_right;
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
-    assert(!isnan(t_result.data()[i]));
+    assert(!std::isnan(t_result.data()[i]));
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
@@ -173,7 +173,7 @@ static void test_contraction_corner_cases()
   new(&m_right) MapXf(t_right.data(), 32, 4);
   m_result = m_left.transpose() * m_right;
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
-    assert(!isnan(t_result.data()[i]));
+    assert(!std::isnan(t_result.data()[i]));
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
-- 
cgit v1.2.3