34 files changed, 1760 insertions, 284 deletions
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 937acc432..f75bf9798 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,3 +1,17 @@
+# generate split test header file only if it does not yet exist
+# in order to prevent a rebuild everytime cmake is configured
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)  
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
+  foreach(i RANGE 1 999)
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
+      "#ifdef EIGEN_TEST_PART_${i}\n"
+      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+      "#else\n"
+      "#define CALL_SUBTEST_${i}(FUNC)\n"
+      "#endif\n\n"
+    )
+  endforeach()
+endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
 add_custom_target(BuildUnsupported)
@@ -99,61 +113,104 @@ ei_add_test(kronecker_product)
 if(EIGEN_TEST_CXX11)
   # It should be safe to always run these tests as there is some fallback code for
   # older compiler that don't support cxx11.
-  ei_add_test(cxx11_meta "-std=c++0x")
-  ei_add_test(cxx11_tensor_simple "-std=c++0x")
-#  ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
-  ei_add_test(cxx11_tensor_assign "-std=c++0x")
-  ei_add_test(cxx11_tensor_dimension "-std=c++0x")
-  ei_add_test(cxx11_tensor_index_list "-std=c++0x")
-  ei_add_test(cxx11_tensor_mixed_indices "-std=c++0x")
-  ei_add_test(cxx11_tensor_comparisons "-std=c++0x")
-  ei_add_test(cxx11_tensor_contraction "-std=c++0x")
-  ei_add_test(cxx11_tensor_convolution "-std=c++0x")
-  ei_add_test(cxx11_tensor_expr "-std=c++0x")
-  ei_add_test(cxx11_tensor_math "-std=c++0x")
-  ei_add_test(cxx11_tensor_forced_eval "-std=c++0x")
-  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
-  ei_add_test(cxx11_tensor_const "-std=c++0x")
-  ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
-  ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
-  ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
-  ei_add_test(cxx11_tensor_uint128 "-std=c++0x")
-  ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
-  ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
-  ei_add_test(cxx11_tensor_map "-std=c++0x")
-  ei_add_test(cxx11_tensor_broadcasting "-std=c++0x")
-  ei_add_test(cxx11_tensor_chipping "-std=c++0x")
-  ei_add_test(cxx11_tensor_concatenation "-std=c++0x")
-  ei_add_test(cxx11_tensor_inflation "-std=c++0x")
-  ei_add_test(cxx11_tensor_morphing "-std=c++0x")
-  ei_add_test(cxx11_tensor_padding "-std=c++0x")
-  ei_add_test(cxx11_tensor_patch "-std=c++0x")
-  ei_add_test(cxx11_tensor_image_patch "-std=c++0x")
-  ei_add_test(cxx11_tensor_volume_patch "-std=c++0x")
-  ei_add_test(cxx11_tensor_reduction "-std=c++0x")
-  ei_add_test(cxx11_tensor_argmax "-std=c++0x")
-  ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
-  ei_add_test(cxx11_tensor_striding "-std=c++0x")
+  set(CMAKE_CXX_STANDARD 11)
+
+  ei_add_test(cxx11_float16)
+  ei_add_test(cxx11_meta)
+  ei_add_test(cxx11_tensor_simple)
+#  ei_add_test(cxx11_tensor_symmetry)
+  ei_add_test(cxx11_tensor_assign)
+  ei_add_test(cxx11_tensor_dimension)
+  ei_add_test(cxx11_tensor_index_list)
+  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_tensor_comparisons)
+  ei_add_test(cxx11_tensor_contraction)
+  ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_math)
+  ei_add_test(cxx11_tensor_forced_eval)
+  ei_add_test(cxx11_tensor_fixed_size)
+  ei_add_test(cxx11_tensor_const)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_strings)
+  ei_add_test(cxx11_tensor_intdiv)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_map)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_padding)
+  ei_add_test(cxx11_tensor_patch)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_volume_patch)
+  ei_add_test(cxx11_tensor_reduction)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_striding)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
-  ei_add_test(cxx11_tensor_ref "-std=c++0x")
-  ei_add_test(cxx11_tensor_random "-std=c++0x")
-  ei_add_test(cxx11_tensor_casts "-std=c++0x")
-  ei_add_test(cxx11_tensor_reverse "-std=c++0x")
-  ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
-  ei_add_test(cxx11_tensor_io "-std=c++0x")
-  ei_add_test(cxx11_tensor_generator "-std=c++0x")
-  ei_add_test(cxx11_tensor_custom_op "-std=c++0x")
-  ei_add_test(cxx11_tensor_custom_index "-std=c++0x")
-  ei_add_test(cxx11_tensor_sugar "-std=c++0x")
-  ei_add_test(cxx11_tensor_fft "-std=c++0x")
-  ei_add_test(cxx11_tensor_ifft "-std=c++0x")
-
-  # These tests needs nvcc
-#  ei_add_test(cxx11_tensor_device "-std=c++0x")
-#  ei_add_test(cxx11_tensor_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_random_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x")
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_random)
+  ei_add_test(cxx11_tensor_casts)
+  ei_add_test(cxx11_tensor_roundings)
+  ei_add_test(cxx11_tensor_reverse)
+  ei_add_test(cxx11_tensor_layout_swap)
+  ei_add_test(cxx11_tensor_io)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_sugar)
+  ei_add_test(cxx11_tensor_fft)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_empty)
+
+  if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+    # This test requires __uint128_t which is only available on 64bit systems 
+    ei_add_test(cxx11_tensor_uint128)
+  endif() 
+
+endif()
+
+# These tests needs nvcc
+find_package(CUDA 7.0)
+if(CUDA_FOUND AND EIGEN_TEST_CUDA)
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
+  string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
+
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+  endif()
+
+  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"")
+  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+  ei_add_test(cxx11_tensor_device)
+  ei_add_test(cxx11_tensor_cuda)
+  ei_add_test(cxx11_tensor_contract_cuda)
+  ei_add_test(cxx11_tensor_reduction_cuda)
+  ei_add_test(cxx11_tensor_argmax_cuda)
+  ei_add_test(cxx11_tensor_cast_float16_cuda)
+
+  # The random number generation code requires arch 3.5 or greater.
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_cuda)
+  endif()
+
+  ei_add_test(cxx11_tensor_of_float16_cuda)
 
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 724ea7b5b..6a5ed057f 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -14,6 +14,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
 {
     /*      subroutine fcn for chkder example. */
@@ -1023,7 +1026,8 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.njev, 72);
   // check norm^2
   std::cout.precision(30);
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4290986055242372e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  std::cout << lm.fvec.squaredNorm() << "\n";
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1044,7 +1048,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev, 9);
   VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430571737783119393e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1354,8 +1358,12 @@ void testNistMGH17(void)
   
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  VERIFY(lm.nfev < 650);  // 602
-  VERIFY(lm.njev < 600);  // 545
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev, 602);  // 602
+  VERIFY_IS_EQUAL(lm.njev, 545);  // 545
+  --g_test_level;
+  VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 1aa1b3d2d..374f86df9 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -16,7 +16,7 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y)
   using namespace std;
 //   return x+std::sin(y);
   EIGEN_ASM_COMMENT("mybegin");
-  return static_cast<Scalar>(x*2 - pow(x,2) + 2*sqrt(y*y) - 4 * sin(x) + 2 * cos(y) - exp(-0.5*x*x));
+  return static_cast<Scalar>(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0));
   //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2;
   EIGEN_ASM_COMMENT("myend");
 }
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index ba4b5aec4..c631c734a 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -30,6 +30,10 @@ template<typename Scalar> void check_atan2()
   
   VERIFY_IS_APPROX(res.value(), x.value());
   VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+
+  res = atan2(r*s+0, r*c+0);
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
 }
 
 
diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp
new file mode 100644
index 000000000..2dc0872d8
--- /dev/null
+++ b/unsupported/test/cxx11_float16.cpp
@@ -0,0 +1,155 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_float16
+
+#include "main.h"
+#include <Eigen/src/Core/arch/CUDA/Half.h>
+
+using Eigen::half;
+
+void test_conversion()
+{
+  // Conversion from float.
+  VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
+  VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555);
+  VERIFY_IS_EQUAL(half(0.0f).x, 0x0000);
+  VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000);
+  VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff);
+  VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00);  // Becomes infinity.
+
+  // Denormals.
+  VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001);
+  VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001);
+  VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
+
+  // Verify round-to-nearest-even behavior.
+  float val1 = float(half(__half{0x3c00}));
+  float val2 = float(half(__half{0x3c01}));
+  float val3 = float(half(__half{0x3c02}));
+  VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02);
+
+  // Conversion from int.
+  VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
+  VERIFY_IS_EQUAL(half(0).x, 0x0000);
+  VERIFY_IS_EQUAL(half(1).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(2).x, 0x4000);
+  VERIFY_IS_EQUAL(half(3).x, 0x4200);
+
+  // Conversion from bool.
+  VERIFY_IS_EQUAL(half(false).x, 0x0000);
+  VERIFY_IS_EQUAL(half(true).x, 0x3c00);
+
+  // Conversion to float.
+  VERIFY_IS_EQUAL(float(half(__half{0x0000})), 0.0f);
+  VERIFY_IS_EQUAL(float(half(__half{0x3c00})), 1.0f);
+
+  // Denormals.
+  VERIFY_IS_APPROX(float(half(__half{0x8001})), -5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half{0x0001})), 5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half{0x0002})), 1.19209e-07f);
+
+  // NaNs and infinities.
+  VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
+  VERIFY(!(numext::isnan)(float(half(0.0f))));
+  VERIFY((numext::isinf)(float(half(__half{0xfc00}))));
+  VERIFY((numext::isnan)(float(half(__half{0xfc01}))));
+  VERIFY((numext::isinf)(float(half(__half{0x7c00}))));
+  VERIFY((numext::isnan)(float(half(__half{0x7c01}))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
+  VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
+  VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
+#endif
+
+  // Exactly same checks as above, just directly on the half representation.
+  VERIFY(!(numext::isinf)(half(__half{0x7bff})));
+  VERIFY(!(numext::isnan)(half(__half{0x0000})));
+  VERIFY((numext::isinf)(half(__half{0xfc00})));
+  VERIFY((numext::isnan)(half(__half{0xfc01})));
+  VERIFY((numext::isinf)(half(__half{0x7c00})));
+  VERIFY((numext::isnan)(half(__half{0x7c01})));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(half(0.0 / 0.0)));
+  VERIFY((numext::isinf)(half(1.0 / 0.0)));
+  VERIFY((numext::isinf)(half(-1.0 / 0.0)));
+#endif
+}
+
+void test_arithmetic()
+{
+  VERIFY_IS_EQUAL(float(half(2) + half(2)), 4);
+  VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0);
+  VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f);
+  VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f);
+  VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f);
+  VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f);
+  VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f);
+}
+
+void test_comparison()
+{
+  VERIFY(half(1.0f) > half(0.5f));
+  VERIFY(half(0.5f) < half(1.0f));
+  VERIFY(!(half(1.0f) < half(0.5f)));
+  VERIFY(!(half(0.5f) > half(1.0f)));
+
+  VERIFY(!(half(4.0f) > half(4.0f)));
+  VERIFY(!(half(4.0f) < half(4.0f)));
+
+  VERIFY(!(half(0.0f) < half(-0.0f)));
+  VERIFY(!(half(-0.0f) < half(0.0f)));
+  VERIFY(!(half(0.0f) > half(-0.0f)));
+  VERIFY(!(half(-0.0f) > half(0.0f)));
+
+  VERIFY(half(0.2f) > half(-1.0f));
+  VERIFY(half(-1.0f) < half(0.2f));
+  VERIFY(half(-16.0f) < half(-15.0f));
+
+  VERIFY(half(1.0f) == half(1.0f));
+  VERIFY(half(1.0f) != half(2.0f));
+
+  // Comparisons with NaNs and infinities.
+  VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
+  VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
+
+  VERIFY(!(half(1.0) == half(0.0 / 0.0)));
+  VERIFY(!(half(1.0) < half(0.0 / 0.0)));
+  VERIFY(!(half(1.0) > half(0.0 / 0.0)));
+  VERIFY(half(1.0) != half(0.0 / 0.0));
+
+  VERIFY(half(1.0) < half(1.0 / 0.0));
+  VERIFY(half(1.0) > half(-1.0 / 0.0));
+}
+
+void test_functions()
+{
+  VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
+
+  VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
+  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI));
+
+  VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
+}
+
+void test_cxx11_float16()
+{
+  CALL_SUBTEST(test_conversion());
+  CALL_SUBTEST(test_arithmetic());
+  CALL_SUBTEST(test_comparison());
+  CALL_SUBTEST(test_functions());
+}
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index 4f45e1dd3..ecac3add1 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -9,6 +9,7 @@
 
 #include "main.h"
 
+#include <array>
 #include <Eigen/CXX11/Core>
 
 using Eigen::internal::is_same;
@@ -249,8 +250,8 @@ static void test_is_same_gf()
 {
   VERIFY((!is_same_gf<dummy_a, dummy_b>::value));
   VERIFY((!!is_same_gf<dummy_a, dummy_a>::value));
-  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), 0);
-  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), 0);
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), false);
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), false);
 }
 
 static void test_apply_op()
@@ -293,8 +294,8 @@ static void test_arg_reductions()
 
 static void test_array_reverse_and_reduce()
 {
-  std::array<int, 6> a{{4, 8, 15, 16, 23, 42}};
-  std::array<int, 6> b{{42, 23, 16, 15, 8, 4}};
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{42, 23, 16, 15, 8, 4}};
 
   // there is no operator<< for std::array, so VERIFY_IS_EQUAL will
   // not compile
@@ -308,11 +309,11 @@ static void test_array_reverse_and_reduce()
 
 static void test_array_zip_and_apply()
 {
-  std::array<int, 6> a{{4, 8, 15, 16, 23, 42}};
-  std::array<int, 6> b{{0, 1, 2, 3, 4, 5}};
-  std::array<int, 6> c{{4, 9, 17, 19, 27, 47}};
-  std::array<int, 6> d{{0, 8, 30, 48, 92, 210}};
-  std::array<int, 6> e{{0, 2, 4, 6, 8, 10}};
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{0, 1, 2, 3, 4, 5}};
+  array<int, 6> c{{4, 9, 17, 19, 27, 47}};
+  array<int, 6> d{{0, 8, 30, 48, 92, 210}};
+  array<int, 6> e{{0, 2, 4, 6, 8, 10}};
 
   VERIFY((array_zip<sum_op>(a, b) == c));
   VERIFY((array_zip<product_op>(a, b) == d));
@@ -325,8 +326,8 @@ static void test_array_zip_and_apply()
 
 static void test_array_misc()
 {
-  std::array<int, 3> a3{{1, 1, 1}};
-  std::array<int, 6> a6{{2, 2, 2, 2, 2, 2}};
+  array<int, 3> a3{{1, 1, 1}};
+  array<int, 6> a6{{2, 2, 2, 2, 2, 2}};
   VERIFY((repeat<3, int>(1) == a3));
   VERIFY((repeat<6, int>(2) == a6));
 
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index d37490d15..41ccbe974 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// TODO(mdevin): Free the cuda memory.
 
+#define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
 #define EIGEN_USE_GPU
 
@@ -56,6 +56,10 @@ void test_cuda_simple_argmax()
 
   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+  cudaFree(d_in);
+  cudaFree(d_out_max);
+  cudaFree(d_out_min);
 }
 
 template <int DataLayout>
@@ -141,6 +145,9 @@ void test_cuda_argmax_dim()
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
   }
 }
 
@@ -227,15 +234,18 @@ void test_cuda_argmin_dim()
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
   }
 }
 
 void test_cxx11_tensor_cuda()
 {
-  CALL_SUBTEST(test_cuda_simple_argmax<RowMajor>());
-  CALL_SUBTEST(test_cuda_simple_argmax<ColMajor>());
-  CALL_SUBTEST(test_cuda_argmax_dim<RowMajor>());
-  CALL_SUBTEST(test_cuda_argmax_dim<ColMajor>());
-  CALL_SUBTEST(test_cuda_argmin_dim<RowMajor>());
-  CALL_SUBTEST(test_cuda_argmin_dim<ColMajor>());
+  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index d16aaf847..e5cf61fe1 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -29,8 +29,8 @@ static void test_1d()
   int row_major[6];
   memset(col_major, 0, 6*sizeof(int));
   memset(row_major, 0, 6*sizeof(int));
-  TensorMap<Tensor<int, 1>> vec3(col_major, 6);
-  TensorMap<Tensor<int, 1, RowMajor>> vec4(row_major, 6);
+  TensorMap<Tensor<int, 1> > vec3(col_major, 6);
+  TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
 
   vec3 = vec1;
   vec4 = vec2;
@@ -92,8 +92,8 @@ static void test_2d()
   int row_major[6];
   memset(col_major, 0, 6*sizeof(int));
   memset(row_major, 0, 6*sizeof(int));
-  TensorMap<Tensor<int, 2>> mat3(row_major, 2, 3);
-  TensorMap<Tensor<int, 2, RowMajor>> mat4(col_major, 2, 3);
+  TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
+  TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
 
   mat3 = mat1;
   mat4 = mat2;
@@ -152,8 +152,8 @@ static void test_3d()
   int row_major[2*3*7];
   memset(col_major, 0, 2*3*7*sizeof(int));
   memset(row_major, 0, 2*3*7*sizeof(int));
-  TensorMap<Tensor<int, 3>> mat3(col_major, 2, 3, 7);
-  TensorMap<Tensor<int, 3, RowMajor>> mat4(row_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
 
   mat3 = mat1;
   mat4 = mat2;
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
new file mode 100644
index 000000000..f22b99de8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
+
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_fallback_conversion() {
+  int num_elem = 101;
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
+  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(floats(i), conv(i));
+  }
+}
+
+
+void test_cxx11_tensor_cast_float16_cuda()
+{
+  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_fallback_conversion());
+}
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 729e43327..3c6d0d2ff 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -24,12 +24,12 @@ static void test_simple_cast()
   cplextensor.setRandom();
 
   chartensor = ftensor.cast<char>();
-  cplextensor = ftensor.cast<std::complex<float>>();
+  cplextensor = ftensor.cast<std::complex<float> >();
 
   for (int i = 0; i < 20; ++i) {
     for (int j = 0; j < 30; ++j) {
       VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
-      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float>>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j)));
     }
   }
 }
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 035a093e6..6d1ef07f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -22,16 +22,16 @@ using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-static void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_cuda_contraction(int m_size, int k_size, int n_size)
 {
-  cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
-  Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
-  Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
-  Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
 
   t_left.setRandom();
@@ -67,12 +67,16 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
   t_result = t_left.contract(t_right, dims);
 
   cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
-    if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) {
-      cout << "mismatch detected at index " << i << ": " << t_result.data()[i]
-           << " vs " <<  t_result_gpu.data()[i] << endl;
-      assert(false);
+  for (size_t i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
     }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
   }
 
   cudaFree((void*)d_t_left);
@@ -80,41 +84,69 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
   cudaFree((void*)d_t_result);
 }
 
-
-void test_cxx11_tensor_cuda()
-{
-  cout<<"Calling contraction tests"<<std::endl;
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
+template<int DataLayout>
+void test_cuda_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
+    test_cuda_contraction<ColMajor>(k, 128, 128);
+    test_cuda_contraction<RowMajor>(k, 128, 128);
   }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
+    test_cuda_contraction<ColMajor>(128, k, 128);
+    test_cuda_contraction<RowMajor>(128, k, 128);
   }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
+    test_cuda_contraction<ColMajor>(128, 128, k);
+    test_cuda_contraction<RowMajor>(128, 128, k);
   }
+}
 
-  int m_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
-  int n_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
-
-  int k_sizes[] = { 31,  39,  63, 64,    65,
-                    95,  96, 127, 129,  255,
-                   257, 511, 512, 513, 1023,
-                  1024, 1025};
 
-  for (int i = 0; i <15; i++)
-    for (int j = 0; j < 15; j++)
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
-        CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
+    }
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+
+  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+
+  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index b0d52c6cf..0e16308a2 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -29,7 +29,7 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> mat4(3,3);
   mat4.setZero();
-  Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}});
+  Eigen::array<DimPair, 1> dims3 = {{DimPair(0, 0)}};
   typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
   Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice());
   eval.evalTo(mat4.data());
@@ -49,7 +49,7 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> mat5(2,2);
   mat5.setZero();
-  Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}});
+  Eigen::array<DimPair, 1> dims4 = {{DimPair(1, 1)}};
   typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
   Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice());
   eval2.evalTo(mat5.data());
@@ -64,7 +64,7 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> mat6(2,2);
   mat6.setZero();
-  Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims6 = {{DimPair(1, 0)}};
   typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
   Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice());
   eval3.evalTo(mat6.data());
@@ -89,7 +89,7 @@ static void test_scalar()
 
   Tensor<float, 1, DataLayout> scalar(1);
   scalar.setZero();
-  Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}});
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
   typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator;
   Evaluator eval(vec1.contract(vec2, dims), DefaultDevice());
   eval.evalTo(scalar.data());
@@ -113,7 +113,7 @@ static void test_multidims()
 
   Tensor<float, 3, DataLayout> mat3(2, 2, 2);
   mat3.setZero();
-  Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}});
+  Eigen::array<DimPair, 2> dims = {{DimPair(1, 2), DimPair(2, 3)}};
   typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
   Evaluator eval(mat1.contract(mat2, dims), DefaultDevice());
   eval.evalTo(mat3.data());
@@ -138,6 +138,26 @@ static void test_multidims()
                                 mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
   VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
                                 mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
+
+  Tensor<float, 2, DataLayout> mat4(2, 2);
+  Tensor<float, 3, DataLayout> mat5(2, 2, 2);
+
+  mat4.setRandom();
+  mat5.setRandom();
+
+  Tensor<float, 1, DataLayout> mat6(2);
+  mat6.setZero();
+  Eigen::array<DimPair, 2> dims2({{DimPair(0, 1), DimPair(1, 0)}});
+  typedef TensorEvaluator<decltype(mat4.contract(mat5, dims2)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice());
+  eval2.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+
+  VERIFY_IS_APPROX(mat6(0), mat4(0,0)*mat5(0,0,0) + mat4(1,0)*mat5(0,1,0) +
+                   mat4(0,1)*mat5(1,0,0) + mat4(1,1)*mat5(1,1,0));
+  VERIFY_IS_APPROX(mat6(1), mat4(0,0)*mat5(0,0,1) + mat4(1,0)*mat5(0,1,1) +
+                   mat4(0,1)*mat5(1,0,1) + mat4(1,1)*mat5(1,1,1));
 }
 
 template<int DataLayout>
@@ -147,7 +167,7 @@ static void test_holes() {
   t1.setRandom();
   t2.setRandom();
 
-  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(3, 4)}});
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(3, 4)}};
   Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
   VERIFY_IS_EQUAL(result.dimension(0), 5);
   VERIFY_IS_EQUAL(result.dimension(1), 7);
@@ -182,7 +202,7 @@ static void test_full_redux()
   t1.setRandom();
   t2.setRandom();
 
-  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
   Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(1, 0, 0)
@@ -212,7 +232,7 @@ static void test_contraction_of_contraction()
   t3.setRandom();
   t4.setRandom();
 
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
   auto contract1 = t1.contract(t2, dims);
   auto diff = t3 - contract1;
   auto contract2 = t1.contract(t4, dims);
@@ -243,7 +263,7 @@ static void test_expr()
 
   Tensor<float, 2, DataLayout> mat3(2,2);
 
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
   mat3 = mat1.contract(mat2, dims);
 
   VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
@@ -263,7 +283,7 @@ static void test_out_of_order_contraction()
 
   Tensor<float, 2, DataLayout> mat3(2, 2);
 
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(0, 2)}});
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(0, 2)}};
   mat3 = mat1.contract(mat2, dims);
 
   VERIFY_IS_APPROX(mat3(0, 0),
@@ -279,7 +299,7 @@ static void test_out_of_order_contraction()
                    mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
                    mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
 
-  Eigen::array<DimPair, 2> dims2({{DimPair(0, 2), DimPair(2, 0)}});
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(0, 2), DimPair(2, 0)}};
   mat3 = mat1.contract(mat2, dims2);
 
   VERIFY_IS_APPROX(mat3(0, 0),
@@ -311,8 +331,8 @@ static void test_consistency()
   Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
 
   // contract on dimensions of size 4 and 3
-  Eigen::array<DimPair, 2> dims1({{DimPair(0, 4), DimPair(1, 0)}});
-  Eigen::array<DimPair, 2> dims2({{DimPair(4, 0), DimPair(0, 1)}});
+  Eigen::array<DimPair, 2> dims1 = {{DimPair(0, 4), DimPair(1, 0)}};
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(4, 0), DimPair(0, 1)}};
 
   mat3 = mat1.contract(mat2, dims1);
   mat4 = mat2.contract(mat1, dims2);
@@ -354,7 +374,7 @@ static void test_large_contraction()
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
 
   // compute results by separate methods
   t_result = t_left.contract(t_right, dims);
@@ -399,10 +419,10 @@ static void test_tensor_vector()
 {
   Tensor<float, 3, DataLayout> t_left(7, 13, 17);
   Tensor<float, 2, DataLayout> t_right(1, 7);
-  
+
   t_left.setRandom();
   t_right.setRandom();
-  
+
   typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
   Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
   Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
@@ -434,7 +454,7 @@ static void test_small_blocking_factors()
   Eigen::setCpuCacheSizes(896, 1920, 2944);
 
   // this contraction should be equivalent to a single matrix multiplication
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
   Tensor<float, 5, DataLayout> t_result;
   t_result = t_left.contract(t_right, dims);
 
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cu
index 5ff082a3a..134359611 100644
--- a/unsupported/test/cxx11_tensor_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -7,8 +7,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// TODO(mdevin): Free the cuda memory.
-
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
@@ -63,6 +61,10 @@ void test_cuda_elementwise_small() {
         out(Eigen::array<int, 1>(i)),
         in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
 }
 
 void test_cuda_elementwise()
@@ -113,6 +115,48 @@ void test_cuda_elementwise()
       }
     }
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_in3);
+  cudaFree(d_out);
+}
+
+void test_cuda_props() {
+  Tensor<float, 1> in1(200);
+  Tensor<bool, 1> out(200);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(bool);
+
+  float* d_in1;
+  bool* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, 200);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
+      d_out, 200);
+
+  gpu_out.device(gpu_device) = (gpu_in1.isnan)();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 200; ++i) {
+    VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
 }
 
 void test_cuda_reduction()
@@ -131,8 +175,7 @@ void test_cuda_reduction()
 
   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -159,10 +202,13 @@ void test_cuda_reduction()
       VERIFY_IS_APPROX(out(i,j), expected);
     }
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_contraction()
+void test_cuda_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -189,8 +235,7 @@ static void test_cuda_contraction()
   cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -214,14 +259,18 @@ static void test_cuda_contraction()
 
   for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
-      cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << endl;
+      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
     }
   }
+
+  cudaFree(d_t_left);
+  cudaFree(d_t_right);
+  cudaFree(d_t_result);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_1d()
+void test_cuda_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -243,8 +292,7 @@ static void test_cuda_convolution_1d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -269,9 +317,13 @@ static void test_cuda_convolution_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
-static void test_cuda_convolution_inner_dim_col_major_1d()
+void test_cuda_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -293,8 +345,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -319,9 +370,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
-static void test_cuda_convolution_inner_dim_row_major_1d()
+void test_cuda_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -343,8 +398,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -369,10 +423,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_2d()
+void test_cuda_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -394,8 +452,7 @@ static void test_cuda_convolution_2d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -430,10 +487,14 @@ static void test_cuda_convolution_2d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_3d()
+void test_cuda_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -455,8 +516,7 @@ static void test_cuda_convolution_3d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;    
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -505,21 +565,507 @@ static void test_cuda_convolution_3d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+
+template <typename Scalar>
+void test_cuda_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_digamma()
+{
+  Tensor<Scalar, 1> in(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in(0) = Scalar(1);
+  in(1) = Scalar(1.5);
+  in(2) = Scalar(4);
+  in(3) = Scalar(-10.5);
+  in(4) = Scalar(10000.5);
+  in(5) = Scalar(0);
+  in(6) = Scalar(-1);
+
+  expected_out(0) = Scalar(-0.5772156649015329);
+  expected_out(1) = Scalar(0.03648997397857645);
+  expected_out(2) = Scalar(1.2561176684318);
+  expected_out(3) = Scalar(2.398239129535781);
+  expected_out(4) = Scalar(9.210340372392849);
+  expected_out(5) = std::numeric_limits<Scalar>::infinity();
+  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in.digamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+  for (int i = 5; i < 7; ++i) {
+    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  }
+}
+
+template <typename Scalar>
+void test_cuda_zeta()
+{
+  Tensor<Scalar, 1> in_x(6);
+  Tensor<Scalar, 1> in_q(6);
+  Tensor<Scalar, 1> out(6);
+  Tensor<Scalar, 1> expected_out(6);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(1.5);
+  in_x(2) = Scalar(4);
+  in_x(3) = Scalar(-10.5);
+  in_x(4) = Scalar(10000.5);
+  in_x(5) = Scalar(3);
+  
+  in_q(0) = Scalar(1.2345);
+  in_q(1) = Scalar(2);
+  in_q(2) = Scalar(1.5);
+  in_q(3) = Scalar(3);
+  in_q(4) = Scalar(1.0001);
+  in_q(5) = Scalar(-2.5);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = Scalar(1.61237534869);
+  expected_out(2) = Scalar(0.234848505667);
+  expected_out(3) = Scalar(1.03086757337e-5);
+  expected_out(4) = Scalar(0.367879440865);
+  expected_out(5) = Scalar(0.054102025820864097);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_q;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_q), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_polygamma()
+{
+  Tensor<Scalar, 1> in_x(7);
+  Tensor<Scalar, 1> in_n(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in_n(0) = Scalar(1);
+  in_n(1) = Scalar(1);
+  in_n(2) = Scalar(1);
+  in_n(3) = Scalar(17);
+  in_n(4) = Scalar(31);
+  in_n(5) = Scalar(28);
+  in_n(6) = Scalar(8);
+  
+  in_x(0) = Scalar(2);
+  in_x(1) = Scalar(3);
+  in_x(2) = Scalar(25.5);
+  in_x(3) = Scalar(4.7);
+  in_x(4) = Scalar(11.8);
+  in_x(5) = Scalar(17.7);
+  in_x(6) = Scalar(30.2);
+
+  expected_out(0) = Scalar(0.644934066848);
+  expected_out(1) = Scalar(0.394934066848);
+  expected_out(2) = Scalar(0.0399946696496);
+  expected_out(3) = Scalar(293.334565435);
+  expected_out(4) = Scalar(0.445487887616);
+  expected_out(5) = Scalar(-2.47810300902e-07);
+  expected_out(6) = Scalar(-8.29668781082e-09);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_n;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_n), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+}
+
+template <typename Scalar>
+void test_cuda_igamma()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                          {0.0, 0.6321205588285578, 0.7768698398515702,
+                           0.9816843611112658, 9.999500016666262e-05, 1.0},
+                          {0.0, 0.4275932955291202, 0.608374823728911,
+                           0.9539882943107686, 7.522076445089201e-07, 1.0},
+                          {0.0, 0.01898815687615381, 0.06564245437845008,
+                           0.5665298796332909, 4.166333347221828e-18, 1.0},
+                          {0.0, 0.9999780593618628, 0.9999899967080838,
+                           0.9999996219837988, 0.9991370418689945, 1.0},
+                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_a), bytes);
+  cudaMalloc((void**)(&d_x), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igamma_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_igammac()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                           {1.0, 0.36787944117144233, 0.22313016014842982,
+                            0.018315638888734182, 0.9999000049998333, 0.0},
+                           {1.0, 0.5724067044708798, 0.3916251762710878,
+                            0.04601170568923136, 0.9999992477923555, 0.0},
+                           {1.0, 0.9810118431238462, 0.9343575456215499,
+                            0.4334701203667089, 1.0, 0.0},
+                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                            3.7801620118431334e-07, 0.0008629581310054535,
+                            0.0},
+                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_a), bytes);
+  cudaMalloc((void**)(&d_x), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igammac_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 void test_cxx11_tensor_cuda()
 {
-  CALL_SUBTEST(test_cuda_elementwise_small());
-  CALL_SUBTEST(test_cuda_elementwise());
-  CALL_SUBTEST(test_cuda_reduction());
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
+  CALL_SUBTEST_1(test_cuda_elementwise_small());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+  CALL_SUBTEST_1(test_cuda_props());
+  CALL_SUBTEST_1(test_cuda_reduction());
+  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+
+#if __cplusplus > 199711L
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
+  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
+
+  CALL_SUBTEST_5(test_cuda_digamma<float>());
+  CALL_SUBTEST_5(test_cuda_digamma<double>());
+
+  CALL_SUBTEST_5(test_cuda_polygamma<float>());
+  CALL_SUBTEST_5(test_cuda_polygamma<double>());
+
+  CALL_SUBTEST_5(test_cuda_zeta<float>());
+  CALL_SUBTEST_5(test_cuda_zeta<double>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<float>());
+  CALL_SUBTEST_5(test_cuda_igammac<float>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<double>());
+  CALL_SUBTEST_5(test_cuda_igammac<double>());
+#endif
 }
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 7e33c9580..8baa477cc 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -25,7 +25,9 @@ struct InsertZeros {
   template <typename Output, typename Device>
   void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
   {
-    array<DenseIndex, 2> strides{{2, 2}};
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
     output.stride(strides).device(device) = input;
 
     Eigen::DSizes<DenseIndex, 2> offsets(1,1);
@@ -70,7 +72,8 @@ struct BatchMatMul {
             Output& output, const Device& device) const
   {
     typedef Tensor<float, 3>::DimensionPair DimPair;
-    array<DimPair, 1> dims({{DimPair(1, 0)}});
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
     for (int i = 0; i < output.dimension(2); ++i) {
       output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
     }
@@ -88,9 +91,10 @@ static void test_custom_binary_op()
   Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
   for (int i = 0; i < 5; ++i) {
     typedef Tensor<float, 3>::DimensionPair DimPair;
-    array<DimPair, 1> dims({{DimPair(1, 0)}});
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
     Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
-    TensorRef<Tensor<float, 2>> val = result.chip<2>(i);
+    TensorRef<Tensor<float, 2> > val = result.chip<2>(i);
     for (int j = 0; j < 2; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(val(j, k), reference(j, k));
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cu
index ed5dd7505..cbe9e6449 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -109,19 +109,19 @@ struct GPUContext {
 
 // The actual expression to evaluate
 template <typename Context>
-static void test_contextual_eval(Context* context)
+void test_contextual_eval(Context* context)
 {
   context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
 }
 
 template <typename Context>
-static void test_forced_contextual_eval(Context* context)
+void test_forced_contextual_eval(Context* context)
 {
   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
 }
 
 template <typename Context>
-static void test_compound_assignment(Context* context)
+void test_compound_assignment(Context* context)
 {
   context->out().device(context->device()) = context->in1().constant(2.718f);
   context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
@@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context)
 
 
 template <typename Context>
-static void test_contraction(Context* context)
+void test_contraction(Context* context)
 {
   Eigen::array<std::pair<int, int>, 2> dims;
   dims[0] = std::make_pair(1, 1);
@@ -145,7 +145,7 @@ static void test_contraction(Context* context)
 
 
 template <typename Context>
-static void test_1d_convolution(Context* context)
+void test_1d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(40,49,70);
@@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context)
 }
 
 template <typename Context>
-static void test_2d_convolution(Context* context)
+void test_2d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(40,49,69);
@@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context)
 }
 
 template <typename Context>
-static void test_3d_convolution(Context* context)
+void test_3d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(39,49,69);
@@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context)
 }
 
 
-static void test_cpu() {
+void test_cpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
   Eigen::Tensor<float, 3> in2(40,50,70);
   Eigen::Tensor<float, 3> out(40,50,70);
@@ -267,7 +267,7 @@ static void test_cpu() {
   }
 }
 
-static void test_gpu() {
+void test_gpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
   Eigen::Tensor<float, 3> in2(40,50,70);
   Eigen::Tensor<float, 3> out(40,50,70);
@@ -383,6 +383,6 @@ static void test_gpu() {
 
 void test_cxx11_tensor_device()
 {
-  CALL_SUBTEST(test_cpu());
-  CALL_SUBTEST(test_gpu());
+  CALL_SUBTEST_1(test_cpu());
+  CALL_SUBTEST_2(test_gpu());
 }
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
new file mode 100644
index 000000000..9130fff35
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_empty_tensor()
+{
+  Tensor<float, 2> source;
+  Tensor<float, 2> tgt1 = source;
+  Tensor<float, 2> tgt2(source);
+  Tensor<float, 2> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+static void test_empty_fixed_size_tensor()
+{
+  TensorFixedSize<float, Sizes<0>> source;
+  TensorFixedSize<float, Sizes<0>> tgt1 = source;
+  TensorFixedSize<float, Sizes<0>> tgt2(source);
+  TensorFixedSize<float, Sizes<0>> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+
+void test_cxx11_tensor_empty()
+{
+   CALL_SUBTEST(test_empty_tensor());
+   CALL_SUBTEST(test_empty_fixed_size_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 0f6e09106..89874349f 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -14,7 +14,7 @@ using Eigen::Tensor;
 
 template <int DataLayout>
 static void test_fft_2D_golden() {
-  Tensor<float, 2, DataLayout, long> input(2, 3);
+  Tensor<float, 2, DataLayout> input(2, 3);
   input(0, 0) = 1;
   input(0, 1) = 2;
   input(0, 2) = 3;
@@ -22,11 +22,11 @@ static void test_fft_2D_golden() {
   input(1, 1) = 5;
   input(1, 2) = 6;
 
-  array<int, 2> fft;
+  array<ptrdiff_t, 2> fft;
   fft[0] = 0;
   fft[1] = 1;
 
-  Tensor<std::complex<float>, 2, DataLayout, long> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 2, DataLayout> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
 
   std::complex<float> output_golden[6]; // in ColMajor order
   output_golden[0] = std::complex<float>(21, 0);
@@ -57,24 +57,24 @@ static void test_fft_2D_golden() {
 }
 
 static void test_fft_complex_input_golden() {
-  Tensor<std::complex<float>, 1, ColMajor, long> input(5);
+  Tensor<std::complex<float>, 1, ColMajor> input(5);
   input(0) = std::complex<float>(1, 1);
   input(1) = std::complex<float>(2, 2);
   input(2) = std::complex<float>(3, 3);
   input(3) = std::complex<float>(4, 4);
   input(4) = std::complex<float>(5, 5);
 
-  array<int, 1> fft;
+  array<ptrdiff_t, 1> fft;
   fft[0] = 0;
 
-  Tensor<std::complex<float>, 1, ColMajor, long> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
-  Tensor<std::complex<float>, 1, ColMajor, long> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
 
-  Tensor<float, 1, ColMajor, long> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
-  Tensor<float, 1, ColMajor, long> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
 
-  Tensor<float, 1, ColMajor, long> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
-  Tensor<float, 1, ColMajor, long> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
 
   VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
   VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
@@ -114,24 +114,24 @@ static void test_fft_complex_input_golden() {
 }
 
 static void test_fft_real_input_golden() {
-  Tensor<float, 1, ColMajor, long> input(5);
+  Tensor<float, 1, ColMajor> input(5);
   input(0) = 1.0;
   input(1) = 2.0;
   input(2) = 3.0;
   input(3) = 4.0;
   input(4) = 5.0;
 
-  array<int, 1> fft;
+  array<ptrdiff_t, 1> fft;
   fft[0] = 0;
 
-  Tensor<std::complex<float>, 1, ColMajor, long> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
-  Tensor<std::complex<float>, 1, ColMajor, long> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
 
-  Tensor<float, 1, ColMajor, long> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
-  Tensor<float, 1, ColMajor, long> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
 
-  Tensor<float, 1, ColMajor, long> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
-  Tensor<float, 1, ColMajor, long> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
 
   VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
   VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
@@ -178,21 +178,21 @@ static void test_fft_real_input_golden() {
 template <int DataLayout, typename RealScalar, bool isComplexInput, int FFTResultType, int FFTDirection, int TensorRank>
 static void test_fft_real_input_energy() {
 
-  Eigen::DSizes<long, TensorRank> dimensions;
-  int total_size = 1;
+  Eigen::DSizes<ptrdiff_t, TensorRank> dimensions;
+  ptrdiff_t total_size = 1;
   for (int i = 0; i < TensorRank; ++i) {
     dimensions[i] = rand() % 20 + 1;
     total_size *= dimensions[i];
   }
-  const DSizes<long, TensorRank> arr = dimensions;
+  const DSizes<ptrdiff_t, TensorRank> arr = dimensions;
 
   typedef typename internal::conditional<isComplexInput == true, std::complex<RealScalar>, RealScalar>::type InputScalar;
 
-  Tensor<InputScalar, TensorRank, DataLayout, long> input;
+  Tensor<InputScalar, TensorRank, DataLayout> input;
   input.resize(arr);
   input.setRandom();
 
-  array<int, TensorRank> fft;
+  array<ptrdiff_t, TensorRank> fft;
   for (int i = 0; i < TensorRank; ++i) {
     fft[i] = i;
   }
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
new file mode 100644
index 000000000..c946007b8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <stdlib.h>
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#include <windows.h>
+void sleep(int seconds) {
+  Sleep(seconds*1000);
+}
+#else
+#include <unistd.h>
+#endif
+
+
+namespace {
+
+void WaitAndAdd(Eigen::Notification* n, int* counter) {
+  n->Wait();
+  *counter = *counter + 1;
+}
+
+}  // namespace
+
+static void test_notification_single()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  sleep(1);
+
+  // The thread should be waiting for the notification.
+  VERIFY_IS_EQUAL(counter, 0);
+
+  // Unblock the thread
+  n.Notify();
+
+  sleep(1);
+
+  // Verify the counter has been incremented
+  VERIFY_IS_EQUAL(counter, 1);
+}
+
+// Like test_notification_single() but enqueues multiple threads to
+// validate that all threads get notified by Notify().
+static void test_notification_multiple()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 0);
+  n.Notify();
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 4);
+}
+
+void test_cxx11_tensor_notification()
+{
+  CALL_SUBTEST(test_notification_single());
+  CALL_SUBTEST(test_notification_multiple());
+}
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index 8ad04f699..e9d1b2d3c 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -48,6 +48,25 @@ static void test_abs()
 }
 
 
+static void test_conjugate()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  Tensor<int, 1> data3(3);
+  data1.setRandom();
+  data2.setRandom();
+  data3.setRandom();
+
+  Tensor<std::complex<float>, 1> conj1 = data1.conjugate();
+  Tensor<std::complex<double>, 1> conj2 = data2.conjugate();
+  Tensor<int, 1> conj3 = data3.conjugate();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(conj1(i), std::conj(data1(i)));
+    VERIFY_IS_APPROX(conj2(i), std::conj(data2(i)));
+    VERIFY_IS_APPROX(conj3(i), data3(i));
+  }
+}
+
 static void test_contractions()
 {
   Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
@@ -64,7 +83,9 @@ static void test_contractions()
 
   // This contraction should be equivalent to a regular matrix multiplication
   typedef Tensor<float, 1>::DimensionPair DimPair;
-  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
   t_result = t_left.contract(t_right, dims);
   m_result = m_left * m_right;
   for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
@@ -77,5 +98,6 @@ void test_cxx11_tensor_of_complex()
 {
   CALL_SUBTEST(test_additions());
   CALL_SUBTEST(test_abs());
+  CALL_SUBTEST(test_conjugate());
   CALL_SUBTEST(test_contractions());
 }
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
new file mode 100644
index 000000000..cb917bb37
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -0,0 +1,256 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+  
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_cuda_unary() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+void test_cuda_elementwise() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+void test_cuda_contractions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims);
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims).cast<float>();
+
+  Tensor<float, 2> half_prec(rows, cols);
+  Tensor<float, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << std::endl;
+      VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+    }
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+void test_cuda_reductions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, size);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  Eigen::array<int, 1> redux_dim = {{0}};
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim);
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim).cast<float>();
+
+  Tensor<float, 1> half_prec(size);
+  Tensor<float, 1> full_prec(size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < size; ++i) {
+    std::cout << "Checking redux " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+#endif
+
+
+void test_cxx11_tensor_of_float16_cuda()
+{
+#ifdef EIGEN_HAS_CUDA_FP16
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice device(&stream);
+  if (device.majorDeviceVersion() > 5 ||
+      (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
+    std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl;
+
+    CALL_SUBTEST_1(test_cuda_conversion());
+    CALL_SUBTEST_1(test_cuda_unary());
+    CALL_SUBTEST_1(test_cuda_elementwise());
+    CALL_SUBTEST_2(test_cuda_contractions());
+    CALL_SUBTEST_3(test_cuda_reductions());
+  }
+  else {
+   std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;
+  }
+#else
+  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 389896c54..0f3dc5787 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -48,7 +48,7 @@ struct MyGenerator {
   }
 
   // Same as above but generates several numbers at a time.
-  typename internal::packet_traits<int>::type packetOp(
+  internal::packet_traits<int>::type packetOp(
       Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
     const int packetSize = internal::packet_traits<int>::size;
     EIGEN_ALIGN_MAX int values[packetSize];
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cu
index 5d091de15..5d091de15 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 0ec316991..6a128901a 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -9,6 +9,7 @@
 
 #include "main.h"
 #include <limits>
+#include <numeric>
 #include <Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index 9e06eb126..cad0c08e0 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -48,9 +48,12 @@ static void test_full_reductions() {
 
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
 }
 
 void test_cxx11_tensor_reduction_cuda() {
-  CALL_SUBTEST(test_full_reductions<ColMajor>());
-  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST_1(test_full_reductions<ColMajor>());
+  CALL_SUBTEST_2(test_full_reductions<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index f96c21fa3..b35b8d29e 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -114,10 +114,18 @@ static void test_expr_reverse(bool LValue)
 
   Tensor<float, 4, DataLayout> result(2,3,5,7);
 
-  array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}};
-  array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}};
-  array<ptrdiff_t, 4> dst_slice_dim{{2,3,1,7}};
-  array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<ptrdiff_t, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<ptrdiff_t, 4> dst_slice_dim = src_slice_dim;
+  array<ptrdiff_t, 4> dst_slice_start = src_slice_start;
 
   for (int i = 0; i < 5; ++i) {
     if (LValue) {
diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
new file mode 100644
index 000000000..2c26151ab
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_roundings.cpp
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_float_rounding()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.round();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::round(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_flooring()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.floor();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::floor(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_ceiling()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.ceil();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::ceil(ftensor(i,j)));
+    }
+  }
+}
+
+void test_cxx11_tensor_roundings()
+{
+   CALL_SUBTEST(test_float_rounding());
+   CALL_SUBTEST(test_float_ceiling());
+   CALL_SUBTEST(test_float_flooring());
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index 98671a986..a03f75cfe 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -18,7 +18,7 @@ static void test_comparison_sugar() {
 
 #define TEST_TENSOR_EQUAL(e1, e2) \
   b = ((e1) == (e2)).all();       \
-  VERIFY(b(0))
+  VERIFY(b())
 
 #define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0))
 
@@ -32,7 +32,30 @@ static void test_comparison_sugar() {
 #undef TEST_TENSOR_EQUAL
 }
 
+
+static void test_scalar_sugar() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+
+  Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta);
+  Tensor<float, 3> S = A * alpha + B * beta;
+
+  // TODO: add enough syntactic sugar to support this
+  // Tensor<float, 3> T = alpha * A + beta * B;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+  }
+}
+
+
 void test_cxx11_tensor_sugar()
 {
   CALL_SUBTEST(test_comparison_sugar());
+  CALL_SUBTEST(test_scalar_sugar());
 }
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e28cf55e2..e46197464 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -17,7 +17,7 @@
 using Eigen::Tensor;
 
 
-static void test_multithread_elementwise()
+void test_multithread_elementwise()
 {
   Tensor<float, 3> in1(2,3,7);
   Tensor<float, 3> in2(2,3,7);
@@ -40,7 +40,7 @@ static void test_multithread_elementwise()
 }
 
 
-static void test_multithread_compound_assignment()
+void test_multithread_compound_assignment()
 {
   Tensor<float, 3> in1(2,3,7);
   Tensor<float, 3> in2(2,3,7);
@@ -64,7 +64,7 @@ static void test_multithread_compound_assignment()
 }
 
 template<int DataLayout>
-static void test_multithread_contraction()
+void test_multithread_contraction()
 {
   Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
   Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
@@ -91,15 +91,20 @@ static void test_multithread_contraction()
 
  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
-      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
-      assert(false);
+    if (fabs(t_result(i) - m_result(i)) < 1e-4) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
+      continue;
     }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  m_result(i) << std::endl;
+    assert(false);
   }
 }
 
 template<int DataLayout>
-static void test_contraction_corner_cases()
+void test_contraction_corner_cases()
 {
   Tensor<float, 2, DataLayout> t_left(32, 500);
   Tensor<float, 2, DataLayout> t_right(32, 28*28);
@@ -186,7 +191,7 @@ static void test_contraction_corner_cases()
 }
 
 template<int DataLayout>
-static void test_multithread_contraction_agrees_with_singlethread() {
+void test_multithread_contraction_agrees_with_singlethread() {
   int contract_size = internal::random<int>(1, 5000);
 
   Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
@@ -229,7 +234,7 @@ static void test_multithread_contraction_agrees_with_singlethread() {
 
 
 template<int DataLayout>
-static void test_multithreaded_reductions() {
+void test_multithreaded_reductions() {
   const int num_threads = internal::random<int>(3, 11);
   ThreadPool thread_pool(num_threads);
   Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
@@ -239,19 +244,19 @@ static void test_multithreaded_reductions() {
   Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
   t1.setRandom();
 
-  Tensor<float, 1, DataLayout> full_redux(1);
+  Tensor<float, 0, DataLayout> full_redux;
   full_redux = t1.sum();
 
-  Tensor<float, 1, DataLayout> full_redux_tp(1);
+  Tensor<float, 0, DataLayout> full_redux_tp;
   full_redux_tp.device(thread_pool_device) = t1.sum();
 
   // Check that the single threaded and the multi threaded reductions return
   // the same result.
-  VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0));
+  VERIFY_IS_APPROX(full_redux(), full_redux_tp());
 }
 
 
-static void test_memcpy() {
+void test_memcpy() {
 
   for (int i = 0; i < 5; ++i) {
     const int num_threads = internal::random<int>(3, 11);
@@ -270,7 +275,7 @@ static void test_memcpy() {
 }
 
 
-static void test_multithread_random()
+void test_multithread_random()
 {
   Eigen::ThreadPool tp(2);
   Eigen::ThreadPoolDevice device(&tp, 2);
@@ -278,26 +283,52 @@ static void test_multithread_random()
   t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
 }
 
+template<int DataLayout>
+void test_multithread_shuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(17,5,7,11);
+  tensor.setRandom();
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
+  array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
+  shuffle.device(device) = tensor.shuffle(shuffles);
+
+  for (int i = 0; i < 17; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
+        }
+      }
+    }
+  }
+}
+
 
 void test_cxx11_tensor_thread_pool()
 {
-  CALL_SUBTEST(test_multithread_elementwise());
-  CALL_SUBTEST(test_multithread_compound_assignment());
+  CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_multithread_compound_assignment());
 
-  CALL_SUBTEST(test_multithread_contraction<ColMajor>());
-  CALL_SUBTEST(test_multithread_contraction<RowMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
 
-  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
-  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
-  CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
-
-  CALL_SUBTEST(test_multithreaded_reductions<ColMajor>());
-  CALL_SUBTEST(test_multithreaded_reductions<RowMajor>());
+  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
 
-  CALL_SUBTEST(test_memcpy());
+  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
 
-  CALL_SUBTEST(test_multithread_random());
+  CALL_SUBTEST_6(test_memcpy());
+  CALL_SUBTEST_6(test_multithread_random());
+  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
+  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index ee3767e58..d2a1e8673 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -11,10 +11,20 @@
 
 #include <Eigen/CXX11/Tensor>
 
+
+#if EIGEN_COMP_MSVC
+#define EIGEN_NO_INT128
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+// Only run the test on compilers that support 128bit integers natively
+#ifndef EIGEN_NO_INT128
+
 using Eigen::internal::TensorUInt128;
 using Eigen::internal::static_val;
 
-void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, __uint128_t expected) {
+void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, uint128_t expected) {
   bool matchl = actual.lower() == static_cast<uint64_t>(expected);
   bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64);
   if (!matchl || !matchh) {
@@ -32,13 +42,13 @@ void test_add() {
   for (uint64_t i1 = 0; i1 < 100; ++i1) {
     for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
       TensorUInt128<uint64_t, uint64_t> i(i1, i2);
-      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
       for (uint64_t j1 = 0; j1 < 100; ++j1) {
         for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
           TensorUInt128<uint64_t, uint64_t> j(j1, j2);
-          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
           TensorUInt128<uint64_t, uint64_t> actual = i + j;
-          __uint128_t expected = a + b;
+          uint128_t expected = a + b;
           VERIFY_EQUAL(actual, expected);
         }
       }
@@ -51,13 +61,13 @@ void test_sub() {
   for (uint64_t i1 = 0; i1 < 100; ++i1) {
     for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
       TensorUInt128<uint64_t, uint64_t> i(i1, i2);
-      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
       for (uint64_t j1 = 0; j1 < 100; ++j1) {
         for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
           TensorUInt128<uint64_t, uint64_t> j(j1, j2);
-          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
           TensorUInt128<uint64_t, uint64_t> actual = i - j;
-          __uint128_t expected = a - b;
+          uint128_t expected = a - b;
           VERIFY_EQUAL(actual, expected);
         }
       }
@@ -70,13 +80,13 @@ void test_mul() {
   for (uint64_t i1 = 0; i1 < 100; ++i1) {
     for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
       TensorUInt128<uint64_t, uint64_t> i(i1, i2);
-      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
       for (uint64_t j1 = 0; j1 < 100; ++j1) {
         for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
           TensorUInt128<uint64_t, uint64_t> j(j1, j2);
-          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
           TensorUInt128<uint64_t, uint64_t> actual = i * j;
-          __uint128_t expected = a * b;
+          uint128_t expected = a * b;
           VERIFY_EQUAL(actual, expected);
         }
       }
@@ -89,13 +99,13 @@ void test_div() {
   for (uint64_t i1 = 0; i1 < 100; ++i1) {
     for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
       TensorUInt128<uint64_t, uint64_t> i(i1, i2);
-      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
       for (uint64_t j1 = 0; j1 < 100; ++j1) {
         for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
           TensorUInt128<uint64_t, uint64_t> j(j1, j2);
-          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
           TensorUInt128<uint64_t, uint64_t> actual = i / j;
-          __uint128_t expected = a / b;
+          uint128_t expected = a / b;
           VERIFY_EQUAL(actual, expected);
         }
       }
@@ -107,10 +117,10 @@ void test_misc1() {
   uint64_t incr = internal::random<uint64_t>(1, 9999999999);
   for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
     TensorUInt128<static_val<0>, uint64_t> i(0, i2);
-    __uint128_t a = static_cast<__uint128_t>(i2);
+    uint128_t a = static_cast<uint128_t>(i2);
     for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
       TensorUInt128<static_val<0>, uint64_t> j(0, j2);
-      __uint128_t b = static_cast<__uint128_t>(j2);
+      uint128_t b = static_cast<uint128_t>(j2);
       uint64_t actual = (i * j).upper();
       uint64_t expected = (a * b) >> 64;
       VERIFY_IS_EQUAL(actual, expected);
@@ -122,23 +132,29 @@ void test_misc2() {
   int64_t incr = internal::random<int64_t>(1, 100);
   for (int64_t log_div = 0; log_div < 63; ++log_div) {
     for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) {
-      uint64_t expected = (static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1;
+      uint64_t expected = (static_cast<uint128_t>(1) << (64+log_div)) / static_cast<uint128_t>(divider) - (static_cast<uint128_t>(1) << 64) + 1;
       uint64_t shift = 1ULL << log_div;
 
       TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
       uint64_t actual = static_cast<uint64_t>(result);
-      VERIFY_EQUAL(actual, expected);
+      VERIFY_IS_EQUAL(actual, expected);
     }
   }
 }
+#endif
 
 
 void test_cxx11_tensor_uint128()
 {
+#ifdef EIGEN_NO_INT128
+  // Skip the test on compilers that don't support 128bit integers natively
+  return;
+#else
   CALL_SUBTEST_1(test_add());
   CALL_SUBTEST_2(test_sub());
   CALL_SUBTEST_3(test_mul());
   CALL_SUBTEST_4(test_div());
   CALL_SUBTEST_5(test_misc1());
   CALL_SUBTEST_6(test_misc2());
+#endif
 }
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index a2bdb99e4..6dc17bd17 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -23,6 +23,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 struct lmder_functor : DenseFunctor<double>
 {
     lmder_functor(void): DenseFunctor<double>(3,15) {}
@@ -631,7 +634,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev(), 79);
   VERIFY_IS_EQUAL(lm.njev(), 72);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.430899764097e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -652,7 +655,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev(), 9);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.428595533845e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -789,7 +792,8 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -799,9 +803,13 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  //VERIFY_IS_EQUAL(info, 1);
+
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 284 );
   VERIFY_IS_EQUAL(lm.njev(), 249 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
@@ -809,7 +817,10 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
+  ++g_test_level;
   VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
+  --g_test_level;
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -819,9 +830,12 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  //VERIFY_IS_EQUAL(info, 1);
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 126);
   VERIFY_IS_EQUAL(lm.njev(), 116);
+  --g_test_level;
+  VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
 }
 
 
@@ -896,8 +910,12 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1); 
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 16 );
   VERIFY_IS_EQUAL(lm.njev(), 15 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
   // check x
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 487d5a9b8..9a995f941 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
 
   MatrixType scaledA;
   RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
-  if (maxImagPartOfSpectrum >= 0.9 * M_PI)
-    scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum;
+  if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
+    scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
   else
     scaledA = A;
 
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index baf183d12..8e104ed1e 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -24,7 +24,7 @@ void test2dRotation(double tol)
     s = std::sin(angle);
     B << c, s, -s, c;
 
-    C = Apow(std::ldexp(angle,1) / M_PI);
+    C = Apow(std::ldexp(angle,1) / EIGEN_PI);
     std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
     VERIFY(C.isApprox(B, tol));
   }
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
index c4f6cf0cb..9b0cf7268 100644
--- a/unsupported/test/mpreal/mpreal.h
+++ b/unsupported/test/mpreal/mpreal.h
@@ -72,14 +72,12 @@
 #define MPREAL_VERSION_STRING "3.6.2"
 
 // Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__) && defined(__INTEL_COMPILER)
-    #define IsInf(x) isinf(x)                   // Intel ICC compiler on Linux
-
+#if defined(__GNUC__)
+    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
 #elif defined(_MSC_VER)                         // Microsoft Visual C++
     #define IsInf(x) (!_finite(x))
-
 #else
-    #define IsInf(x) std::isinf EIGEN_NOT_A_MACRO (x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
+    #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
 #endif
 
 // A Clang feature extension to determine compiler features.
@@ -3103,4 +3101,4 @@ namespace std
 
 }
 
-#endif /* __MPREAL_H__ */
-\ No newline at end of file
+#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index 97665af96..3be020434 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -239,7 +239,7 @@ void check_global_interpolation_with_derivatives2d()
   typedef Spline2d::PointType PointType;
   typedef Spline2d::KnotVectorType KnotVectorType;
 
-  const unsigned int numPoints = 100;
+  const Eigen::DenseIndex numPoints = 100;
   const unsigned int dimension = 2;
   const unsigned int degree = 3;