11 files changed, 479 insertions, 13 deletions
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index f75bf9798..eed9f079e 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -116,6 +116,8 @@ if(EIGEN_TEST_CXX11)
   set(CMAKE_CXX_STANDARD 11)
 
   ei_add_test(cxx11_float16)
+  ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_meta)
   ei_add_test(cxx11_tensor_simple)
 #  ei_add_test(cxx11_tensor_symmetry)
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
new file mode 100644
index 000000000..f16cc6f07
--- /dev/null
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+static void test_basic_eventcount()
+{
+  std::vector<EventCount::Waiter> waiters(1);
+  EventCount ec(waiters);
+  EventCount::Waiter& w = waiters[0];
+  ec.Notify(false);
+  ec.Prewait(&w);
+  ec.Notify(true);
+  ec.CommitWait(&w);
+  ec.Prewait(&w);
+  ec.CancelWait(&w);
+}
+
+// Fake bounded counter-based queue.
+struct TestQueue {
+  std::atomic<int> val_;
+  static const int kQueueSize = 10;
+
+  TestQueue() : val_() {}
+
+  ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); }
+
+  bool Push() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == kQueueSize) return false;
+      if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Pop() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == 0) return false;
+      if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Empty() { return val_.load(std::memory_order_relaxed) == 0; }
+};
+
+const int TestQueue::kQueueSize;
+
+// A number of producers send messages to a set of consumers using a set of
+// fake queues. Ensure that it does not crash, consumers don't deadlock and
+// number of blocked and unblocked threads match.
+static void test_stress_eventcount()
+{
+  const int kThreads = std::thread::hardware_concurrency();
+  static const int kEvents = 1 << 16;
+  static const int kQueues = 10;
+
+  std::vector<EventCount::Waiter> waiters(kThreads);
+  EventCount ec(waiters);
+  TestQueue queues[kQueues];
+
+  std::vector<std::unique_ptr<std::thread>> producers;
+  for (int i = 0; i < kThreads; i++) {
+    producers.emplace_back(new std::thread([&ec, &queues]() {
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Push()) {
+          ec.Notify(false);
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+    }));
+  }
+
+  std::vector<std::unique_ptr<std::thread>> consumers;
+  for (int i = 0; i < kThreads; i++) {
+    consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
+      EventCount::Waiter& w = waiters[i];
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Pop()) continue;
+        j--;
+        ec.Prewait(&w);
+        bool empty = true;
+        for (int q = 0; q < kQueues; q++) {
+          if (!queues[q].Empty()) {
+            empty = false;
+            break;
+          }
+        }
+        if (!empty) {
+          ec.CancelWait(&w);
+          continue;
+        }
+        ec.CommitWait(&w);
+      }
+    }));
+  }
+
+  for (int i = 0; i < kThreads; i++) {
+    producers[i]->join();
+    consumers[i]->join();
+  }
+}
+
+void test_cxx11_eventcount()
+{
+  CALL_SUBTEST(test_basic_eventcount());
+  CALL_SUBTEST(test_stress_eventcount());
+}
diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp
index 2dc0872d8..273dcbc11 100644
--- a/unsupported/test/cxx11_float16.cpp
+++ b/unsupported/test/cxx11_float16.cpp
@@ -122,6 +122,8 @@ void test_comparison()
   VERIFY(half(1.0f) != half(2.0f));
 
   // Comparisons with NaNs and infinities.
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
   VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
   VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
 
@@ -132,13 +134,26 @@ void test_comparison()
 
   VERIFY(half(1.0) < half(1.0 / 0.0));
   VERIFY(half(1.0) > half(-1.0 / 0.0));
+#endif
 }
 
-void test_functions()
+void test_basic_functions()
 {
   VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
   VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
 
+  VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
+
+  VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
+
+  VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
+
+  VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
+
   VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
   VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI));
 
@@ -146,10 +161,32 @@ void test_functions()
   VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
 }
 
+void test_trigonometric_functions()
+{
+  VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
+  //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
+  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
+}
+
 void test_cxx11_float16()
 {
   CALL_SUBTEST(test_conversion());
   CALL_SUBTEST(test_arithmetic());
   CALL_SUBTEST(test_comparison());
-  CALL_SUBTEST(test_functions());
+  CALL_SUBTEST(test_basic_functions());
+  CALL_SUBTEST(test_trigonometric_functions());
 }
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index ecac3add1..8911c59d8 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -10,7 +10,7 @@
 #include "main.h"
 
 #include <array>
-#include <Eigen/CXX11/Core>
+#include <Eigen/CXX11/src/util/CXX11Meta.h>
 
 using Eigen::internal::is_same;
 using Eigen::internal::type_list;
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
new file mode 100644
index 000000000..d1770ee1b
--- /dev/null
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include <cstdlib>
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+void test_basic_runqueue()
+{
+  RunQueue<int, 4> q;
+  // Check empty state.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  std::vector<int> stolen;
+  VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Push one front, pop one front.
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  VERIFY_IS_EQUAL(0, q.Size());
+  // Push front to overflow.
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(4));
+  VERIFY_IS_EQUAL(3, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(5));
+  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(6, q.PushFront(6));
+  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(5, q.PopFront());
+  VERIFY_IS_EQUAL(3, q.Size());
+  VERIFY_IS_EQUAL(4, q.PopFront());
+  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(3, q.PopFront());
+  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(2, q.PopFront());
+  VERIFY_IS_EQUAL(0, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  // Push one back, pop one back.
+  VERIFY_IS_EQUAL(0, q.PushBack(7));
+  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1, stolen.size());
+  VERIFY_IS_EQUAL(7, stolen[0]);
+  VERIFY_IS_EQUAL(0, q.Size());
+  stolen.clear();
+  // Push back to overflow.
+  VERIFY_IS_EQUAL(0, q.PushBack(8));
+  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(9));
+  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(10));
+  VERIFY_IS_EQUAL(3, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(11));
+  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(12, q.PushBack(12));
+  VERIFY_IS_EQUAL(4, q.Size());
+  // Pop back in halves.
+  VERIFY_IS_EQUAL(2, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(2, stolen.size());
+  VERIFY_IS_EQUAL(10, stolen[0]);
+  VERIFY_IS_EQUAL(11, stolen[1]);
+  VERIFY_IS_EQUAL(2, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1, stolen.size());
+  VERIFY_IS_EQUAL(9, stolen[0]);
+  VERIFY_IS_EQUAL(1, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1, stolen.size());
+  VERIFY_IS_EQUAL(8, stolen[0]);
+  stolen.clear();
+  VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0, stolen.size());
+  // Empty again.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0, q.Size());
+}
+
+// Empty tests that the queue is not claimed to be empty when is is in fact not.
+// Emptiness property is crucial part of thread pool blocking scheme,
+// so we go to great effort to ensure this property. We create a queue with
+// 1 element and then push 1 element (either front or back at random) and pop
+// 1 element (either front or back at random). So queue always contains at least
+// 1 element, but otherwise changes chaotically. Another thread constantly tests
+// that the queue is not claimed to be empty.
+void test_empty_runqueue()
+{
+  RunQueue<int, 4> q;
+  q.PushFront(1);
+  std::atomic<bool> done(false);
+  std::thread mutator([&q, &done]() {
+    unsigned rnd = 0;
+    std::vector<int> stolen;
+    for (int i = 0; i < 1 << 18; i++) {
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(0, q.PushFront(1));
+      else
+        VERIFY_IS_EQUAL(0, q.PushBack(1));
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(1, q.PopFront());
+      else {
+        for (;;) {
+          if (q.PopBackHalf(&stolen) == 1) {
+            stolen.clear();
+            break;
+          }
+          VERIFY_IS_EQUAL(0, stolen.size());
+        }
+      }
+    }
+    done = true;
+  });
+  while (!done) {
+    VERIFY(!q.Empty());
+    int size = q.Size();
+    VERIFY_GE(size, 1);
+    VERIFY_LE(size, 2);
+  }
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  mutator.join();
+}
+
+// Stress is a chaotic random test.
+// One thread (owner) calls PushFront/PopFront, other threads call PushBack/
+// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
+void test_stress_runqueue()
+{
+  static const int kEvents = 1 << 18;
+  RunQueue<int, 8> q;
+  std::atomic<int> total(0);
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.emplace_back(new std::thread([&q, &total]() {
+    int sum = 0;
+    int pushed = 1;
+    int popped = 1;
+    while (pushed < kEvents || popped < kEvents) {
+      if (pushed < kEvents) {
+        if (q.PushFront(pushed) == 0) {
+          sum += pushed;
+          pushed++;
+        }
+      }
+      if (popped < kEvents) {
+        int v = q.PopFront();
+        if (v != 0) {
+          sum -= v;
+          popped++;
+        }
+      }
+    }
+    total += sum;
+  }));
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      for (int j = 1; j < kEvents; j++) {
+        if (q.PushBack(j) == 0) {
+          sum += j;
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+      total += sum;
+    }));
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      std::vector<int> stolen;
+      for (int j = 1; j < kEvents;) {
+        if (q.PopBackHalf(&stolen) == 0) {
+          EIGEN_THREAD_YIELD();
+          continue;
+        }
+        while (stolen.size() && j < kEvents) {
+          int v = stolen.back();
+          stolen.pop_back();
+          VERIFY_IS_NOT_EQUAL(v, 0);
+          sum += v;
+          j++;
+        }
+      }
+      while (stolen.size()) {
+        int v = stolen.back();
+        stolen.pop_back();
+        VERIFY_IS_NOT_EQUAL(v, 0);
+        while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD();
+      }
+      total -= sum;
+    }));
+  }
+  for (size_t i = 0; i < threads.size(); i++) threads[i]->join();
+  VERIFY(q.Empty());
+  VERIFY(total.load() == 0);
+}
+
+void test_cxx11_runqueue()
+{
+  CALL_SUBTEST_1(test_basic_runqueue());
+  CALL_SUBTEST_2(test_empty_runqueue());
+  CALL_SUBTEST_3(test_stress_runqueue());
+}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 134359611..4026f48f0 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -661,6 +661,9 @@ void test_cuda_digamma()
   for (int i = 5; i < 7; ++i) {
     VERIFY_IS_EQUAL(out(i), expected_out(i));
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -718,13 +721,17 @@ void test_cuda_zeta()
   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
 
   VERIFY_IS_EQUAL(out(0), expected_out(0));
-  VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3));
+  VERIFY((std::isnan)(out(3)));
 
   for (int i = 1; i < 6; ++i) {
     if (i != 3) {
       VERIFY_IS_APPROX(out(i), expected_out(i));
     }
   }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_q);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -787,6 +794,10 @@ void test_cuda_polygamma()
   for (int i = 0; i < 7; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
   }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_n);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -826,9 +837,9 @@ void test_cuda_igamma()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_a), bytes);
-  cudaMalloc((void**)(&d_x), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
 
   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
@@ -854,6 +865,10 @@ void test_cuda_igamma()
       }
     }
   }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -920,6 +935,10 @@ void test_cuda_igammac()
       }
     }
   }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -935,8 +954,8 @@ void test_cuda_erf(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
 
   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
 
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 1c33fefb3..5fe164859 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -20,6 +20,8 @@ static void test_0d()
   TensorFixedSize<float, Sizes<> > scalar1;
   TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
   VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
 
   scalar1() = 7.0;
   scalar2() = 13.0;
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index d247bebaa..61c742a16 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -16,7 +16,7 @@ using Eigen::RowMajor;
 
 static void test_tanh()
 {
-  Tensor<float, 1> vec1({6});
+  Tensor<float, 1> vec1(6);
   vec1.setRandom();
 
   Tensor<float, 1> vec2 = vec1.tanh();
@@ -28,7 +28,7 @@ static void test_tanh()
 
 static void test_sigmoid()
 {
-  Tensor<float, 1> vec1({6});
+  Tensor<float, 1> vec1(6);
   vec1.setRandom();
 
   Tensor<float, 1> vec2 = vec1.sigmoid();
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 72f826216..4fba6fdd1 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -14,8 +14,8 @@
 
 static void test_simple()
 {
-  Tensor<float, 1, ColMajor> vec1({6});
-  Tensor<float, 1, ColMajor, int> vec2({6});
+  Tensor<float, 1, ColMajor> vec1(6);
+  Tensor<float, 1, ColMajor, int> vec2(6);
 
   vec1(0) = 4.0;  vec2(0) = 0.0;
   vec1(1) = 8.0;  vec2(1) = 1.0;
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index cb917bb37..154a72d5c 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -228,6 +228,42 @@ void test_cuda_reductions() {
   gpu_device.deallocate(d_res_float);
 }
 
+void test_cuda_forced_evals() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
 
 #endif
 
@@ -246,6 +282,7 @@ void test_cxx11_tensor_of_float16_cuda()
     CALL_SUBTEST_1(test_cuda_elementwise());
     CALL_SUBTEST_2(test_cuda_contractions());
     CALL_SUBTEST_3(test_cuda_reductions());
+    CALL_SUBTEST_4(test_cuda_forced_evals());
   }
   else {
    std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 6dc17bd17..64f168c16 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -792,7 +792,9 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
+  ++g_test_level;
   VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  --g_test_level;
   // was: VERIFY_IS_EQUAL(info, 1);
 
   // check norm^2