9 files changed, 0 insertions, 3042 deletions
diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu
deleted file mode 100644
index 57d6ca000..000000000
--- a/unsupported/test/cxx11_tensor_argmax_hip.cu
+++ /dev/null
@@ -1,251 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-template <int Layout>
-void test_hip_simple_argmax()
-{
-  Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
-  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
-  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
-  in.setRandom();
-  in *= in.constant(100.0);
-  in(0, 0, 0) = -1000.0;
-  in(71, 52, 96) = 1000.0;
-
-  std::size_t in_bytes = in.size() * sizeof(double);
-  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
-
-  double* d_in;
-  DenseIndex* d_out_max;
-  DenseIndex* d_out_min;
-  hipMalloc((void**)(&d_in), in_bytes);
-  hipMalloc((void**)(&d_out_max), out_bytes);
-  hipMalloc((void**)(&d_out_min), out_bytes);
-
-  hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
-
-  gpu_out_max.device(gpu_device) = gpu_in.argmax();
-  gpu_out_min.device(gpu_device) = gpu_in.argmin();
-
-  assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
-  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
-
-  hipFree(d_in);
-  hipFree(d_out_max);
-  hipFree(d_out_min);
-}
-
-template <int DataLayout>
-void test_hip_argmax_dim()
-{
-  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
-  std::vector<int> dims;
-  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
-  for (int dim = 0; dim < 4; ++dim) {
-    tensor.setRandom();
-    tensor = (tensor + tensor.constant(0.5)).log();
-
-    array<DenseIndex, 3> out_shape;
-    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
-    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
-    array<DenseIndex, 4> ix;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != 0) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix) = 10.0;
-          }
-        }
-      }
-    }
-
-    std::size_t in_bytes = tensor.size() * sizeof(float);
-    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
-    float* d_in;
-    DenseIndex* d_out;
-    hipMalloc((void**)(&d_in), in_bytes);
-    hipMalloc((void**)(&d_out), out_bytes);
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    Eigen::HipStreamDevice stream;
-    Eigen::GpuDevice gpu_device(&stream);
-
-    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
-    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
-    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    VERIFY_IS_EQUAL(tensor_arg.size(),
-                    size_t(2*3*5*7 / tensor.dimension(dim)));
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect max to be in the first index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != tensor.dimension(dim) - 1) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = 20.0;
-          }
-        }
-      }
-    }
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect max to be in the last index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
-    }
-
-    hipFree(d_in);
-    hipFree(d_out);
-  }
-}
-
-template <int DataLayout>
-void test_hip_argmin_dim()
-{
-  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
-  std::vector<int> dims;
-  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
-  for (int dim = 0; dim < 4; ++dim) {
-    tensor.setRandom();
-    tensor = (tensor + tensor.constant(0.5)).log();
-
-    array<DenseIndex, 3> out_shape;
-    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
-    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
-    array<DenseIndex, 4> ix;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != 0) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix) = -10.0;
-          }
-        }
-      }
-    }
-
-    std::size_t in_bytes = tensor.size() * sizeof(float);
-    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
-    float* d_in;
-    DenseIndex* d_out;
-    hipMalloc((void**)(&d_in), in_bytes);
-    hipMalloc((void**)(&d_out), out_bytes);
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    Eigen::HipStreamDevice stream;
-    Eigen::GpuDevice gpu_device(&stream);
-
-    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
-    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
-    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    VERIFY_IS_EQUAL(tensor_arg.size(),
-                    2*3*5*7 / tensor.dimension(dim));
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect min to be in the first index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != tensor.dimension(dim) - 1) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = -20.0;
-          }
-        }
-      }
-    }
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect max to be in the last index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
-    }
-
-    hipFree(d_in);
-    hipFree(d_out);
-  }
-}
-
-void test_cxx11_tensor_hip()
-{
-  CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
-  CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
-  CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
-  CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
-  CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
-  CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
-}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
deleted file mode 100644
index bf6a49df4..000000000
--- a/unsupported/test/cxx11_tensor_cast_float16_hip.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_conversion() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  Tensor<float, 1> floats(num_elem);
-  floats.setRandom();
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
-      d_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
-      d_conv, num_elem);
-
-  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
-
-  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
-  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
-  Tensor<float, 1> initial(num_elem);
-  Tensor<float, 1> final(num_elem);
-  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    VERIFY_IS_APPROX(initial(i), final(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_half);
-  gpu_device.deallocate(d_conv);
-}
-
-
-void test_fallback_conversion() {
-  int num_elem = 101;
-  Tensor<float, 1> floats(num_elem);
-  floats.setRandom();
-
-  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
-  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
-
-  for (int i = 0; i < num_elem; ++i) {
-    VERIFY_IS_APPROX(floats(i), conv(i));
-  }
-}
-
-
-void test_cxx11_tensor_cast_float16_hip()
-{
-  CALL_SUBTEST(test_hip_conversion());
-  CALL_SUBTEST(test_fallback_conversion());
-}
diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu
deleted file mode 100644
index 652af0ab0..000000000
--- a/unsupported/test/cxx11_tensor_contract_hip.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_contraction(int m_size, int k_size, int n_size)
-{
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
-  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
-  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-
-  t_left.setRandom();
-  t_right.setRandom();
-
-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
-  float* d_t_left;
-  float* d_t_right;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_left), t_left_bytes);
-  hipMalloc((void**)(&d_t_right), t_right_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
-
-
-  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-  t_result = t_left.contract(t_right, dims);
-
-  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
-      continue;
-    }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
-      continue;
-    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
-  }
-
-  hipFree((void*)d_t_left);
-  hipFree((void*)d_t_right);
-  hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_scalar(int m_size, int k_size, int n_size)
-{
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 0, DataLayout> t_result;
-  Tensor<float, 0, DataLayout> t_result_gpu;
-  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
-
-  t_left.setRandom();
-  t_right.setRandom();
-
-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = sizeof(float);
-
-  float* d_t_left;
-  float* d_t_right;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_left), t_left_bytes);
-  hipMalloc((void**)(&d_t_right), t_right_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_left(d_t_left, m_size, k_size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_right(d_t_right, k_size, n_size);
-  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
-      gpu_t_result(d_t_result);
-
-  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-  t_result = t_left.contract(t_right, dims);
-
-  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
-      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
-    std::cout << "mismatch detected: " << t_result()
-              << " vs " <<  t_result_gpu() << std::endl;
-    assert(false);
-  }
-
-  hipFree((void*)d_t_left);
-  hipFree((void*)d_t_right);
-  hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_m() {
-  for (int k = 32; k < 256; k++) {
-    test_hip_contraction<ColMajor>(k, 128, 128);
-    test_hip_contraction<RowMajor>(k, 128, 128);
-  }
-}
-
-template<int DataLayout>
-void test_hip_contraction_k() {
-  for (int k = 32; k < 256; k++) {
-    test_hip_contraction<ColMajor>(128, k, 128);
-    test_hip_contraction<RowMajor>(128, k, 128);
-  }
-}
-
-template<int DataLayout>
-void test_hip_contraction_n() {
-  for (int k = 32; k < 256; k++) {
-    test_hip_contraction<ColMajor>(128, 128, k);
-    test_hip_contraction<RowMajor>(128, 128, k);
-  }
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_sizes() {
-  int m_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257 , 511,
-                   512, 513, 1023, 1024, 1025};
-
-  int n_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257,  511,
-                   512, 513, 1023, 1024, 1025};
-
-  int k_sizes[] = {  31,   39,  63,  64,   65,
-                     95,   96, 127, 129,  255,
-                    257,  511, 512, 513, 1023,
-                   1024, 1025};
-
-  for (int i = 0; i < 15; i++) {
-    for (int j = 0; j < 15; j++) {
-      for (int k = 0; k < 17; k++) {
-        test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
-      }
-    }
-  }
-}
-
-void test_cxx11_tensor_hip()
-{
-  CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
-
-  CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
-
-  CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
-
-  CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
-
-  CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
-
-  // Commenting out these tests due to long runtimes
-  // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
-  // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
-}
diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu
deleted file mode 100644
index b98c481ff..000000000
--- a/unsupported/test/cxx11_tensor_device_hip.cu
+++ /dev/null
@@ -1,389 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-using Eigen::RowMajor;
-
-// Context for evaluation on cpu
-struct CPUContext {
-  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
-    kernel_1d_(0) = 3.14f;
-    kernel_1d_(1) = 2.7f;
-
-    kernel_2d_(0,0) = 3.14f;
-    kernel_2d_(1,0) = 2.7f;
-    kernel_2d_(0,1) = 0.2f;
-    kernel_2d_(1,1) = 7.0f;
-
-    kernel_3d_(0,0,0) = 3.14f;
-    kernel_3d_(0,1,0) = 2.7f;
-    kernel_3d_(0,0,1) = 0.2f;
-    kernel_3d_(0,1,1) = 7.0f;
-    kernel_3d_(1,0,0) = -1.0f;
-    kernel_3d_(1,1,0) = -0.3f;
-    kernel_3d_(1,0,1) = -0.7f;
-    kernel_3d_(1,1,1) = -0.5f;
-  }
-
-  const Eigen::DefaultDevice& device() const { return cpu_device_; }
-
-  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
-  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
-  Eigen::Tensor<float, 3>& out() { return out_; }
-  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
-  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
-  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
-
- private:
-  const Eigen::Tensor<float, 3>& in1_;
-  const Eigen::Tensor<float, 3>& in2_;
-  Eigen::Tensor<float, 3>& out_;
-
-  Eigen::Tensor<float, 1> kernel_1d_;
-  Eigen::Tensor<float, 2> kernel_2d_;
-  Eigen::Tensor<float, 3> kernel_3d_;
-
-  Eigen::DefaultDevice cpu_device_;
-};
-
-
-// Context for evaluation on GPU
-struct GPUContext {
-  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
-    assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
-    float kernel_1d_val[] = {3.14f, 2.7f};
-    assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
-    assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
-    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
-    assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
-    assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
-    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
-    assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-  }
-  ~GPUContext() {
-    assert(hipFree(kernel_1d_) == hipSuccess);
-    assert(hipFree(kernel_2d_) == hipSuccess);
-    assert(hipFree(kernel_3d_) == hipSuccess);
-  }
-
-  const Eigen::GpuDevice& device() const { return gpu_device_; }
-
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
-  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
-  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
-
- private:
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
-  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
-
-  float* kernel_1d_;
-  float* kernel_2d_;
-  float* kernel_3d_;
-
-  Eigen::HipStreamDevice stream_;
-  Eigen::GpuDevice gpu_device_;
-};
-
-
-// The actual expression to evaluate
-template <typename Context>
-void test_contextual_eval(Context* context)
-{
-  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_forced_contextual_eval(Context* context)
-{
-  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_compound_assignment(Context* context)
-{
-  context->out().device(context->device()) = context->in1().constant(2.718f);
-  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
-}
-
-
-template <typename Context>
-void test_contraction(Context* context)
-{
-  Eigen::array<std::pair<int, int>, 2> dims;
-  dims[0] = std::make_pair(1, 1);
-  dims[1] = std::make_pair(2, 2);
-
-  Eigen::array<int, 2> shape(40, 50*70);
-
-  Eigen::DSizes<int, 2> indices(0,0);
-  Eigen::DSizes<int, 2> sizes(40,40);
-
-  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
-}
-
-
-template <typename Context>
-void test_1d_convolution(Context* context)
-{
-  Eigen::DSizes<int, 3> indices(0,0,0);
-  Eigen::DSizes<int, 3> sizes(40,49,70);
-
-  Eigen::array<int, 1> dims(1);
-  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
-}
-
-template <typename Context>
-void test_2d_convolution(Context* context)
-{
-  Eigen::DSizes<int, 3> indices(0,0,0);
-  Eigen::DSizes<int, 3> sizes(40,49,69);
-
-  Eigen::array<int, 2> dims(1,2);
-  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
-}
-
-template <typename Context>
-void test_3d_convolution(Context* context)
-{
-  Eigen::DSizes<int, 3> indices(0,0,0);
-  Eigen::DSizes<int, 3> sizes(39,49,69);
-
-  Eigen::array<int, 3> dims(0,1,2);
-  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
-}
-
-
-void test_cpu() {
-  Eigen::Tensor<float, 3> in1(40,50,70);
-  Eigen::Tensor<float, 3> in2(40,50,70);
-  Eigen::Tensor<float, 3> out(40,50,70);
-
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
-
-  CPUContext context(in1, in2, out);
-  test_contextual_eval(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_forced_contextual_eval(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_compound_assignment(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_contraction(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 40; ++j) {
-      const float result = out(i,j,0);
-      float expected = 0;
-      for (int k = 0; k < 50; ++k) {
-        for (int l = 0; l < 70; ++l) {
-          expected += in1(i, k, l) * in2(j, k, l);
-        }
-      }
-      VERIFY_IS_APPROX(expected, result);
-    }
-  }
-
-  test_1d_convolution(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
-      }
-    }
-  }
-
-  test_2d_convolution(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-        const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
-                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
-        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
-          continue;
-        }
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-
-  test_3d_convolution(&context);
-  for (int i = 0; i < 39; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-        const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
-                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
-                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
-                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
-        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
-          continue;
-        }
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-}
-
-void test_gpu() {
-  Eigen::Tensor<float, 3> in1(40,50,70);
-  Eigen::Tensor<float, 3> in2(40,50,70);
-  Eigen::Tensor<float, 3> out(40,50,70);
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
-
-  std::size_t in1_bytes = in1.size() * sizeof(float);
-  std::size_t in2_bytes = in2.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_in1;
-  float* d_in2;
-  float* d_out;
-  hipMalloc((void**)(&d_in1), in1_bytes);
-  hipMalloc((void**)(&d_in2), in2_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
-
-  GPUContext context(gpu_in1, gpu_in2, gpu_out);
-  test_contextual_eval(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_forced_contextual_eval(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_compound_assignment(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_contraction(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 40; ++j) {
-      const float result = out(i,j,0);
-      float expected = 0;
-      for (int k = 0; k < 50; ++k) {
-        for (int l = 0; l < 70; ++l) {
-          expected += in1(i, k, l) * in2(j, k, l);
-        }
-      }
-      VERIFY_IS_APPROX(expected, result);
-    }
-  }
-
-  test_1d_convolution(&context);
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
-  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
-      }
-    }
-  }
-
-  test_2d_convolution(&context);
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
-  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-        const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
-                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-
-  /*
-  test_3d_convolution(&context);
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
-  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
-  for (int i = 0; i < 39; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-       const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
-                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
-                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
-                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-  */
-}
-
-
-void test_cxx11_tensor_device()
-{
-  CALL_SUBTEST(test_cpu());
-  CALL_SUBTEST(test_gpu());
-}
diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu
deleted file mode 100644
index b28840267..000000000
--- a/unsupported/test/cxx11_tensor_hip.cu
+++ /dev/null
@@ -1,1295 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#ifdef __NVCC__
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
-#endif
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_nullary() {
-  Tensor<float, 1, 0, int> in1(2);
-  Tensor<float, 1, 0, int> in2(2);
-  in1.setRandom();
-  in2.setRandom();
-
-  std::size_t tensor_bytes = in1.size() * sizeof(float);
-
-  float* d_in1;
-  float* d_in2;
-  hipMalloc((void**)(&d_in1), tensor_bytes);
-  hipMalloc((void**)(&d_in2), tensor_bytes);
-  hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
-      d_in1, 2);
-  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
-      d_in2, 2);
-
-  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
-  gpu_in2.device(gpu_device) = gpu_in2.random();
-
-  Tensor<float, 1, 0, int> new1(2);
-  Tensor<float, 1, 0, int> new2(2);
-
-  assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost,
-                         gpu_device.stream()) == hipSuccess);
-  assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost,
-                         gpu_device.stream()) == hipSuccess);
-
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 2; ++i) {
-    VERIFY_IS_APPROX(new1(i), 3.14f);
-    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
-  }
-
-  hipFree(d_in1);
-  hipFree(d_in2);
-}
-
-void test_hip_elementwise_small() {
-  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2});
-  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2});
-  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2});
-  in1.setRandom();
-  in2.setRandom();
-
-  std::size_t in1_bytes = in1.size() * sizeof(float);
-  std::size_t in2_bytes = in2.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_in1;
-  float* d_in2;
-  float* d_out;
-  hipMalloc((void**)(&d_in1), in1_bytes);
-  hipMalloc((void**)(&d_in2), in2_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
-      d_in1, Eigen::array<Eigen::DenseIndex, 1>{2});
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
-      d_in2, Eigen::array<Eigen::DenseIndex, 1>{2});
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
-      d_out, Eigen::array<Eigen::DenseIndex, 1>{2});
-
-  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
-                         gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 2; ++i) {
-    VERIFY_IS_APPROX(
-        out(Eigen::array<Eigen::DenseIndex, 1>{i}),
-        in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i}));
-  }
-
-  hipFree(d_in1);
-  hipFree(d_in2);
-  hipFree(d_out);
-}
-
-void test_hip_elementwise()
-{
-  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  in1.setRandom();
-  in2.setRandom();
-  in3.setRandom();
-
-  std::size_t in1_bytes = in1.size() * sizeof(float);
-  std::size_t in2_bytes = in2.size() * sizeof(float);
-  std::size_t in3_bytes = in3.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_in1;
-  float* d_in2;
-  float* d_in3;
-  float* d_out;
-  hipMalloc((void**)(&d_in1), in1_bytes);
-  hipMalloc((void**)(&d_in2), in2_bytes);
-  hipMalloc((void**)(&d_in3), in3_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-
-  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 72; ++i) {
-    for (int j = 0; j < 53; ++j) {
-      for (int k = 0; k < 97; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}));
-      }
-    }
-  }
-
-  hipFree(d_in1);
-  hipFree(d_in2);
-  hipFree(d_in3);
-  hipFree(d_out);
-}
-
-void test_hip_props() {
-  Tensor<float, 1> in1(200);
-  Tensor<bool, 1> out(200);
-  in1.setRandom();
-
-  std::size_t in1_bytes = in1.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(bool);
-
-  float* d_in1;
-  bool* d_out;
-  hipMalloc((void**)(&d_in1), in1_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
-      d_in1, 200);
-  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
-      d_out, 200);
-
-  gpu_out.device(gpu_device) = (gpu_in1.isnan)();
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
-                         gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 200; ++i) {
-    VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
-  }
-
-  hipFree(d_in1);
-  hipFree(d_out);
-}
-
-void test_hip_reduction()
-{
-  Tensor<float, 4> in1(72,53,97,113);
-  Tensor<float, 2> out(72,97);
-  in1.setRandom();
-
-  std::size_t in1_bytes = in1.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_in1;
-  float* d_out;
-  hipMalloc((void**)(&d_in1), in1_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
-  array<Eigen::DenseIndex, 2> reduction_axis;
-  reduction_axis[0] = 1;
-  reduction_axis[1] = 3;
-
-  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 72; ++i) {
-    for (int j = 0; j < 97; ++j) {
-      float expected = 0;
-      for (int k = 0; k < 53; ++k) {
-        for (int l = 0; l < 113; ++l) {
-          expected =
-              std::max<float>(expected, in1(i, k, j, l));
-        }
-      }
-      VERIFY_IS_APPROX(out(i,j), expected);
-    }
-  }
-
-  hipFree(d_in1);
-  hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_contraction()
-{
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
-  Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
-  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1});
-  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1});
-
-  t_left.setRandom();
-  t_right.setRandom();
-
-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
-  float* d_t_left;
-  float* d_t_right;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_left), t_left_bytes);
-  hipMalloc((void**)(&d_t_right), t_right_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
-  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
-  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
-
-  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
-  MapXf m_left(t_left.data(), 300, 93);
-  MapXf m_right(t_right.data(), 93, 140);
-  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
-
-  typedef Tensor<float, 1>::DimensionPair DimPair;
-  Eigen::array<DimPair, 2> dims;
-  dims[0] = DimPair(2, 0);
-  dims[1] = DimPair(3, 1);
-
-  m_result = m_left * m_right;
-  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-
-  hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-
-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
-      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
-      assert(false);
-    }
-  }
-
-  hipFree(d_t_left);
-  hipFree(d_t_right);
-  hipFree(d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_convolution_1d()
-{
-  Tensor<float, 4, DataLayout> input(74,37,11,137);
-  Tensor<float, 1, DataLayout> kernel(4);
-  Tensor<float, 4, DataLayout> out(74,34,11,137);
-  input = input.constant(10.0f) + input.random();
-  kernel = kernel.constant(7.0f) + kernel.random();
-
-  std::size_t input_bytes = input.size() * sizeof(float);
-  std::size_t kernel_bytes = kernel.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_input;
-  float* d_kernel;
-  float* d_out;
-  hipMalloc((void**)(&d_input), input_bytes);
-  hipMalloc((void**)(&d_kernel), kernel_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
-  Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
-  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
-
-  Eigen::array<Eigen::DenseIndex, 1> dims{1};
-  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 74; ++i) {
-    for (int j = 0; j < 34; ++j) {
-      for (int k = 0; k < 11; ++k) {
-        for (int l = 0; l < 137; ++l) {
-          const float result = out(i,j,k,l);
-          const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
-                                 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
-          VERIFY_IS_APPROX(result, expected);
-        }
-      }
-    }
-  }
-
-  hipFree(d_input);
-  hipFree(d_kernel);
-  hipFree(d_out);
-}
-
-void test_hip_convolution_inner_dim_col_major_1d()
-{
-  Tensor<float, 4, ColMajor> input(74,9,11,7);
-  Tensor<float, 1, ColMajor> kernel(4);
-  Tensor<float, 4, ColMajor> out(71,9,11,7);
-  input = input.constant(10.0f) + input.random();
-  kernel = kernel.constant(7.0f) + kernel.random();
-
-  std::size_t input_bytes = input.size() * sizeof(float);
-  std::size_t kernel_bytes = kernel.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_input;
-  float* d_kernel;
-  float* d_out;
-  hipMalloc((void**)(&d_input), input_bytes);
-  hipMalloc((void**)(&d_kernel), kernel_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
-  Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
-  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
-
-  Eigen::array<Eigen::DenseIndex, 1> dims{0};
-  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 71; ++i) {
-    for (int j = 0; j < 9; ++j) {
-      for (int k = 0; k < 11; ++k) {
-        for (int l = 0; l < 7; ++l) {
-          const float result = out(i,j,k,l);
-          const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
-                                 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
-          VERIFY_IS_APPROX(result, expected);
-        }
-      }
-    }
-  }
-
-  hipFree(d_input);
-  hipFree(d_kernel);
-  hipFree(d_out);
-}
-
-void test_hip_convolution_inner_dim_row_major_1d()
-{
-  Tensor<float, 4, RowMajor> input(7,9,11,74);
-  Tensor<float, 1, RowMajor> kernel(4);
-  Tensor<float, 4, RowMajor> out(7,9,11,71);
-  input = input.constant(10.0f) + input.random();
-  kernel = kernel.constant(7.0f) + kernel.random();
-
-  std::size_t input_bytes = input.size() * sizeof(float);
-  std::size_t kernel_bytes = kernel.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_input;
-  float* d_kernel;
-  float* d_out;
-  hipMalloc((void**)(&d_input), input_bytes);
-  hipMalloc((void**)(&d_kernel), kernel_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
-  Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
-  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
-
-  Eigen::array<Eigen::DenseIndex, 1> dims{3};
-  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 7; ++i) {
-    for (int j = 0; j < 9; ++j) {
-      for (int k = 0; k < 11; ++k) {
-        for (int l = 0; l < 71; ++l) {
-          const float result = out(i,j,k,l);
-          const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
-                                 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
-          VERIFY_IS_APPROX(result, expected);
-        }
-      }
-    }
-  }
-
-  hipFree(d_input);
-  hipFree(d_kernel);
-  hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_convolution_2d()
-{
-  Tensor<float, 4, DataLayout> input(74,37,11,137);
-  Tensor<float, 2, DataLayout> kernel(3,4);
-  Tensor<float, 4, DataLayout> out(74,35,8,137);
-  input = input.constant(10.0f) + input.random();
-  kernel = kernel.constant(7.0f) + kernel.random();
-
-  std::size_t input_bytes = input.size() * sizeof(float);
-  std::size_t kernel_bytes = kernel.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_input;
-  float* d_kernel;
-  float* d_out;
-  hipMalloc((void**)(&d_input), input_bytes);
-  hipMalloc((void**)(&d_kernel), kernel_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
-  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
-
-  Eigen::array<Eigen::DenseIndex, 2> dims{1,2};
-  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 74; ++i) {
-    for (int j = 0; j < 35; ++j) {
-      for (int k = 0; k < 8; ++k) {
-        for (int l = 0; l < 137; ++l) {
-          const float result = out(i,j,k,l);
-          const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
-                                 input(i,j+1,k+0,l) * kernel(1,0) +
-                                 input(i,j+2,k+0,l) * kernel(2,0) +
-                                 input(i,j+0,k+1,l) * kernel(0,1) +
-                                 input(i,j+1,k+1,l) * kernel(1,1) +
-                                 input(i,j+2,k+1,l) * kernel(2,1) +
-                                 input(i,j+0,k+2,l) * kernel(0,2) +
-                                 input(i,j+1,k+2,l) * kernel(1,2) +
-                                 input(i,j+2,k+2,l) * kernel(2,2) +
-                                 input(i,j+0,k+3,l) * kernel(0,3) +
-                                 input(i,j+1,k+3,l) * kernel(1,3) +
-                                 input(i,j+2,k+3,l) * kernel(2,3);
-          VERIFY_IS_APPROX(result, expected);
-        }
-      }
-    }
-  }
-
-  hipFree(d_input);
-  hipFree(d_kernel);
-  hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_convolution_3d()
-{
-  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17});
-  Tensor<float, 3, DataLayout> kernel(3,4,2);
-  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17});
-  input = input.constant(10.0f) + input.random();
-  kernel = kernel.constant(7.0f) + kernel.random();
-
-  std::size_t input_bytes = input.size() * sizeof(float);
-  std::size_t kernel_bytes = kernel.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_input;
-  float* d_kernel;
-  float* d_out;
-  hipMalloc((void**)(&d_input), input_bytes);
-  hipMalloc((void**)(&d_kernel), kernel_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;    
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
-  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
-  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
-
-  Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3};
-  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 74; ++i) {
-    for (int j = 0; j < 35; ++j) {
-      for (int k = 0; k < 8; ++k) {
-        for (int l = 0; l < 136; ++l) {
-          for (int m = 0; m < 17; ++m) {
-            const float result = out(i,j,k,l,m);
-            const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
-                                   input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
-                                   input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
-                                   input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
-                                   input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
-                                   input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
-                                   input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
-                                   input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
-                                   input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
-                                   input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
-                                   input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
-                                   input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
-                                   input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
-                                   input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
-                                   input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
-                                   input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
-                                   input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
-                                   input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
-                                   input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
-                                   input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
-                                   input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
-                                   input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
-                                   input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
-                                   input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
-            VERIFY_IS_APPROX(result, expected);
-          }
-        }
-      }
-    }
-  }
-
-  hipFree(d_input);
-  hipFree(d_kernel);
-  hipFree(d_out);
-}
-
-
-template <typename Scalar>
-void test_hip_lgamma(const Scalar stddev)
-{
-  Tensor<Scalar, 2> in(72,97);
-  in.setRandom();
-  in *= in.constant(stddev);
-  Tensor<Scalar, 2> out(72,97);
-  out.setZero();
-
-  std::size_t bytes = in.size() * sizeof(Scalar);
-
-  Scalar* d_in;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_in), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
-  gpu_out.device(gpu_device) = gpu_in.lgamma();
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 72; ++i) {
-    for (int j = 0; j < 97; ++j) {
-      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
-    }
-  }
-
-  hipFree(d_in);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_digamma()
-{
-  Tensor<Scalar, 1> in(7);
-  Tensor<Scalar, 1> out(7);
-  Tensor<Scalar, 1> expected_out(7);
-  out.setZero();
-
-  in(0) = Scalar(1);
-  in(1) = Scalar(1.5);
-  in(2) = Scalar(4);
-  in(3) = Scalar(-10.5);
-  in(4) = Scalar(10000.5);
-  in(5) = Scalar(0);
-  in(6) = Scalar(-1);
-
-  expected_out(0) = Scalar(-0.5772156649015329);
-  expected_out(1) = Scalar(0.03648997397857645);
-  expected_out(2) = Scalar(1.2561176684318);
-  expected_out(3) = Scalar(2.398239129535781);
-  expected_out(4) = Scalar(9.210340372392849);
-  expected_out(5) = std::numeric_limits<Scalar>::infinity();
-  expected_out(6) = std::numeric_limits<Scalar>::infinity();
-
-  std::size_t bytes = in.size() * sizeof(Scalar);
-
-  Scalar* d_in;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_in), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
-
-  gpu_out.device(gpu_device) = gpu_in.digamma();
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 5; ++i) {
-    VERIFY_IS_APPROX(out(i), expected_out(i));
-  }
-  for (int i = 5; i < 7; ++i) {
-    VERIFY_IS_EQUAL(out(i), expected_out(i));
-  }
-
-  hipFree(d_in);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_zeta()
-{
-  Tensor<Scalar, 1> in_x(6);
-  Tensor<Scalar, 1> in_q(6);
-  Tensor<Scalar, 1> out(6);
-  Tensor<Scalar, 1> expected_out(6);
-  out.setZero();
-
-  in_x(0) = Scalar(1);
-  in_x(1) = Scalar(1.5);
-  in_x(2) = Scalar(4);
-  in_x(3) = Scalar(-10.5);
-  in_x(4) = Scalar(10000.5);
-  in_x(5) = Scalar(3);
-  
-  in_q(0) = Scalar(1.2345);
-  in_q(1) = Scalar(2);
-  in_q(2) = Scalar(1.5);
-  in_q(3) = Scalar(3);
-  in_q(4) = Scalar(1.0001);
-  in_q(5) = Scalar(-2.5);
-
-  expected_out(0) = std::numeric_limits<Scalar>::infinity();
-  expected_out(1) = Scalar(1.61237534869);
-  expected_out(2) = Scalar(0.234848505667);
-  expected_out(3) = Scalar(1.03086757337e-5);
-  expected_out(4) = Scalar(0.367879440865);
-  expected_out(5) = Scalar(0.054102025820864097);
-
-  std::size_t bytes = in_x.size() * sizeof(Scalar);
-
-  Scalar* d_in_x;
-  Scalar* d_in_q;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_in_x), bytes);
-  hipMalloc((void**)(&d_in_q), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice);
-  
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
-
-  gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  VERIFY_IS_EQUAL(out(0), expected_out(0));
-  VERIFY((std::isnan)(out(3)));
-
-  for (int i = 1; i < 6; ++i) {
-    if (i != 3) {
-      VERIFY_IS_APPROX(out(i), expected_out(i));
-    }
-  }
-
-  hipFree(d_in_x);
-  hipFree(d_in_q);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_polygamma()
-{
-  Tensor<Scalar, 1> in_x(7);
-  Tensor<Scalar, 1> in_n(7);
-  Tensor<Scalar, 1> out(7);
-  Tensor<Scalar, 1> expected_out(7);
-  out.setZero();
-
-  in_n(0) = Scalar(1);
-  in_n(1) = Scalar(1);
-  in_n(2) = Scalar(1);
-  in_n(3) = Scalar(17);
-  in_n(4) = Scalar(31);
-  in_n(5) = Scalar(28);
-  in_n(6) = Scalar(8);
-  
-  in_x(0) = Scalar(2);
-  in_x(1) = Scalar(3);
-  in_x(2) = Scalar(25.5);
-  in_x(3) = Scalar(4.7);
-  in_x(4) = Scalar(11.8);
-  in_x(5) = Scalar(17.7);
-  in_x(6) = Scalar(30.2);
-
-  expected_out(0) = Scalar(0.644934066848);
-  expected_out(1) = Scalar(0.394934066848);
-  expected_out(2) = Scalar(0.0399946696496);
-  expected_out(3) = Scalar(293.334565435);
-  expected_out(4) = Scalar(0.445487887616);
-  expected_out(5) = Scalar(-2.47810300902e-07);
-  expected_out(6) = Scalar(-8.29668781082e-09);
-
-  std::size_t bytes = in_x.size() * sizeof(Scalar);
-
-  Scalar* d_in_x;
-  Scalar* d_in_n;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_in_x), bytes);
-  hipMalloc((void**)(&d_in_n), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice);
-  
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
-
-  gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 7; ++i) {
-    VERIFY_IS_APPROX(out(i), expected_out(i));
-  }
-
-  hipFree(d_in_x);
-  hipFree(d_in_n);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_igamma()
-{
-  Tensor<Scalar, 2> a(6, 6);
-  Tensor<Scalar, 2> x(6, 6);
-  Tensor<Scalar, 2> out(6, 6);
-  out.setZero();
-
-  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      a(i, j) = a_s[i];
-      x(i, j) = x_s[j];
-    }
-  }
-
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
-                          {0.0, 0.6321205588285578, 0.7768698398515702,
-                           0.9816843611112658, 9.999500016666262e-05, 1.0},
-                          {0.0, 0.4275932955291202, 0.608374823728911,
-                           0.9539882943107686, 7.522076445089201e-07, 1.0},
-                          {0.0, 0.01898815687615381, 0.06564245437845008,
-                           0.5665298796332909, 4.166333347221828e-18, 1.0},
-                          {0.0, 0.9999780593618628, 0.9999899967080838,
-                           0.9999996219837988, 0.9991370418689945, 1.0},
-                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-
-
-
-  std::size_t bytes = a.size() * sizeof(Scalar);
-
-  Scalar* d_a;
-  Scalar* d_x;
-  Scalar* d_out;
-  assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess);
-  assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess);
-  assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
-
-  hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
-
-  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igamma_s[i][j])) {
-        VERIFY((std::isnan)(out(i, j)));
-      } else {
-        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
-      }
-    }
-  }
-
-  hipFree(d_a);
-  hipFree(d_x);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_igammac()
-{
-  Tensor<Scalar, 2> a(6, 6);
-  Tensor<Scalar, 2> x(6, 6);
-  Tensor<Scalar, 2> out(6, 6);
-  out.setZero();
-
-  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      a(i, j) = a_s[i];
-      x(i, j) = x_s[j];
-    }
-  }
-
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
-                           {1.0, 0.36787944117144233, 0.22313016014842982,
-                            0.018315638888734182, 0.9999000049998333, 0.0},
-                           {1.0, 0.5724067044708798, 0.3916251762710878,
-                            0.04601170568923136, 0.9999992477923555, 0.0},
-                           {1.0, 0.9810118431238462, 0.9343575456215499,
-                            0.4334701203667089, 1.0, 0.0},
-                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
-                            3.7801620118431334e-07, 0.0008629581310054535,
-                            0.0},
-                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
-
-  std::size_t bytes = a.size() * sizeof(Scalar);
-
-  Scalar* d_a;
-  Scalar* d_x;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_a), bytes);
-  hipMalloc((void**)(&d_x), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
-
-  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igammac_s[i][j])) {
-        VERIFY((std::isnan)(out(i, j)));
-      } else {
-        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
-      }
-    }
-  }
-
-  hipFree(d_a);
-  hipFree(d_x);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_erf(const Scalar stddev)
-{
-  Tensor<Scalar, 2> in(72,97);
-  in.setRandom();
-  in *= in.constant(stddev);
-  Tensor<Scalar, 2> out(72,97);
-  out.setZero();
-
-  std::size_t bytes = in.size() * sizeof(Scalar);
-
-  Scalar* d_in;
-  Scalar* d_out;
-  assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess);
-  assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
-
-  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
-  gpu_out.device(gpu_device) = gpu_in.erf();
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 72; ++i) {
-    for (int j = 0; j < 97; ++j) {
-      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
-    }
-  }
-
-  hipFree(d_in);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_erfc(const Scalar stddev)
-{
-  Tensor<Scalar, 2> in(72,97);
-  in.setRandom();
-  in *= in.constant(stddev);
-  Tensor<Scalar, 2> out(72,97);
-  out.setZero();
-
-  std::size_t bytes = in.size() * sizeof(Scalar);
-
-  Scalar* d_in;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_in), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
-  gpu_out.device(gpu_device) = gpu_in.erfc();
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 0; i < 72; ++i) {
-    for (int j = 0; j < 97; ++j) {
-      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
-    }
-  }
-
-  hipFree(d_in);
-  hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_betainc()
-{
-  Tensor<Scalar, 1> in_x(125);
-  Tensor<Scalar, 1> in_a(125);
-  Tensor<Scalar, 1> in_b(125);
-  Tensor<Scalar, 1> out(125);
-  Tensor<Scalar, 1> expected_out(125);
-  out.setZero();
-
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-
-  Array<Scalar, 1, Dynamic> x(125);
-  Array<Scalar, 1, Dynamic> a(125);
-  Array<Scalar, 1, Dynamic> b(125);
-  Array<Scalar, 1, Dynamic> v(125);
-
-  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
-      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
-      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
-      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
-      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
-      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
-
-  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
-      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
-      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
-      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
-      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
-      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
-      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
-      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
-      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
-      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
-      999.999, 999.999, 999.999;
-
-  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
-      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
-      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
-      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
-      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
-      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
-      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
-      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
-      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
-
-  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
-      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
-      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
-      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
-      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
-      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
-      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
-      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
-      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
-      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
-      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
-      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
-      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
-      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
-      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
-      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
-      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
-      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
-      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
-      0.5000000000004947, 0.9999999999999999, nan;
-
-  for (int i = 0; i < 125; ++i) {
-    in_x(i) = x(i);
-    in_a(i) = a(i);
-    in_b(i) = b(i);
-    expected_out(i) = v(i);
-  }
-
-  std::size_t bytes = in_x.size() * sizeof(Scalar);
-
-  Scalar* d_in_x;
-  Scalar* d_in_a;
-  Scalar* d_in_b;
-  Scalar* d_out;
-  hipMalloc((void**)(&d_in_x), bytes);
-  hipMalloc((void**)(&d_in_a), bytes);
-  hipMalloc((void**)(&d_in_b), bytes);
-  hipMalloc((void**)(&d_out), bytes);
-
-  hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
-
-  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
-
-  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  for (int i = 1; i < 125; ++i) {
-    if ((std::isnan)(expected_out(i))) {
-      VERIFY((std::isnan)(out(i)));
-    } else {
-      VERIFY_IS_APPROX(out(i), expected_out(i));
-    }
-  }
-
-  hipFree(d_in_x);
-  hipFree(d_in_a);
-  hipFree(d_in_b);
-  hipFree(d_out);
-}
-
-
-void test_cxx11_tensor_hip()
-{
-  CALL_SUBTEST(test_hip_nullary());
-  CALL_SUBTEST(test_hip_elementwise_small());
-  CALL_SUBTEST(test_hip_elementwise());
-  CALL_SUBTEST(test_hip_props());
-  CALL_SUBTEST(test_hip_reduction());
-  CALL_SUBTEST(test_hip_contraction<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction<RowMajor>());
-  CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
-  CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
-  CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST(test_hip_convolution_2d<ColMajor>());
-  CALL_SUBTEST(test_hip_convolution_2d<RowMajor>());
-  // The following two tests commented out due to long runtime
-  // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>());
-  // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
-  // std::erf, std::erfc, and so on where only added in c++11. We use them
-  // as a golden reference to validate the results produced by Eigen. Therefore
-  // we can only run these tests if we use a c++11 compiler.
-
-  CALL_SUBTEST(test_hip_lgamma<float>(1.0f));
-  CALL_SUBTEST(test_hip_lgamma<float>(100.0f));
-  CALL_SUBTEST(test_hip_lgamma<float>(0.01f));
-  CALL_SUBTEST(test_hip_lgamma<float>(0.001f));
-
-  CALL_SUBTEST(test_hip_lgamma<double>(1.0));
-  CALL_SUBTEST(test_hip_lgamma<double>(100.0));
-  CALL_SUBTEST(test_hip_lgamma<double>(0.01));
-  CALL_SUBTEST(test_hip_lgamma<double>(0.001));
-
-  CALL_SUBTEST(test_hip_erf<float>(1.0f));
-  CALL_SUBTEST(test_hip_erf<float>(100.0f));
-  CALL_SUBTEST(test_hip_erf<float>(0.01f));
-  CALL_SUBTEST(test_hip_erf<float>(0.001f));
-
-  CALL_SUBTEST(test_hip_erfc<float>(1.0f));
-  // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs
-  CALL_SUBTEST(test_hip_erfc<float>(5.0f)); 
-  CALL_SUBTEST(test_hip_erfc<float>(0.01f));
-  CALL_SUBTEST(test_hip_erfc<float>(0.001f));
-
-  CALL_SUBTEST(test_hip_erf<double>(1.0));
-  CALL_SUBTEST(test_hip_erf<double>(100.0));
-  CALL_SUBTEST(test_hip_erf<double>(0.01));
-  CALL_SUBTEST(test_hip_erf<double>(0.001));
-
-  CALL_SUBTEST(test_hip_erfc<double>(1.0));
-  // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs
-  CALL_SUBTEST(test_hip_erfc<double>(5.0)); 
-  CALL_SUBTEST(test_hip_erfc<double>(0.01));
-  CALL_SUBTEST(test_hip_erfc<double>(0.001));
-
-  // Following tests have functional failures on some seeds
-  // CALL_SUBTEST(test_hip_digamma<float>());
-  // CALL_SUBTEST(test_hip_digamma<double>());
-
-  // Following tests have functional failures on some seeds
-  // CALL_SUBTEST(test_hip_polygamma<float>());
-  // CALL_SUBTEST(test_hip_polygamma<double>());
-
-  // Following tests have functional failures on some seeds
-  // CALL_SUBTEST(test_hip_zeta<float>());
-  // CALL_SUBTEST(test_hip_zeta<double>());
-
-  CALL_SUBTEST(test_hip_igamma<float>());
-  CALL_SUBTEST(test_hip_igammac<float>());
-
-  CALL_SUBTEST(test_hip_igamma<double>());
-  CALL_SUBTEST(test_hip_igammac<double>());
-
-  CALL_SUBTEST(test_hip_betainc<float>());
-  CALL_SUBTEST(test_hip_betainc<double>());
-#endif
-}
diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu
deleted file mode 100644
index 6c1a401bf..000000000
--- a/unsupported/test/cxx11_tensor_of_float16_hip.cu
+++ /dev/null
@@ -1,498 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-
-template<typename>
-void test_hip_numext() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
-  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
-  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
-  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
-
-  Tensor<bool, 1> half_prec(num_elem);
-  Tensor<bool, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking numext " << i << std::endl;
-    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-
-#ifdef EIGEN_HAS_HIP_FP16
-
-template<typename>
-void test_hip_conversion() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-  
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
-      d_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
-      d_conv, num_elem);
-
-  gpu_float.device(gpu_device) = gpu_float.random();
-  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
-  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
-  Tensor<float, 1> initial(num_elem);
-  Tensor<float, 1> final(num_elem);
-  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
-
-  for (int i = 0; i < num_elem; ++i) {
-    VERIFY_IS_APPROX(initial(i), final(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_half);
-  gpu_device.deallocate(d_conv);
-}
-
-template<typename>
-void test_hip_unary() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
-  gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
-
-  Tensor<float, 1> half_prec(num_elem);
-  Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking unary " << i << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_elementwise() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
-      d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
-      d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
-  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
-  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
-
-  Tensor<float, 1> half_prec(num_elem);
-  Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
-    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
-  }
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_trancendental() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
-  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
-  gpu_float3.device(gpu_device) = gpu_float3.random();
-  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
-  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
-  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
-  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
-
-  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
-  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
-
-  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
-  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
-
-  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
-  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
-
-  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
-  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
-
-  Tensor<float, 1> input1(num_elem);
-  Tensor<Eigen::half, 1> half_prec1(num_elem);
-  Tensor<Eigen::half, 1> full_prec1(num_elem);
-  Tensor<float, 1> input2(num_elem);
-  Tensor<Eigen::half, 1> half_prec2(num_elem);
-  Tensor<Eigen::half, 1> full_prec2(num_elem);
-  Tensor<float, 1> input3(num_elem);
-  Tensor<Eigen::half, 1> half_prec3(num_elem);
-  Tensor<Eigen::half, 1> full_prec3(num_elem);
-  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
-  }
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
-      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
-    else
-      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
-  }
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
-  }
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_float3);
-  gpu_device.deallocate(d_res1_half);
-  gpu_device.deallocate(d_res1_float);
-  gpu_device.deallocate(d_res2_half);
-  gpu_device.deallocate(d_res2_float);
-  gpu_device.deallocate(d_res3_float);
-  gpu_device.deallocate(d_res3_half);
-}
-
-template<typename>
-void test_hip_contractions() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int rows = 23;
-  int cols = 23;
-  int num_elem = rows*cols;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
-      d_res_half, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
-      d_res_float, rows, cols);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
-  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
-
-  typedef Tensor<float, 2>::DimensionPair DimPair;
-  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
-
-  Tensor<Eigen::half, 2> half_prec(rows, cols);
-  Tensor<Eigen::half, 2> full_prec(rows, cols);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < rows; ++i) {
-    for (int j = 0; j < cols; ++j) {
-      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
-      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
-        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
-      }
-    }
-  }
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions(int size1, int size2, int redux) {
-
-   std::cout << "Reducing " << size1 << " by " << size2
-             << " tensor along dim " << redux << std::endl; 
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = size1*size2;
-  int result_size = (redux == 1 ? size1 : size2);
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
-  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, result_size);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, result_size);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
-  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
-
-  Eigen::array<int, 1> redux_dim(redux);
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
-
-  Tensor<Eigen::half, 1> half_prec(result_size);
-  Tensor<Eigen::half, 1> full_prec(result_size);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < result_size; ++i) {
-    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
-  }
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions() {
-  test_hip_reductions<void>(13, 13, 0);
-  test_hip_reductions<void>(13, 13, 1);
-
-  test_hip_reductions<void>(35, 36, 0);
-  test_hip_reductions<void>(35, 36, 1);
-
-  test_hip_reductions<void>(36, 35, 0);
-  test_hip_reductions<void>(36, 35, 1);
-}
-
-template<typename>
-void test_hip_full_reductions() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int size = 13;
-  int num_elem = size*size;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
-  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
-      d_res_half);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
-      d_res_float);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
-
-  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
-
-  Tensor<Eigen::half, 0> half_prec;
-  Tensor<Eigen::half, 0> full_prec;
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  VERIFY_IS_APPROX(full_prec(), half_prec());
-
-  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  VERIFY_IS_APPROX(full_prec(), half_prec());
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_forced_evals() {
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
-      d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
-      d_res_half2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  Eigen::array<int, 1> no_bcast;
-  no_bcast[0] = 1;
-
-  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
-  gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
-  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
-
-  Tensor<float, 1> half_prec1(num_elem);
-  Tensor<float, 1> half_prec2(num_elem);
-  Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
-    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half1);
-  gpu_device.deallocate(d_res_half2);
-  gpu_device.deallocate(d_res_float);
-}
-#endif
-
-
-void test_cxx11_tensor_of_float16_hip()
-{
-  CALL_SUBTEST(test_hip_numext<void>());
-
-#ifdef EIGEN_HAS_HIP_FP16
-  CALL_SUBTEST(test_hip_conversion<void>());
-  CALL_SUBTEST(test_hip_unary<void>());
-  CALL_SUBTEST(test_hip_elementwise<void>());
-  CALL_SUBTEST(test_hip_trancendental<void>());
-  CALL_SUBTEST(test_hip_contractions<void>());
-  CALL_SUBTEST(test_hip_reductions<void>());
-  CALL_SUBTEST(test_hip_full_reductions<void>());
-  CALL_SUBTEST(test_hip_forced_evals<void>());
-#else
-  std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
-#endif
-}
diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu
deleted file mode 100644
index 7d7e72ca2..000000000
--- a/unsupported/test/cxx11_tensor_random_hip.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <Eigen/CXX11/Tensor>
-
-
-void test_hip_random_uniform()
-{
-  Tensor<float, 2> out(72,97);
-  out.setZero();
-
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_out;
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
-  gpu_out.device(gpu_device) = gpu_out.random();
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  // For now we just check thes code doesn't crash.
-  // TODO: come up with a valid test of randomness
-}
-
-
-void test_hip_random_normal()
-{
-  Tensor<float, 2> out(72,97);
-  out.setZero();
-
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_out;
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
-  Eigen::internal::NormalRandomGenerator<float> gen(true);
-  gpu_out.device(gpu_device) = gpu_out.random(gen);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-}
-
-static void test_complex()
-{
-  Tensor<std::complex<float>, 1> vec(6);
-  vec.setRandom();
-
-  // Fixme: we should check that the generated numbers follow a uniform
-  // distribution instead.
-  for (int i = 1; i < 6; ++i) {
-    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
-  }
-}
-
-
-void test_cxx11_tensor_random_hip()
-{
-  CALL_SUBTEST(test_hip_random_uniform());
-  CALL_SUBTEST(test_hip_random_normal());
-  CALL_SUBTEST(test_complex());
-}
diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu
deleted file mode 100644
index c5aad05be..000000000
--- a/unsupported/test/cxx11_tensor_reduction_hip.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-template<typename Type, int DataLayout>
-static void test_full_reductions() {
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  const int num_rows = internal::random<int>(1024, 5*1024);
-  const int num_cols = internal::random<int>(1024, 5*1024);
-
-  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
-  in.setRandom();
-
-  Tensor<Type, 0, DataLayout> full_redux;
-  full_redux = in.sum();
-
-  std::size_t in_bytes = in.size() * sizeof(Type);
-  std::size_t out_bytes = full_redux.size() * sizeof(Type);
-  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
-  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
-  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
-
-  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
-  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
-
-  out_gpu.device(gpu_device) = in_gpu.sum();
-
-  Tensor<Type, 0, DataLayout> full_redux_gpu;
-  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
-  gpu_device.synchronize();
-
-  // Check that the CPU and GPU reductions return the same result.
-  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
-
-  gpu_device.deallocate(gpu_in_ptr);
-  gpu_device.deallocate(gpu_out_ptr);
-}
-
-template<typename Type, int DataLayout>
-static void test_first_dim_reductions() {
-  int dim_x = 33;
-  int dim_y = 1;
-  int dim_z = 128;
-
-  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
-  in.setRandom();
-
-  Eigen::array<int, 1> red_axis;
-  red_axis[0] = 0;
-  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
-  // Create device
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice dev(&stream);
-  
-  // Create data(T)
-  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
-  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
-  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
-  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
-  
-  // Perform operation
-  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
-  gpu_out.device(dev) = gpu_in.sum(red_axis);
-  gpu_out.device(dev) += gpu_in.sum(red_axis);
-  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
-  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
-  dev.synchronize();
-
-  // Check that the CPU and GPU reductions return the same result.
-  for (int i = 0; i < gpu_out.size(); ++i) {
-    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
-  }
-
-  dev.deallocate(in_data);
-  dev.deallocate(out_data);
-}
-
-template<typename Type, int DataLayout>
-static void test_last_dim_reductions() {
-  int dim_x = 128;
-  int dim_y = 1;
-  int dim_z = 33;
-
-  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
-  in.setRandom();
-
-  Eigen::array<int, 1> red_axis;
-  red_axis[0] = 2;
-  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
-  // Create device
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice dev(&stream);
-  
-  // Create data
-  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
-  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
-  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
-  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
-  
-  // Perform operation
-  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
-  gpu_out.device(dev) = gpu_in.sum(red_axis);
-  gpu_out.device(dev) += gpu_in.sum(red_axis);
-  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
-  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
-  dev.synchronize();
-
-  // Check that the CPU and GPU reductions return the same result.
-  for (int i = 0; i < gpu_out.size(); ++i) {
-    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
-  }
-
-  dev.deallocate(in_data);
-  dev.deallocate(out_data);
-}
-
-
-void test_cxx11_tensor_reduction_hip() {
-  CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
-  CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
-  CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
-  CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
-  
-  CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
-  CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
-  CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
-// Outer reductions of doubles aren't supported just yet.  					      
-//  CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
-
-  CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
-// Outer reductions of doubles aren't supported just yet.  					      
-//  CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
-  CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
-  CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
-}
diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu
deleted file mode 100644
index f4c4f59b9..000000000
--- a/unsupported/test/cxx11_tensor_scan_hip.cu
+++ /dev/null
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_cumsum(int m_size, int k_size, int n_size)
-{
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
-  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
-  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
-
-  t_input.setRandom();
-
-  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
-  float* d_t_input;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_input), t_input_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
-      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
-  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
-      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
-
-  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
-  t_result = t_input.cumsum(1);
-
-  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
-      continue;
-    }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
-      continue;
-    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
-  }
-
-  hipFree((void*)d_t_input);
-  hipFree((void*)d_t_result);
-}
-
-
-void test_cxx11_tensor_scan_hip()
-{
-  CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
-}