Adding support for using Eigen in HIP kernels.

This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests.
author: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
commit: 8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree: 1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /unsupported
parent: e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)
30 files changed, 7025 insertions, 36 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index d243fe035..4b7c7d724 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t;
 #endif
 
 #ifdef EIGEN_USE_GPU
-#include <iostream>
-#include <cuda_runtime.h>
-#if __cplusplus >= 201103L
-#include <atomic>
-#include <unistd.h>
-#endif
+  #include <iostream>
+  #if defined(EIGEN_USE_HIP)
+    #include <hip/hip_runtime.h>
+  #else
+    #include <cuda_runtime.h>
+  #endif
+  #if __cplusplus >= 201103L
+    #include <atomic>
+    #include <unistd.h>
+  #endif
 #endif
 
 #include "src/Tensor/TensorMacros.h"
@@ -95,7 +99,11 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceCuda.h"
+#if defined(EIGEN_USE_HIP)
+  #include "src/Tensor/TensorDeviceHip.h"
+#else
+  #include "src/Tensor/TensorDeviceCuda.h"
+#endif
 #include "src/Tensor/TensorDeviceSycl.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
@@ -112,16 +120,28 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionCuda.h"
+#if defined(EIGEN_USE_HIP)
+  #include "src/Tensor/TensorReductionHip.h"
+#else
+  #include "src/Tensor/TensorReductionCuda.h"
+#endif
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionCuda.h"
+#if defined(EIGEN_USE_HIP)
+  #include "src/Tensor/TensorContractionHip.h"
+#else
+  #include "src/Tensor/TensorContractionCuda.h"
+#endif
 #include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
+#if defined(EIGEN_USE_HIP)
+  #include "src/Tensor/TensorConvolutionHip.h"
+#else
+  #include "src/Tensor/TensorConvolution.h"
+#endif   
 #include "src/Tensor/TensorFFT.h"
 #include "src/Tensor/TensorPatch.h"
 #include "src/Tensor/TensorImagePatch.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e72ddb4a9..979fcf4d9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)    
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
@@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)    
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemm(Scalar* buffer) const {
     #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
     if (m_can_use_xsmm) {
       evalGemmXSMM(buffer);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9caee..4853dd37b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -28,7 +28,10 @@ class TensorContractionBlocking {
   typedef typename LhsMapper::Scalar LhsScalar;
   typedef typename RhsMapper::Scalar RhsScalar;
 
-  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+ TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
new file mode 100644
index 000000000..7561846a3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
@@ -0,0 +1,1521 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = hipBlockIdx_x;
+  const Index n_block_idx = hipBlockIdx_y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in hipThreadIdx_x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z;
+  const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // hipThreadIdx_x: the vertical position in an 8x8 block
+  // hipThreadIdx_y: the vertical index of the 8x8 block in the grid
+  // hipThreadIdx_z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y];
+    const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (hipThreadIdx_x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8);
+
+  if (hipThreadIdx_x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0];
+      Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1];
+      Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2];
+      Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3];
+      Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4];
+      Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5];
+      Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6];
+      Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7];
+
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6;
+      output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j];
+        output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512, 1)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = hipBlockIdx_x;
+  const Index n_block_idx = hipBlockIdx_y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+#define prefetch_lhs(reg, row, col)                   \
+    if (!CHECK_LHS_BOUNDARY) {                        \
+      if (col < k_size) {                             \
+        /*reg = lhs.template loadPacket<Unaligned>(row, col);*/     \
+        reg.x =lhs(row + 0, col);                     \
+        reg.y =lhs(row + 1, col);                     \
+        reg.z =lhs(row + 2, col);                     \
+        reg.w =lhs(row + 3, col);                     \
+      }                                               \
+    } else {                                          \
+      if (col < k_size) {                             \
+        if (row + 3 < m_size) {                       \
+          /*reg =lhs.template loadPacket<Unaligned>(row, col);*/   \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+          reg.w =lhs(row + 3, col);                   \
+        } else if (row + 2 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+        } else if (row + 1 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+        } else if (row  < m_size) {                   \
+          reg.x =lhs(row + 0, col);                   \
+        }                                             \
+      }                                               \
+    }                                                 \
+
+
+  Index lhs_vert = base_m+hipThreadIdx_x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    //lhs_pf0 = internal::pset1<float4>(0);
+    //rhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf0 = make_float4(0, 0, 0, 0);
+    rhs_pf0 = make_float4(0, 0, 0, 0);
+
+    Index lhs_horiz = hipThreadIdx_y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(hipThreadIdx_x%4)*4;
+    Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((hipThreadIdx_x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    if((hipThreadIdx_x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x];
+      float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x];
+
+      int start_feature = hipThreadIdx_y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = hipThreadIdx_y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+  Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    /*lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);*/
+
+
+    lhs_pf0 = make_float4(0, 0, 0, 0);
+    lhs_pf1 = make_float4(0, 0, 0, 0);
+    lhs_pf2 = make_float4(0, 0, 0, 0);
+    lhs_pf3 = make_float4(0, 0, 0, 0);
+
+    rhs_pf0 = make_float4(0, 0, 0, 0);
+    rhs_pf1 = make_float4(0, 0, 0, 0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((hipThreadIdx_y/4+k+24) < k_size) {
+        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+        //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+        //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+        //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
+        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+        lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+        lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+        lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+        lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+        lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+        lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+        lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+      } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+        //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+        //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+        lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+        lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+        lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+      } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+        //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+      } else if ((hipThreadIdx_y/4+k) < k_size) {
+        //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+        lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+        lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+        lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((hipThreadIdx_y/4+k+24) < k_size) {
+          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+          //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+          //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+          //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+          lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+          //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+          //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+          //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        } else if ((hipThreadIdx_y/4+k) < k_size) {
+          //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((hipThreadIdx_y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+        } else if ((hipThreadIdx_y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((hipThreadIdx_y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+        } else if ((hipThreadIdx_y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((hipThreadIdx_y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+        } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+        } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+        } else if ((hipThreadIdx_y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+hipThreadIdx_x*4;
+    Index rhs_horiz0 = hipThreadIdx_y*2+base_n;
+    Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+          rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+hipThreadIdx_x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+hipThreadIdx_x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
+
+      // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8.
+      int start_feature = (hipThreadIdx_y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+
+  __syncthreads();
+  Index horiz_base = (hipThreadIdx_y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256, 1)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  typedef float2 LHS_MEM16x16[32][16];
+  typedef float2 RHS_MEM16x16[64][8];
+
+  const Index m_block_idx = hipBlockIdx_x;
+  const Index n_block_idx = hipBlockIdx_y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256, 1)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = hipBlockIdx_x;
+  const Index n_block_idx = hipBlockIdx_y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+ 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+  // We need to redefine this method to make hipcc happy
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>),
+                    dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>),
+                        dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>),
+                        dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    setHipSharedMemConfig(hipSharedMemBankSizeEightByte);
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_HIPCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
new file mode 100644
index 000000000..ba9971050
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
@@ -0,0 +1,1119 @@
+//#include "hip/hip_runtime.h"
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+namespace internal {
+
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+              const array<Index, NumKernelDims>& indices) {
+
+    array<Index, NumDims> dimensions = input_dims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = indices[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      dimensions[index] = result_dim;
+    }
+
+    array<Index, NumDims> inputStrides;
+    array<Index, NumDims> outputStrides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      outputStrides[NumDims - 1] = 1;
+      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+      }
+    }
+
+    array<Index, NumDims> hipInputDimensions;
+    array<Index, NumDims> hipOutputDimensions;
+    array<Index, NumDims> tmp = dimensions;
+    array<Index, NumDims> ordering;
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = i + offset;
+      ordering[index] = indices[i];
+      tmp[indices[i]] = -1;
+      hipInputDimensions[index] = input_dims[indices[i]];
+      hipOutputDimensions[index] = dimensions[indices[i]];
+    }
+
+    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                      ? NumKernelDims
+                      : 0;
+    for (int i = 0; i < NumDims; ++i) {
+      if (tmp[i] >= 0) {
+        ordering[written] = i;
+        hipInputDimensions[written] = input_dims[i];
+        hipOutputDimensions[written] = dimensions[i];
+        ++written;
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[ordering[i]];
+      m_outputStrides[i] = outputStrides[ordering[i]];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i) {
+        if (i > NumKernelDims) {
+          m_hipInputStrides[i] =
+              m_hipInputStrides[i - 1] * hipInputDimensions[i - 1];
+          m_hipOutputStrides[i] =
+              m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1];
+        } else {
+          m_hipInputStrides[i] = 1;
+          m_hipOutputStrides[i] = 1;
+        }
+      }
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        if (i + 1 < offset) {
+          m_hipInputStrides[i] =
+              m_hipInputStrides[i + 1] * hipInputDimensions[i + 1];
+          m_hipOutputStrides[i] =
+              m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1];
+        } else {
+          m_hipInputStrides[i] = 1;
+          m_hipOutputStrides[i] = 1;
+        }
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_hipInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_hipInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[NumKernelDims];
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_hipInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_hipInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[limit];
+    }
+    return inputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const {
+    Index outputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_hipOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_hipOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[NumKernelDims];
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_hipOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_hipOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[limit];
+    }
+    return outputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
+           k * m_inputStrides[offset + 2];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
+           k * m_outputStrides[offset + 2];
+  }
+
+ private:
+  static const int NumDims = internal::array_size<InputDims>::value;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_hipInputStrides;
+  array<Index, NumDims> m_hipOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar,
+                                        typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index,
+                                      typename traits<KernelXprType>::Index>::type Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<InputXprType>::NumDimensions;
+  static const int Layout = traits<InputXprType>::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename InputXprType::Nested>::type&
+    inputExpression() const { return m_input_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename KernelXprType::Nested>::type&
+    kernelExpression() const { return m_kernel_xpr; }
+
+  protected:
+    typename InputXprType::Nested m_input_xpr;
+    typename KernelXprType::Nested m_kernel_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<InputArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+ 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    CoeffReturnType result = CoeffReturnType(0);
+    convolve(firstInput(index), 0, NumKernelDims-1, result);
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+  {
+    Index indices[2] = {index, index+PacketSize-1};
+    Index startInputs[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];
+
+    if (startInputs[1]-startInputs[0] == PacketSize-1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_MAX Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+      }
+      data[PacketSize-1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+      return internal::pload<PacketReturnType>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex-1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex-1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+    return StaticKernelSize;
+  }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+    return kernelSize;
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int kernelSize, float* buffer) {
+  HIP_DYNAMIC_SHARED( float, s)
+
+  const int first_x = hipBlockIdx_x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_plane = hipBlockIdx_y * hipBlockDim_y;
+  const int plane_stride = hipBlockDim_y * hipGridDim_y;
+
+  for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) {
+    // Load inputs to shared memory
+    const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = hipThreadIdx_y * num_x_input;
+    #pragma unroll
+    for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
+      const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x);
+      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+    }
+
+    __syncthreads();
+
+    // Compute the convolution
+    const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
+      const int kernel_offset = plane_kernel_offset + i;
+      float result = 0.0f;
+      #pragma unroll
+      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+        result += s[k + kernel_offset] * kernel[k];
+      }
+      const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x);
+      buffer[tensor_index] = result;
+    }
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ void EigenConvolutionKernel2D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int numY, const int maxY, const int kernelSizeX,
+    const int kernelSizeY, float* buffer) {
+  HIP_DYNAMIC_SHARED( float, s)
+
+  const int first_x = hipBlockIdx_x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_y = hipBlockIdx_y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+  const int num_y_output = last_y - first_y + 1;
+
+  const int first_plane = hipBlockIdx_z * hipBlockDim_z;
+  const int plane_stride = hipBlockDim_z * hipGridDim_z;
+
+  for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) {
+
+    const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = hipThreadIdx_z * num_y_input;
+
+    // Load inputs to shared memory
+    #pragma unroll
+    for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
+      const int input_offset = num_x_input * (j + plane_kernel_offset);
+      #pragma unroll
+      for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
+        const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        s[i + input_offset] = eval.coeff(tensor_index);
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
+      #pragma unroll
+      for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
+        float result = 0.0f;
+        #pragma unroll
+        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+          const int kernel_offset = kernelSizeX * l;
+          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+          #pragma unroll
+          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+            result += s[k + input_offset] * kernel[k + kernel_offset];
+          }
+        }
+        const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        buffer[tensor_index] = result;
+      }
+    }
+
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ void EigenConvolutionKernel3D( 
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const size_t numPlanes, const size_t numX,
+    const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
+    const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+    const size_t kernelSizeZ, float* buffer) {
+  HIP_DYNAMIC_SHARED( float, s)
+
+  // Load inputs to shared memory
+  const int first_x = hipBlockIdx_x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + kernelSizeX;
+
+  const int first_y = hipBlockIdx_y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + kernelSizeY;
+
+  const int first_z = hipBlockIdx_z * maxZ;
+  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int num_z_input = last_z - first_z + kernelSizeZ;
+
+  for (int p = 0; p < numPlanes; ++p) {
+
+    const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = 0;
+
+    for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) {
+      for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
+        for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
+          const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int num_z_output = last_z - first_z + 1;
+    const int num_y_output = last_y - first_y + 1;
+    const int num_x_output = last_x - first_x + 1;
+    const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
+
+    for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) {
+      for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
+        for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
+          float result = 0.0f;
+          for (int n = 0; n < kernelSizeZ; ++n) {
+            for (int m = 0; m < kernelSizeY; ++m) {
+              for (int l = 0; l < kernelSizeX; ++l) {
+                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+              }
+            }
+          }
+          const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          buffer[tensor_index] = result;
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    PacketAccess = false,
+    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+ 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  static unsigned int ceil(unsigned int num, unsigned int denom) {
+    const unsigned int rounded_toward_zero = num / denom;
+    if (num > rounded_toward_zero * denom) {
+      return rounded_toward_zero + 1;
+    }
+    return rounded_toward_zero;
+  }
+
+  void executeEval(Scalar* data) const {
+    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumHipMultiProcessors();
+    const int hipWarpSize = 32;
+
+    switch (NumKernelDims) {
+      case 1: {
+        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numP = dimensions().TotalSize() / numX;
+        int maxX;
+        dim3 block_size;
+
+        const int single_stride_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 0
+                : m_inputImpl.dimensions().rank() - 1;
+        if (m_indices[0] == single_stride_dim) {
+          // Maximum the reuse
+          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+          maxX = numext::mini<int>(inner_dim, numX);
+          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+        }
+        else {
+          // Read as much as possible alongside the inner most dimension, that is the plane
+          const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar));
+          const int maxP = numext::mini<int>(inner_dim, numP);
+          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+          block_size.x = numext::mini(hipWarpSize, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
+        }
+
+        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+        //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 1> indices(m_indices[0]);
+        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+        switch(kernel_size) {
+          case 4: {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
+                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            break;
+          }
+          case 7: {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
+                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            break;
+          }
+          default: {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
+                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+          }
+        }
+        break;
+      }
+
+      case 2: {
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numP = dimensions().TotalSize() / (numX*numY);
+
+        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+        // Snap maxX to warp size
+        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+        const int maxX = numext::mini<int>(inner_dim, numX);
+        const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+        dim3 block_size;
+        block_size.x = numext::mini(1024, maxX);
+        block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
+
+        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int num_y_blocks = ceil(numY, maxY);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
+        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY]);
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size_x) {
+          case 4: {
+            switch (kernel_size_y) {
+              case 7: {
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
+                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                break;
+              }
+              default: {
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
+                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          case 7: {
+            switch (kernel_size_y) {
+              case 4: {
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
+                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                break;
+              }
+              default: {
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
+                                dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          default: {
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>),
+                            dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            break;
+          }
+        }
+        break;
+      }
+
+      case 3: {
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+        const int idxZ =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numZ = dimensions()[m_indices[idxZ]];
+        const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+        const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+        const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+        const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+        dim3 block_size;
+        block_size.x = numext::mini(32, maxX);
+        block_size.y = numext::mini(32, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
+        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+        const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+        const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
+                                      m_indices[idxZ]);
+        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY],
+                                          m_kernelImpl.dimensions()[idxZ]);
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
+                        dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel,
+                        numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator& operator = (const TensorEvaluator&);
+
+  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+  KernelArgType m_kernelArg;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+
+  const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 341889e88..e94e577fc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -35,9 +35,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef EIGEN_CUDA_ARCH
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
     // Running on the host CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
 #else
     // Running on a CUDA device
     return 32;
@@ -45,7 +48,7 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
     // Running on the host CPU
     return l1CacheSize();
 #else
@@ -55,7 +58,7 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
     // Running single threaded on the host CPU
     return l3CacheSize();
 #else
@@ -65,10 +68,14 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef EIGEN_CUDA_ARCH
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
     // Running single threaded on the host CPU
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
 #else
     // Running on a CUDA device
     return EIGEN_CUDA_ARCH / 100;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
new file mode 100644
index 000000000..c0e110987
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
+
+#if defined(EIGEN_HIPCC)
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#endif
+#include <unistd.h> //for sleep function
+
+namespace Eigen {
+
+static const int kHipScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const hipStream_t& stream() const = 0;
+  virtual const hipDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+static hipDeviceProp_t* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if 0 && __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
+      int num_devices;
+      hipError_t status = hipGetDeviceCount(&num_devices);
+      if (status != hipSuccess) {
+        std::cerr << "Failed to get the number of HIP devices: "
+                  << hipGetErrorString(status)
+                  << std::endl;
+        assert(status == hipSuccess);
+      }
+      m_deviceProperties = new hipDeviceProp_t[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = hipGetDeviceProperties(&m_deviceProperties[i], i);
+        if (status != hipSuccess) {
+          std::cerr << "Failed to initialize HIP device #"
+                    << i
+                    << ": "
+                    << hipGetErrorString(status)
+                    << std::endl;
+          assert(status == hipSuccess);
+        }
+      }
+
+#if 0 && __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
+      m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if 0 && __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
+    }
+  }
+}
+
+static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00;
+
+class HipStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    hipGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  HipStreamDevice(const hipStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      hipGetDevice(&device_);
+    } else {
+      int num_devices;
+      hipError_t err = hipGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == hipSuccess);
+      assert(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }
+
+  virtual ~HipStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const hipStream_t& stream() const { return *stream_; }
+  const hipDeviceProp_t& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    hipError_t err = hipSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == hipSuccess);
+    void* result;
+    err = hipMalloc(&result, num_bytes);
+    assert(err == hipSuccess);
+    assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    hipError_t err = hipSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == hipSuccess);
+    assert(buffer != NULL);
+    err = hipFree(buffer);
+    assert(err == hipSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kHipScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int));
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == hipSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const hipStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const hipStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#if !defined(EIGEN_HIP_DEVICE_COMPILE)
+    hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == hipSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    hipError_t err =
+        hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == hipSuccess);
+  }
+
+   EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    hipError_t err =
+        hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == hipSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#if !defined(EIGEN_HIP_DEVICE_COMPILE)
+    //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream());
+    hipError_t err = hipMemset(buffer, c, n);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == hipSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip devices.
+    return firstLevelCacheSize();
+  }
+
+// FIXME - this will move into HIP
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#undef assert
+#define assert(COND)
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#if defined(EIGEN_HIPCC) && \
+    !defined(EIGEN_HIP_DEVICE_COMPILE)
+    hipError_t err = hipStreamSynchronize(stream_->stream());
+    if (err != hipSuccess) {
+      std::cerr << "Error detected in HIP stream: "
+                << hipGetErrorString(err)
+                << std::endl;
+      assert(err == hipSuccess);
+    }
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the HIP runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#if defined(EIGEN_HIPCC)
+    hipError_t error = hipStreamQuery(stream_->stream());
+    return (error == hipSuccess) || (error == hipErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \
+  assert(hipGetLastError() == hipSuccess);
+
+
+// FIXME: Should be device and kernel specific.
+#if defined(EIGEN_HIPCC)
+static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) {
+#if !defined(EIGEN_HIP_DEVICE_COMPILE)
+  hipError_t status = hipDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  assert(status == hipSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 0ffe68ab3..24a57970a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
 };
 
 
-#if defined(EIGEN_CUDACC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EigenMetaKernelEval {
   static __device__ EIGEN_ALWAYS_INLINE
@@ -250,6 +250,17 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
+#if defined(EIGEN_HIPCC)
+    const int block_size = device.maxHipThreadsPerBlock();
+    const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / block_size;
+    const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+ 
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+        dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size);
+#else
     const int block_size = device.maxCudaThreadsPerBlock();
     const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / block_size;
@@ -260,11 +271,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
         num_blocks, block_size, 0, device, evaluator, size);
+#endif
   }
   evaluator.cleanup();
 }
 
-#endif  // EIGEN_CUDACC
+#endif  // EIGEN_CUDACC || EIGEN_HIPCC
 #endif  // EIGEN_USE_GPU
 
 // SYCL Executor policy
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c015ce196..b8f0bc798 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -109,7 +109,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     const Index numValues =  internal::array_prod(m_impl.dimensions());
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
     // Should initialize the memory in case we're dealing with non POD types.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 3209fecd3..835efbf72 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -350,7 +350,11 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
 
 namespace internal {
 
-template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+template<typename FirstType, typename... OtherTypes>
+  #if defined(EIGEN_HIPCC)
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  #endif
+  size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
   size_t result = 1;
   for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
     result *= sizes[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index c9e61f359..8e1ba486d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
  */
 
 // SFINAE requires variadic templates
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 5431eb740..de1075cc1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16)))
 template <>
 struct PacketType<half, GpuDevice> {
   typedef half2 type;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index e59074506..2a979845b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -858,7 +858,10 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     }
     return inputIndex;
   }
-
+  
+  #if defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
   static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
 #ifndef __SYCL_DEVICE_ONLY__
     return numext::maxi(min, numext::mini(max,value));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 230915db2..71536a4b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -16,7 +16,7 @@ namespace internal {
 namespace {
 
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
   assert(threadIdx.z == 0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index da0ffe728..d2fb3fd32 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -334,12 +334,12 @@ struct OuterReducer {
 };
 
 
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
 template <int B, int N, typename S, typename R, typename I>
 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
 template <typename S, typename R, typename I>
 __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
 template <int B, int N, typename S, typename R, typename I>
@@ -495,7 +495,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+  EIGEN_STRONG_INLINE
+    #if !defined(EIGEN_HIPCC)
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
@@ -694,9 +698,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 #ifdef EIGEN_USE_THREADS
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
   template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
-#ifdef EIGEN_HAS_CUDA_FP16
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
   template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
   template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
   template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
@@ -774,14 +778,22 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   // Indexed by reduced dimensions.
   array<Index, NumReducedDims> m_reducedDims;
 
+#if defined(EIGEN_HIPCC)
+ public:
+#endif
+  
   // Evaluator for the input expression.
   TensorEvaluator<ArgType, Device> m_impl;
 
+#if defined(EIGEN_HIPCC)
+ private:
+#endif
+  
   // Operation to apply for computing the reduction.
   Op m_reducer;
 
   // For full reductions
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
   static const bool RunningOnSycl = false;
 #elif defined(EIGEN_USE_SYCL)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
new file mode 100644
index 000000000..5304a22c5
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
@@ -0,0 +1,815 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+#endif
+
+#if defined(EIGEN_HIPCC)
+#define HIP_WARP_SIZE 64
+#endif
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple hip thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another hip thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    assert(0 && "Wordsize not supported");
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
+    defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+  atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  const Index num_threads = hipBlockDim_x * hipGridDim_x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
+    defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+  // Initialize the output value
+  const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x;
+  if (hipGridDim_x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (hipThreadIdx_x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+    // XXX use std::is_floating_point to determine the type of accum
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum);
+    }
+  }
+
+  if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (hipGridDim_x > 1 && hipThreadIdx_x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, hipGridDim_x + 1);
+    __threadfence_system();
+  }
+
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+  eigen_assert(hipBlockDim_x == 1);
+  eigen_assert(hipGridDim_x == 1);
+  if (num_coeffs % 2 != 0) {
+    half last = input.m_impl.coeff(num_coeffs-1);
+    *scratch = __halves2half2(last, reducer.initialize());
+  } else {
+    *scratch = reducer.template initializePacket<half2>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  const Index num_threads = hipBlockDim_x * hipGridDim_x;
+  const Index num_packets = num_coeffs / 2;
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    ((half2*)output)[i] = reducer.template initializePacket<half2>();
+  }
+
+  if (thread_id == 0 && num_coeffs % 2 != 0) {
+    output[num_coeffs-1] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, half2* scratch) {
+  eigen_assert(NumPerThread % 2 == 0);
+
+  const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+  if (hipGridDim_x == 1 && first_index == 0) {
+    if (num_coeffs % 2 != 0) {
+      half last = input.m_impl.coeff(num_coeffs-1);
+      *scratch = __halves2half2(last, reducer.initialize());
+    } else {
+      *scratch = reducer.template initializePacket<half2>();
+    }
+    __syncthreads();
+  }
+
+  half2 accum = reducer.template initializePacket<half2>();
+  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + 2*i;
+    eigen_assert(index + 1 < num_coeffs);
+    half2 val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+    // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+    union { int i; half2 h; } wka_in, wka_out;
+    wka_in.h = accum;
+    wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
+    reducer.reducePacket(wka_out.h, &accum);
+  }
+
+  if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+
+  if (hipGridDim_x == 1 && first_index == 0) {
+    half tmp = __low2half(*scratch);
+    reducer.reduce(__high2half(*scratch), &tmp);
+    *output = tmp;
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+  eigen_assert(hipThreadIdx_x == 1);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+namespace {
+  std::mutex __eigen_reduction_hip_mutex;
+}
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+    // guard FullReductionLauncher with a mutex so only 1 FullReductionKernel
+    // is dispatched at a time
+    std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex);
+
+    typedef typename Self::Index Index;
+    typedef typename Self::CoeffReturnType Scalar;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+
+      unsigned int semaphore_host = 0xFF;
+      hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
+      if (semaphore_host != 0) {
+        std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: "
+                  << semaphore_host << "\n";
+        // wait for all commands on the device to complete so semaphore value
+        // is reset to 0
+        hipDeviceSynchronize();
+
+        // read again
+        hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
+        if (semaphore_host != 0) {
+          std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: "
+                    << semaphore_host << ", apply manual override to 0\n";
+
+          // force set semaphore value to be 0
+          semaphore_host = 0;
+          hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice);
+        }
+      }
+    }
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer);
+  }
+};
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch);
+    }
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>),
+                         dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch);
+    }
+  }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#if defined(EIGEN_HAS_HIP_FP16)
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
+    defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(hipBlockDim_y == 1);
+  eigen_assert(hipBlockDim_z == 1);
+  eigen_assert(hipGridDim_y == 1);
+  eigen_assert(hipGridDim_z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = hipBlockDim_x * hipGridDim_x;
+  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (hipGridDim_x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + hipBlockDim_x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+        // XXX use std::is_floating_point to determine the type of reduced_val
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      }
+
+      if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#if defined(EIGEN_HAS_HIP_FP16)
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(hipBlockDim_y == 1);
+  eigen_assert(hipBlockDim_z == 1);
+  eigen_assert(hipGridDim_y == 1);
+  eigen_assert(hipGridDim_z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = hipBlockDim_x * hipGridDim_x;
+  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (hipGridDim_x == 1) {
+    Index i = 2*thread_id;
+    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+      half* loc = output + i;
+      *((half2*)loc) = reducer.template initializePacket<half2>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
+    const Index row = 2 * (i / input_col_blocks);
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x);
+
+      half2 reduced_val1 = reducer.template initializePacket<half2>();
+      half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + hipBlockDim_x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) {
+            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const half2 val1 = __halves2half2(last1, reducer.initialize());
+            reducer.reducePacket(val1, &reduced_val1);
+            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+            const half2 val2 = __halves2half2(last2, reducer.initialize());
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + hipBlockDim_x * (j + k) * 2;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+	// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	union { int i; half2 h; } wka_in, wka_out;
+
+	wka_in.h = reduced_val1;
+	wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
+        reducer.reducePacket(wka_out.h, &reduced_val1);
+	
+	wka_in.h = reduced_val2;
+	wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
+        reducer.reducePacket(wka_out.h, &reduced_val2);
+      }
+
+      half val1 =  __low2half(reduced_val1);
+      reducer.reduce(__high2half(reduced_val1), &val1);
+      half val2 =  __low2half(reduced_val2);
+      reducer.reduce(__high2half(reduced_val2), &val2);
+      half2 val = __halves2half2(val1, val2);
+
+      if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>),
+                         dim3(num_blocks), dim3(1024), 0, device.stream(),
+                         reducer.initialize(), num_preserved_vals, output);
+    }
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self,
+                       num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output);
+    }
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#if defined(EIGEN_HAS_HIP_FP16)
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = hipBlockDim_x * hipGridDim_x;
+  const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (hipGridDim_x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumHipMultiProcessors() *
+                             device.maxHipThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>),
+                         dim3(num_blocks), dim3(1024), 0, device.stream(),
+                         reducer.initialize(), num_preserved_vals, output);
+    }
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 1f545ef1a..174a6a064 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -242,7 +242,7 @@ struct ScanLauncher {
   }
 };
 
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
 
 // GPU implementation of scan
 // TODO(ibab) This placeholder implementation performs multiple scans in
@@ -278,10 +278,15 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
      Index total_size = internal::array_prod(self.dimensions());
      Index num_blocks = (total_size / self.size() + 63) / 64;
      Index block_size = 64;
+#if defined(EIGEN_HIPCC)     
+     hipLaunchKernelGGL(HIP_KERNEL_NAME(ScanKernel<Self, Reducer>), dim3(num_blocks),
+			dim3(block_size), 0, self.device().stream(), self, total_size, data);
+#else
      LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+#endif     
   }
 };
-#endif  // EIGEN_USE_GPU && EIGEN_CUDACC
+#endif  // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC)
 
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 49d315a66..bb584e3f9 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -268,6 +268,9 @@ template<
   typename Reducer
 > struct reduce<Reducer>
 {
+  #if defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
   constexpr static inline int run() { return Reducer::Identity; }
 };
 
@@ -276,6 +279,9 @@ template<
   typename A
 > struct reduce<Reducer, A>
 {
+  #if defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
   constexpr static inline A run(A a) { return a; }
 };
 
@@ -285,6 +291,9 @@ template<
   typename... Ts
 > struct reduce<Reducer, A, Ts...>
 {
+  #if defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
   constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
@@ -324,6 +333,9 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
 // does...
 template<typename... Ts>
+#if defined(EIGEN_HIPCC)
+EIGEN_DEVICE_FUNC
+#endif
 constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
 {
   return reduce<product_op, Ts...>::run(ts...);
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 96b3a8261..18b76350b 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,7 +15,7 @@
 // The array class is only available starting with cxx11. Emulate our own here
 // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY)
 
 namespace Eigen {
 template <typename T, size_t n> class array {
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 6c7ac3f3b..5ab67dc1f 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -190,7 +190,7 @@ template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
     int dummy;
     return ::lgammaf_r(x, &dummy);
 #else
@@ -203,7 +203,7 @@ template <>
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
     int dummy;
     return ::lgamma_r(x, &dummy);
 #else
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 42b790d55..c3dba5f2a 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -297,3 +297,55 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
+
+# Add HIP specific tests 
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+    
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) 
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if (${HIP_PLATFORM} STREQUAL "hcc")
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+
+	# ei_add_test(cxx11_tensor_complex_hip)
+	# ei_add_test(cxx11_tensor_complex_cwise_ops_hip)
+	ei_add_test(cxx11_tensor_reduction_hip)
+	ei_add_test(cxx11_tensor_argmax_hip)
+	ei_add_test(cxx11_tensor_cast_float16_hip)
+	ei_add_test(cxx11_tensor_scan_hip)
+	ei_add_test(cxx11_tensor_device_hip)
+	ei_add_test(cxx11_tensor_hip)
+	ei_add_test(cxx11_tensor_contract_hip)
+	ei_add_test(cxx11_tensor_of_float16_hip)
+	ei_add_test(cxx11_tensor_random_hip)
+	
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+	
+      elseif (${HIP_PLATFORM} STREQUAL "nvcc")
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+      
+    endif(HIP_FOUND)
+
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+    
+  endif()
+    
+endif(EIGEN_TEST_HIP)
+
diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu
new file mode 100644
index 000000000..57d6ca000
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_hip.cu
@@ -0,0 +1,251 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int Layout>
+void test_hip_simple_argmax()
+{
+  Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
+  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
+  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(71, 52, 96) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(double);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  double* d_in;
+  DenseIndex* d_out_max;
+  DenseIndex* d_out_min;
+  hipMalloc((void**)(&d_in), in_bytes);
+  hipMalloc((void**)(&d_out_max), out_bytes);
+  hipMalloc((void**)(&d_out_min), out_bytes);
+
+  hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+
+  gpu_out_max.device(gpu_device) = gpu_in.argmax();
+  gpu_out_min.device(gpu_device) = gpu_in.argmin();
+
+  assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
+  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+  hipFree(d_in);
+  hipFree(d_out_max);
+  hipFree(d_out_min);
+}
+
+template <int DataLayout>
+void test_hip_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    hipMalloc((void**)(&d_in), in_bytes);
+    hipMalloc((void**)(&d_out), out_bytes);
+
+    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+    Eigen::HipStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    size_t(2*3*5*7 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    hipFree(d_in);
+    hipFree(d_out);
+  }
+}
+
+template <int DataLayout>
+void test_hip_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    hipMalloc((void**)(&d_in), in_bytes);
+    hipMalloc((void**)(&d_out), out_bytes);
+
+    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+    Eigen::HipStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    2*3*5*7 / tensor.dimension(dim));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    hipFree(d_in);
+    hipFree(d_out);
+  }
+}
+
+void test_cxx11_tensor_hip()
+{
+  CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
+  CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
+  CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
+  CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
+  CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
+  CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
new file mode 100644
index 000000000..bf6a49df4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_hip_conversion() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
+
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_fallback_conversion() {
+  int num_elem = 101;
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
+  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(floats(i), conv(i));
+  }
+}
+
+
+void test_cxx11_tensor_cast_float16_hip()
+{
+  CALL_SUBTEST(test_hip_conversion());
+  CALL_SUBTEST(test_fallback_conversion());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu
new file mode 100644
index 000000000..652af0ab0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_hip.cu
@@ -0,0 +1,215 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_hip_contraction(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  hipMalloc((void**)(&d_t_left), t_left_bytes);
+  hipMalloc((void**)(&d_t_right), t_right_bytes);
+  hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  hipFree((void*)d_t_left);
+  hipFree((void*)d_t_right);
+  hipFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  hipMalloc((void**)(&d_t_left), t_left_bytes);
+  hipMalloc((void**)(&d_t_right), t_right_bytes);
+  hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  hipFree((void*)d_t_left);
+  hipFree((void*)d_t_right);
+  hipFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_hip_contraction_m() {
+  for (int k = 32; k < 256; k++) {
+    test_hip_contraction<ColMajor>(k, 128, 128);
+    test_hip_contraction<RowMajor>(k, 128, 128);
+  }
+}
+
+template<int DataLayout>
+void test_hip_contraction_k() {
+  for (int k = 32; k < 256; k++) {
+    test_hip_contraction<ColMajor>(128, k, 128);
+    test_hip_contraction<RowMajor>(128, k, 128);
+  }
+}
+
+template<int DataLayout>
+void test_hip_contraction_n() {
+  for (int k = 32; k < 256; k++) {
+    test_hip_contraction<ColMajor>(128, 128, k);
+    test_hip_contraction<RowMajor>(128, 128, k);
+  }
+}
+
+
+template<int DataLayout>
+void test_hip_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
+      for (int k = 0; k < 17; k++) {
+        test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_hip()
+{
+  CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
+  CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
+
+  CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
+  CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
+
+  CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
+  CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
+
+  // Commenting out these tests due to long runtimes
+  // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
+  // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu
new file mode 100644
index 000000000..b98c481ff
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device_hip.cu
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
+    kernel_1d_(0) = 3.14f;
+    kernel_1d_(1) = 2.7f;
+
+    kernel_2d_(0,0) = 3.14f;
+    kernel_2d_(1,0) = 2.7f;
+    kernel_2d_(0,1) = 0.2f;
+    kernel_2d_(1,1) = 7.0f;
+
+    kernel_3d_(0,0,0) = 3.14f;
+    kernel_3d_(0,1,0) = 2.7f;
+    kernel_3d_(0,0,1) = 0.2f;
+    kernel_3d_(0,1,1) = 7.0f;
+    kernel_3d_(1,0,0) = -1.0f;
+    kernel_3d_(1,1,0) = -0.3f;
+    kernel_3d_(1,0,1) = -0.7f;
+    kernel_3d_(1,1,1) = -0.5f;
+  }
+
+  const Eigen::DefaultDevice& device() const { return cpu_device_; }
+
+  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+  Eigen::Tensor<float, 3>& out() { return out_; }
+  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
+  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
+  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
+
+ private:
+  const Eigen::Tensor<float, 3>& in1_;
+  const Eigen::Tensor<float, 3>& in2_;
+  Eigen::Tensor<float, 3>& out_;
+
+  Eigen::Tensor<float, 1> kernel_1d_;
+  Eigen::Tensor<float, 2> kernel_2d_;
+  Eigen::Tensor<float, 3> kernel_3d_;
+
+  Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+    assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
+    float kernel_1d_val[] = {3.14f, 2.7f};
+    assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
+
+    assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
+    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
+    assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
+
+    assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
+    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
+    assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
+  }
+  ~GPUContext() {
+    assert(hipFree(kernel_1d_) == hipSuccess);
+    assert(hipFree(kernel_2d_) == hipSuccess);
+    assert(hipFree(kernel_3d_) == hipSuccess);
+  }
+
+  const Eigen::GpuDevice& device() const { return gpu_device_; }
+
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
+
+ private:
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+
+  float* kernel_1d_;
+  float* kernel_2d_;
+  float* kernel_3d_;
+
+  Eigen::HipStreamDevice stream_;
+  Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+void test_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_forced_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
+template <typename Context>
+void test_contraction(Context* context)
+{
+  Eigen::array<std::pair<int, int>, 2> dims;
+  dims[0] = std::make_pair(1, 1);
+  dims[1] = std::make_pair(2, 2);
+
+  Eigen::array<int, 2> shape(40, 50*70);
+
+  Eigen::DSizes<int, 2> indices(0,0);
+  Eigen::DSizes<int, 2> sizes(40,40);
+
+  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
+}
+
+
+template <typename Context>
+void test_1d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,70);
+
+  Eigen::array<int, 1> dims(1);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
+}
+
+template <typename Context>
+void test_2d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,69);
+
+  Eigen::array<int, 2> dims(1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
+}
+
+template <typename Context>
+void test_3d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(39,49,69);
+
+  Eigen::array<int, 3> dims(0,1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
+}
+
+
+void test_cpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  CPUContext context(in1, in2, out);
+  test_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+void test_gpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  hipMalloc((void**)(&d_in1), in1_bytes);
+  hipMalloc((void**)(&d_in2), in2_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
+
+  GPUContext context(gpu_in1, gpu_in2, gpu_out);
+  test_contextual_eval(&context);
+  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
+  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
+  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  /*
+  test_3d_convolution(&context);
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
+  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+       const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+  */
+}
+
+
+void test_cxx11_tensor_device()
+{
+  CALL_SUBTEST(test_cpu());
+  CALL_SUBTEST(test_gpu());
+}
diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu
new file mode 100644
index 000000000..f1794aa87
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_hip.cu
@@ -0,0 +1,1296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_USE_GPU
+
+#ifdef __NVCC__
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_hip_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  hipMalloc((void**)(&d_in1), tensor_bytes);
+  hipMalloc((void**)(&d_in2), tensor_bytes);
+  hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost,
+                         gpu_device.stream()) == hipSuccess);
+  assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost,
+                         gpu_device.stream()) == hipSuccess);
+
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  hipFree(d_in1);
+  hipFree(d_in2);
+}
+
+void test_hip_elementwise_small() {
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2});
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2});
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2});
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  hipMalloc((void**)(&d_in1), in1_bytes);
+  hipMalloc((void**)(&d_in2), in2_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>{2});
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>{2});
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+      d_out, Eigen::array<Eigen::DenseIndex, 1>{2});
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
+                         gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(
+        out(Eigen::array<Eigen::DenseIndex, 1>{i}),
+        in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i}));
+  }
+
+  hipFree(d_in1);
+  hipFree(d_in2);
+  hipFree(d_out);
+}
+
+void test_hip_elementwise()
+{
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  in1.setRandom();
+  in2.setRandom();
+  in3.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t in3_bytes = in3.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_in3;
+  float* d_out;
+  hipMalloc((void**)(&d_in1), in1_bytes);
+  hipMalloc((void**)(&d_in2), in2_bytes);
+  hipMalloc((void**)(&d_in3), in3_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}));
+      }
+    }
+  }
+
+  hipFree(d_in1);
+  hipFree(d_in2);
+  hipFree(d_in3);
+  hipFree(d_out);
+}
+
+void test_hip_props() {
+  Tensor<float, 1> in1(200);
+  Tensor<bool, 1> out(200);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(bool);
+
+  float* d_in1;
+  bool* d_out;
+  hipMalloc((void**)(&d_in1), in1_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, 200);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
+      d_out, 200);
+
+  gpu_out.device(gpu_device) = (gpu_in1.isnan)();
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
+                         gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 200; ++i) {
+    VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
+  }
+
+  hipFree(d_in1);
+  hipFree(d_out);
+}
+
+void test_hip_reduction()
+{
+  Tensor<float, 4> in1(72,53,97,113);
+  Tensor<float, 2> out(72,97);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_out;
+  hipMalloc((void**)(&d_in1), in1_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  array<Eigen::DenseIndex, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = 0;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected =
+              std::max<float>(expected, in1(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i,j), expected);
+    }
+  }
+
+  hipFree(d_in1);
+  hipFree(d_out);
+}
+
+template<int DataLayout>
+void test_hip_contraction()
+{
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1});
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1});
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  hipMalloc((void**)(&d_t_left), t_left_bytes);
+  hipMalloc((void**)(&d_t_right), t_right_bytes);
+  hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
+
+  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+  MapXf m_left(t_left.data(), 300, 93);
+  MapXf m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+
+  m_result = m_left * m_right;
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+  hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  hipFree(d_t_left);
+  hipFree(d_t_right);
+  hipFree(d_t_result);
+}
+
+
+template<int DataLayout>
+void test_hip_convolution_1d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 1, DataLayout> kernel(4);
+  Tensor<float, 4, DataLayout> out(74,34,11,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  hipMalloc((void**)(&d_input), input_bytes);
+  hipMalloc((void**)(&d_kernel), kernel_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims{1};
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 34; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
+                                 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  hipFree(d_input);
+  hipFree(d_kernel);
+  hipFree(d_out);
+}
+
+void test_hip_convolution_inner_dim_col_major_1d()
+{
+  Tensor<float, 4, ColMajor> input(74,9,11,7);
+  Tensor<float, 1, ColMajor> kernel(4);
+  Tensor<float, 4, ColMajor> out(71,9,11,7);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  hipMalloc((void**)(&d_input), input_bytes);
+  hipMalloc((void**)(&d_kernel), kernel_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims{0};
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 71; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
+                                 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  hipFree(d_input);
+  hipFree(d_kernel);
+  hipFree(d_out);
+}
+
+void test_hip_convolution_inner_dim_row_major_1d()
+{
+  Tensor<float, 4, RowMajor> input(7,9,11,74);
+  Tensor<float, 1, RowMajor> kernel(4);
+  Tensor<float, 4, RowMajor> out(7,9,11,71);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  hipMalloc((void**)(&d_input), input_bytes);
+  hipMalloc((void**)(&d_kernel), kernel_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims{3};
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 71; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
+                                 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  hipFree(d_input);
+  hipFree(d_kernel);
+  hipFree(d_out);
+}
+
+template<int DataLayout>
+void test_hip_convolution_2d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 2, DataLayout> kernel(3,4);
+  Tensor<float, 4, DataLayout> out(74,35,8,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  hipMalloc((void**)(&d_input), input_bytes);
+  hipMalloc((void**)(&d_kernel), kernel_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
+
+  Eigen::array<Eigen::DenseIndex, 2> dims{1,2};
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                 input(i,j+1,k+0,l) * kernel(1,0) +
+                                 input(i,j+2,k+0,l) * kernel(2,0) +
+                                 input(i,j+0,k+1,l) * kernel(0,1) +
+                                 input(i,j+1,k+1,l) * kernel(1,1) +
+                                 input(i,j+2,k+1,l) * kernel(2,1) +
+                                 input(i,j+0,k+2,l) * kernel(0,2) +
+                                 input(i,j+1,k+2,l) * kernel(1,2) +
+                                 input(i,j+2,k+2,l) * kernel(2,2) +
+                                 input(i,j+0,k+3,l) * kernel(0,3) +
+                                 input(i,j+1,k+3,l) * kernel(1,3) +
+                                 input(i,j+2,k+3,l) * kernel(2,3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  hipFree(d_input);
+  hipFree(d_kernel);
+  hipFree(d_out);
+}
+
+template<int DataLayout>
+void test_hip_convolution_3d()
+{
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17});
+  Tensor<float, 3, DataLayout> kernel(3,4,2);
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17});
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  hipMalloc((void**)(&d_input), input_bytes);
+  hipMalloc((void**)(&d_kernel), kernel_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;    
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
+
+  Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3};
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 136; ++l) {
+          for (int m = 0; m < 17; ++m) {
+            const float result = out(i,j,k,l,m);
+            const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
+                                   input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
+                                   input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
+                                   input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
+                                   input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
+                                   input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
+                                   input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
+                                   input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
+                                   input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
+                                   input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
+                                   input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
+                                   input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
+                                   input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
+                                   input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
+                                   input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
+                                   input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
+                                   input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
+                                   input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
+                                   input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
+                                   input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
+                                   input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
+                                   input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
+                                   input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
+                                   input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+  }
+
+  hipFree(d_input);
+  hipFree(d_kernel);
+  hipFree(d_out);
+}
+
+
+template <typename Scalar>
+void test_hip_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_in), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+
+  hipFree(d_in);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_digamma()
+{
+  Tensor<Scalar, 1> in(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in(0) = Scalar(1);
+  in(1) = Scalar(1.5);
+  in(2) = Scalar(4);
+  in(3) = Scalar(-10.5);
+  in(4) = Scalar(10000.5);
+  in(5) = Scalar(0);
+  in(6) = Scalar(-1);
+
+  expected_out(0) = Scalar(-0.5772156649015329);
+  expected_out(1) = Scalar(0.03648997397857645);
+  expected_out(2) = Scalar(1.2561176684318);
+  expected_out(3) = Scalar(2.398239129535781);
+  expected_out(4) = Scalar(9.210340372392849);
+  expected_out(5) = std::numeric_limits<Scalar>::infinity();
+  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_in), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in.digamma();
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+  for (int i = 5; i < 7; ++i) {
+    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  }
+
+  hipFree(d_in);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_zeta()
+{
+  Tensor<Scalar, 1> in_x(6);
+  Tensor<Scalar, 1> in_q(6);
+  Tensor<Scalar, 1> out(6);
+  Tensor<Scalar, 1> expected_out(6);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(1.5);
+  in_x(2) = Scalar(4);
+  in_x(3) = Scalar(-10.5);
+  in_x(4) = Scalar(10000.5);
+  in_x(5) = Scalar(3);
+  
+  in_q(0) = Scalar(1.2345);
+  in_q(1) = Scalar(2);
+  in_q(2) = Scalar(1.5);
+  in_q(3) = Scalar(3);
+  in_q(4) = Scalar(1.0001);
+  in_q(5) = Scalar(-2.5);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = Scalar(1.61237534869);
+  expected_out(2) = Scalar(0.234848505667);
+  expected_out(3) = Scalar(1.03086757337e-5);
+  expected_out(4) = Scalar(0.367879440865);
+  expected_out(5) = Scalar(0.054102025820864097);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_q;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_in_x), bytes);
+  hipMalloc((void**)(&d_in_q), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice);
+  
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  hipFree(d_in_x);
+  hipFree(d_in_q);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_polygamma()
+{
+  Tensor<Scalar, 1> in_x(7);
+  Tensor<Scalar, 1> in_n(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in_n(0) = Scalar(1);
+  in_n(1) = Scalar(1);
+  in_n(2) = Scalar(1);
+  in_n(3) = Scalar(17);
+  in_n(4) = Scalar(31);
+  in_n(5) = Scalar(28);
+  in_n(6) = Scalar(8);
+  
+  in_x(0) = Scalar(2);
+  in_x(1) = Scalar(3);
+  in_x(2) = Scalar(25.5);
+  in_x(3) = Scalar(4.7);
+  in_x(4) = Scalar(11.8);
+  in_x(5) = Scalar(17.7);
+  in_x(6) = Scalar(30.2);
+
+  expected_out(0) = Scalar(0.644934066848);
+  expected_out(1) = Scalar(0.394934066848);
+  expected_out(2) = Scalar(0.0399946696496);
+  expected_out(3) = Scalar(293.334565435);
+  expected_out(4) = Scalar(0.445487887616);
+  expected_out(5) = Scalar(-2.47810300902e-07);
+  expected_out(6) = Scalar(-8.29668781082e-09);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_n;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_in_x), bytes);
+  hipMalloc((void**)(&d_in_n), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice);
+  
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  hipFree(d_in_x);
+  hipFree(d_in_n);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_igamma()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                          {0.0, 0.6321205588285578, 0.7768698398515702,
+                           0.9816843611112658, 9.999500016666262e-05, 1.0},
+                          {0.0, 0.4275932955291202, 0.608374823728911,
+                           0.9539882943107686, 7.522076445089201e-07, 1.0},
+                          {0.0, 0.01898815687615381, 0.06564245437845008,
+                           0.5665298796332909, 4.166333347221828e-18, 1.0},
+                          {0.0, 0.9999780593618628, 0.9999899967080838,
+                           0.9999996219837988, 0.9991370418689945, 1.0},
+                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess);
+  assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess);
+  assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
+
+  hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igamma_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+      }
+    }
+  }
+
+  hipFree(d_a);
+  hipFree(d_x);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_igammac()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                           {1.0, 0.36787944117144233, 0.22313016014842982,
+                            0.018315638888734182, 0.9999000049998333, 0.0},
+                           {1.0, 0.5724067044708798, 0.3916251762710878,
+                            0.04601170568923136, 0.9999992477923555, 0.0},
+                           {1.0, 0.9810118431238462, 0.9343575456215499,
+                            0.4334701203667089, 1.0, 0.0},
+                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                            3.7801620118431334e-07, 0.0008629581310054535,
+                            0.0},
+                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_a), bytes);
+  hipMalloc((void**)(&d_x), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igammac_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+      }
+    }
+  }
+
+  hipFree(d_a);
+  hipFree(d_x);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess);
+  assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
+
+  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+
+  hipFree(d_in);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_in), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+
+  hipFree(d_in);
+  hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_betainc()
+{
+  Tensor<Scalar, 1> in_x(125);
+  Tensor<Scalar, 1> in_a(125);
+  Tensor<Scalar, 1> in_b(125);
+  Tensor<Scalar, 1> out(125);
+  Tensor<Scalar, 1> expected_out(125);
+  out.setZero();
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Array<Scalar, 1, Dynamic> x(125);
+  Array<Scalar, 1, Dynamic> a(125);
+  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);
+
+  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999;
+
+  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+      0.5000000000004947, 0.9999999999999999, nan;
+
+  for (int i = 0; i < 125; ++i) {
+    in_x(i) = x(i);
+    in_a(i) = a(i);
+    in_b(i) = b(i);
+    expected_out(i) = v(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_a;
+  Scalar* d_in_b;
+  Scalar* d_out;
+  hipMalloc((void**)(&d_in_x), bytes);
+  hipMalloc((void**)(&d_in_a), bytes);
+  hipMalloc((void**)(&d_in_b), bytes);
+  hipMalloc((void**)(&d_out), bytes);
+
+  hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+  assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  for (int i = 1; i < 125; ++i) {
+    if ((std::isnan)(expected_out(i))) {
+      VERIFY((std::isnan)(out(i)));
+    } else {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  hipFree(d_in_x);
+  hipFree(d_in_a);
+  hipFree(d_in_b);
+  hipFree(d_out);
+}
+
+
+void test_cxx11_tensor_hip()
+{
+  CALL_SUBTEST(test_hip_nullary());
+  CALL_SUBTEST(test_hip_elementwise_small());
+  CALL_SUBTEST(test_hip_elementwise());
+  CALL_SUBTEST(test_hip_props());
+  CALL_SUBTEST(test_hip_reduction());
+  // FIXME : uncommenting following tests results in compile failure
+  // CALL_SUBTEST(test_hip_contraction<ColMajor>());
+  // CALL_SUBTEST(test_hip_contraction<RowMajor>());
+  CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
+  CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
+  CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST(test_hip_convolution_2d<ColMajor>());
+  CALL_SUBTEST(test_hip_convolution_2d<RowMajor>());
+  // The following two tests commented out due to long runtime
+  // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>());
+  // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>());
+
+#if __cplusplus > 199711L
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
+
+  CALL_SUBTEST(test_hip_lgamma<float>(1.0f));
+  CALL_SUBTEST(test_hip_lgamma<float>(100.0f));
+  CALL_SUBTEST(test_hip_lgamma<float>(0.01f));
+  CALL_SUBTEST(test_hip_lgamma<float>(0.001f));
+
+  CALL_SUBTEST(test_hip_lgamma<double>(1.0));
+  CALL_SUBTEST(test_hip_lgamma<double>(100.0));
+  CALL_SUBTEST(test_hip_lgamma<double>(0.01));
+  CALL_SUBTEST(test_hip_lgamma<double>(0.001));
+
+  CALL_SUBTEST(test_hip_erf<float>(1.0f));
+  CALL_SUBTEST(test_hip_erf<float>(100.0f));
+  CALL_SUBTEST(test_hip_erf<float>(0.01f));
+  CALL_SUBTEST(test_hip_erf<float>(0.001f));
+
+  CALL_SUBTEST(test_hip_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs
+  CALL_SUBTEST(test_hip_erfc<float>(5.0f)); 
+  CALL_SUBTEST(test_hip_erfc<float>(0.01f));
+  CALL_SUBTEST(test_hip_erfc<float>(0.001f));
+
+  CALL_SUBTEST(test_hip_erf<double>(1.0));
+  CALL_SUBTEST(test_hip_erf<double>(100.0));
+  CALL_SUBTEST(test_hip_erf<double>(0.01));
+  CALL_SUBTEST(test_hip_erf<double>(0.001));
+
+  CALL_SUBTEST(test_hip_erfc<double>(1.0));
+  // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs
+  CALL_SUBTEST(test_hip_erfc<double>(5.0)); 
+  CALL_SUBTEST(test_hip_erfc<double>(0.01));
+  CALL_SUBTEST(test_hip_erfc<double>(0.001));
+
+  // Following tests have functional failures on some seeds
+  // CALL_SUBTEST(test_hip_digamma<float>());
+  // CALL_SUBTEST(test_hip_digamma<double>());
+
+  // Following tests have functional failures on some seeds
+  // CALL_SUBTEST(test_hip_polygamma<float>());
+  // CALL_SUBTEST(test_hip_polygamma<double>());
+
+  // Following tests have functional failures on some seeds
+  // CALL_SUBTEST(test_hip_zeta<float>());
+  // CALL_SUBTEST(test_hip_zeta<double>());
+
+  CALL_SUBTEST(test_hip_igamma<float>());
+  CALL_SUBTEST(test_hip_igammac<float>());
+
+  CALL_SUBTEST(test_hip_igamma<double>());
+  CALL_SUBTEST(test_hip_igammac<double>());
+
+  CALL_SUBTEST(test_hip_betainc<float>());
+  CALL_SUBTEST(test_hip_betainc<double>());
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu
new file mode 100644
index 000000000..6c1a401bf
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_hip.cu
@@ -0,0 +1,498 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+using Eigen::Tensor;
+
+template<typename>
+void test_hip_numext() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+#ifdef EIGEN_HAS_HIP_FP16
+
+template<typename>
+void test_hip_conversion() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+  
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+template<typename>
+void test_hip_unary() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_elementwise() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_trancendental() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
+  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
+  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
+  gpu_device.deallocate(d_res1_half);
+  gpu_device.deallocate(d_res1_float);
+  gpu_device.deallocate(d_res2_half);
+  gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
+}
+
+template<typename>
+void test_hip_contractions() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
+    }
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_reductions(int size1, int size2, int redux) {
+
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl; 
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
+  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+
+  Eigen::array<int, 1> redux_dim(redux);
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_reductions() {
+  test_hip_reductions<void>(13, 13, 0);
+  test_hip_reductions<void>(13, 13, 1);
+
+  test_hip_reductions<void>(35, 36, 0);
+  test_hip_reductions<void>(35, 36, 1);
+
+  test_hip_reductions<void>(36, 35, 0);
+  test_hip_reductions<void>(36, 35, 1);
+}
+
+template<typename>
+void test_hip_full_reductions() {
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_forced_evals() {
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half1);
+  gpu_device.deallocate(d_res_half2);
+  gpu_device.deallocate(d_res_float);
+}
+#endif
+
+
+void test_cxx11_tensor_of_float16_hip()
+{
+  CALL_SUBTEST(test_hip_numext<void>());
+
+#ifdef EIGEN_HAS_HIP_FP16
+  CALL_SUBTEST(test_hip_conversion<void>());
+  CALL_SUBTEST(test_hip_unary<void>());
+  CALL_SUBTEST(test_hip_elementwise<void>());
+  CALL_SUBTEST(test_hip_trancendental<void>());
+  CALL_SUBTEST(test_hip_contractions<void>());
+  CALL_SUBTEST(test_hip_reductions<void>());
+  CALL_SUBTEST(test_hip_full_reductions<void>());
+  CALL_SUBTEST(test_hip_forced_evals<void>());
+#else
+  std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu
new file mode 100644
index 000000000..7d7e72ca2
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_hip.cu
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+
+void test_hip_random_uniform()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  gpu_out.device(gpu_device) = gpu_out.random();
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+}
+
+
+void test_hip_random_normal()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  hipMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  Eigen::internal::NormalRandomGenerator<float> gen(true);
+  gpu_out.device(gpu_device) = gpu_out.random(gen);
+
+  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+}
+
+static void test_complex()
+{
+  Tensor<std::complex<float>, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+void test_cxx11_tensor_random_hip()
+{
+  CALL_SUBTEST(test_hip_random_uniform());
+  CALL_SUBTEST(test_hip_random_normal());
+  CALL_SUBTEST(test_complex());
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu
new file mode 100644
index 000000000..c5aad05be
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_hip.cu
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template<typename Type, int DataLayout>
+static void test_full_reductions() {
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<Type, 0, DataLayout> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+  int dim_x = 33;
+  int dim_y = 1;
+  int dim_z = 128;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data(T)
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+  int dim_x = 128;
+  int dim_y = 1;
+  int dim_z = 33;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+
+void test_cxx11_tensor_reduction_hip() {
+  CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
+  
+  CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
+  CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
+
+  CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
+  CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
+}
diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu
new file mode 100644
index 000000000..f4c4f59b9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_hip.cu
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_hip_cumsum(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+  t_input.setRandom();
+
+  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_input;
+  float* d_t_result;
+
+  hipMalloc((void**)(&d_t_input), t_input_bytes);
+  hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+  hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+  t_result = t_input.cumsum(1);
+
+  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  hipFree((void*)d_t_input);
+  hipFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_hip()
+{
+  CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
+}
author	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
commit	8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree	1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /unsupported
parent	e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)